diff --git a/HISTORY b/HISTORY
index 6fbe8b15..1bf78664 100644
--- a/HISTORY
+++ b/HISTORY
@@ -2,6 +2,8 @@
repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian)
repmgr: add --version-number command line option (Ian)
repmgr: add --compact option to "cluster show"; GitHub #521 (Ian)
+ repmgr: cluster show - differentiate between unreachable nodes
+ and nodes which are running but rejecting connections (Ian)
repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
repmgr: prevent potential race condition in "standby switchover"
diff --git a/dbutils.h b/dbutils.h
index ef94b84c..ecfcb7cf 100644
--- a/dbutils.h
+++ b/dbutils.h
@@ -80,7 +80,8 @@ typedef enum
NODE_STATUS_UP,
NODE_STATUS_SHUTTING_DOWN,
NODE_STATUS_DOWN,
- NODE_STATUS_UNCLEAN_SHUTDOWN
+ NODE_STATUS_UNCLEAN_SHUTDOWN,
+ NODE_STATUS_REJECTED
} NodeStatus;
typedef enum
diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml
index 91672880..e675ddec 100644
--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -88,6 +88,18 @@ REPMGRD_OPTS="--daemonize=false"
+
+
+ repmgr cluster show:
+ differentiate between unreachable nodes and nodes which are running but rejecting connections.
+
+
+ This makes it possible to see whether a node is unreachable at network level,
+ or if it is running but rejecting connections for some reason.
+
+
+
+
Add to repmgr standby promote (GitHub #522).
diff --git a/doc/repmgr-cluster-show.sgml b/doc/repmgr-cluster-show.sgml
index ce43ff34..997958ab 100644
--- a/doc/repmgr-cluster-show.sgml
+++ b/doc/repmgr-cluster-show.sgml
@@ -22,6 +22,14 @@
directly and can be run on any node in the cluster; this is also useful when analyzing
connectivity from a particular node.
+
+
+ Node availability is tested by connecting from the node where
+ repmgr cluster show is executed, and does not necessarily imply the node
+ is down. See and to get
+ better overviews of connections between nodes.
+
+
@@ -55,30 +63,48 @@
Notes
The column Role shows the expected server role according to the
- &repmgr; metadata. Status shows whether the server is running or unreachable.
+ &repmgr; metadata.
+
+
+ Status shows whether the server is running or unreachable.
If the node has an unexpected role not reflected in the &repmgr; metadata, e.g. a node was manually
- promoted to primary, this will be highlighted with an exclamation mark, e.g.:
+ promoted to primary, this will be highlighted with an exclamation mark.
+ If a connection to the node cannot be made, this will be highlighted with a question mark.
+ Note that the node will only be shown as ? unreachable
+ if a connection is not possible at network level; if the PostgreSQL instance on the
+ node is pingable but not accepting connections, it will be shown as ? running.
+
+
+ In the following example, executed on node3, node1 is not reachable
+ at network level and assumed to be down; node2 has been promoted to primary
+ (but node3 is not attached to it, and its metadata has not yet been updated);
+ node4 is running but rejecting connections (from node3 at least).
- $ repmgr -f /etc/repmgr.conf cluster show
+ ID | Name | Role | Status | Upstream | Location | Priority | Connection string
+ ----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
+ 1 | node1 | primary | ? unreachable | | default | 100 | host=db_node1 dbname=repmgr user=repmgr
+ 2 | node2 | standby | ! running as primary | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr
+ 3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr
+ 4 | node4 | standby | ? running | node1 | default | 100 | host=db_node4 dbname=repmgr user=repmgr
- ID | Name | Role | Status | Upstream | Location | Priority | Connection string
- ----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
- 1 | node1 | primary | ? unreachable | | default | 100 | host=db_node1 dbname=repmgr user=repmgr
- 2 | node2 | standby | ! running as primary | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr
- 3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr
-
- WARNING: following issues were detected
- - unable to connect to node "node1" (ID: 1)
- - node "node1" (ID: 1) is registered as an active primary but is unreachable
- - node "node2" (ID: 2) is registered as standby but running as primary
+ WARNING: following issues were detected
+ - unable to connect to node "node1" (ID: 1)
+ - node "node1" (ID: 1) is registered as an active primary but is unreachable
+ - node "node2" (ID: 2) is registered as standby but running as primary
+ - unable to connect to node "node4" (ID: 4)
HINT: execute with --verbose option to see connection error messages
-
- Node availability is tested by connecting from the node where
- repmgr cluster show is executed, and does not necessarily imply the node
- is down. See and to get
- better overviews of connections between nodes.
-
+
+ To diagnose connection issues, execute repmgr cluster show
+ with the option; this will display the error message
+ for each failed connection attempt.
+
+
+
+ Use and
+ to diagnose connection issues across the whole replication cluster.
+
+
diff --git a/repmgr-action-cluster.c b/repmgr-action-cluster.c
index d37d7cc9..2c2e6026 100644
--- a/repmgr-action-cluster.c
+++ b/repmgr-action-cluster.c
@@ -155,7 +155,12 @@ do_cluster_show(void)
}
else
{
- cell->node_info->node_status = NODE_STATUS_DOWN;
+ /* check if node is reachable, but just not letting us in */
+ if (is_server_available(cell->node_info->conninfo))
+ cell->node_info->node_status = NODE_STATUS_REJECTED;
+ else
+ cell->node_info->node_status = NODE_STATUS_DOWN;
+
cell->node_info->recovery_type = RECTYPE_UNKNOWN;
connection_error_found = true;
@@ -230,6 +235,19 @@ do_cluster_show(void)
}
}
}
+ /* node is up but cannot connect */
+ else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
+ {
+ if (cell->node_info->active == true)
+ {
+ appendPQExpBufferStr(&details, "? running");
+ }
+ else
+ {
+ appendPQExpBufferStr(&details, "! running");
+ error_found = true;
+ }
+ }
/* node is unreachable */
else
{
@@ -303,6 +321,19 @@ do_cluster_show(void)
cell->node_info->node_name, cell->node_info->node_id);
}
}
+ /* node is up but cannot connect */
+ else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
+ {
+ if (cell->node_info->active == true)
+ {
+ appendPQExpBufferStr(&details, "? running");
+ }
+ else
+ {
+ appendPQExpBufferStr(&details, "! running");
+ error_found = true;
+ }
+ }
/* node is unreachable */
else
{
@@ -316,11 +347,10 @@ do_cluster_show(void)
}
else
{
- appendPQExpBufferStr(&details, "- failed");
- error_found = true;
+ appendPQExpBufferStr(&details, "- failed");
+ error_found = true;
}
}
-
}
break;
@@ -340,6 +370,20 @@ do_cluster_show(void)
error_found = true;
}
}
+ /* node is up but cannot connect */
+ else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
+ {
+ if (cell->node_info->active == true)
+ {
+ appendPQExpBufferStr(&details, "? rejected");
+ }
+ else
+ {
+ appendPQExpBufferStr(&details, "! failed");
+ error_found = true;
+ }
+
+ }
/* node is unreachable */
else
{