diff --git a/HISTORY b/HISTORY index 6fbe8b15..1bf78664 100644 --- a/HISTORY +++ b/HISTORY @@ -2,6 +2,8 @@ repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian) repmgr: add --version-number command line option (Ian) repmgr: add --compact option to "cluster show"; GitHub #521 (Ian) + repmgr: cluster show - differentiate between unreachable nodes + and nodes which are running but rejecting connections (Ian) repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian) repmgr: add "node check --data-directory-config"; GitHub #523 (Ian) repmgr: prevent potential race condition in "standby switchover" diff --git a/dbutils.h b/dbutils.h index ef94b84c..ecfcb7cf 100644 --- a/dbutils.h +++ b/dbutils.h @@ -80,7 +80,8 @@ typedef enum NODE_STATUS_UP, NODE_STATUS_SHUTTING_DOWN, NODE_STATUS_DOWN, - NODE_STATUS_UNCLEAN_SHUTDOWN + NODE_STATUS_UNCLEAN_SHUTDOWN, + NODE_STATUS_REJECTED } NodeStatus; typedef enum diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index 91672880..e675ddec 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -88,6 +88,18 @@ REPMGRD_OPTS="--daemonize=false" + + + repmgr cluster show: + differentiate between unreachable nodes and nodes which are running but rejecting connections. + + + This makes it possible to see whether a node is unreachable at network level, + or if it is running but rejecting connections for some reason. + + + + Add to repmgr standby promote (GitHub #522). diff --git a/doc/repmgr-cluster-show.sgml b/doc/repmgr-cluster-show.sgml index ce43ff34..997958ab 100644 --- a/doc/repmgr-cluster-show.sgml +++ b/doc/repmgr-cluster-show.sgml @@ -22,6 +22,14 @@ directly and can be run on any node in the cluster; this is also useful when analyzing connectivity from a particular node. + + + Node availability is tested by connecting from the node where + repmgr cluster show is executed, and does not necessarily imply the node + is down. See and to get + better overviews of connections between nodes. + + @@ -55,30 +63,48 @@ Notes The column Role shows the expected server role according to the - &repmgr; metadata. Status shows whether the server is running or unreachable. + &repmgr; metadata. + + + Status shows whether the server is running or unreachable. If the node has an unexpected role not reflected in the &repmgr; metadata, e.g. a node was manually - promoted to primary, this will be highlighted with an exclamation mark, e.g.: + promoted to primary, this will be highlighted with an exclamation mark. + If a connection to the node cannot be made, this will be highlighted with a question mark. + Note that the node will only be shown as ? unreachable + if a connection is not possible at network level; if the PostgreSQL instance on the + node is pingable but not accepting connections, it will be shown as ? running. + + + In the following example, executed on node3, node1 is not reachable + at network level and assumed to be down; node2 has been promoted to primary + (but node3 is not attached to it, and its metadata has not yet been updated); + node4 is running but rejecting connections (from node3 at least). - $ repmgr -f /etc/repmgr.conf cluster show + ID | Name | Role | Status | Upstream | Location | Priority | Connection string + ----+-------+---------+----------------------+----------+----------+----------+----------------------------------------- + 1 | node1 | primary | ? unreachable | | default | 100 | host=db_node1 dbname=repmgr user=repmgr + 2 | node2 | standby | ! running as primary | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr + 3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr + 4 | node4 | standby | ? running | node1 | default | 100 | host=db_node4 dbname=repmgr user=repmgr - ID | Name | Role | Status | Upstream | Location | Priority | Connection string - ----+-------+---------+----------------------+----------+----------+----------+----------------------------------------- - 1 | node1 | primary | ? unreachable | | default | 100 | host=db_node1 dbname=repmgr user=repmgr - 2 | node2 | standby | ! running as primary | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr - 3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr - - WARNING: following issues were detected - - unable to connect to node "node1" (ID: 1) - - node "node1" (ID: 1) is registered as an active primary but is unreachable - - node "node2" (ID: 2) is registered as standby but running as primary + WARNING: following issues were detected + - unable to connect to node "node1" (ID: 1) + - node "node1" (ID: 1) is registered as an active primary but is unreachable + - node "node2" (ID: 2) is registered as standby but running as primary + - unable to connect to node "node4" (ID: 4) HINT: execute with --verbose option to see connection error messages - - Node availability is tested by connecting from the node where - repmgr cluster show is executed, and does not necessarily imply the node - is down. See and to get - better overviews of connections between nodes. - + + To diagnose connection issues, execute repmgr cluster show + with the option; this will display the error message + for each failed connection attempt. + + + + Use and + to diagnose connection issues across the whole replication cluster. + + diff --git a/repmgr-action-cluster.c b/repmgr-action-cluster.c index d37d7cc9..2c2e6026 100644 --- a/repmgr-action-cluster.c +++ b/repmgr-action-cluster.c @@ -155,7 +155,12 @@ do_cluster_show(void) } else { - cell->node_info->node_status = NODE_STATUS_DOWN; + /* check if node is reachable, but just not letting us in */ + if (is_server_available(cell->node_info->conninfo)) + cell->node_info->node_status = NODE_STATUS_REJECTED; + else + cell->node_info->node_status = NODE_STATUS_DOWN; + cell->node_info->recovery_type = RECTYPE_UNKNOWN; connection_error_found = true; @@ -230,6 +235,19 @@ do_cluster_show(void) } } } + /* node is up but cannot connect */ + else if (cell->node_info->node_status == NODE_STATUS_REJECTED) + { + if (cell->node_info->active == true) + { + appendPQExpBufferStr(&details, "? running"); + } + else + { + appendPQExpBufferStr(&details, "! running"); + error_found = true; + } + } /* node is unreachable */ else { @@ -303,6 +321,19 @@ do_cluster_show(void) cell->node_info->node_name, cell->node_info->node_id); } } + /* node is up but cannot connect */ + else if (cell->node_info->node_status == NODE_STATUS_REJECTED) + { + if (cell->node_info->active == true) + { + appendPQExpBufferStr(&details, "? running"); + } + else + { + appendPQExpBufferStr(&details, "! running"); + error_found = true; + } + } /* node is unreachable */ else { @@ -316,11 +347,10 @@ do_cluster_show(void) } else { - appendPQExpBufferStr(&details, "- failed"); - error_found = true; + appendPQExpBufferStr(&details, "- failed"); + error_found = true; } } - } break; @@ -340,6 +370,20 @@ do_cluster_show(void) error_found = true; } } + /* node is up but cannot connect */ + else if (cell->node_info->node_status == NODE_STATUS_REJECTED) + { + if (cell->node_info->active == true) + { + appendPQExpBufferStr(&details, "? rejected"); + } + else + { + appendPQExpBufferStr(&details, "! failed"); + error_found = true; + } + + } /* node is unreachable */ else {