cluster show: differentiate unreachable status

Differentiate between unreachable nodes and nodes which are running but rejecting connections.
2026-06-01 03:39:05 +00:00 · 2019-02-15 15:55:36 +09:00
parent 9338a9e233
commit 3a5a4388c7
5 changed files with 109 additions and 24 deletions
@@ -2,6 +2,8 @@
        repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian)
        repmgr: add --version-number command line option (Ian)
        repmgr: add --compact option to "cluster show"; GitHub #521 (Ian)
 		repmgr: cluster show - differentiate between unreachable nodes
 		  and nodes which are running but rejecting connections (Ian)
        repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
        repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
        repmgr: prevent potential race condition in "standby switchover"
@@ -80,7 +80,8 @@ typedef enum
 	NODE_STATUS_UP,
 	NODE_STATUS_SHUTTING_DOWN,
 	NODE_STATUS_DOWN,
-	NODE_STATUS_UNCLEAN_SHUTDOWN
+	NODE_STATUS_UNCLEAN_SHUTDOWN,
 	NODE_STATUS_REJECTED
 } NodeStatus;
 typedef enum
@@ -88,6 +88,18 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
            </para>
          </listitem>
          <listitem>
            <para>
              <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
 			  differentiate between unreachable nodes and nodes which are running but rejecting connections.
            </para>
            <para>
 			  This makes it possible to see whether a node is unreachable at network level,
 			  or if it is running but rejecting connections for some reason.
            </para>
          </listitem>
          <listitem>
            <para>
              Add <option>--dry-run</option> to <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> (GitHub #522).
@@ -22,6 +22,14 @@
      directly and can be run on any node in the cluster; this is also useful when analyzing
      connectivity from a particular node.
    </para>
    <para>
      Node availability is tested by connecting from the node where
      <command>repmgr cluster show</command> is executed, and does not necessarily imply the node
      is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
      better overviews of connections between nodes.
    </para>
  </refsect1>
  <refsect1>
@@ -55,30 +63,48 @@
    <title>Notes</title>
    <para>
      The column <literal>Role</literal> shows the expected server role according to the
-      &repmgr; metadata. <literal>Status</literal> shows whether the server is running or unreachable.
+      &repmgr; metadata.
 	</para>
 	<para>
 	  <literal>Status</literal> shows whether the server is running or unreachable.
      If the node has an unexpected role not reflected in the &repmgr; metadata, e.g. a node was manually
-      promoted to primary, this will be highlighted with an exclamation mark, e.g.:
+      promoted to primary, this will be highlighted with an exclamation mark.
 	  If a connection to the node cannot be made, this will be highlighted with a question mark.
 	  Note that the node will only be shown as <literal>? unreachable</literal>
 	  if a connection is not possible at network level; if the PostgreSQL instance on the
 	  node is pingable but not accepting connections, it will be shown as <literal>? running</literal>.
 	</para>
 	<para>
 	  In the following example, executed on <literal>node3</literal>, <literal>node1</literal> is not reachable
 	  at network level and assumed to be down; <literal>node2</literal> has been promoted to primary
 	  (but <literal>node3</literal> is not attached to it, and its metadata has not yet been updated);
 	  <literal>node4</literal> is running but rejecting connections (from <literal>node3</literal> at least).
      <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster show
+	 ID | Name  | Role    | Status               | Upstream | Location | Priority | Connection string
 	----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
 	 1  | node1 | primary | ? unreachable        |          | default  | 100      | host=db_node1 dbname=repmgr user=repmgr
 	 2  | node2 | standby | ! running as primary | node1    | default  | 100      | host=db_node2 dbname=repmgr user=repmgr
 	 3  | node3 | standby |   running            | node1    | default  | 100      | host=db_node3 dbname=repmgr user=repmgr
 	 4  | node4 | standby | ? running            | node1    | default  | 100      | host=db_node4 dbname=repmgr user=repmgr
-     ID | Name  | Role    | Status               | Upstream | Location | Priority | Connection string
+	WARNING: following issues were detected
-    ----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
+	  - unable to connect to node "node1" (ID: 1)
-     1  | node1 | primary | ? unreachable        |          | default  | 100      | host=db_node1 dbname=repmgr user=repmgr
+	  - node "node1" (ID: 1) is registered as an active primary but is unreachable
-     2  | node2 | standby | ! running as primary | node1    | default  | 100      | host=db_node2 dbname=repmgr user=repmgr
+	  - node "node2" (ID: 2) is registered as standby but running as primary
-     3  | node3 | standby |   running            | node1    | default  | 100      | host=db_node3 dbname=repmgr user=repmgr
+	  - unable to connect to node "node4" (ID: 4)
    WARNING: following issues were detected
      - unable to connect to node "node1" (ID: 1)
      - node "node1" (ID: 1) is registered as an active primary but is unreachable
      - node "node2" (ID: 2) is registered as standby but running as primary
    HINT: execute with --verbose option to see connection error messages</programlisting>
    </para>
-    <para>
+	<para>
-      Node availability is tested by connecting from the node where
+	  To diagnose connection issues, execute <command>repmgr cluster show</command>
-      <command>repmgr cluster show</command> is executed, and does not necessarily imply the node
+	  with the <option>--verbose</option> option; this will display the error message
-      is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
+	  for each failed connection attempt.
-      better overviews of connections between nodes.
+	</para>
-    </para>
+	<tip>
 	  <para>
 		Use <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck">
 		to diagnose connection issues across the whole replication cluster.
 	  </para>
 	</tip>
  </refsect1>
  <refsect1>
@@ -155,7 +155,12 @@ do_cluster_show(void)
 		}
 		else
 		{
-			cell->node_info->node_status = NODE_STATUS_DOWN;
+			/* check if node is reachable, but just not letting us in */
 			if (is_server_available(cell->node_info->conninfo))
 				cell->node_info->node_status = NODE_STATUS_REJECTED;
 			else
 				cell->node_info->node_status = NODE_STATUS_DOWN;
 			cell->node_info->recovery_type = RECTYPE_UNKNOWN;
 			connection_error_found = true;
@@ -230,6 +235,19 @@ do_cluster_show(void)
 							}
 						}
 					}
 					/* node is up but cannot connect */
 					else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
 					{
 						if (cell->node_info->active == true)
 						{
 							appendPQExpBufferStr(&details, "? running");
 						}
 						else
 						{
 							appendPQExpBufferStr(&details, "! running");
 								error_found = true;
 						}
 					}
 					/* node is unreachable */
 					else
 					{
@@ -303,6 +321,19 @@ do_cluster_show(void)
 													cell->node_info->node_name, cell->node_info->node_id);
 						}
 					}
 					/* node is up but cannot connect */
 					else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
 					{
 						if (cell->node_info->active == true)
 						{
 							appendPQExpBufferStr(&details, "? running");
 						}
 						else
 						{
 							appendPQExpBufferStr(&details, "! running");
 								error_found = true;
 						}
 					}
 					/* node is unreachable */
 					else
 					{
@@ -316,11 +347,10 @@ do_cluster_show(void)
 						}
 						else
 						{
-							appendPQExpBufferStr(&details, "- failed");
+								appendPQExpBufferStr(&details, "- failed");
-							error_found = true;
+								error_found = true;
 						}
 					}
 				}
 				break;
@@ -340,6 +370,20 @@ do_cluster_show(void)
 							error_found = true;
 						}
 					}
 					/* node is up but cannot connect */
 					else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
 					{
 						if (cell->node_info->active == true)
 						{
 							appendPQExpBufferStr(&details, "? rejected");
 						}
 						else
 						{
 							appendPQExpBufferStr(&details, "! failed");
 							error_found = true;
 						}
 					}
 					/* node is unreachable */
 					else
 					{