mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
cluster show: differentiate unreachable status
Differentiate between unreachable nodes and nodes which are running but rejecting connections.
This commit is contained in:
2
HISTORY
2
HISTORY
@@ -2,6 +2,8 @@
|
||||
repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian)
|
||||
repmgr: add --version-number command line option (Ian)
|
||||
repmgr: add --compact option to "cluster show"; GitHub #521 (Ian)
|
||||
repmgr: cluster show - differentiate between unreachable nodes
|
||||
and nodes which are running but rejecting connections (Ian)
|
||||
repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
|
||||
repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
|
||||
repmgr: prevent potential race condition in "standby switchover"
|
||||
|
||||
@@ -80,7 +80,8 @@ typedef enum
|
||||
NODE_STATUS_UP,
|
||||
NODE_STATUS_SHUTTING_DOWN,
|
||||
NODE_STATUS_DOWN,
|
||||
NODE_STATUS_UNCLEAN_SHUTDOWN
|
||||
NODE_STATUS_UNCLEAN_SHUTDOWN,
|
||||
NODE_STATUS_REJECTED
|
||||
} NodeStatus;
|
||||
|
||||
typedef enum
|
||||
|
||||
@@ -88,6 +88,18 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
|
||||
differentiate between unreachable nodes and nodes which are running but rejecting connections.
|
||||
</para>
|
||||
<para>
|
||||
This makes it possible to see whether a node is unreachable at network level,
|
||||
or if it is running but rejecting connections for some reason.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Add <option>--dry-run</option> to <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> (GitHub #522).
|
||||
|
||||
@@ -22,6 +22,14 @@
|
||||
directly and can be run on any node in the cluster; this is also useful when analyzing
|
||||
connectivity from a particular node.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Node availability is tested by connecting from the node where
|
||||
<command>repmgr cluster show</command> is executed, and does not necessarily imply the node
|
||||
is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
|
||||
better overviews of connections between nodes.
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
@@ -55,30 +63,48 @@
|
||||
<title>Notes</title>
|
||||
<para>
|
||||
The column <literal>Role</literal> shows the expected server role according to the
|
||||
&repmgr; metadata. <literal>Status</literal> shows whether the server is running or unreachable.
|
||||
&repmgr; metadata.
|
||||
</para>
|
||||
<para>
|
||||
<literal>Status</literal> shows whether the server is running or unreachable.
|
||||
If the node has an unexpected role not reflected in the &repmgr; metadata, e.g. a node was manually
|
||||
promoted to primary, this will be highlighted with an exclamation mark, e.g.:
|
||||
promoted to primary, this will be highlighted with an exclamation mark.
|
||||
If a connection to the node cannot be made, this will be highlighted with a question mark.
|
||||
Note that the node will only be shown as <literal>? unreachable</literal>
|
||||
if a connection is not possible at network level; if the PostgreSQL instance on the
|
||||
node is pingable but not accepting connections, it will be shown as <literal>? running</literal>.
|
||||
</para>
|
||||
<para>
|
||||
In the following example, executed on <literal>node3</literal>, <literal>node1</literal> is not reachable
|
||||
at network level and assumed to be down; <literal>node2</literal> has been promoted to primary
|
||||
(but <literal>node3</literal> is not attached to it, and its metadata has not yet been updated);
|
||||
<literal>node4</literal> is running but rejecting connections (from <literal>node3</literal> at least).
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster show
|
||||
ID | Name | Role | Status | Upstream | Location | Priority | Connection string
|
||||
----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
|
||||
1 | node1 | primary | ? unreachable | | default | 100 | host=db_node1 dbname=repmgr user=repmgr
|
||||
2 | node2 | standby | ! running as primary | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr
|
||||
3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr
|
||||
4 | node4 | standby | ? running | node1 | default | 100 | host=db_node4 dbname=repmgr user=repmgr
|
||||
|
||||
ID | Name | Role | Status | Upstream | Location | Priority | Connection string
|
||||
----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
|
||||
1 | node1 | primary | ? unreachable | | default | 100 | host=db_node1 dbname=repmgr user=repmgr
|
||||
2 | node2 | standby | ! running as primary | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr
|
||||
3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr
|
||||
|
||||
WARNING: following issues were detected
|
||||
- unable to connect to node "node1" (ID: 1)
|
||||
- node "node1" (ID: 1) is registered as an active primary but is unreachable
|
||||
- node "node2" (ID: 2) is registered as standby but running as primary
|
||||
WARNING: following issues were detected
|
||||
- unable to connect to node "node1" (ID: 1)
|
||||
- node "node1" (ID: 1) is registered as an active primary but is unreachable
|
||||
- node "node2" (ID: 2) is registered as standby but running as primary
|
||||
- unable to connect to node "node4" (ID: 4)
|
||||
HINT: execute with --verbose option to see connection error messages</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Node availability is tested by connecting from the node where
|
||||
<command>repmgr cluster show</command> is executed, and does not necessarily imply the node
|
||||
is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
|
||||
better overviews of connections between nodes.
|
||||
</para>
|
||||
<para>
|
||||
To diagnose connection issues, execute <command>repmgr cluster show</command>
|
||||
with the <option>--verbose</option> option; this will display the error message
|
||||
for each failed connection attempt.
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
Use <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck">
|
||||
to diagnose connection issues across the whole replication cluster.
|
||||
</para>
|
||||
</tip>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
|
||||
@@ -155,7 +155,12 @@ do_cluster_show(void)
|
||||
}
|
||||
else
|
||||
{
|
||||
cell->node_info->node_status = NODE_STATUS_DOWN;
|
||||
/* check if node is reachable, but just not letting us in */
|
||||
if (is_server_available(cell->node_info->conninfo))
|
||||
cell->node_info->node_status = NODE_STATUS_REJECTED;
|
||||
else
|
||||
cell->node_info->node_status = NODE_STATUS_DOWN;
|
||||
|
||||
cell->node_info->recovery_type = RECTYPE_UNKNOWN;
|
||||
|
||||
connection_error_found = true;
|
||||
@@ -230,6 +235,19 @@ do_cluster_show(void)
|
||||
}
|
||||
}
|
||||
}
|
||||
/* node is up but cannot connect */
|
||||
else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
|
||||
{
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBufferStr(&details, "? running");
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBufferStr(&details, "! running");
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
/* node is unreachable */
|
||||
else
|
||||
{
|
||||
@@ -303,6 +321,19 @@ do_cluster_show(void)
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
}
|
||||
/* node is up but cannot connect */
|
||||
else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
|
||||
{
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBufferStr(&details, "? running");
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBufferStr(&details, "! running");
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
/* node is unreachable */
|
||||
else
|
||||
{
|
||||
@@ -316,11 +347,10 @@ do_cluster_show(void)
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBufferStr(&details, "- failed");
|
||||
error_found = true;
|
||||
appendPQExpBufferStr(&details, "- failed");
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
break;
|
||||
@@ -340,6 +370,20 @@ do_cluster_show(void)
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
/* node is up but cannot connect */
|
||||
else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
|
||||
{
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBufferStr(&details, "? rejected");
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBufferStr(&details, "! failed");
|
||||
error_found = true;
|
||||
}
|
||||
|
||||
}
|
||||
/* node is unreachable */
|
||||
else
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user