mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 08:36:30 +00:00
cluster show: differentiate unreachable status
Differentiate between unreachable nodes and nodes which are running but rejecting connections.
This commit is contained in:
2
HISTORY
2
HISTORY
@@ -2,6 +2,8 @@
|
|||||||
repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian)
|
repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian)
|
||||||
repmgr: add --version-number command line option (Ian)
|
repmgr: add --version-number command line option (Ian)
|
||||||
repmgr: add --compact option to "cluster show"; GitHub #521 (Ian)
|
repmgr: add --compact option to "cluster show"; GitHub #521 (Ian)
|
||||||
|
repmgr: cluster show - differentiate between unreachable nodes
|
||||||
|
and nodes which are running but rejecting connections (Ian)
|
||||||
repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
|
repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
|
||||||
repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
|
repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
|
||||||
repmgr: prevent potential race condition in "standby switchover"
|
repmgr: prevent potential race condition in "standby switchover"
|
||||||
|
|||||||
@@ -80,7 +80,8 @@ typedef enum
|
|||||||
NODE_STATUS_UP,
|
NODE_STATUS_UP,
|
||||||
NODE_STATUS_SHUTTING_DOWN,
|
NODE_STATUS_SHUTTING_DOWN,
|
||||||
NODE_STATUS_DOWN,
|
NODE_STATUS_DOWN,
|
||||||
NODE_STATUS_UNCLEAN_SHUTDOWN
|
NODE_STATUS_UNCLEAN_SHUTDOWN,
|
||||||
|
NODE_STATUS_REJECTED
|
||||||
} NodeStatus;
|
} NodeStatus;
|
||||||
|
|
||||||
typedef enum
|
typedef enum
|
||||||
|
|||||||
@@ -88,6 +88,18 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
|
|||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
<command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
|
||||||
|
differentiate between unreachable nodes and nodes which are running but rejecting connections.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
This makes it possible to see whether a node is unreachable at network level,
|
||||||
|
or if it is running but rejecting connections for some reason.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
|
||||||
<listitem>
|
<listitem>
|
||||||
<para>
|
<para>
|
||||||
Add <option>--dry-run</option> to <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> (GitHub #522).
|
Add <option>--dry-run</option> to <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> (GitHub #522).
|
||||||
|
|||||||
@@ -22,6 +22,14 @@
|
|||||||
directly and can be run on any node in the cluster; this is also useful when analyzing
|
directly and can be run on any node in the cluster; this is also useful when analyzing
|
||||||
connectivity from a particular node.
|
connectivity from a particular node.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
Node availability is tested by connecting from the node where
|
||||||
|
<command>repmgr cluster show</command> is executed, and does not necessarily imply the node
|
||||||
|
is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
|
||||||
|
better overviews of connections between nodes.
|
||||||
|
</para>
|
||||||
|
|
||||||
</refsect1>
|
</refsect1>
|
||||||
|
|
||||||
<refsect1>
|
<refsect1>
|
||||||
@@ -55,30 +63,48 @@
|
|||||||
<title>Notes</title>
|
<title>Notes</title>
|
||||||
<para>
|
<para>
|
||||||
The column <literal>Role</literal> shows the expected server role according to the
|
The column <literal>Role</literal> shows the expected server role according to the
|
||||||
&repmgr; metadata. <literal>Status</literal> shows whether the server is running or unreachable.
|
&repmgr; metadata.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
<literal>Status</literal> shows whether the server is running or unreachable.
|
||||||
If the node has an unexpected role not reflected in the &repmgr; metadata, e.g. a node was manually
|
If the node has an unexpected role not reflected in the &repmgr; metadata, e.g. a node was manually
|
||||||
promoted to primary, this will be highlighted with an exclamation mark, e.g.:
|
promoted to primary, this will be highlighted with an exclamation mark.
|
||||||
|
If a connection to the node cannot be made, this will be highlighted with a question mark.
|
||||||
|
Note that the node will only be shown as <literal>? unreachable</literal>
|
||||||
|
if a connection is not possible at network level; if the PostgreSQL instance on the
|
||||||
|
node is pingable but not accepting connections, it will be shown as <literal>? running</literal>.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
In the following example, executed on <literal>node3</literal>, <literal>node1</literal> is not reachable
|
||||||
|
at network level and assumed to be down; <literal>node2</literal> has been promoted to primary
|
||||||
|
(but <literal>node3</literal> is not attached to it, and its metadata has not yet been updated);
|
||||||
|
<literal>node4</literal> is running but rejecting connections (from <literal>node3</literal> at least).
|
||||||
<programlisting>
|
<programlisting>
|
||||||
$ repmgr -f /etc/repmgr.conf cluster show
|
ID | Name | Role | Status | Upstream | Location | Priority | Connection string
|
||||||
|
----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
|
||||||
|
1 | node1 | primary | ? unreachable | | default | 100 | host=db_node1 dbname=repmgr user=repmgr
|
||||||
|
2 | node2 | standby | ! running as primary | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr
|
||||||
|
3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr
|
||||||
|
4 | node4 | standby | ? running | node1 | default | 100 | host=db_node4 dbname=repmgr user=repmgr
|
||||||
|
|
||||||
ID | Name | Role | Status | Upstream | Location | Priority | Connection string
|
WARNING: following issues were detected
|
||||||
----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
|
- unable to connect to node "node1" (ID: 1)
|
||||||
1 | node1 | primary | ? unreachable | | default | 100 | host=db_node1 dbname=repmgr user=repmgr
|
- node "node1" (ID: 1) is registered as an active primary but is unreachable
|
||||||
2 | node2 | standby | ! running as primary | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr
|
- node "node2" (ID: 2) is registered as standby but running as primary
|
||||||
3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr
|
- unable to connect to node "node4" (ID: 4)
|
||||||
|
|
||||||
WARNING: following issues were detected
|
|
||||||
- unable to connect to node "node1" (ID: 1)
|
|
||||||
- node "node1" (ID: 1) is registered as an active primary but is unreachable
|
|
||||||
- node "node2" (ID: 2) is registered as standby but running as primary
|
|
||||||
HINT: execute with --verbose option to see connection error messages</programlisting>
|
HINT: execute with --verbose option to see connection error messages</programlisting>
|
||||||
</para>
|
</para>
|
||||||
<para>
|
<para>
|
||||||
Node availability is tested by connecting from the node where
|
To diagnose connection issues, execute <command>repmgr cluster show</command>
|
||||||
<command>repmgr cluster show</command> is executed, and does not necessarily imply the node
|
with the <option>--verbose</option> option; this will display the error message
|
||||||
is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
|
for each failed connection attempt.
|
||||||
better overviews of connections between nodes.
|
</para>
|
||||||
</para>
|
<tip>
|
||||||
|
<para>
|
||||||
|
Use <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck">
|
||||||
|
to diagnose connection issues across the whole replication cluster.
|
||||||
|
</para>
|
||||||
|
</tip>
|
||||||
</refsect1>
|
</refsect1>
|
||||||
|
|
||||||
<refsect1>
|
<refsect1>
|
||||||
|
|||||||
@@ -155,7 +155,12 @@ do_cluster_show(void)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cell->node_info->node_status = NODE_STATUS_DOWN;
|
/* check if node is reachable, but just not letting us in */
|
||||||
|
if (is_server_available(cell->node_info->conninfo))
|
||||||
|
cell->node_info->node_status = NODE_STATUS_REJECTED;
|
||||||
|
else
|
||||||
|
cell->node_info->node_status = NODE_STATUS_DOWN;
|
||||||
|
|
||||||
cell->node_info->recovery_type = RECTYPE_UNKNOWN;
|
cell->node_info->recovery_type = RECTYPE_UNKNOWN;
|
||||||
|
|
||||||
connection_error_found = true;
|
connection_error_found = true;
|
||||||
@@ -230,6 +235,19 @@ do_cluster_show(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* node is up but cannot connect */
|
||||||
|
else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
|
||||||
|
{
|
||||||
|
if (cell->node_info->active == true)
|
||||||
|
{
|
||||||
|
appendPQExpBufferStr(&details, "? running");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
appendPQExpBufferStr(&details, "! running");
|
||||||
|
error_found = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
/* node is unreachable */
|
/* node is unreachable */
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -303,6 +321,19 @@ do_cluster_show(void)
|
|||||||
cell->node_info->node_name, cell->node_info->node_id);
|
cell->node_info->node_name, cell->node_info->node_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* node is up but cannot connect */
|
||||||
|
else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
|
||||||
|
{
|
||||||
|
if (cell->node_info->active == true)
|
||||||
|
{
|
||||||
|
appendPQExpBufferStr(&details, "? running");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
appendPQExpBufferStr(&details, "! running");
|
||||||
|
error_found = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
/* node is unreachable */
|
/* node is unreachable */
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -316,11 +347,10 @@ do_cluster_show(void)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
appendPQExpBufferStr(&details, "- failed");
|
appendPQExpBufferStr(&details, "- failed");
|
||||||
error_found = true;
|
error_found = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@@ -340,6 +370,20 @@ do_cluster_show(void)
|
|||||||
error_found = true;
|
error_found = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* node is up but cannot connect */
|
||||||
|
else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
|
||||||
|
{
|
||||||
|
if (cell->node_info->active == true)
|
||||||
|
{
|
||||||
|
appendPQExpBufferStr(&details, "? rejected");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
appendPQExpBufferStr(&details, "! failed");
|
||||||
|
error_found = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
/* node is unreachable */
|
/* node is unreachable */
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user