cluster show/daemon status: report upstream node mismatches

When showing node information, check if the node's copy of its
record shows a different upstream to the one expected according
to the node where the command is executed.

This helps visualise situations where the cluster is in an
unexpected state, and provide a better idea of the actual state.

For example, if a cluster has divided somehow and a set of nodes are
following a new primary, when running "cluster show" etc., repmgr
will now show the name of the primary those nodes are actually
following, rather than the now outdated node name recorded
on the other side of the split. A warning will also be issued
about the situation.
This commit is contained in:
Ian Barwick
2019-05-14 12:58:18 +09:00
parent ae44012383
commit fca033fb9d
6 changed files with 121 additions and 40 deletions

View File

@@ -4,6 +4,8 @@
repmgr: add "--repmgrd-force-unpause" option to "standby switchover" (Ian)
repmgr: improve "--dry-run" behaviour for "standby promote" and
"standby switchover" (Ian)
repmgr: in "cluster show" and "daemon status", show upstream node name
as reported by each individual node (Ian)
repmgrd: monitor standbys attached to primary (Ian)
general: documentation converted to DocBook XML format (Ian)

View File

@@ -62,6 +62,24 @@
</para>
</listitem>
<listitem>
<para>
<link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>
and <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>:
show the upstream node name as reported by each individual node - this helps visualise
situations where the cluster is in an unexpected state, and provide a better idea of the
actual cluster state.
</para>
<para>
For example, if a cluster has divided somehow and a set of nodes are
following a new primary, when running either of these commands, &repmgr;
will now show the name of the primary those nodes are actually
following, rather than the now outdated node name recorded
on the other side of the &quot;split&quot;. A warning will also be issued
about the unexpected situation.
</para>
</listitem>
</itemizedlist>
</para>
</sect2>

View File

@@ -142,9 +142,16 @@ do_cluster_show(void)
}
}
/*
* TODO: count nodes marked as "? unreachable" and add a hint about
* the other cluster commands for better determining whether
* unreachable.
*/
for (cell = nodes.head; cell; cell = cell->next)
{
PQExpBufferData node_status;
PQExpBufferData upstream;
PQExpBufferData buf;
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
@@ -171,14 +178,18 @@ do_cluster_show(void)
}
initPQExpBuffer(&node_status);
initPQExpBuffer(&upstream);
if (format_node_status(cell->node_info, &node_status, &warnings) == true)
if (format_node_status(cell->node_info, &node_status, &upstream, &warnings) == true)
error_found = true;
snprintf(cell->node_info->details, sizeof(cell->node_info->details),
"%s", node_status.data);
snprintf(cell->node_info->upstream_node_name, sizeof(cell->node_info->upstream_node_name),
"%s", upstream.data);
termPQExpBuffer(&node_status);
termPQExpBuffer(&upstream);
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
@@ -191,6 +202,7 @@ do_cluster_show(void)
headers_show[SHOW_ROLE].cur_length = strlen(get_node_type_string(cell->node_info->type));
headers_show[SHOW_NAME].cur_length = strlen(cell->node_info->node_name);
headers_show[SHOW_STATUS].cur_length = strlen(cell->node_info->details);
headers_show[SHOW_UPSTREAM_NAME].cur_length = strlen(cell->node_info->upstream_node_name);
initPQExpBuffer(&buf);

View File

@@ -129,6 +129,8 @@ do_daemon_status(void)
for (cell = nodes.head; cell; cell = cell->next)
{
int j;
PQExpBufferData node_status;
PQExpBufferData upstream;
repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
repmgrd_info[i]->node_id = cell->node_info->node_id;
@@ -229,15 +231,18 @@ do_daemon_status(void)
}
}
{
PQExpBufferData node_status;
initPQExpBuffer(&node_status);
initPQExpBuffer(&node_status);
initPQExpBuffer(&upstream);
(void)format_node_status(cell->node_info, &node_status, &warnings);
snprintf(repmgrd_info[i]->pg_running_text, sizeof(cell->node_info->details),
(void)format_node_status(cell->node_info, &node_status, &upstream, &warnings);
snprintf(repmgrd_info[i]->pg_running_text, sizeof(cell->node_info->details),
"%s", node_status.data);
termPQExpBuffer(&node_status);
}
snprintf(cell->node_info->upstream_node_name, sizeof(cell->node_info->upstream_node_name),
"%s", upstream.data);
termPQExpBuffer(&node_status);
termPQExpBuffer(&upstream);
PQfinish(cell->node_info->conn);

View File

@@ -244,7 +244,7 @@ extern void make_remote_repmgr_path(PQExpBufferData *outputbuf, t_node_info *rem
extern void make_repmgrd_path(PQExpBufferData *output_buf);
/* display functions */
extern bool format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *warnings);
extern bool format_node_status(t_node_info *node_info, PQExpBufferData *node_status, PQExpBufferData *upstream, ItemList *warnings);
extern void print_help_header(void);
extern void print_status_header(int cols, ColHeader *headers);

View File

@@ -2008,14 +2008,18 @@ check_cli_parameters(const int action)
bool
format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *warnings)
format_node_status(t_node_info *node_info, PQExpBufferData *node_status, PQExpBufferData *upstream, ItemList *warnings)
{
bool error_found = false;
t_node_info remote_node_rec = T_NODE_INFO_INITIALIZER;
RecordStatus remote_node_rec_found = RECORD_NOT_FOUND;
if (PQstatus(node_info->conn) == CONNECTION_OK)
{
node_info->node_status = NODE_STATUS_UP;
node_info->recovery_type = get_recovery_type(node_info->conn);
/* get node's copy of its record so we can see what it thinks its status is */
remote_node_rec_found = get_node_record_with_upstream(node_info->conn, node_info->node_id, &remote_node_rec);
}
else
{
@@ -2028,11 +2032,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
node_info->recovery_type = RECTYPE_UNKNOWN;
}
/*
* TODO: count nodes marked as "? unreachable" and add a hint about
* the other cluster commands for better determining whether
* unreachable.
*/
/* format node status info */
switch (node_info->type)
{
case PRIMARY:
@@ -2045,16 +2045,16 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
switch (node_info->recovery_type)
{
case RECTYPE_PRIMARY:
appendPQExpBufferStr(details, "* running");
appendPQExpBufferStr(node_status, "* running");
break;
case RECTYPE_STANDBY:
appendPQExpBufferStr(details, "! running as standby");
appendPQExpBufferStr(node_status, "! running as standby");
item_list_append_format(warnings,
"node \"%s\" (ID: %i) is registered as primary but running as standby",
node_info->node_name, node_info->node_id);
break;
case RECTYPE_UNKNOWN:
appendPQExpBufferStr(details, "! unknown");
appendPQExpBufferStr(node_status, "! unknown");
item_list_append_format(warnings,
"node \"%s\" (ID: %i) has unknown replication status",
node_info->node_name, node_info->node_id);
@@ -2065,14 +2065,14 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
{
if (node_info->recovery_type == RECTYPE_PRIMARY)
{
appendPQExpBufferStr(details, "! running");
appendPQExpBufferStr(node_status, "! running");
item_list_append_format(warnings,
"node \"%s\" (ID: %i) is running but the repmgr node record is inactive",
node_info->node_name, node_info->node_id);
}
else
{
appendPQExpBufferStr(details, "! running as standby");
appendPQExpBufferStr(node_status, "! running as standby");
item_list_append_format(warnings,
"node \"%s\" (ID: %i) is registered as an inactive primary but running as standby",
node_info->node_name, node_info->node_id);
@@ -2084,11 +2084,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
{
if (node_info->active == true)
{
appendPQExpBufferStr(details, "? running");
appendPQExpBufferStr(node_status, "? running");
}
else
{
appendPQExpBufferStr(details, "! running");
appendPQExpBufferStr(node_status, "! running");
error_found = true;
}
}
@@ -2098,7 +2098,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
/* node is unreachable but marked active */
if (node_info->active == true)
{
appendPQExpBufferStr(details, "? unreachable");
appendPQExpBufferStr(node_status, "? unreachable");
item_list_append_format(warnings,
"node \"%s\" (ID: %i) is registered as an active primary but is unreachable",
node_info->node_name, node_info->node_id);
@@ -2106,7 +2106,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
/* node is unreachable and marked as inactive */
else
{
appendPQExpBufferStr(details, "- failed");
appendPQExpBufferStr(node_status, "- failed");
error_found = true;
}
}
@@ -2122,16 +2122,16 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
switch (node_info->recovery_type)
{
case RECTYPE_STANDBY:
appendPQExpBufferStr(details, " running");
appendPQExpBufferStr(node_status, " running");
break;
case RECTYPE_PRIMARY:
appendPQExpBufferStr(details, "! running as primary");
appendPQExpBufferStr(node_status, "! running as primary");
item_list_append_format(warnings,
"node \"%s\" (ID: %i) is registered as standby but running as primary",
node_info->node_name, node_info->node_id);
break;
case RECTYPE_UNKNOWN:
appendPQExpBufferStr(details, "! unknown");
appendPQExpBufferStr(node_status, "! unknown");
item_list_append_format(
warnings,
"node \"%s\" (ID: %i) has unknown replication status",
@@ -2143,14 +2143,14 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
{
if (node_info->recovery_type == RECTYPE_STANDBY)
{
appendPQExpBufferStr(details, "! running");
appendPQExpBufferStr(node_status, "! running");
item_list_append_format(warnings,
"node \"%s\" (ID: %i) is running but the repmgr node record is inactive",
node_info->node_name, node_info->node_id);
}
else
{
appendPQExpBufferStr(details, "! running as primary");
appendPQExpBufferStr(node_status, "! running as primary");
item_list_append_format(warnings,
"node \"%s\" (ID: %i) is running as primary but the repmgr node record is inactive",
node_info->node_name, node_info->node_id);
@@ -2170,11 +2170,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
{
if (node_info->active == true)
{
appendPQExpBufferStr(details, "? running");
appendPQExpBufferStr(node_status, "? running");
}
else
{
appendPQExpBufferStr(details, "! running");
appendPQExpBufferStr(node_status, "! running");
error_found = true;
}
}
@@ -2184,14 +2184,14 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
/* node is unreachable but marked active */
if (node_info->active == true)
{
appendPQExpBufferStr(details, "? unreachable");
appendPQExpBufferStr(node_status, "? unreachable");
item_list_append_format(warnings,
"node \"%s\" (ID: %i) is registered as an active standby but is unreachable",
node_info->node_name, node_info->node_id);
}
else
{
appendPQExpBufferStr(details, "- failed");
appendPQExpBufferStr(node_status, "- failed");
error_found = true;
}
}
@@ -2206,11 +2206,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
{
if (node_info->active == true)
{
appendPQExpBufferStr(details, "* running");
appendPQExpBufferStr(node_status, "* running");
}
else
{
appendPQExpBufferStr(details, "! running");
appendPQExpBufferStr(node_status, "! running");
error_found = true;
}
}
@@ -2219,11 +2219,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
{
if (node_info->active == true)
{
appendPQExpBufferStr(details, "? rejected");
appendPQExpBufferStr(node_status, "? rejected");
}
else
{
appendPQExpBufferStr(details, "! failed");
appendPQExpBufferStr(node_status, "! failed");
error_found = true;
}
}
@@ -2232,11 +2232,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
{
if (node_info->active == true)
{
appendPQExpBufferStr(details, "? unreachable");
appendPQExpBufferStr(node_status, "? unreachable");
}
else
{
appendPQExpBufferStr(details, "- failed");
appendPQExpBufferStr(node_status, "- failed");
error_found = true;
}
}
@@ -2245,12 +2245,56 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w
case UNKNOWN:
{
/* this should never happen */
appendPQExpBufferStr(details, "? unknown node type");
appendPQExpBufferStr(node_status, "? unknown node type");
error_found = true;
}
break;
}
/* format node upstream info */
if (remote_node_rec_found == RECORD_NOT_FOUND)
{
/*
* Unable to retrieve the node's copy of its own record - copy the
* name from our own copy of the record
*/
appendPQExpBufferStr(upstream,
node_info->upstream_node_name);
}
else
{
if (node_info->upstream_node_id == remote_node_rec.upstream_node_id)
{
appendPQExpBufferStr(upstream,
node_info->upstream_node_name);
}
else
{
if (remote_node_rec.upstream_node_id == NO_UPSTREAM_NODE)
{
appendPQExpBufferChar(upstream, '!');
item_list_append_format(warnings,
"node \"%s\" (ID: %i) reports it has no upstream (expected: \"%s\")",
node_info->node_name,
node_info->node_id,
node_info->upstream_node_name);
}
else
{
appendPQExpBuffer(upstream,
"! %s", remote_node_rec.upstream_node_name);
item_list_append_format(warnings,
"node \"%s\" (ID: %i) reports a different upstream (reported: \"%s\", expected \"%s\")",
node_info->node_name,
node_info->node_id,
remote_node_rec.upstream_node_name,
node_info->upstream_node_name);
}
}
}
return error_found;
}