From fca033fb9d9cc0103d5d9afd91934a64ad8093b0 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Tue, 14 May 2019 12:58:18 +0900 Subject: [PATCH] cluster show/daemon status: report upstream node mismatches When showing node information, check if the node's copy of its record shows a different upstream to the one expected according to the node where the command is executed. This helps visualise situations where the cluster is in an unexpected state, and provide a better idea of the actual state. For example, if a cluster has divided somehow and a set of nodes are following a new primary, when running "cluster show" etc., repmgr will now show the name of the primary those nodes are actually following, rather than the now outdated node name recorded on the other side of the split. A warning will also be issued about the situation. --- HISTORY | 2 + doc/appendix-release-notes.sgml | 18 ++++++ repmgr-action-cluster.c | 14 ++++- repmgr-action-daemon.c | 19 +++--- repmgr-client-global.h | 2 +- repmgr-client.c | 106 ++++++++++++++++++++++---------- 6 files changed, 121 insertions(+), 40 deletions(-) diff --git a/HISTORY b/HISTORY index e2a2d034..1b285f41 100644 --- a/HISTORY +++ b/HISTORY @@ -4,6 +4,8 @@ repmgr: add "--repmgrd-force-unpause" option to "standby switchover" (Ian) repmgr: improve "--dry-run" behaviour for "standby promote" and "standby switchover" (Ian) + repmgr: in "cluster show" and "daemon status", show upstream node name + as reported by each individual node (Ian) repmgrd: monitor standbys attached to primary (Ian) general: documentation converted to DocBook XML format (Ian) diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index 54bb0cf9..ad635842 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -62,6 +62,24 @@ + + + repmgr cluster show + and repmgr daemon status: + show the upstream node name as reported by each individual node - this helps visualise + situations where the cluster is in an unexpected state, and provide a better idea of the + actual cluster state. + + + For example, if a cluster has divided somehow and a set of nodes are + following a new primary, when running either of these commands, &repmgr; + will now show the name of the primary those nodes are actually + following, rather than the now outdated node name recorded + on the other side of the "split". A warning will also be issued + about the unexpected situation. + + + diff --git a/repmgr-action-cluster.c b/repmgr-action-cluster.c index 08ef1700..99201e80 100644 --- a/repmgr-action-cluster.c +++ b/repmgr-action-cluster.c @@ -142,9 +142,16 @@ do_cluster_show(void) } } + /* + * TODO: count nodes marked as "? unreachable" and add a hint about + * the other cluster commands for better determining whether + * unreachable. + */ + for (cell = nodes.head; cell; cell = cell->next) { PQExpBufferData node_status; + PQExpBufferData upstream; PQExpBufferData buf; cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo); @@ -171,14 +178,18 @@ do_cluster_show(void) } initPQExpBuffer(&node_status); + initPQExpBuffer(&upstream); - if (format_node_status(cell->node_info, &node_status, &warnings) == true) + if (format_node_status(cell->node_info, &node_status, &upstream, &warnings) == true) error_found = true; snprintf(cell->node_info->details, sizeof(cell->node_info->details), "%s", node_status.data); + snprintf(cell->node_info->upstream_node_name, sizeof(cell->node_info->upstream_node_name), + "%s", upstream.data); termPQExpBuffer(&node_status); + termPQExpBuffer(&upstream); PQfinish(cell->node_info->conn); cell->node_info->conn = NULL; @@ -191,6 +202,7 @@ do_cluster_show(void) headers_show[SHOW_ROLE].cur_length = strlen(get_node_type_string(cell->node_info->type)); headers_show[SHOW_NAME].cur_length = strlen(cell->node_info->node_name); headers_show[SHOW_STATUS].cur_length = strlen(cell->node_info->details); + headers_show[SHOW_UPSTREAM_NAME].cur_length = strlen(cell->node_info->upstream_node_name); initPQExpBuffer(&buf); diff --git a/repmgr-action-daemon.c b/repmgr-action-daemon.c index fac287fc..1cee0b49 100644 --- a/repmgr-action-daemon.c +++ b/repmgr-action-daemon.c @@ -129,6 +129,8 @@ do_daemon_status(void) for (cell = nodes.head; cell; cell = cell->next) { int j; + PQExpBufferData node_status; + PQExpBufferData upstream; repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo)); repmgrd_info[i]->node_id = cell->node_info->node_id; @@ -229,15 +231,18 @@ do_daemon_status(void) } } - { - PQExpBufferData node_status; - initPQExpBuffer(&node_status); + initPQExpBuffer(&node_status); + initPQExpBuffer(&upstream); - (void)format_node_status(cell->node_info, &node_status, &warnings); - snprintf(repmgrd_info[i]->pg_running_text, sizeof(cell->node_info->details), + (void)format_node_status(cell->node_info, &node_status, &upstream, &warnings); + snprintf(repmgrd_info[i]->pg_running_text, sizeof(cell->node_info->details), "%s", node_status.data); - termPQExpBuffer(&node_status); - } + + snprintf(cell->node_info->upstream_node_name, sizeof(cell->node_info->upstream_node_name), + "%s", upstream.data); + + termPQExpBuffer(&node_status); + termPQExpBuffer(&upstream); PQfinish(cell->node_info->conn); diff --git a/repmgr-client-global.h b/repmgr-client-global.h index 3b5f1a33..b92d1042 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -244,7 +244,7 @@ extern void make_remote_repmgr_path(PQExpBufferData *outputbuf, t_node_info *rem extern void make_repmgrd_path(PQExpBufferData *output_buf); /* display functions */ -extern bool format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *warnings); +extern bool format_node_status(t_node_info *node_info, PQExpBufferData *node_status, PQExpBufferData *upstream, ItemList *warnings); extern void print_help_header(void); extern void print_status_header(int cols, ColHeader *headers); diff --git a/repmgr-client.c b/repmgr-client.c index 58f707da..44f0354d 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -2008,14 +2008,18 @@ check_cli_parameters(const int action) bool -format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *warnings) +format_node_status(t_node_info *node_info, PQExpBufferData *node_status, PQExpBufferData *upstream, ItemList *warnings) { bool error_found = false; + t_node_info remote_node_rec = T_NODE_INFO_INITIALIZER; + RecordStatus remote_node_rec_found = RECORD_NOT_FOUND; if (PQstatus(node_info->conn) == CONNECTION_OK) { node_info->node_status = NODE_STATUS_UP; node_info->recovery_type = get_recovery_type(node_info->conn); + /* get node's copy of its record so we can see what it thinks its status is */ + remote_node_rec_found = get_node_record_with_upstream(node_info->conn, node_info->node_id, &remote_node_rec); } else { @@ -2028,11 +2032,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w node_info->recovery_type = RECTYPE_UNKNOWN; } - /* - * TODO: count nodes marked as "? unreachable" and add a hint about - * the other cluster commands for better determining whether - * unreachable. - */ + /* format node status info */ switch (node_info->type) { case PRIMARY: @@ -2045,16 +2045,16 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w switch (node_info->recovery_type) { case RECTYPE_PRIMARY: - appendPQExpBufferStr(details, "* running"); + appendPQExpBufferStr(node_status, "* running"); break; case RECTYPE_STANDBY: - appendPQExpBufferStr(details, "! running as standby"); + appendPQExpBufferStr(node_status, "! running as standby"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as primary but running as standby", node_info->node_name, node_info->node_id); break; case RECTYPE_UNKNOWN: - appendPQExpBufferStr(details, "! unknown"); + appendPQExpBufferStr(node_status, "! unknown"); item_list_append_format(warnings, "node \"%s\" (ID: %i) has unknown replication status", node_info->node_name, node_info->node_id); @@ -2065,14 +2065,14 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->recovery_type == RECTYPE_PRIMARY) { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is running but the repmgr node record is inactive", node_info->node_name, node_info->node_id); } else { - appendPQExpBufferStr(details, "! running as standby"); + appendPQExpBufferStr(node_status, "! running as standby"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as an inactive primary but running as standby", node_info->node_name, node_info->node_id); @@ -2084,11 +2084,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "? running"); + appendPQExpBufferStr(node_status, "? running"); } else { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); error_found = true; } } @@ -2098,7 +2098,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w /* node is unreachable but marked active */ if (node_info->active == true) { - appendPQExpBufferStr(details, "? unreachable"); + appendPQExpBufferStr(node_status, "? unreachable"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as an active primary but is unreachable", node_info->node_name, node_info->node_id); @@ -2106,7 +2106,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w /* node is unreachable and marked as inactive */ else { - appendPQExpBufferStr(details, "- failed"); + appendPQExpBufferStr(node_status, "- failed"); error_found = true; } } @@ -2122,16 +2122,16 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w switch (node_info->recovery_type) { case RECTYPE_STANDBY: - appendPQExpBufferStr(details, " running"); + appendPQExpBufferStr(node_status, " running"); break; case RECTYPE_PRIMARY: - appendPQExpBufferStr(details, "! running as primary"); + appendPQExpBufferStr(node_status, "! running as primary"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as standby but running as primary", node_info->node_name, node_info->node_id); break; case RECTYPE_UNKNOWN: - appendPQExpBufferStr(details, "! unknown"); + appendPQExpBufferStr(node_status, "! unknown"); item_list_append_format( warnings, "node \"%s\" (ID: %i) has unknown replication status", @@ -2143,14 +2143,14 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->recovery_type == RECTYPE_STANDBY) { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is running but the repmgr node record is inactive", node_info->node_name, node_info->node_id); } else { - appendPQExpBufferStr(details, "! running as primary"); + appendPQExpBufferStr(node_status, "! running as primary"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is running as primary but the repmgr node record is inactive", node_info->node_name, node_info->node_id); @@ -2170,11 +2170,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "? running"); + appendPQExpBufferStr(node_status, "? running"); } else { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); error_found = true; } } @@ -2184,14 +2184,14 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w /* node is unreachable but marked active */ if (node_info->active == true) { - appendPQExpBufferStr(details, "? unreachable"); + appendPQExpBufferStr(node_status, "? unreachable"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as an active standby but is unreachable", node_info->node_name, node_info->node_id); } else { - appendPQExpBufferStr(details, "- failed"); + appendPQExpBufferStr(node_status, "- failed"); error_found = true; } } @@ -2206,11 +2206,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "* running"); + appendPQExpBufferStr(node_status, "* running"); } else { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); error_found = true; } } @@ -2219,11 +2219,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "? rejected"); + appendPQExpBufferStr(node_status, "? rejected"); } else { - appendPQExpBufferStr(details, "! failed"); + appendPQExpBufferStr(node_status, "! failed"); error_found = true; } } @@ -2232,11 +2232,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "? unreachable"); + appendPQExpBufferStr(node_status, "? unreachable"); } else { - appendPQExpBufferStr(details, "- failed"); + appendPQExpBufferStr(node_status, "- failed"); error_found = true; } } @@ -2245,12 +2245,56 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w case UNKNOWN: { /* this should never happen */ - appendPQExpBufferStr(details, "? unknown node type"); + appendPQExpBufferStr(node_status, "? unknown node type"); error_found = true; } break; } + /* format node upstream info */ + + if (remote_node_rec_found == RECORD_NOT_FOUND) + { + /* + * Unable to retrieve the node's copy of its own record - copy the + * name from our own copy of the record + */ + appendPQExpBufferStr(upstream, + node_info->upstream_node_name); + } + else + { + if (node_info->upstream_node_id == remote_node_rec.upstream_node_id) + { + appendPQExpBufferStr(upstream, + node_info->upstream_node_name); + + } + else + { + if (remote_node_rec.upstream_node_id == NO_UPSTREAM_NODE) + { + appendPQExpBufferChar(upstream, '!'); + item_list_append_format(warnings, + "node \"%s\" (ID: %i) reports it has no upstream (expected: \"%s\")", + node_info->node_name, + node_info->node_id, + node_info->upstream_node_name); + } + else + { + appendPQExpBuffer(upstream, + "! %s", remote_node_rec.upstream_node_name); + item_list_append_format(warnings, + "node \"%s\" (ID: %i) reports a different upstream (reported: \"%s\", expected \"%s\")", + node_info->node_name, + node_info->node_id, + remote_node_rec.upstream_node_name, + node_info->upstream_node_name); + } + } + } + return error_found; }