diff --git a/HISTORY b/HISTORY index e2a2d034..1b285f41 100644 --- a/HISTORY +++ b/HISTORY @@ -4,6 +4,8 @@ repmgr: add "--repmgrd-force-unpause" option to "standby switchover" (Ian) repmgr: improve "--dry-run" behaviour for "standby promote" and "standby switchover" (Ian) + repmgr: in "cluster show" and "daemon status", show upstream node name + as reported by each individual node (Ian) repmgrd: monitor standbys attached to primary (Ian) general: documentation converted to DocBook XML format (Ian) diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index 54bb0cf9..ad635842 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -62,6 +62,24 @@ + + + repmgr cluster show + and repmgr daemon status: + show the upstream node name as reported by each individual node - this helps visualise + situations where the cluster is in an unexpected state, and provide a better idea of the + actual cluster state. + + + For example, if a cluster has divided somehow and a set of nodes are + following a new primary, when running either of these commands, &repmgr; + will now show the name of the primary those nodes are actually + following, rather than the now outdated node name recorded + on the other side of the "split". A warning will also be issued + about the unexpected situation. + + + diff --git a/repmgr-action-cluster.c b/repmgr-action-cluster.c index 08ef1700..99201e80 100644 --- a/repmgr-action-cluster.c +++ b/repmgr-action-cluster.c @@ -142,9 +142,16 @@ do_cluster_show(void) } } + /* + * TODO: count nodes marked as "? unreachable" and add a hint about + * the other cluster commands for better determining whether + * unreachable. + */ + for (cell = nodes.head; cell; cell = cell->next) { PQExpBufferData node_status; + PQExpBufferData upstream; PQExpBufferData buf; cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo); @@ -171,14 +178,18 @@ do_cluster_show(void) } initPQExpBuffer(&node_status); + initPQExpBuffer(&upstream); - if (format_node_status(cell->node_info, &node_status, &warnings) == true) + if (format_node_status(cell->node_info, &node_status, &upstream, &warnings) == true) error_found = true; snprintf(cell->node_info->details, sizeof(cell->node_info->details), "%s", node_status.data); + snprintf(cell->node_info->upstream_node_name, sizeof(cell->node_info->upstream_node_name), + "%s", upstream.data); termPQExpBuffer(&node_status); + termPQExpBuffer(&upstream); PQfinish(cell->node_info->conn); cell->node_info->conn = NULL; @@ -191,6 +202,7 @@ do_cluster_show(void) headers_show[SHOW_ROLE].cur_length = strlen(get_node_type_string(cell->node_info->type)); headers_show[SHOW_NAME].cur_length = strlen(cell->node_info->node_name); headers_show[SHOW_STATUS].cur_length = strlen(cell->node_info->details); + headers_show[SHOW_UPSTREAM_NAME].cur_length = strlen(cell->node_info->upstream_node_name); initPQExpBuffer(&buf); diff --git a/repmgr-action-daemon.c b/repmgr-action-daemon.c index fac287fc..1cee0b49 100644 --- a/repmgr-action-daemon.c +++ b/repmgr-action-daemon.c @@ -129,6 +129,8 @@ do_daemon_status(void) for (cell = nodes.head; cell; cell = cell->next) { int j; + PQExpBufferData node_status; + PQExpBufferData upstream; repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo)); repmgrd_info[i]->node_id = cell->node_info->node_id; @@ -229,15 +231,18 @@ do_daemon_status(void) } } - { - PQExpBufferData node_status; - initPQExpBuffer(&node_status); + initPQExpBuffer(&node_status); + initPQExpBuffer(&upstream); - (void)format_node_status(cell->node_info, &node_status, &warnings); - snprintf(repmgrd_info[i]->pg_running_text, sizeof(cell->node_info->details), + (void)format_node_status(cell->node_info, &node_status, &upstream, &warnings); + snprintf(repmgrd_info[i]->pg_running_text, sizeof(cell->node_info->details), "%s", node_status.data); - termPQExpBuffer(&node_status); - } + + snprintf(cell->node_info->upstream_node_name, sizeof(cell->node_info->upstream_node_name), + "%s", upstream.data); + + termPQExpBuffer(&node_status); + termPQExpBuffer(&upstream); PQfinish(cell->node_info->conn); diff --git a/repmgr-client-global.h b/repmgr-client-global.h index 3b5f1a33..b92d1042 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -244,7 +244,7 @@ extern void make_remote_repmgr_path(PQExpBufferData *outputbuf, t_node_info *rem extern void make_repmgrd_path(PQExpBufferData *output_buf); /* display functions */ -extern bool format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *warnings); +extern bool format_node_status(t_node_info *node_info, PQExpBufferData *node_status, PQExpBufferData *upstream, ItemList *warnings); extern void print_help_header(void); extern void print_status_header(int cols, ColHeader *headers); diff --git a/repmgr-client.c b/repmgr-client.c index 58f707da..44f0354d 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -2008,14 +2008,18 @@ check_cli_parameters(const int action) bool -format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *warnings) +format_node_status(t_node_info *node_info, PQExpBufferData *node_status, PQExpBufferData *upstream, ItemList *warnings) { bool error_found = false; + t_node_info remote_node_rec = T_NODE_INFO_INITIALIZER; + RecordStatus remote_node_rec_found = RECORD_NOT_FOUND; if (PQstatus(node_info->conn) == CONNECTION_OK) { node_info->node_status = NODE_STATUS_UP; node_info->recovery_type = get_recovery_type(node_info->conn); + /* get node's copy of its record so we can see what it thinks its status is */ + remote_node_rec_found = get_node_record_with_upstream(node_info->conn, node_info->node_id, &remote_node_rec); } else { @@ -2028,11 +2032,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w node_info->recovery_type = RECTYPE_UNKNOWN; } - /* - * TODO: count nodes marked as "? unreachable" and add a hint about - * the other cluster commands for better determining whether - * unreachable. - */ + /* format node status info */ switch (node_info->type) { case PRIMARY: @@ -2045,16 +2045,16 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w switch (node_info->recovery_type) { case RECTYPE_PRIMARY: - appendPQExpBufferStr(details, "* running"); + appendPQExpBufferStr(node_status, "* running"); break; case RECTYPE_STANDBY: - appendPQExpBufferStr(details, "! running as standby"); + appendPQExpBufferStr(node_status, "! running as standby"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as primary but running as standby", node_info->node_name, node_info->node_id); break; case RECTYPE_UNKNOWN: - appendPQExpBufferStr(details, "! unknown"); + appendPQExpBufferStr(node_status, "! unknown"); item_list_append_format(warnings, "node \"%s\" (ID: %i) has unknown replication status", node_info->node_name, node_info->node_id); @@ -2065,14 +2065,14 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->recovery_type == RECTYPE_PRIMARY) { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is running but the repmgr node record is inactive", node_info->node_name, node_info->node_id); } else { - appendPQExpBufferStr(details, "! running as standby"); + appendPQExpBufferStr(node_status, "! running as standby"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as an inactive primary but running as standby", node_info->node_name, node_info->node_id); @@ -2084,11 +2084,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "? running"); + appendPQExpBufferStr(node_status, "? running"); } else { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); error_found = true; } } @@ -2098,7 +2098,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w /* node is unreachable but marked active */ if (node_info->active == true) { - appendPQExpBufferStr(details, "? unreachable"); + appendPQExpBufferStr(node_status, "? unreachable"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as an active primary but is unreachable", node_info->node_name, node_info->node_id); @@ -2106,7 +2106,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w /* node is unreachable and marked as inactive */ else { - appendPQExpBufferStr(details, "- failed"); + appendPQExpBufferStr(node_status, "- failed"); error_found = true; } } @@ -2122,16 +2122,16 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w switch (node_info->recovery_type) { case RECTYPE_STANDBY: - appendPQExpBufferStr(details, " running"); + appendPQExpBufferStr(node_status, " running"); break; case RECTYPE_PRIMARY: - appendPQExpBufferStr(details, "! running as primary"); + appendPQExpBufferStr(node_status, "! running as primary"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as standby but running as primary", node_info->node_name, node_info->node_id); break; case RECTYPE_UNKNOWN: - appendPQExpBufferStr(details, "! unknown"); + appendPQExpBufferStr(node_status, "! unknown"); item_list_append_format( warnings, "node \"%s\" (ID: %i) has unknown replication status", @@ -2143,14 +2143,14 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->recovery_type == RECTYPE_STANDBY) { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is running but the repmgr node record is inactive", node_info->node_name, node_info->node_id); } else { - appendPQExpBufferStr(details, "! running as primary"); + appendPQExpBufferStr(node_status, "! running as primary"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is running as primary but the repmgr node record is inactive", node_info->node_name, node_info->node_id); @@ -2170,11 +2170,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "? running"); + appendPQExpBufferStr(node_status, "? running"); } else { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); error_found = true; } } @@ -2184,14 +2184,14 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w /* node is unreachable but marked active */ if (node_info->active == true) { - appendPQExpBufferStr(details, "? unreachable"); + appendPQExpBufferStr(node_status, "? unreachable"); item_list_append_format(warnings, "node \"%s\" (ID: %i) is registered as an active standby but is unreachable", node_info->node_name, node_info->node_id); } else { - appendPQExpBufferStr(details, "- failed"); + appendPQExpBufferStr(node_status, "- failed"); error_found = true; } } @@ -2206,11 +2206,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "* running"); + appendPQExpBufferStr(node_status, "* running"); } else { - appendPQExpBufferStr(details, "! running"); + appendPQExpBufferStr(node_status, "! running"); error_found = true; } } @@ -2219,11 +2219,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "? rejected"); + appendPQExpBufferStr(node_status, "? rejected"); } else { - appendPQExpBufferStr(details, "! failed"); + appendPQExpBufferStr(node_status, "! failed"); error_found = true; } } @@ -2232,11 +2232,11 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w { if (node_info->active == true) { - appendPQExpBufferStr(details, "? unreachable"); + appendPQExpBufferStr(node_status, "? unreachable"); } else { - appendPQExpBufferStr(details, "- failed"); + appendPQExpBufferStr(node_status, "- failed"); error_found = true; } } @@ -2245,12 +2245,56 @@ format_node_status(t_node_info *node_info, PQExpBufferData *details, ItemList *w case UNKNOWN: { /* this should never happen */ - appendPQExpBufferStr(details, "? unknown node type"); + appendPQExpBufferStr(node_status, "? unknown node type"); error_found = true; } break; } + /* format node upstream info */ + + if (remote_node_rec_found == RECORD_NOT_FOUND) + { + /* + * Unable to retrieve the node's copy of its own record - copy the + * name from our own copy of the record + */ + appendPQExpBufferStr(upstream, + node_info->upstream_node_name); + } + else + { + if (node_info->upstream_node_id == remote_node_rec.upstream_node_id) + { + appendPQExpBufferStr(upstream, + node_info->upstream_node_name); + + } + else + { + if (remote_node_rec.upstream_node_id == NO_UPSTREAM_NODE) + { + appendPQExpBufferChar(upstream, '!'); + item_list_append_format(warnings, + "node \"%s\" (ID: %i) reports it has no upstream (expected: \"%s\")", + node_info->node_name, + node_info->node_id, + node_info->upstream_node_name); + } + else + { + appendPQExpBuffer(upstream, + "! %s", remote_node_rec.upstream_node_name); + item_list_append_format(warnings, + "node \"%s\" (ID: %i) reports a different upstream (reported: \"%s\", expected \"%s\")", + node_info->node_name, + node_info->node_id, + remote_node_rec.upstream_node_name, + node_info->upstream_node_name); + } + } + } + return error_found; }