diff --git a/HISTORY b/HISTORY index ae2c3323..c9457ba6 100644 --- a/HISTORY +++ b/HISTORY @@ -4,6 +4,8 @@ repmgr: "standby clone" - don't copy external config files in --dry-run mode; GitHub #491 (Ian) repmgr: add "cluster_cleanup" event; GitHub #492 (Ian) + repmgr: (standby switchover) improve detection of free walsenders; + GitHub #495 (Ian) repmgrd: ensure that sending SIGHUP always results in the log file being reopened; GitHub #485 (Ian) repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian) diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index 351f9fd1..8c33fded 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -21,6 +21,22 @@ + + repmgr enhancements + + + + + + repmgr standby switchover: + improve detection of free walsenders. (GitHub #495). + + + + + + + repmgr enhancements diff --git a/doc/switchover.sgml b/doc/switchover.sgml index 660406bb..84444e69 100644 --- a/doc/switchover.sgml +++ b/doc/switchover.sgml @@ -60,6 +60,13 @@ &repmgr; being able to shut down the current primary server quickly and cleanly. + + Ensure that the promotion candidate has sufficient free walsenders available + (PostgreSQL configuration item max_wal_senders), and if replication + slots are in use, at least one free slot is available for the demotion candidate ( + PostgreSQL configuration item max_replication_slots). + + Ensure that a passwordless SSH connection is possible from the promotion candidate (standby) to the demotion candidate (current primary). If --siblings-follow diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 44c12760..69c98bc5 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -2841,6 +2841,12 @@ do_standby_switchover(void) int reachable_sibling_nodes_with_slot_count = 0; int unreachable_sibling_node_count = 0; + /* number of free walsenders required on promotion candidate */ + int min_required_wal_senders = 1; + + /* this will be calculated as max_wal_senders - COUNT(*) FROM pg_stat_replication */ + int available_wal_senders = 0; + /* number of free replication slots required on promotion candidate */ int min_required_free_slots = 0; @@ -3110,6 +3116,178 @@ do_standby_switchover(void) } termPQExpBuffer(&command_output); + + // check walsenders here + /* + * populate local node record with current state of various replication-related + * values, so we can check for sufficient walsenders and replication slots + */ + get_node_replication_stats(local_conn, server_version_num, &local_node_record); + + available_wal_senders = local_node_record.max_wal_senders - + local_node_record.attached_wal_receivers; + + /* + * If --siblings-follow specified, get list and check they're reachable + * (if not just issue a warning) + */ + get_active_sibling_node_records(local_conn, + local_node_record.node_id, + local_node_record.upstream_node_id, + &sibling_nodes); + + if (runtime_options.siblings_follow == false) + { + if (sibling_nodes.node_count > 0) + { + log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"), + sibling_nodes.node_count); + log_detail(_("these nodes will remain attached to the current primary")); + } + } + else + { + char host[MAXLEN] = ""; + NodeInfoListCell *cell; + + log_verbose(LOG_INFO, _("%i active sibling nodes found"), + sibling_nodes.node_count); + + if (sibling_nodes.node_count == 0) + { + log_warning(_("option \"--sibling-nodes\" specified, but no sibling nodes exist")); + } + else + { + /* include walsender for promotion candidate in total */ + + + for (cell = sibling_nodes.head; cell; cell = cell->next) + { + /* get host from node record */ + get_conninfo_value(cell->node_info->conninfo, "host", host); + r = test_ssh_connection(host, runtime_options.remote_user); + + if (r != 0) + { + cell->node_info->reachable = false; + unreachable_sibling_node_count++; + } + else + { + cell->node_info->reachable = true; + reachable_sibling_node_count++; + min_required_wal_senders++; + + if (cell->node_info->slot_name[0] != '\0') + { + reachable_sibling_nodes_with_slot_count++; + min_required_free_slots++; + } + } + } + + if (unreachable_sibling_node_count > 0) + { + if (runtime_options.force == false) + { + log_error(_("%i of %i sibling nodes unreachable via SSH:"), + unreachable_sibling_node_count, + sibling_nodes.node_count); + } + else + { + log_warning(_("%i of %i sibling nodes unreachable via SSH:"), + unreachable_sibling_node_count, + sibling_nodes.node_count); + } + + /* display list of unreachable sibling nodes */ + for (cell = sibling_nodes.head; cell; cell = cell->next) + { + if (cell->node_info->reachable == true) + continue; + log_detail(" %s (ID: %i)", + cell->node_info->node_name, + cell->node_info->node_id); + } + + if (runtime_options.force == false) + { + log_hint(_("use -F/--force to proceed in any case")); + PQfinish(local_conn); + exit(ERR_BAD_CONFIG); + } + + if (runtime_options.dry_run == true) + { + log_detail(_("F/--force specified, would proceed anyway")); + } + else + { + log_detail(_("F/--force specified, proceeding anyway")); + } + } + else + { + char *msg = _("all sibling nodes are reachable via SSH"); + + if (runtime_options.dry_run == true) + { + log_info("%s", msg); + } + else + { + log_verbose(LOG_INFO, "%s", msg); + } + } + } + } + + + /* + * check there are sufficient free walsenders - obviously there's potential + * for a later race condition if some walsenders come into use before the + * switchover operation gets around to attaching the sibling nodes, but + * this should catch any actual existing configuration issue (and if anyone's + * performing a switchover in such an unstable environment, they only have + * themselves to blame). + */ + if (available_wal_senders < min_required_wal_senders) + { + if (runtime_options.force == false || runtime_options.dry_run == true) + { + log_error(_("insufficient free walsenders on promotion candidate")); + log_detail(_("at least %i walsenders required but only %i free walsenders on promotion candidate"), + min_required_wal_senders, + available_wal_senders); + log_hint(_("increase parameter \"max_wal_senders\" or use -F/--force to proceed in any case")); + + if (runtime_options.dry_run == false) + { + PQfinish(local_conn); + exit(ERR_BAD_CONFIG); + } + } + else + { + log_warning(_("insufficient free walsenders on promotion candidate")); + log_detail(_("at least %i walsenders required but only %i free walsender(s) on promotion candidate"), + min_required_wal_senders, + available_wal_senders); + } + } + else + { + if (runtime_options.dry_run == true) + { + log_info(_("%i walsenders required, %i available"), + min_required_wal_senders, + available_wal_senders); + } + } + + /* check demotion candidate can make replication connection to promotion candidate */ { initPQExpBuffer(&remote_command_str); @@ -3353,171 +3531,6 @@ do_standby_switchover(void) PQfinish(remote_conn); - /* - * populate local node record with current state of various replication-related - * values, so we can check for sufficient walsenders and replication slots - */ - get_node_replication_stats(local_conn, server_version_num, &local_node_record); - - /* - * If --siblings-follow specified, get list and check they're reachable - * (if not just issue a warning) - */ - get_active_sibling_node_records(local_conn, - local_node_record.node_id, - local_node_record.upstream_node_id, - &sibling_nodes); - - if (runtime_options.siblings_follow == false) - { - if (sibling_nodes.node_count > 0) - { - log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"), - sibling_nodes.node_count); - log_detail(_("these nodes will remain attached to the current primary")); - } - } - else - { - char host[MAXLEN] = ""; - NodeInfoListCell *cell; - - log_verbose(LOG_INFO, _("%i active sibling nodes found"), - sibling_nodes.node_count); - - if (sibling_nodes.node_count == 0) - { - log_warning(_("option \"--sibling-nodes\" specified, but no sibling nodes exist")); - } - else - { - /* include walsender for promotion candidate in total */ - int min_required_wal_senders = 1; - int available_wal_senders = local_node_record.max_wal_senders - - local_node_record.attached_wal_receivers; - - for (cell = sibling_nodes.head; cell; cell = cell->next) - { - /* get host from node record */ - get_conninfo_value(cell->node_info->conninfo, "host", host); - r = test_ssh_connection(host, runtime_options.remote_user); - - if (r != 0) - { - cell->node_info->reachable = false; - unreachable_sibling_node_count++; - } - else - { - cell->node_info->reachable = true; - reachable_sibling_node_count++; - min_required_wal_senders++; - - if (cell->node_info->slot_name[0] != '\0') - { - reachable_sibling_nodes_with_slot_count++; - min_required_free_slots++; - } - } - } - - if (unreachable_sibling_node_count > 0) - { - if (runtime_options.force == false) - { - log_error(_("%i of %i sibling nodes unreachable via SSH:"), - unreachable_sibling_node_count, - sibling_nodes.node_count); - } - else - { - log_warning(_("%i of %i sibling nodes unreachable via SSH:"), - unreachable_sibling_node_count, - sibling_nodes.node_count); - } - - /* display list of unreachable sibling nodes */ - for (cell = sibling_nodes.head; cell; cell = cell->next) - { - if (cell->node_info->reachable == true) - continue; - log_detail(" %s (ID: %i)", - cell->node_info->node_name, - cell->node_info->node_id); - } - - if (runtime_options.force == false) - { - log_hint(_("use -F/--force to proceed in any case")); - PQfinish(local_conn); - exit(ERR_BAD_CONFIG); - } - - if (runtime_options.dry_run == true) - { - log_detail(_("F/--force specified, would proceed anyway")); - } - else - { - log_detail(_("F/--force specified, proceeding anyway")); - } - } - else - { - char *msg = _("all sibling nodes are reachable via SSH"); - - if (runtime_options.dry_run == true) - { - log_info("%s", msg); - } - else - { - log_verbose(LOG_INFO, "%s", msg); - } - } - - /* - * check there are sufficient free walsenders - obviously there's potential - * for a later race condition if some walsenders come into use before the - * switchover operation gets around to attaching the sibling nodes, but - * this should catch any actual existing configuration issue. - */ - if (available_wal_senders < min_required_wal_senders) - { - if (runtime_options.force == false || runtime_options.dry_run == true) - { - log_error(_("insufficient free walsenders to attach all sibling nodes")); - log_detail(_("at least %i walsenders required but only %i free walsenders on promotion candidate"), - min_required_wal_senders, - available_wal_senders); - log_hint(_("increase parameter \"max_wal_senders\" or use -F/--force to proceed in any case")); - - if (runtime_options.dry_run == false) - { - PQfinish(local_conn); - exit(ERR_BAD_CONFIG); - } - } - else - { - log_warning(_("insufficient free walsenders to attach all sibling nodes")); - log_detail(_("at least %i walsenders required but only %i free walsender(s) on promotion candidate"), - min_required_wal_senders, - available_wal_senders); - } - } - else - { - if (runtime_options.dry_run == true) - { - log_info(_("%i walsenders required, %i available"), - min_required_wal_senders, - available_wal_senders); - } - } - } - } - /* * if replication slots are required by demotion candidate and/or siblings,