mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 07:06:30 +00:00
"standby switchover": additional sanity checks
Check that sufficient walsenders will be available on the promotion candidate, and if replication slots are in use check if enough of those will be available. Note these checks can't guarantee that the walsenders/slots will be available at the appropriate points during the switchover process, but do ensure that existing configuration problems will be caught. Implements GitHub #371.
This commit is contained in:
2
HISTORY
2
HISTORY
@@ -3,6 +3,8 @@
|
||||
server and logging output is not explicitly redirected (Ian)
|
||||
repmgr: improve switchover log messages and exit code when old primary could
|
||||
not be shut down cleanly (Ian)
|
||||
repmgr: add check for sufficient walsenders/replication slots before executing
|
||||
a switchover; GitHub #371 (Ian)
|
||||
repmgr: add --dry-run mode to "repmgr standby follow"; GitHub #368 (Ian)
|
||||
repmgr: provide information about the primary node for "standby_register" and
|
||||
"standby_follow" event notifications; GitHub #375 (Ian)
|
||||
|
||||
@@ -2667,8 +2667,8 @@ get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *no
|
||||
appendPQExpBuffer(&query,
|
||||
" current_setting('max_replication_slots')::INT AS max_replication_slots, "
|
||||
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots) AS total_replication_slots, "
|
||||
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active = TRUE) AS active_replication_slots, "
|
||||
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active = FALSE) AS inactive_replication_slots, ");
|
||||
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active IS TRUE) AS active_replication_slots, "
|
||||
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active IS FALSE) AS inactive_replication_slots, ");
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -47,6 +47,14 @@
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
add check for sufficient walsenders and replication slots on the promotion candidate before executing
|
||||
<command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>
|
||||
(GitHub #371)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
add --dry-run mode to <command><link linkend="repmgr-standby-switchover">repmgr standby follow</link></command>
|
||||
|
||||
@@ -2170,8 +2170,13 @@ do_standby_switchover(void)
|
||||
|
||||
/* store list of sibling nodes if --siblings-follow specified */
|
||||
NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||
int reachable_sibling_node_count = 0;
|
||||
int reachable_sibling_nodes_with_slot_count = 0;
|
||||
int unreachable_sibling_node_count = 0;
|
||||
|
||||
/* number of free replication slots required on promotion candidate */
|
||||
int min_required_free_slots = 0;
|
||||
|
||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||
|
||||
/*
|
||||
@@ -2254,9 +2259,9 @@ do_standby_switchover(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Check this standby is attached to the demotion candidate TODO: - check
|
||||
* standby is attached to demotion candidate - check application_name in
|
||||
* pg_stat_replication
|
||||
* Check this standby is attached to the demotion candidate
|
||||
* TODO:
|
||||
* - check application_name in pg_stat_replication
|
||||
*/
|
||||
|
||||
if (local_node_record.upstream_node_id != remote_node_record.node_id)
|
||||
@@ -2276,6 +2281,11 @@ do_standby_switchover(void)
|
||||
/* this will fill the %p event notification parameter */
|
||||
event_info.former_primary_id = remote_node_record.node_id;
|
||||
|
||||
/* keep a running total of how many nodes will require a replication slot */
|
||||
if (remote_node_record.slot_name[0] != '\0')
|
||||
{
|
||||
min_required_free_slots++;
|
||||
}
|
||||
/*
|
||||
* If --force-rewind specified, check pg_rewind can be used, and
|
||||
* pre-emptively fetch the list of configuration files which should be
|
||||
@@ -2618,9 +2628,15 @@ do_standby_switchover(void)
|
||||
|
||||
PQfinish(remote_conn);
|
||||
|
||||
/*
|
||||
* populate local node record with current state of various replication-related
|
||||
* values, so we can check for sufficient walsenders and replication slots
|
||||
*/
|
||||
get_node_replication_stats(local_conn, source_server_version_num, &local_node_record);
|
||||
|
||||
/*
|
||||
* If --siblings-follow specified, get list and check they're reachable
|
||||
* (if not just issue a warning)
|
||||
*/
|
||||
get_active_sibling_node_records(local_conn,
|
||||
local_node_record.node_id,
|
||||
@@ -2650,6 +2666,11 @@ do_standby_switchover(void)
|
||||
}
|
||||
else
|
||||
{
|
||||
/* include walsender for promotion candidate in total */
|
||||
int min_required_wal_senders = 1;
|
||||
int available_wal_senders = local_node_record.max_wal_senders -
|
||||
local_node_record.attached_wal_receivers;
|
||||
|
||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
/* get host from node record */
|
||||
@@ -2664,6 +2685,14 @@ do_standby_switchover(void)
|
||||
else
|
||||
{
|
||||
cell->node_info->reachable = true;
|
||||
reachable_sibling_node_count++;
|
||||
min_required_wal_senders++;
|
||||
|
||||
if (cell->node_info->slot_name[0] != '\0')
|
||||
{
|
||||
reachable_sibling_nodes_with_slot_count++;
|
||||
min_required_free_slots++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2682,6 +2711,7 @@ do_standby_switchover(void)
|
||||
sibling_nodes.node_count);
|
||||
}
|
||||
|
||||
/* display list of unreachable sibling nodes */
|
||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
if (cell->node_info->reachable == true)
|
||||
@@ -2698,7 +2728,14 @@ do_standby_switchover(void)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
log_detail(_("F/--force specified, proceeding anyway"));
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_detail(_("F/--force specified, would proceed anyway"));
|
||||
}
|
||||
else
|
||||
{
|
||||
log_detail(_("F/--force specified, proceeding anyway"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -2713,22 +2750,118 @@ do_standby_switchover(void)
|
||||
log_verbose(LOG_INFO, "%s", msg);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* check there are sufficient free walsenders - obviously there's potential
|
||||
* for a later race condition if some walsenders come into use before the
|
||||
* switchover operation gets around to attaching the sibling nodes, but
|
||||
* this should catch any actual existing configuration issue.
|
||||
*/
|
||||
if (available_wal_senders < min_required_wal_senders)
|
||||
{
|
||||
if (runtime_options.force == false || runtime_options.dry_run == true)
|
||||
{
|
||||
log_error(_("insufficient free walsenders to attach all sibling nodes"));
|
||||
log_detail(_("at least %i walsenders required but only %i free walsenders on promotion candidate"),
|
||||
min_required_wal_senders,
|
||||
available_wal_senders);
|
||||
log_hint(_("increase parameter \"max_wal_senders\" or use -F/--force to proceed in any case"));
|
||||
|
||||
if (runtime_options.dry_run == false)
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log_warning(_("insufficient free walsenders to attach all sibling nodes"));
|
||||
log_detail(_("at least %i walsenders required but only %i free walsender(s) on promotion candidate"),
|
||||
min_required_wal_senders,
|
||||
available_wal_senders);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info(_("%i walsenders required, %i available"),
|
||||
min_required_wal_senders,
|
||||
available_wal_senders);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* if replication slots are required by demotion candidate and/or siblings,
|
||||
* check the promotion candidate has sufficient free slots
|
||||
x */
|
||||
|
||||
if (min_required_free_slots > 0 )
|
||||
{
|
||||
int available_slots = local_node_record.max_replication_slots -
|
||||
local_node_record.active_replication_slots;
|
||||
|
||||
log_debug("minimum of %i free slots (%i for siblings) required; %i available",
|
||||
min_required_free_slots,
|
||||
reachable_sibling_nodes_with_slot_count
|
||||
, available_slots);
|
||||
|
||||
if (available_slots < min_required_free_slots)
|
||||
{
|
||||
if (runtime_options.force == false || runtime_options.dry_run == true)
|
||||
{
|
||||
log_error(_("insufficient free replication slots to attach all nodes"));
|
||||
log_detail(_("at least %i additional replication slots required but only %i free slots available on promotion candidate"),
|
||||
min_required_free_slots,
|
||||
available_slots);
|
||||
log_hint(_("increase parameter \"max_replication_slots\" or use -F/--force to proceed in any case"));
|
||||
|
||||
if (runtime_options.dry_run == false)
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info(_("%i replication slots required, %i available"),
|
||||
min_required_free_slots,
|
||||
available_slots);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* PQfinish(local_conn); */
|
||||
|
||||
|
||||
/*
|
||||
* Sanity checks completed - prepare for the switchover
|
||||
*/
|
||||
|
||||
log_notice(_("local node \"%s\" (ID: %i) will be promoted to primary; "
|
||||
"current primary \"%s\" (ID: %i) will be demoted to standby"),
|
||||
local_node_record.node_name,
|
||||
local_node_record.node_id,
|
||||
remote_node_record.node_name,
|
||||
remote_node_record.node_id);
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_notice(_("local node \"%s\" (ID: %i) would be promoted to primary; "
|
||||
"current primary \"%s\" (ID: %i) would be demoted to standby"),
|
||||
local_node_record.node_name,
|
||||
local_node_record.node_id,
|
||||
remote_node_record.node_name,
|
||||
remote_node_record.node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_notice(_("local node \"%s\" (ID: %i) will be promoted to primary; "
|
||||
"current primary \"%s\" (ID: %i) will be demoted to standby"),
|
||||
local_node_record.node_name,
|
||||
local_node_record.node_id,
|
||||
remote_node_record.node_name,
|
||||
remote_node_record.node_id);
|
||||
}
|
||||
/*
|
||||
* Stop the remote primary
|
||||
*
|
||||
@@ -2759,8 +2892,7 @@ do_standby_switchover(void)
|
||||
|
||||
/* XXX handle failure */
|
||||
|
||||
(void) remote_command(
|
||||
remote_host,
|
||||
(void) remote_command(remote_host,
|
||||
runtime_options.remote_user,
|
||||
remote_command_str.data,
|
||||
&command_output);
|
||||
|
||||
Reference in New Issue
Block a user