From 38e3aae0534f5a2aa929d1cfc236b71f08a534eb Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Tue, 25 Sep 2018 11:30:01 +0900 Subject: [PATCH] repmgr: add parameter "shutdown_check_timeout" Previously, "repmgr standby switchover" used the configuration file parameters "reconnect_interval" and "reconnect_attempts" to define a timeout to determine whether the current primary (demotion candidate) has shut down. However, these parameters are intended for primary failure detection and are generally lower in value, while a controlled shutdown may take longer, resulting in the switchover being aborted as repmgr was not waiting long enough. To prevent this happening, parameter "shutdown_check_timeout" has been added. This complements the existing "standby_reconnect_timeout" parameter used by "repmgr standby switchover". Implements GitHub #504. --- HISTORY | 4 ++++ configfile.c | 3 +++ configfile.h | 2 ++ doc/repmgr-standby-switchover.sgml | 30 ++++++++++++++++-------------- repmgr-action-standby.c | 12 ++++++------ repmgr.conf.sample | 2 ++ repmgr.h | 1 + 7 files changed, 34 insertions(+), 20 deletions(-) diff --git a/HISTORY b/HISTORY index 996033cc..0c1fa8a4 100644 --- a/HISTORY +++ b/HISTORY @@ -1,3 +1,7 @@ +4.2.0 2018-??-?? + repmgr: add parameter "shutdown_check_timeout" for use by "standby switchover"; + GitHub #504 (Ian) + 4.1.1 2018-09-05 logging: explicitly log the text of failed queries as ERRORs to assist logfile analysis; GitHub #498 diff --git a/configfile.c b/configfile.c index c4c060cc..4e345991 100644 --- a/configfile.c +++ b/configfile.c @@ -335,6 +335,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * * standby switchover settings *------------------------ */ + options->shutdown_check_timeout = DEFAULT_SHUTDOWN_CHECK_TIMEOUT; options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT; /*----------------- @@ -545,6 +546,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0); /* standby switchover settings */ + else if (strcmp(name, "shutdown_check_timeout") == 0) + options->shutdown_check_timeout = repmgr_atoi(value, name, error_list, 0); else if (strcmp(name, "standby_reconnect_timeout") == 0) options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0); diff --git a/configfile.h b/configfile.h index abe87225..975c8f8e 100644 --- a/configfile.h +++ b/configfile.h @@ -103,6 +103,7 @@ typedef struct int standby_follow_timeout; /* standby switchover settings */ + int shutdown_check_timeout; int standby_reconnect_timeout; /* node rejoin settings */ @@ -181,6 +182,7 @@ typedef struct DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \ DEFAULT_STANDBY_FOLLOW_TIMEOUT, \ /* standby switchover settings */ \ + DEFAULT_SHUTDOWN_CHECK_TIMEOUT, \ DEFAULT_STANDBY_RECONNECT_TIMEOUT, \ /* node rejoin settings */ \ DEFAULT_NODE_REJOIN_TIMEOUT, \ diff --git a/doc/repmgr-standby-switchover.sgml b/doc/repmgr-standby-switchover.sgml index 30c69a5b..89140bb4 100644 --- a/doc/repmgr-standby-switchover.sgml +++ b/doc/repmgr-standby-switchover.sgml @@ -141,19 +141,7 @@ Note that following parameters in repmgr.conf are relevant to the switchover operation: - - - reconnect_attempts: number of times to check the original primary - for a clean shutdown after executing the shutdown command, before aborting - - - - - reconnect_interval: interval (in seconds) to check the original - primary for a clean shutdown after executing the shutdown command (up to a maximum - of reconnect_attempts tries) - - + replication_lag_critical: @@ -163,10 +151,24 @@ + + + shutdown_check_timeout: maximum number of seconds to wait for the + demotion candidate (current primary) to shut down, before aborting the switchover. + + + + In versions prior to &repmgr; 4.2, repmgr standby switchover would + use the values defined in reconnect_attempts and reconnect_interval + to determine the timeout for demotion candidate shutdown. + + + + standby_reconnect_timeout: - number of seconds to attempt to wait for the demoted primary + maximum number of seconds to attempt to wait for the demoted primary to reconnect to the promoted primary (default: 60 seconds) diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 6de5769b..e6f91e75 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -3666,13 +3666,14 @@ do_standby_switchover(void) /* loop for timeout waiting for current primary to stop */ - for (i = 0; i < config_file_options.reconnect_attempts; i++) + for (i = 0; i < config_file_options.shutdown_check_timeout; i++) { /* Check whether primary is available */ PGPing ping_res; - log_info(_("checking primary status; %i of %i attempts"), - i + 1, config_file_options.reconnect_attempts); + log_info(_("checking for primary shutdown; %i of %i attempts (\"shutdown_check_timeout\")"), + i + 1, config_file_options.shutdown_check_timeout); + ping_res = PQping(remote_conninfo); log_debug("ping status is: %s", print_pqping_status(ping_res)); @@ -3741,9 +3742,8 @@ do_standby_switchover(void) termPQExpBuffer(&command_output); } - log_debug("sleeping %i seconds (\"reconnect_interval\") until next check", - config_file_options.reconnect_interval); - sleep(config_file_options.reconnect_interval); + log_debug("sleeping 1 second until next check"); + sleep(1); } if (shutdown_success == false) diff --git a/repmgr.conf.sample b/repmgr.conf.sample index b5b5d710..28296f40 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -231,6 +231,8 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh" # These settings apply when switching roles between a primary and a standby # ("repmgr standby switchover"). +#shutdown_check_timeout=60 # The max length of time (in seconds) to wait for the demotion + # candidate (current primary) to shut down #standby_reconnect_timeout=60 # The max length of time (in seconds) to wait # for the demoted standby to reconnect to the promoted # primary (note: this value should be equal to or greater diff --git a/repmgr.h b/repmgr.h index 1aad9684..8bf4ec4f 100644 --- a/repmgr.h +++ b/repmgr.h @@ -84,6 +84,7 @@ #define DEFAULT_WAIT_START 30 /* seconds */ #define DEFAULT_PROMOTE_CHECK_TIMEOUT 60 /* seconds */ #define DEFAULT_PROMOTE_CHECK_INTERVAL 1 /* seconds */ +#define DEFAULT_SHUTDOWN_CHECK_TIMEOUT 60 /* seconds */ #define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */ #define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */