From a3f90d2bbae29e69c575ca3ebc61933f227a74dd Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 6 Mar 2019 15:38:39 +0900 Subject: [PATCH] Add configuration option "sibling_nodes_disconnect_timeout" This controls the maximum length of time in seconds that repmgrd will wait for other standbys to disconnect their WAL receivers in a failover situation. This setting is only used when "standby_disconnect_on_failover" is set to "true". --- configfile.c | 3 +++ configfile.h | 3 ++- repmgr.conf.sample | 5 +++++ repmgr.h | 1 + repmgrd-physical.c | 10 ++++------ 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/configfile.c b/configfile.c index fdbfe5fc..82223151 100644 --- a/configfile.c +++ b/configfile.c @@ -359,6 +359,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */ memset(options->repmgrd_pid_file, 0, sizeof(options->repmgrd_pid_file)); options->standby_disconnect_on_failover = false; + options->sibling_nodes_disconnect_timeout = DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT; options->connection_check_type = CHECK_PING; /*------------- @@ -622,6 +623,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * strncpy(options->repmgrd_pid_file, value, MAXPGPATH); else if (strcmp(name, "standby_disconnect_on_failover") == 0) options->standby_disconnect_on_failover = parse_bool(value, name, error_list); + else if (strcmp(name, "sibling_nodes_disconnect_timeout") == 0) + options->sibling_nodes_disconnect_timeout = repmgr_atoi(value, name, error_list, 0); else if (strcmp(name, "connection_check_type") == 0) { if (strcasecmp(value, "ping") == 0) diff --git a/configfile.h b/configfile.h index 319a5959..e4b257a5 100644 --- a/configfile.h +++ b/configfile.h @@ -142,6 +142,7 @@ typedef struct int repmgrd_standby_startup_timeout; char repmgrd_pid_file[MAXPGPATH]; bool standby_disconnect_on_failover; + int sibling_nodes_disconnect_timeout; ConnectionCheckType connection_check_type; /* BDR settings */ @@ -214,7 +215,7 @@ typedef struct false, -1, \ DEFAULT_ASYNC_QUERY_TIMEOUT, \ DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \ - -1, "", false, CHECK_PING, \ + -1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, CHECK_PING, \ /* BDR settings */ \ false, DEFAULT_BDR_RECOVERY_TIMEOUT, \ /* service settings */ \ diff --git a/repmgr.conf.sample b/repmgr.conf.sample index c0f7edce..67edfcd0 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -322,6 +322,11 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh" # "--no-pid-file" will force PID file creation to be skipped. # Note: there is normally no need to set this, particularly if # repmgr was installed from packages. +#standby_disconnect_on_failover=false # If "true", in a failover situation wait for all standbys to + # disconnect their WAL receivers before electing a new primary +#sibling_nodes_disconnect_timeout=30 # If "standby_disconnect_on_failover", maximum length of time (in seconds) + # to wait for other standbys to confirm they have disconnected their + # WAL receivers #------------------------------------------------------------------------------ # service control commands diff --git a/repmgr.h b/repmgr.h index ceac63aa..b9e37abf 100644 --- a/repmgr.h +++ b/repmgr.h @@ -91,6 +91,7 @@ #define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */ #define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */ #define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT 30 /* seconds */ +#define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */ #define WALRECEIVER_DISABLE_TIMEOUT_VALUE 86400000 /* milliseconds */ diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 72e0abd1..f0341f1d 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -1989,8 +1989,6 @@ do_primary_failover(void) static NodeInfoList check_sibling_nodes = T_NODE_INFO_LIST_INITIALIZER; int i; - // XXX make configurable - int sibling_nodes_disconnect_timeout = 30; bool sibling_node_wal_receiver_connected = false; if (PQserverVersion(local_conn) < 90500) @@ -2016,7 +2014,7 @@ do_primary_failover(void) local_node_info.upstream_node_id, &check_sibling_nodes); - for (i = 0; i < sibling_nodes_disconnect_timeout; i++) + for (i = 0; i < config_file_options.sibling_nodes_disconnect_timeout; i++) { for (cell = check_sibling_nodes.head; cell; cell = cell->next) { @@ -2048,13 +2046,13 @@ do_primary_failover(void) } log_debug("sleeping %i of max %i seconds (\"sibling_nodes_disconnect_timeout\")", - i + 1, sibling_nodes_disconnect_timeout) - sleep(1); + i + 1, config_file_options.sibling_nodes_disconnect_timeout); + sleep(1); } if (sibling_node_wal_receiver_connected == true) { - // XXX what do we do here? abort or continue? make configurable? + /* TODO: prevent any such nodes becoming promotion candidates */ log_warning(_("WAL receiver still connected on at least one sibling node")); } else