diff --git a/configfile.c b/configfile.c index 76a8b542..367938fc 100644 --- a/configfile.c +++ b/configfile.c @@ -333,6 +333,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT; options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT; + /*------------------------ + * standby switchover settings + *------------------------ + */ + options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT; + /*----------------- * repmgrd settings *----------------- @@ -352,7 +358,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->degraded_monitoring_timeout = -1; options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT; options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT; - options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT; + options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */ /*------------- * witness settings @@ -539,6 +545,14 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * else if (strcmp(name, "standby_follow_timeout") == 0) options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0); + /* standby switchover settings */ + else if (strcmp(name, "standby_reconnect_timeout") == 0) + options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0); + + /* node rejoin settings */ + else if (strcmp(name, "node_rejoin_timeout") == 0) + options->node_rejoin_timeout = repmgr_atoi(value, name, error_list, 0); + /* node check settings */ else if (strcmp(name, "archive_ready_warning") == 0) options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1); @@ -588,8 +602,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->async_query_timeout = repmgr_atoi(value, name, error_list, 0); else if (strcmp(name, "primary_notification_timeout") == 0) options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0); - else if (strcmp(name, "standby_reconnect_timeout") == 0) - options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0); + else if (strcmp(name, "repmgrd_standby_startup_timeout") == 0) + options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0); /* witness settings */ else if (strcmp(name, "witness_sync_interval") == 0) @@ -771,6 +785,18 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * PQconninfoFree(conninfo_options); } + + /* set values for parameters which default to other parameters */ + + /* + * From 4.1, "repmgrd_standby_startup_timeout" replaces "standby_reconnect_timeout" + * in repmgrd; fall back to "standby_reconnect_timeout" if no value explicitly provided + */ + if (options->repmgrd_standby_startup_timeout == -1) + { + options->repmgrd_standby_startup_timeout = options->standby_reconnect_timeout; + } + /* add warning about changed "barman_" parameter meanings */ if ((options->barman_host[0] == '\0' && options->barman_server[0] != '\0') || (options->barman_host[0] != '\0' && options->barman_server[0] == '\0')) @@ -795,6 +821,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * item_list_append(error_list, _("\replication_lag_critical\" must be greater than \"replication_lag_warning\"")); } + + if (options->standby_reconnect_timeout < options->node_rejoin_timeout) + { + item_list_append(error_list, + _("\"standby_reconnect_timeout\" must be equal to or greater than \"node_rejoin_timeout\"")); + } } @@ -1017,6 +1049,7 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL * - promote_delay * - reconnect_attempts * - reconnect_interval + * - repmgrd_standby_startup_timeout * - retry_promote_interval_secs * * non-changeable options @@ -1233,6 +1266,15 @@ reload_config(t_configuration_options *orig_options) config_changed = true; } + /* repmgrd_standby_startup_timeout */ + if (orig_options->repmgrd_standby_startup_timeout != new_options.repmgrd_standby_startup_timeout) + { + orig_options->repmgrd_standby_startup_timeout = new_options.repmgrd_standby_startup_timeout; + log_info(_("\"repmgrd_standby_startup_timeout\" is now \"%i\""), new_options.repmgrd_standby_startup_timeout); + + config_changed = true; + } + /* * Handle changes to logging configuration */ diff --git a/configfile.h b/configfile.h index abfd6d0c..2119478c 100644 --- a/configfile.h +++ b/configfile.h @@ -102,6 +102,12 @@ typedef struct int primary_follow_timeout; int standby_follow_timeout; + /* standby switchover settings */ + int standby_reconnect_timeout; + + /* node rejoin settings */ + int node_rejoin_timeout; + /* node check settings */ int archive_ready_warning; int archive_ready_critical; @@ -124,7 +130,7 @@ typedef struct int degraded_monitoring_timeout; int async_query_timeout; int primary_notification_timeout; - int standby_reconnect_timeout; + int repmgrd_standby_startup_timeout; /* BDR settings */ bool bdr_local_monitoring_only; @@ -173,6 +179,10 @@ typedef struct /* standby follow settings */ \ DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \ DEFAULT_STANDBY_FOLLOW_TIMEOUT, \ + /* standby switchover settings */ \ + DEFAULT_STANDBY_RECONNECT_TIMEOUT, \ + /* node rejoin settings */ \ + DEFAULT_NODE_REJOIN_TIMEOUT, \ /* node check settings */ \ DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \ DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \ @@ -186,7 +196,7 @@ typedef struct false, -1, \ DEFAULT_ASYNC_QUERY_TIMEOUT, \ DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \ - DEFAULT_STANDBY_RECONNECT_TIMEOUT, \ + -1, \ /* BDR settings */ \ false, DEFAULT_BDR_RECOVERY_TIMEOUT, \ /* service settings */ \ diff --git a/doc/repmgr-node-rejoin.sgml b/doc/repmgr-node-rejoin.sgml index f8015ce1..fd9928af 100644 --- a/doc/repmgr-node-rejoin.sgml +++ b/doc/repmgr-node-rejoin.sgml @@ -115,7 +115,24 @@ + + Configuration file settings + + + + + node_rejoin_timeout: + the maximum length of time (in seconds) to wait for + the node to reconnect to the replication cluster (defaults to + the value set in standby_reconnect_timeout, + 60 seconds). + + + + + + Event notifications diff --git a/doc/repmgr-standby-switchover.sgml b/doc/repmgr-standby-switchover.sgml index a063d421..da401c66 100644 --- a/doc/repmgr-standby-switchover.sgml +++ b/doc/repmgr-standby-switchover.sgml @@ -154,8 +154,8 @@ standby_reconnect_timeout: - Number of seconds to attempt to reconnect to the demoted primary - once it has been restarted. + number of seconds to attempt to wait for the demoted primary + to reconnect to the promoted primary (default: 60 seconds) diff --git a/repmgr-action-node.c b/repmgr-action-node.c index 81cf72c0..07389e36 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -2274,19 +2274,19 @@ do_node_rejoin(void) { log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"), config_file_options.node_id, - i + 1, config_file_options.standby_reconnect_timeout); + i + 1, config_file_options.node_rejoin_timeout); } else { log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts", config_file_options.node_id, - i + 1, config_file_options.standby_reconnect_timeout); + i + 1, config_file_options.node_rejoin_timeout); } sleep(1); } - for (; i < config_file_options.standby_reconnect_timeout; i++) + for (; i < config_file_options.node_rejoin_timeout; i++) { success = is_downstream_node_attached(upstream_conn, config_file_options.node_name); @@ -2301,13 +2301,13 @@ do_node_rejoin(void) { log_info(_("waiting for node %i to connect to new primary; %i of max %i attempts"), config_file_options.node_id, - i + 1, config_file_options.standby_reconnect_timeout); + i + 1, config_file_options.node_rejoin_timeout); } else { log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts", config_file_options.node_id, - i + 1, config_file_options.standby_reconnect_timeout); + i + 1, config_file_options.node_rejoin_timeout); } sleep(1); diff --git a/repmgr.conf.sample b/repmgr.conf.sample index 35deee44..8a178c78 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -207,7 +207,7 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh" #------------------------------------------------------------------------------ -# Standby follow settings +# "standby follow" settings #------------------------------------------------------------------------------ # These settings apply when instructing a standby to follow the new primary @@ -219,6 +219,28 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh" # for the standby to connect to the primary +#------------------------------------------------------------------------------ +# "standby switchover" settings +#------------------------------------------------------------------------------ + +# These settings apply when switching roles between a primary and a standby +# ("repmgr standby switchover"). + +#standby_reconnect_timeout=60 # The max length of time (in seconds) to wait + # for the demoted standby to reconnect to the promoted + # primary (note: this value should be equal to or greater + # than that set for "node_rejoin_timeout") + +#------------------------------------------------------------------------------ +# "node rejoin" settings +#------------------------------------------------------------------------------ + +# These settings apply when reintegrating a node into a replication cluster +# with "repmgrd_node_rejoin" + +#node_rejoin_timeout=60 # The maximum length of time (in seconds) to wait for + # the node to reconnect to the replication cluster + #------------------------------------------------------------------------------ # Barman options #------------------------------------------------------------------------------ @@ -265,8 +287,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh" #primary_notification_timeout=60 # Interval (in seconds) which repmgrd on a standby # will wait for a notification from the new primary, # before falling back to degraded monitoring -#standby_reconnect_timeout=60 # Interval (in seconds) which repmgrd on a standby will wait - # to reconnect to the local node after executing "follow_command" +#repmgrd_standby_startup_timeout=60 # Interval (in seconds) which repmgrd on a standby will wait + # for the the local node to restart and become ready to accept connections after + # executing "follow_command" (defaults to the value set in "standby_reconnect_timeout") #monitoring_history=no # Whether to write monitoring data to the "montoring_history" table #monitor_interval_secs=2 # Interval (in seconds) at which to write monitoring data diff --git a/repmgr.h b/repmgr.h index 8bdac74a..1aad9684 100644 --- a/repmgr.h +++ b/repmgr.h @@ -85,6 +85,7 @@ #define DEFAULT_PROMOTE_CHECK_TIMEOUT 60 /* seconds */ #define DEFAULT_PROMOTE_CHECK_INTERVAL 1 /* seconds */ #define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */ +#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */ #ifndef RECOVERY_COMMAND_FILE #define RECOVERY_COMMAND_FILE "recovery.conf" diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 26da52d4..55af0e78 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -1941,7 +1941,7 @@ do_upstream_standby_failover(void) * completes, so poll for a while until we get a connection. */ - for (i = 0; i < config_file_options.standby_reconnect_timeout; i++) + for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++) { local_conn = establish_db_connection(local_node_info.conninfo, false); @@ -1950,7 +1950,7 @@ do_upstream_standby_failover(void) log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node", i + 1, - config_file_options.standby_reconnect_timeout); + config_file_options.repmgrd_standby_startup_timeout); sleep(1); } @@ -2391,7 +2391,7 @@ follow_new_primary(int new_primary_id) * completes, so poll for a while until we get a connection. */ - for (i = 0; i < config_file_options.standby_reconnect_timeout; i++) + for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++) { local_conn = establish_db_connection(local_node_info.conninfo, false); @@ -2400,7 +2400,7 @@ follow_new_primary(int new_primary_id) log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node", i + 1, - config_file_options.standby_reconnect_timeout); + config_file_options.repmgrd_standby_startup_timeout); sleep(1); }