De-overload configuration file parameter "standby_reconnect_timeout"

Currently the (very generic sounding) "standby_reconnect_timeout" configuration
file parameter is used in several different contexts and it would be useful
to have more granular control over the different timeouts it's used to configure.

This patch introduces "node_rejoin_timeout", used in place of "standby_reconnect_timeout"
(which wasn't documented) when "repmgr node rejoin" is executed, to determine
how long to wait for the node to rejoin the replication cluster.

Additionally "repmgrd_standby_startup_timeout" is introduced as a timeout for
failover situations, when repmgrd executes "repmgr standby follow" to follow
a new primary, and waits for the standby to restart and become available
for connections.

"standby_reconnect_timeout" is now only relevant for "repmgr standby switchover".

Implements GitHub #454.
This commit is contained in:
Ian Barwick
2018-06-28 17:53:17 +09:00
parent 080a29c33b
commit b2081dca52
8 changed files with 112 additions and 19 deletions

View File

@@ -333,6 +333,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT; options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT; options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
/*------------------------
* standby switchover settings
*------------------------
*/
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
/*----------------- /*-----------------
* repmgrd settings * repmgrd settings
*----------------- *-----------------
@@ -352,7 +358,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->degraded_monitoring_timeout = -1; options->degraded_monitoring_timeout = -1;
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT; options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT; options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT; options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */
/*------------- /*-------------
* witness settings * witness settings
@@ -539,6 +545,14 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
else if (strcmp(name, "standby_follow_timeout") == 0) else if (strcmp(name, "standby_follow_timeout") == 0)
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0); options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
/* standby switchover settings */
else if (strcmp(name, "standby_reconnect_timeout") == 0)
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
/* node rejoin settings */
else if (strcmp(name, "node_rejoin_timeout") == 0)
options->node_rejoin_timeout = repmgr_atoi(value, name, error_list, 0);
/* node check settings */ /* node check settings */
else if (strcmp(name, "archive_ready_warning") == 0) else if (strcmp(name, "archive_ready_warning") == 0)
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1); options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
@@ -588,8 +602,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0); options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "primary_notification_timeout") == 0) else if (strcmp(name, "primary_notification_timeout") == 0)
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0); options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "standby_reconnect_timeout") == 0) else if (strcmp(name, "repmgrd_standby_startup_timeout") == 0)
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0); options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0);
/* witness settings */ /* witness settings */
else if (strcmp(name, "witness_sync_interval") == 0) else if (strcmp(name, "witness_sync_interval") == 0)
@@ -771,6 +785,18 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
PQconninfoFree(conninfo_options); PQconninfoFree(conninfo_options);
} }
/* set values for parameters which default to other parameters */
/*
* From 4.1, "repmgrd_standby_startup_timeout" replaces "standby_reconnect_timeout"
* in repmgrd; fall back to "standby_reconnect_timeout" if no value explicitly provided
*/
if (options->repmgrd_standby_startup_timeout == -1)
{
options->repmgrd_standby_startup_timeout = options->standby_reconnect_timeout;
}
/* add warning about changed "barman_" parameter meanings */ /* add warning about changed "barman_" parameter meanings */
if ((options->barman_host[0] == '\0' && options->barman_server[0] != '\0') || if ((options->barman_host[0] == '\0' && options->barman_server[0] != '\0') ||
(options->barman_host[0] != '\0' && options->barman_server[0] == '\0')) (options->barman_host[0] != '\0' && options->barman_server[0] == '\0'))
@@ -795,6 +821,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
item_list_append(error_list, item_list_append(error_list,
_("\replication_lag_critical\" must be greater than \"replication_lag_warning\"")); _("\replication_lag_critical\" must be greater than \"replication_lag_warning\""));
} }
if (options->standby_reconnect_timeout < options->node_rejoin_timeout)
{
item_list_append(error_list,
_("\"standby_reconnect_timeout\" must be equal to or greater than \"node_rejoin_timeout\""));
}
} }
@@ -1017,6 +1049,7 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
* - promote_delay * - promote_delay
* - reconnect_attempts * - reconnect_attempts
* - reconnect_interval * - reconnect_interval
* - repmgrd_standby_startup_timeout
* - retry_promote_interval_secs * - retry_promote_interval_secs
* *
* non-changeable options * non-changeable options
@@ -1233,6 +1266,15 @@ reload_config(t_configuration_options *orig_options)
config_changed = true; config_changed = true;
} }
/* repmgrd_standby_startup_timeout */
if (orig_options->repmgrd_standby_startup_timeout != new_options.repmgrd_standby_startup_timeout)
{
orig_options->repmgrd_standby_startup_timeout = new_options.repmgrd_standby_startup_timeout;
log_info(_("\"repmgrd_standby_startup_timeout\" is now \"%i\""), new_options.repmgrd_standby_startup_timeout);
config_changed = true;
}
/* /*
* Handle changes to logging configuration * Handle changes to logging configuration
*/ */

View File

@@ -102,6 +102,12 @@ typedef struct
int primary_follow_timeout; int primary_follow_timeout;
int standby_follow_timeout; int standby_follow_timeout;
/* standby switchover settings */
int standby_reconnect_timeout;
/* node rejoin settings */
int node_rejoin_timeout;
/* node check settings */ /* node check settings */
int archive_ready_warning; int archive_ready_warning;
int archive_ready_critical; int archive_ready_critical;
@@ -124,7 +130,7 @@ typedef struct
int degraded_monitoring_timeout; int degraded_monitoring_timeout;
int async_query_timeout; int async_query_timeout;
int primary_notification_timeout; int primary_notification_timeout;
int standby_reconnect_timeout; int repmgrd_standby_startup_timeout;
/* BDR settings */ /* BDR settings */
bool bdr_local_monitoring_only; bool bdr_local_monitoring_only;
@@ -173,6 +179,10 @@ typedef struct
/* standby follow settings */ \ /* standby follow settings */ \
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \ DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \ DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
/* standby switchover settings */ \
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
/* node rejoin settings */ \
DEFAULT_NODE_REJOIN_TIMEOUT, \
/* node check settings */ \ /* node check settings */ \
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \ DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \ DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
@@ -186,7 +196,7 @@ typedef struct
false, -1, \ false, -1, \
DEFAULT_ASYNC_QUERY_TIMEOUT, \ DEFAULT_ASYNC_QUERY_TIMEOUT, \
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \ DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \ -1, \
/* BDR settings */ \ /* BDR settings */ \
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \ false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
/* service settings */ \ /* service settings */ \

View File

@@ -115,7 +115,24 @@
</variablelist> </variablelist>
</refsect1> </refsect1>
<refsect1>
<title>Configuration file settings</title>
<para>
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<simpara>
<literal>node_rejoin_timeout</literal>:
the maximum length of time (in seconds) to wait for
the node to reconnect to the replication cluster (defaults to
the value set in <literal>standby_reconnect_timeout</literal>,
60 seconds).
</simpara>
</listitem>
</itemizedlist>
</para>
</refsect1>
<refsect1> <refsect1>
<title>Event notifications</title> <title>Event notifications</title>
<para> <para>

View File

@@ -154,8 +154,8 @@
<listitem> <listitem>
<simpara> <simpara>
<literal>standby_reconnect_timeout</literal>: <literal>standby_reconnect_timeout</literal>:
Number of seconds to attempt to reconnect to the demoted primary number of seconds to attempt to wait for the demoted primary
once it has been restarted. to reconnect to the promoted primary (default: 60 seconds)
</simpara> </simpara>
</listitem> </listitem>

View File

@@ -2274,19 +2274,19 @@ do_node_rejoin(void)
{ {
log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"), log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"),
config_file_options.node_id, config_file_options.node_id,
i + 1, config_file_options.standby_reconnect_timeout); i + 1, config_file_options.node_rejoin_timeout);
} }
else else
{ {
log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts", log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts",
config_file_options.node_id, config_file_options.node_id,
i + 1, config_file_options.standby_reconnect_timeout); i + 1, config_file_options.node_rejoin_timeout);
} }
sleep(1); sleep(1);
} }
for (; i < config_file_options.standby_reconnect_timeout; i++) for (; i < config_file_options.node_rejoin_timeout; i++)
{ {
success = is_downstream_node_attached(upstream_conn, config_file_options.node_name); success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
@@ -2301,13 +2301,13 @@ do_node_rejoin(void)
{ {
log_info(_("waiting for node %i to connect to new primary; %i of max %i attempts"), log_info(_("waiting for node %i to connect to new primary; %i of max %i attempts"),
config_file_options.node_id, config_file_options.node_id,
i + 1, config_file_options.standby_reconnect_timeout); i + 1, config_file_options.node_rejoin_timeout);
} }
else else
{ {
log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts", log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts",
config_file_options.node_id, config_file_options.node_id,
i + 1, config_file_options.standby_reconnect_timeout); i + 1, config_file_options.node_rejoin_timeout);
} }
sleep(1); sleep(1);

View File

@@ -207,7 +207,7 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# Standby follow settings # "standby follow" settings
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# These settings apply when instructing a standby to follow the new primary # These settings apply when instructing a standby to follow the new primary
@@ -219,6 +219,28 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
# for the standby to connect to the primary # for the standby to connect to the primary
#------------------------------------------------------------------------------
# "standby switchover" settings
#------------------------------------------------------------------------------
# These settings apply when switching roles between a primary and a standby
# ("repmgr standby switchover").
#standby_reconnect_timeout=60 # The max length of time (in seconds) to wait
# for the demoted standby to reconnect to the promoted
# primary (note: this value should be equal to or greater
# than that set for "node_rejoin_timeout")
#------------------------------------------------------------------------------
# "node rejoin" settings
#------------------------------------------------------------------------------
# These settings apply when reintegrating a node into a replication cluster
# with "repmgrd_node_rejoin"
#node_rejoin_timeout=60 # The maximum length of time (in seconds) to wait for
# the node to reconnect to the replication cluster
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# Barman options # Barman options
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
@@ -265,8 +287,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
#primary_notification_timeout=60 # Interval (in seconds) which repmgrd on a standby #primary_notification_timeout=60 # Interval (in seconds) which repmgrd on a standby
# will wait for a notification from the new primary, # will wait for a notification from the new primary,
# before falling back to degraded monitoring # before falling back to degraded monitoring
#standby_reconnect_timeout=60 # Interval (in seconds) which repmgrd on a standby will wait #repmgrd_standby_startup_timeout=60 # Interval (in seconds) which repmgrd on a standby will wait
# to reconnect to the local node after executing "follow_command" # for the the local node to restart and become ready to accept connections after
# executing "follow_command" (defaults to the value set in "standby_reconnect_timeout")
#monitoring_history=no # Whether to write monitoring data to the "montoring_history" table #monitoring_history=no # Whether to write monitoring data to the "montoring_history" table
#monitor_interval_secs=2 # Interval (in seconds) at which to write monitoring data #monitor_interval_secs=2 # Interval (in seconds) at which to write monitoring data

View File

@@ -85,6 +85,7 @@
#define DEFAULT_PROMOTE_CHECK_TIMEOUT 60 /* seconds */ #define DEFAULT_PROMOTE_CHECK_TIMEOUT 60 /* seconds */
#define DEFAULT_PROMOTE_CHECK_INTERVAL 1 /* seconds */ #define DEFAULT_PROMOTE_CHECK_INTERVAL 1 /* seconds */
#define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */ #define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */
#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */
#ifndef RECOVERY_COMMAND_FILE #ifndef RECOVERY_COMMAND_FILE
#define RECOVERY_COMMAND_FILE "recovery.conf" #define RECOVERY_COMMAND_FILE "recovery.conf"

View File

@@ -1941,7 +1941,7 @@ do_upstream_standby_failover(void)
* completes, so poll for a while until we get a connection. * completes, so poll for a while until we get a connection.
*/ */
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++) for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
{ {
local_conn = establish_db_connection(local_node_info.conninfo, false); local_conn = establish_db_connection(local_node_info.conninfo, false);
@@ -1950,7 +1950,7 @@ do_upstream_standby_failover(void)
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node", log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
i + 1, i + 1,
config_file_options.standby_reconnect_timeout); config_file_options.repmgrd_standby_startup_timeout);
sleep(1); sleep(1);
} }
@@ -2391,7 +2391,7 @@ follow_new_primary(int new_primary_id)
* completes, so poll for a while until we get a connection. * completes, so poll for a while until we get a connection.
*/ */
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++) for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
{ {
local_conn = establish_db_connection(local_node_info.conninfo, false); local_conn = establish_db_connection(local_node_info.conninfo, false);
@@ -2400,7 +2400,7 @@ follow_new_primary(int new_primary_id)
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node", log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
i + 1, i + 1,
config_file_options.standby_reconnect_timeout); config_file_options.repmgrd_standby_startup_timeout);
sleep(1); sleep(1);
} }