mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
De-overload configuration file parameter "standby_reconnect_timeout"
Currently the (very generic sounding) "standby_reconnect_timeout" configuration file parameter is used in several different contexts and it would be useful to have more granular control over the different timeouts it's used to configure. This patch introduces "node_rejoin_timeout", used in place of "standby_reconnect_timeout" (which wasn't documented) when "repmgr node rejoin" is executed, to determine how long to wait for the node to rejoin the replication cluster. Additionally "repmgrd_standby_startup_timeout" is introduced as a timeout for failover situations, when repmgrd executes "repmgr standby follow" to follow a new primary, and waits for the standby to restart and become available for connections. "standby_reconnect_timeout" is now only relevant for "repmgr standby switchover". Implements GitHub #454.
This commit is contained in:
48
configfile.c
48
configfile.c
@@ -333,6 +333,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
|
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
|
||||||
options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
|
options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
|
||||||
|
|
||||||
|
/*------------------------
|
||||||
|
* standby switchover settings
|
||||||
|
*------------------------
|
||||||
|
*/
|
||||||
|
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
|
||||||
|
|
||||||
/*-----------------
|
/*-----------------
|
||||||
* repmgrd settings
|
* repmgrd settings
|
||||||
*-----------------
|
*-----------------
|
||||||
@@ -352,7 +358,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->degraded_monitoring_timeout = -1;
|
options->degraded_monitoring_timeout = -1;
|
||||||
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
|
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
|
||||||
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
|
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
|
||||||
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
|
options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */
|
||||||
|
|
||||||
/*-------------
|
/*-------------
|
||||||
* witness settings
|
* witness settings
|
||||||
@@ -539,6 +545,14 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
else if (strcmp(name, "standby_follow_timeout") == 0)
|
else if (strcmp(name, "standby_follow_timeout") == 0)
|
||||||
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
|
/* standby switchover settings */
|
||||||
|
else if (strcmp(name, "standby_reconnect_timeout") == 0)
|
||||||
|
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
|
/* node rejoin settings */
|
||||||
|
else if (strcmp(name, "node_rejoin_timeout") == 0)
|
||||||
|
options->node_rejoin_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
/* node check settings */
|
/* node check settings */
|
||||||
else if (strcmp(name, "archive_ready_warning") == 0)
|
else if (strcmp(name, "archive_ready_warning") == 0)
|
||||||
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
|
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
|
||||||
@@ -588,8 +602,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
|
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
else if (strcmp(name, "primary_notification_timeout") == 0)
|
else if (strcmp(name, "primary_notification_timeout") == 0)
|
||||||
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
|
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
else if (strcmp(name, "standby_reconnect_timeout") == 0)
|
else if (strcmp(name, "repmgrd_standby_startup_timeout") == 0)
|
||||||
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
/* witness settings */
|
/* witness settings */
|
||||||
else if (strcmp(name, "witness_sync_interval") == 0)
|
else if (strcmp(name, "witness_sync_interval") == 0)
|
||||||
@@ -771,6 +785,18 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
PQconninfoFree(conninfo_options);
|
PQconninfoFree(conninfo_options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* set values for parameters which default to other parameters */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* From 4.1, "repmgrd_standby_startup_timeout" replaces "standby_reconnect_timeout"
|
||||||
|
* in repmgrd; fall back to "standby_reconnect_timeout" if no value explicitly provided
|
||||||
|
*/
|
||||||
|
if (options->repmgrd_standby_startup_timeout == -1)
|
||||||
|
{
|
||||||
|
options->repmgrd_standby_startup_timeout = options->standby_reconnect_timeout;
|
||||||
|
}
|
||||||
|
|
||||||
/* add warning about changed "barman_" parameter meanings */
|
/* add warning about changed "barman_" parameter meanings */
|
||||||
if ((options->barman_host[0] == '\0' && options->barman_server[0] != '\0') ||
|
if ((options->barman_host[0] == '\0' && options->barman_server[0] != '\0') ||
|
||||||
(options->barman_host[0] != '\0' && options->barman_server[0] == '\0'))
|
(options->barman_host[0] != '\0' && options->barman_server[0] == '\0'))
|
||||||
@@ -795,6 +821,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
item_list_append(error_list,
|
item_list_append(error_list,
|
||||||
_("\replication_lag_critical\" must be greater than \"replication_lag_warning\""));
|
_("\replication_lag_critical\" must be greater than \"replication_lag_warning\""));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (options->standby_reconnect_timeout < options->node_rejoin_timeout)
|
||||||
|
{
|
||||||
|
item_list_append(error_list,
|
||||||
|
_("\"standby_reconnect_timeout\" must be equal to or greater than \"node_rejoin_timeout\""));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1017,6 +1049,7 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
|
|||||||
* - promote_delay
|
* - promote_delay
|
||||||
* - reconnect_attempts
|
* - reconnect_attempts
|
||||||
* - reconnect_interval
|
* - reconnect_interval
|
||||||
|
* - repmgrd_standby_startup_timeout
|
||||||
* - retry_promote_interval_secs
|
* - retry_promote_interval_secs
|
||||||
*
|
*
|
||||||
* non-changeable options
|
* non-changeable options
|
||||||
@@ -1233,6 +1266,15 @@ reload_config(t_configuration_options *orig_options)
|
|||||||
config_changed = true;
|
config_changed = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* repmgrd_standby_startup_timeout */
|
||||||
|
if (orig_options->repmgrd_standby_startup_timeout != new_options.repmgrd_standby_startup_timeout)
|
||||||
|
{
|
||||||
|
orig_options->repmgrd_standby_startup_timeout = new_options.repmgrd_standby_startup_timeout;
|
||||||
|
log_info(_("\"repmgrd_standby_startup_timeout\" is now \"%i\""), new_options.repmgrd_standby_startup_timeout);
|
||||||
|
|
||||||
|
config_changed = true;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Handle changes to logging configuration
|
* Handle changes to logging configuration
|
||||||
*/
|
*/
|
||||||
|
|||||||
14
configfile.h
14
configfile.h
@@ -102,6 +102,12 @@ typedef struct
|
|||||||
int primary_follow_timeout;
|
int primary_follow_timeout;
|
||||||
int standby_follow_timeout;
|
int standby_follow_timeout;
|
||||||
|
|
||||||
|
/* standby switchover settings */
|
||||||
|
int standby_reconnect_timeout;
|
||||||
|
|
||||||
|
/* node rejoin settings */
|
||||||
|
int node_rejoin_timeout;
|
||||||
|
|
||||||
/* node check settings */
|
/* node check settings */
|
||||||
int archive_ready_warning;
|
int archive_ready_warning;
|
||||||
int archive_ready_critical;
|
int archive_ready_critical;
|
||||||
@@ -124,7 +130,7 @@ typedef struct
|
|||||||
int degraded_monitoring_timeout;
|
int degraded_monitoring_timeout;
|
||||||
int async_query_timeout;
|
int async_query_timeout;
|
||||||
int primary_notification_timeout;
|
int primary_notification_timeout;
|
||||||
int standby_reconnect_timeout;
|
int repmgrd_standby_startup_timeout;
|
||||||
|
|
||||||
/* BDR settings */
|
/* BDR settings */
|
||||||
bool bdr_local_monitoring_only;
|
bool bdr_local_monitoring_only;
|
||||||
@@ -173,6 +179,10 @@ typedef struct
|
|||||||
/* standby follow settings */ \
|
/* standby follow settings */ \
|
||||||
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
||||||
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
|
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
|
||||||
|
/* standby switchover settings */ \
|
||||||
|
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
|
||||||
|
/* node rejoin settings */ \
|
||||||
|
DEFAULT_NODE_REJOIN_TIMEOUT, \
|
||||||
/* node check settings */ \
|
/* node check settings */ \
|
||||||
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
|
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
|
||||||
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
|
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
|
||||||
@@ -186,7 +196,7 @@ typedef struct
|
|||||||
false, -1, \
|
false, -1, \
|
||||||
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
||||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||||
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
|
-1, \
|
||||||
/* BDR settings */ \
|
/* BDR settings */ \
|
||||||
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||||
/* service settings */ \
|
/* service settings */ \
|
||||||
|
|||||||
@@ -115,7 +115,24 @@
|
|||||||
|
|
||||||
</variablelist>
|
</variablelist>
|
||||||
</refsect1>
|
</refsect1>
|
||||||
|
<refsect1>
|
||||||
|
<title>Configuration file settings</title>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
<itemizedlist spacing="compact" mark="bullet">
|
||||||
|
<listitem>
|
||||||
|
<simpara>
|
||||||
|
<literal>node_rejoin_timeout</literal>:
|
||||||
|
the maximum length of time (in seconds) to wait for
|
||||||
|
the node to reconnect to the replication cluster (defaults to
|
||||||
|
the value set in <literal>standby_reconnect_timeout</literal>,
|
||||||
|
60 seconds).
|
||||||
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
</itemizedlist>
|
||||||
|
</para>
|
||||||
|
|
||||||
|
</refsect1>
|
||||||
<refsect1>
|
<refsect1>
|
||||||
<title>Event notifications</title>
|
<title>Event notifications</title>
|
||||||
<para>
|
<para>
|
||||||
|
|||||||
@@ -154,8 +154,8 @@
|
|||||||
<listitem>
|
<listitem>
|
||||||
<simpara>
|
<simpara>
|
||||||
<literal>standby_reconnect_timeout</literal>:
|
<literal>standby_reconnect_timeout</literal>:
|
||||||
Number of seconds to attempt to reconnect to the demoted primary
|
number of seconds to attempt to wait for the demoted primary
|
||||||
once it has been restarted.
|
to reconnect to the promoted primary (default: 60 seconds)
|
||||||
</simpara>
|
</simpara>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
|
||||||
|
|||||||
@@ -2274,19 +2274,19 @@ do_node_rejoin(void)
|
|||||||
{
|
{
|
||||||
log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"),
|
log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"),
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
i + 1, config_file_options.standby_reconnect_timeout);
|
i + 1, config_file_options.node_rejoin_timeout);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts",
|
log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts",
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
i + 1, config_file_options.standby_reconnect_timeout);
|
i + 1, config_file_options.node_rejoin_timeout);
|
||||||
}
|
}
|
||||||
|
|
||||||
sleep(1);
|
sleep(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; i < config_file_options.standby_reconnect_timeout; i++)
|
for (; i < config_file_options.node_rejoin_timeout; i++)
|
||||||
{
|
{
|
||||||
success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
|
success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
|
||||||
|
|
||||||
@@ -2301,13 +2301,13 @@ do_node_rejoin(void)
|
|||||||
{
|
{
|
||||||
log_info(_("waiting for node %i to connect to new primary; %i of max %i attempts"),
|
log_info(_("waiting for node %i to connect to new primary; %i of max %i attempts"),
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
i + 1, config_file_options.standby_reconnect_timeout);
|
i + 1, config_file_options.node_rejoin_timeout);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts",
|
log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts",
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
i + 1, config_file_options.standby_reconnect_timeout);
|
i + 1, config_file_options.node_rejoin_timeout);
|
||||||
}
|
}
|
||||||
|
|
||||||
sleep(1);
|
sleep(1);
|
||||||
|
|||||||
@@ -207,7 +207,7 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
|||||||
|
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
# Standby follow settings
|
# "standby follow" settings
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
|
|
||||||
# These settings apply when instructing a standby to follow the new primary
|
# These settings apply when instructing a standby to follow the new primary
|
||||||
@@ -219,6 +219,28 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
|||||||
# for the standby to connect to the primary
|
# for the standby to connect to the primary
|
||||||
|
|
||||||
|
|
||||||
|
#------------------------------------------------------------------------------
|
||||||
|
# "standby switchover" settings
|
||||||
|
#------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# These settings apply when switching roles between a primary and a standby
|
||||||
|
# ("repmgr standby switchover").
|
||||||
|
|
||||||
|
#standby_reconnect_timeout=60 # The max length of time (in seconds) to wait
|
||||||
|
# for the demoted standby to reconnect to the promoted
|
||||||
|
# primary (note: this value should be equal to or greater
|
||||||
|
# than that set for "node_rejoin_timeout")
|
||||||
|
|
||||||
|
#------------------------------------------------------------------------------
|
||||||
|
# "node rejoin" settings
|
||||||
|
#------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# These settings apply when reintegrating a node into a replication cluster
|
||||||
|
# with "repmgrd_node_rejoin"
|
||||||
|
|
||||||
|
#node_rejoin_timeout=60 # The maximum length of time (in seconds) to wait for
|
||||||
|
# the node to reconnect to the replication cluster
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
# Barman options
|
# Barman options
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
@@ -265,8 +287,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
|||||||
#primary_notification_timeout=60 # Interval (in seconds) which repmgrd on a standby
|
#primary_notification_timeout=60 # Interval (in seconds) which repmgrd on a standby
|
||||||
# will wait for a notification from the new primary,
|
# will wait for a notification from the new primary,
|
||||||
# before falling back to degraded monitoring
|
# before falling back to degraded monitoring
|
||||||
#standby_reconnect_timeout=60 # Interval (in seconds) which repmgrd on a standby will wait
|
#repmgrd_standby_startup_timeout=60 # Interval (in seconds) which repmgrd on a standby will wait
|
||||||
# to reconnect to the local node after executing "follow_command"
|
# for the the local node to restart and become ready to accept connections after
|
||||||
|
# executing "follow_command" (defaults to the value set in "standby_reconnect_timeout")
|
||||||
|
|
||||||
#monitoring_history=no # Whether to write monitoring data to the "montoring_history" table
|
#monitoring_history=no # Whether to write monitoring data to the "montoring_history" table
|
||||||
#monitor_interval_secs=2 # Interval (in seconds) at which to write monitoring data
|
#monitor_interval_secs=2 # Interval (in seconds) at which to write monitoring data
|
||||||
|
|||||||
1
repmgr.h
1
repmgr.h
@@ -85,6 +85,7 @@
|
|||||||
#define DEFAULT_PROMOTE_CHECK_TIMEOUT 60 /* seconds */
|
#define DEFAULT_PROMOTE_CHECK_TIMEOUT 60 /* seconds */
|
||||||
#define DEFAULT_PROMOTE_CHECK_INTERVAL 1 /* seconds */
|
#define DEFAULT_PROMOTE_CHECK_INTERVAL 1 /* seconds */
|
||||||
#define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */
|
#define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */
|
||||||
|
#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */
|
||||||
|
|
||||||
#ifndef RECOVERY_COMMAND_FILE
|
#ifndef RECOVERY_COMMAND_FILE
|
||||||
#define RECOVERY_COMMAND_FILE "recovery.conf"
|
#define RECOVERY_COMMAND_FILE "recovery.conf"
|
||||||
|
|||||||
@@ -1941,7 +1941,7 @@ do_upstream_standby_failover(void)
|
|||||||
* completes, so poll for a while until we get a connection.
|
* completes, so poll for a while until we get a connection.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
|
||||||
{
|
{
|
||||||
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
||||||
|
|
||||||
@@ -1950,7 +1950,7 @@ do_upstream_standby_failover(void)
|
|||||||
|
|
||||||
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
|
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
|
||||||
i + 1,
|
i + 1,
|
||||||
config_file_options.standby_reconnect_timeout);
|
config_file_options.repmgrd_standby_startup_timeout);
|
||||||
sleep(1);
|
sleep(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2391,7 +2391,7 @@ follow_new_primary(int new_primary_id)
|
|||||||
* completes, so poll for a while until we get a connection.
|
* completes, so poll for a while until we get a connection.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
|
||||||
{
|
{
|
||||||
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
||||||
|
|
||||||
@@ -2400,7 +2400,7 @@ follow_new_primary(int new_primary_id)
|
|||||||
|
|
||||||
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
|
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
|
||||||
i + 1,
|
i + 1,
|
||||||
config_file_options.standby_reconnect_timeout);
|
config_file_options.repmgrd_standby_startup_timeout);
|
||||||
sleep(1);
|
sleep(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user