mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Add configuration option "sibling_nodes_disconnect_timeout"
This controls the maximum length of time in seconds that repmgrd will wait for other standbys to disconnect their WAL receivers in a failover situation. This setting is only used when "standby_disconnect_on_failover" is set to "true".
This commit is contained in:
@@ -359,6 +359,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */
|
||||
memset(options->repmgrd_pid_file, 0, sizeof(options->repmgrd_pid_file));
|
||||
options->standby_disconnect_on_failover = false;
|
||||
options->sibling_nodes_disconnect_timeout = DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT;
|
||||
options->connection_check_type = CHECK_PING;
|
||||
|
||||
/*-------------
|
||||
@@ -622,6 +623,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
strncpy(options->repmgrd_pid_file, value, MAXPGPATH);
|
||||
else if (strcmp(name, "standby_disconnect_on_failover") == 0)
|
||||
options->standby_disconnect_on_failover = parse_bool(value, name, error_list);
|
||||
else if (strcmp(name, "sibling_nodes_disconnect_timeout") == 0)
|
||||
options->sibling_nodes_disconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "connection_check_type") == 0)
|
||||
{
|
||||
if (strcasecmp(value, "ping") == 0)
|
||||
|
||||
@@ -142,6 +142,7 @@ typedef struct
|
||||
int repmgrd_standby_startup_timeout;
|
||||
char repmgrd_pid_file[MAXPGPATH];
|
||||
bool standby_disconnect_on_failover;
|
||||
int sibling_nodes_disconnect_timeout;
|
||||
ConnectionCheckType connection_check_type;
|
||||
|
||||
/* BDR settings */
|
||||
@@ -214,7 +215,7 @@ typedef struct
|
||||
false, -1, \
|
||||
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||
-1, "", false, CHECK_PING, \
|
||||
-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, CHECK_PING, \
|
||||
/* BDR settings */ \
|
||||
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||
/* service settings */ \
|
||||
|
||||
@@ -322,6 +322,11 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
# "--no-pid-file" will force PID file creation to be skipped.
|
||||
# Note: there is normally no need to set this, particularly if
|
||||
# repmgr was installed from packages.
|
||||
#standby_disconnect_on_failover=false # If "true", in a failover situation wait for all standbys to
|
||||
# disconnect their WAL receivers before electing a new primary
|
||||
#sibling_nodes_disconnect_timeout=30 # If "standby_disconnect_on_failover", maximum length of time (in seconds)
|
||||
# to wait for other standbys to confirm they have disconnected their
|
||||
# WAL receivers
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# service control commands
|
||||
|
||||
1
repmgr.h
1
repmgr.h
@@ -91,6 +91,7 @@
|
||||
#define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */
|
||||
#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */
|
||||
#define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT 30 /* seconds */
|
||||
#define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
|
||||
|
||||
#define WALRECEIVER_DISABLE_TIMEOUT_VALUE 86400000 /* milliseconds */
|
||||
|
||||
|
||||
@@ -1989,8 +1989,6 @@ do_primary_failover(void)
|
||||
static NodeInfoList check_sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||
int i;
|
||||
|
||||
// XXX make configurable
|
||||
int sibling_nodes_disconnect_timeout = 30;
|
||||
bool sibling_node_wal_receiver_connected = false;
|
||||
|
||||
if (PQserverVersion(local_conn) < 90500)
|
||||
@@ -2016,7 +2014,7 @@ do_primary_failover(void)
|
||||
local_node_info.upstream_node_id,
|
||||
&check_sibling_nodes);
|
||||
|
||||
for (i = 0; i < sibling_nodes_disconnect_timeout; i++)
|
||||
for (i = 0; i < config_file_options.sibling_nodes_disconnect_timeout; i++)
|
||||
{
|
||||
for (cell = check_sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
@@ -2048,13 +2046,13 @@ do_primary_failover(void)
|
||||
}
|
||||
|
||||
log_debug("sleeping %i of max %i seconds (\"sibling_nodes_disconnect_timeout\")",
|
||||
i + 1, sibling_nodes_disconnect_timeout)
|
||||
i + 1, config_file_options.sibling_nodes_disconnect_timeout);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
if (sibling_node_wal_receiver_connected == true)
|
||||
{
|
||||
// XXX what do we do here? abort or continue? make configurable?
|
||||
/* TODO: prevent any such nodes becoming promotion candidates */
|
||||
log_warning(_("WAL receiver still connected on at least one sibling node"));
|
||||
}
|
||||
else
|
||||
|
||||
Reference in New Issue
Block a user