From e10d9fd39353215042ac1f4cf3653ce5d57b8459 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Tue, 6 Oct 2020 13:22:38 +0900 Subject: [PATCH] EXPERIMENTAL: synchronise try_primary_reconnect()'s reconnection loop Per proposal in GitHub #662, this patch attempts to synchronise each repmgrd's primary reconnection attempts to prevent potential race conditions. This relies on each node's clock being correcly synchronised. Currently this change is experimental and is not enabled by default. It can be enabled by setting the repmgr.conf parameter "reconnect_loop_sync". --- configdata.c | 14 ++++++++++++++ configfile.h | 12 +++++++++++- repmgrd-physical.c | 32 ++++++++++++++++++++++++++++---- 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/configdata.c b/configdata.c index d6cde08d..0e55a161 100644 --- a/configdata.c +++ b/configdata.c @@ -881,6 +881,20 @@ struct ConfigFileSetting config_file_settings[] = { .strmaxlen = sizeof(config_file_options.ssh_options) }, {} }, + /* ================================== + * undocumented experimental settings + * ================================== + */ + /* reconnect_loop_sync */ + { + "reconnect_loop_sync", + CONFIG_BOOL, + { .boolptr = &config_file_options.reconnect_loop_sync }, + { .booldefault = false }, + {}, + {}, + {} + }, /* ========================== * undocumented test settings * ========================== diff --git a/configfile.h b/configfile.h index 9cfe83ec..da2ae106 100644 --- a/configfile.h +++ b/configfile.h @@ -238,7 +238,17 @@ typedef struct char rsync_options[MAXLEN]; char ssh_options[MAXLEN]; - /* undocumented test settings */ + /* + * undocumented settings + * + * These settings are for testing or experimential features + * and may be changed without notice. + */ + + /* experimental settings */ + bool reconnect_loop_sync; + + /* test settings */ int promote_delay; int failover_delay; char connection_check_query[MAXLEN]; diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 8b5ca337..9e412947 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -5377,6 +5377,11 @@ try_primary_reconnect(PGconn **conn, PGconn *local_conn, t_node_info *node_info) for (i = 0; i < max_attempts; i++) { + time_t started_at = time(NULL); + int up_to; + bool sleep_now = false; + bool max_sleep_seconds; + log_info(_("checking state of node \"%s\" (ID: %i), %i of %i attempts"), node_info->node_name, node_info->node_id, @@ -5442,12 +5447,31 @@ try_primary_reconnect(PGconn **conn, PGconn *local_conn, t_node_info *node_info) node_info->node_id); } - if (i + 1 < max_attempts) + /* + * Experimental behaviour, see GitHub #662. + */ + if (config_file_options.reconnect_loop_sync == true) + { + up_to = (time(NULL) - started_at); + max_sleep_seconds = (up_to == 0) + ? config_file_options.reconnect_interval + : (up_to % config_file_options.reconnect_interval); + if (i + 1 <= max_attempts) + sleep_now = true; + } + else + { + max_sleep_seconds = config_file_options.reconnect_interval; + if (i + 1 < max_attempts) + sleep_now = true; + } + + if (sleep_now == true) { int j; - log_info(_("sleeping %i seconds until next reconnection attempt"), - config_file_options.reconnect_interval); - for (j = 0; j < config_file_options.reconnect_interval; j++) + log_info(_("sleeping up to %i seconds until next reconnection attempt"), + max_sleep_seconds); + for (j = 0; j < max_sleep_seconds; j++) { int new_primary_node_id; if (get_new_primary(local_conn, &new_primary_node_id) == true && new_primary_node_id != UNKNOWN_NODE_ID)