From 68a9745e7e08835e6a4f8953bb63b895e92a80f1 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 7 Jun 2018 14:38:15 +0900 Subject: [PATCH] standby follow: check node has connect to new primary After restarting the standby, poll pg_stat_replication on the upstream until the standby connects, and exit with an error if it doesn't by the timeout defined in "standby_follow_timeout". Implments GitHub #444. --- configfile.c | 20 ++++++-- configfile.h | 9 +++- doc/repmgr-standby-follow.sgml | 16 +++++-- repmgr-action-standby.c | 83 +++++++++++++++++++++++++++++----- repmgr.conf.sample | 4 +- repmgr.h | 1 + 6 files changed, 109 insertions(+), 24 deletions(-) diff --git a/configfile.c b/configfile.c index 03017074..f7d81178 100644 --- a/configfile.c +++ b/configfile.c @@ -319,13 +319,20 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->use_primary_conninfo_password = false; memset(options->passfile, 0, sizeof(options->passfile)); - /*----------------------- + /*------------------------- * standby promote settings - *------------------------ + *------------------------- */ options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT; options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL; + /*------------------------ + * standby follow settings + *------------------------ + */ + options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT; + options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT; + /*----------------- * repmgrd settings *----------------- @@ -345,7 +352,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->degraded_monitoring_timeout = -1; options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT; options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT; - options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT; options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT; /*------------- @@ -527,6 +533,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * else if (strcmp(name, "promote_check_interval") == 0) options->promote_check_interval = repmgr_atoi(value, name, error_list, 1); + /* standby follow settings */ + else if (strcmp(name, "primary_follow_timeout") == 0) + options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0); + else if (strcmp(name, "standby_follow_timeout") == 0) + options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0); + /* node check settings */ else if (strcmp(name, "archive_ready_warning") == 0) options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1); @@ -576,8 +588,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->async_query_timeout = repmgr_atoi(value, name, error_list, 0); else if (strcmp(name, "primary_notification_timeout") == 0) options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0); - else if (strcmp(name, "primary_follow_timeout") == 0) - options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0); else if (strcmp(name, "standby_reconnect_timeout") == 0) options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0); diff --git a/configfile.h b/configfile.h index d86aa907..abfd6d0c 100644 --- a/configfile.h +++ b/configfile.h @@ -98,6 +98,10 @@ typedef struct int promote_check_timeout; int promote_check_interval; + /* standby follow settings */ + int primary_follow_timeout; + int standby_follow_timeout; + /* node check settings */ int archive_ready_warning; int archive_ready_critical; @@ -120,7 +124,6 @@ typedef struct int degraded_monitoring_timeout; int async_query_timeout; int primary_notification_timeout; - int primary_follow_timeout; int standby_reconnect_timeout; /* BDR settings */ @@ -167,6 +170,9 @@ typedef struct false, "", "", { NULL, NULL }, "", false, "", false, "", \ /* standby promote settings */ \ DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \ + /* standby follow settings */ \ + DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \ + DEFAULT_STANDBY_FOLLOW_TIMEOUT, \ /* node check settings */ \ DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \ DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \ @@ -180,7 +186,6 @@ typedef struct false, -1, \ DEFAULT_ASYNC_QUERY_TIMEOUT, \ DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \ - DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \ DEFAULT_STANDBY_RECONNECT_TIMEOUT, \ /* BDR settings */ \ false, DEFAULT_BDR_RECOVERY_TIMEOUT, \ diff --git a/doc/repmgr-standby-follow.sgml b/doc/repmgr-standby-follow.sgml index e7a07306..2245ecf5 100644 --- a/doc/repmgr-standby-follow.sgml +++ b/doc/repmgr-standby-follow.sgml @@ -26,10 +26,18 @@ running. It can only be used to attach an active standby to the current primary node (and not to another standby). - - To re-add an inactive node to the replication cluster, see - - + + + To re-add an inactive node to the replication cluster, use + . + + + + + repmgr standby follow will wait up to + standby_follow_timeout seconds (default: 30) + to verify the standby has actually connected to the new primary. + diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index c382232c..efd350b3 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -2352,6 +2352,74 @@ do_standby_follow(void) &follow_output, &follow_error_code); + /* unable to restart the standby */ + if (success == false) + { + create_event_notification_extended( + primary_conn, + &config_file_options, + config_file_options.node_id, + "standby_follow", + success, + follow_output.data, + &event_info); + + PQfinish(primary_conn); + + log_notice(_("STANDBY FOLLOW failed")); + if (strlen( follow_output.data )) + log_detail("%s", follow_output.data); + + termPQExpBuffer(&follow_output); + exit(follow_error_code); + } + + termPQExpBuffer(&follow_output); + + initPQExpBuffer(&follow_output); + + /* + * Wait up to "standby_follow_timeout" seconds for standby to connect to + * upstream. + * For 9.6 and later, we could check pg_stat_wal_receiver on the local node. + */ + + /* assume success, necessary if standby_follow_timeout is zero */ + success = true; + + for (timer = 0; timer < config_file_options.standby_follow_timeout; timer++) + { + success = is_downstream_node_attached(primary_conn, config_file_options.node_name); + if (success == true) + break; + + log_verbose(LOG_DEBUG, "sleeping %i of max %i seconds waiting for standby to attach to primary", + timer + 1, + config_file_options.standby_follow_timeout); + sleep(1); + } + + if (success == true) + { + log_notice(_("STANDBY FOLLOW successful")); + appendPQExpBuffer(&follow_output, + "standby attached to upstream node \"%s\" (node ID: %i)", + primary_node_record.node_name, + primary_node_id); + } + else + { + log_error(_("STANDBY FOLLOW failed")); + appendPQExpBuffer(&follow_output, + "standby did not attach to upstream node \"%s\" (node ID: %i) after %i seconds", + primary_node_record.node_name, + primary_node_id, + config_file_options.standby_follow_timeout); + + } + + log_detail("%s", follow_output.data); + create_event_notification_extended( primary_conn, &config_file_options, @@ -2363,20 +2431,11 @@ do_standby_follow(void) PQfinish(primary_conn); - if (success == false) - { - log_notice(_("STANDBY FOLLOW failed")); - log_detail("%s", follow_output.data); - - termPQExpBuffer(&follow_output); - exit(follow_error_code); - } - - log_notice(_("STANDBY FOLLOW successful")); - log_detail("%s", follow_output.data); - termPQExpBuffer(&follow_output); + if (success == false) + exit(ERR_FOLLOW_FAIL); + return; } diff --git a/repmgr.conf.sample b/repmgr.conf.sample index 84611173..35deee44 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -213,8 +213,10 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh" # These settings apply when instructing a standby to follow the new primary # ("repmgr standby follow"). -#primary_follow_timeout=60 # The length of time (in seconds) to wait +#primary_follow_timeout=60 # The max length of time (in seconds) to wait # for the new primary to become available +#standby_follow_timeout=15 # The max length of time (in seconds) to wait + # for the standby to connect to the primary #------------------------------------------------------------------------------ diff --git a/repmgr.h b/repmgr.h index a9f4043b..392139d6 100644 --- a/repmgr.h +++ b/repmgr.h @@ -70,6 +70,7 @@ #define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */ #define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */ #define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */ +#define DEFAULT_STANDBY_FOLLOW_TIMEOUT 30 /* seconds */ #define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */ #define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */ #define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */