standby follow: check node has connect to new primary

After restarting the standby, poll pg_stat_replication on the upstream
until the standby connects, and exit with an error if it doesn't by the
timeout defined in "standby_follow_timeout".

Implments GitHub #444.
This commit is contained in:
Ian Barwick
2018-06-07 14:38:15 +09:00
parent e12fbb7b4d
commit efc388065e
6 changed files with 109 additions and 24 deletions

View File

@@ -319,13 +319,20 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->use_primary_conninfo_password = false; options->use_primary_conninfo_password = false;
memset(options->passfile, 0, sizeof(options->passfile)); memset(options->passfile, 0, sizeof(options->passfile));
/*----------------------- /*-------------------------
* standby promote settings * standby promote settings
*------------------------ *-------------------------
*/ */
options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT; options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT;
options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL; options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL;
/*------------------------
* standby follow settings
*------------------------
*/
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
/*----------------- /*-----------------
* repmgrd settings * repmgrd settings
*----------------- *-----------------
@@ -345,7 +352,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->degraded_monitoring_timeout = -1; options->degraded_monitoring_timeout = -1;
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT; options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT; options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT; options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
/*------------- /*-------------
@@ -527,6 +533,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
else if (strcmp(name, "promote_check_interval") == 0) else if (strcmp(name, "promote_check_interval") == 0)
options->promote_check_interval = repmgr_atoi(value, name, error_list, 1); options->promote_check_interval = repmgr_atoi(value, name, error_list, 1);
/* standby follow settings */
else if (strcmp(name, "primary_follow_timeout") == 0)
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "standby_follow_timeout") == 0)
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
/* node check settings */ /* node check settings */
else if (strcmp(name, "archive_ready_warning") == 0) else if (strcmp(name, "archive_ready_warning") == 0)
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1); options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
@@ -576,8 +588,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0); options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "primary_notification_timeout") == 0) else if (strcmp(name, "primary_notification_timeout") == 0)
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0); options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "primary_follow_timeout") == 0)
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "standby_reconnect_timeout") == 0) else if (strcmp(name, "standby_reconnect_timeout") == 0)
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0); options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);

View File

@@ -98,6 +98,10 @@ typedef struct
int promote_check_timeout; int promote_check_timeout;
int promote_check_interval; int promote_check_interval;
/* standby follow settings */
int primary_follow_timeout;
int standby_follow_timeout;
/* node check settings */ /* node check settings */
int archive_ready_warning; int archive_ready_warning;
int archive_ready_critical; int archive_ready_critical;
@@ -120,7 +124,6 @@ typedef struct
int degraded_monitoring_timeout; int degraded_monitoring_timeout;
int async_query_timeout; int async_query_timeout;
int primary_notification_timeout; int primary_notification_timeout;
int primary_follow_timeout;
int standby_reconnect_timeout; int standby_reconnect_timeout;
/* BDR settings */ /* BDR settings */
@@ -167,6 +170,9 @@ typedef struct
false, "", "", { NULL, NULL }, "", false, "", false, "", \ false, "", "", { NULL, NULL }, "", false, "", false, "", \
/* standby promote settings */ \ /* standby promote settings */ \
DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \ DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \
/* standby follow settings */ \
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
/* node check settings */ \ /* node check settings */ \
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \ DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \ DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
@@ -180,7 +186,6 @@ typedef struct
false, -1, \ false, -1, \
DEFAULT_ASYNC_QUERY_TIMEOUT, \ DEFAULT_ASYNC_QUERY_TIMEOUT, \
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \ DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \ DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
/* BDR settings */ \ /* BDR settings */ \
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \ false, DEFAULT_BDR_RECOVERY_TIMEOUT, \

View File

@@ -26,10 +26,18 @@
running. It can only be used to attach an active standby to the current primary node running. It can only be used to attach an active standby to the current primary node
(and not to another standby). (and not to another standby).
</para> </para>
<para> <tip>
To re-add an inactive node to the replication cluster, see <para>
<xref linkend="repmgr-node-rejoin"> To re-add an inactive node to the replication cluster, use
</para> <xref linkend="repmgr-node-rejoin">.
</para>
</tip>
<para>
<command>repmgr standby follow</command> will wait up to
<varname>standby_follow_timeout</varname> seconds (default: <literal>30</literal>)
to verify the standby has actually connected to the new primary.
</para>
</refsect1> </refsect1>

View File

@@ -2352,6 +2352,74 @@ do_standby_follow(void)
&follow_output, &follow_output,
&follow_error_code); &follow_error_code);
/* unable to restart the standby */
if (success == false)
{
create_event_notification_extended(
primary_conn,
&config_file_options,
config_file_options.node_id,
"standby_follow",
success,
follow_output.data,
&event_info);
PQfinish(primary_conn);
log_notice(_("STANDBY FOLLOW failed"));
if (strlen( follow_output.data ))
log_detail("%s", follow_output.data);
termPQExpBuffer(&follow_output);
exit(follow_error_code);
}
termPQExpBuffer(&follow_output);
initPQExpBuffer(&follow_output);
/*
* Wait up to "standby_follow_timeout" seconds for standby to connect to
* upstream.
* For 9.6 and later, we could check pg_stat_wal_receiver on the local node.
*/
/* assume success, necessary if standby_follow_timeout is zero */
success = true;
for (timer = 0; timer < config_file_options.standby_follow_timeout; timer++)
{
success = is_downstream_node_attached(primary_conn, config_file_options.node_name);
if (success == true)
break;
log_verbose(LOG_DEBUG, "sleeping %i of max %i seconds waiting for standby to attach to primary",
timer + 1,
config_file_options.standby_follow_timeout);
sleep(1);
}
if (success == true)
{
log_notice(_("STANDBY FOLLOW successful"));
appendPQExpBuffer(&follow_output,
"standby attached to upstream node \"%s\" (node ID: %i)",
primary_node_record.node_name,
primary_node_id);
}
else
{
log_error(_("STANDBY FOLLOW failed"));
appendPQExpBuffer(&follow_output,
"standby did not attach to upstream node \"%s\" (node ID: %i) after %i seconds",
primary_node_record.node_name,
primary_node_id,
config_file_options.standby_follow_timeout);
}
log_detail("%s", follow_output.data);
create_event_notification_extended( create_event_notification_extended(
primary_conn, primary_conn,
&config_file_options, &config_file_options,
@@ -2363,20 +2431,11 @@ do_standby_follow(void)
PQfinish(primary_conn); PQfinish(primary_conn);
if (success == false)
{
log_notice(_("STANDBY FOLLOW failed"));
log_detail("%s", follow_output.data);
termPQExpBuffer(&follow_output);
exit(follow_error_code);
}
log_notice(_("STANDBY FOLLOW successful"));
log_detail("%s", follow_output.data);
termPQExpBuffer(&follow_output); termPQExpBuffer(&follow_output);
if (success == false)
exit(ERR_FOLLOW_FAIL);
return; return;
} }

View File

@@ -213,8 +213,10 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
# These settings apply when instructing a standby to follow the new primary # These settings apply when instructing a standby to follow the new primary
# ("repmgr standby follow"). # ("repmgr standby follow").
#primary_follow_timeout=60 # The length of time (in seconds) to wait #primary_follow_timeout=60 # The max length of time (in seconds) to wait
# for the new primary to become available # for the new primary to become available
#standby_follow_timeout=15 # The max length of time (in seconds) to wait
# for the standby to connect to the primary
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------

View File

@@ -70,6 +70,7 @@
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */ #define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */ #define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */ #define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */
#define DEFAULT_STANDBY_FOLLOW_TIMEOUT 30 /* seconds */
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */ #define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
#define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */ #define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */
#define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */ #define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */