standby follow: check node has connect to new primary

After restarting the standby, poll pg_stat_replication on the upstream
until the standby connects, and exit with an error if it doesn't by the
timeout defined in "standby_follow_timeout".

Implments GitHub #444.
This commit is contained in:
Ian Barwick
2018-06-07 14:38:15 +09:00
parent 20ce53e2d2
commit 68a9745e7e
6 changed files with 109 additions and 24 deletions

View File

@@ -319,13 +319,20 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->use_primary_conninfo_password = false;
memset(options->passfile, 0, sizeof(options->passfile));
/*-----------------------
/*-------------------------
* standby promote settings
*------------------------
*-------------------------
*/
options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT;
options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL;
/*------------------------
* standby follow settings
*------------------------
*/
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
/*-----------------
* repmgrd settings
*-----------------
@@ -345,7 +352,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->degraded_monitoring_timeout = -1;
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
/*-------------
@@ -527,6 +533,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
else if (strcmp(name, "promote_check_interval") == 0)
options->promote_check_interval = repmgr_atoi(value, name, error_list, 1);
/* standby follow settings */
else if (strcmp(name, "primary_follow_timeout") == 0)
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "standby_follow_timeout") == 0)
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
/* node check settings */
else if (strcmp(name, "archive_ready_warning") == 0)
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
@@ -576,8 +588,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "primary_notification_timeout") == 0)
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "primary_follow_timeout") == 0)
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "standby_reconnect_timeout") == 0)
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);

View File

@@ -98,6 +98,10 @@ typedef struct
int promote_check_timeout;
int promote_check_interval;
/* standby follow settings */
int primary_follow_timeout;
int standby_follow_timeout;
/* node check settings */
int archive_ready_warning;
int archive_ready_critical;
@@ -120,7 +124,6 @@ typedef struct
int degraded_monitoring_timeout;
int async_query_timeout;
int primary_notification_timeout;
int primary_follow_timeout;
int standby_reconnect_timeout;
/* BDR settings */
@@ -167,6 +170,9 @@ typedef struct
false, "", "", { NULL, NULL }, "", false, "", false, "", \
/* standby promote settings */ \
DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \
/* standby follow settings */ \
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
/* node check settings */ \
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
@@ -180,7 +186,6 @@ typedef struct
false, -1, \
DEFAULT_ASYNC_QUERY_TIMEOUT, \
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
/* BDR settings */ \
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \

View File

@@ -26,10 +26,18 @@
running. It can only be used to attach an active standby to the current primary node
(and not to another standby).
</para>
<para>
To re-add an inactive node to the replication cluster, see
<xref linkend="repmgr-node-rejoin">
</para>
<tip>
<para>
To re-add an inactive node to the replication cluster, use
<xref linkend="repmgr-node-rejoin">.
</para>
</tip>
<para>
<command>repmgr standby follow</command> will wait up to
<varname>standby_follow_timeout</varname> seconds (default: <literal>30</literal>)
to verify the standby has actually connected to the new primary.
</para>
</refsect1>

View File

@@ -2352,6 +2352,74 @@ do_standby_follow(void)
&follow_output,
&follow_error_code);
/* unable to restart the standby */
if (success == false)
{
create_event_notification_extended(
primary_conn,
&config_file_options,
config_file_options.node_id,
"standby_follow",
success,
follow_output.data,
&event_info);
PQfinish(primary_conn);
log_notice(_("STANDBY FOLLOW failed"));
if (strlen( follow_output.data ))
log_detail("%s", follow_output.data);
termPQExpBuffer(&follow_output);
exit(follow_error_code);
}
termPQExpBuffer(&follow_output);
initPQExpBuffer(&follow_output);
/*
* Wait up to "standby_follow_timeout" seconds for standby to connect to
* upstream.
* For 9.6 and later, we could check pg_stat_wal_receiver on the local node.
*/
/* assume success, necessary if standby_follow_timeout is zero */
success = true;
for (timer = 0; timer < config_file_options.standby_follow_timeout; timer++)
{
success = is_downstream_node_attached(primary_conn, config_file_options.node_name);
if (success == true)
break;
log_verbose(LOG_DEBUG, "sleeping %i of max %i seconds waiting for standby to attach to primary",
timer + 1,
config_file_options.standby_follow_timeout);
sleep(1);
}
if (success == true)
{
log_notice(_("STANDBY FOLLOW successful"));
appendPQExpBuffer(&follow_output,
"standby attached to upstream node \"%s\" (node ID: %i)",
primary_node_record.node_name,
primary_node_id);
}
else
{
log_error(_("STANDBY FOLLOW failed"));
appendPQExpBuffer(&follow_output,
"standby did not attach to upstream node \"%s\" (node ID: %i) after %i seconds",
primary_node_record.node_name,
primary_node_id,
config_file_options.standby_follow_timeout);
}
log_detail("%s", follow_output.data);
create_event_notification_extended(
primary_conn,
&config_file_options,
@@ -2363,20 +2431,11 @@ do_standby_follow(void)
PQfinish(primary_conn);
if (success == false)
{
log_notice(_("STANDBY FOLLOW failed"));
log_detail("%s", follow_output.data);
termPQExpBuffer(&follow_output);
exit(follow_error_code);
}
log_notice(_("STANDBY FOLLOW successful"));
log_detail("%s", follow_output.data);
termPQExpBuffer(&follow_output);
if (success == false)
exit(ERR_FOLLOW_FAIL);
return;
}

View File

@@ -213,8 +213,10 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
# These settings apply when instructing a standby to follow the new primary
# ("repmgr standby follow").
#primary_follow_timeout=60 # The length of time (in seconds) to wait
#primary_follow_timeout=60 # The max length of time (in seconds) to wait
# for the new primary to become available
#standby_follow_timeout=15 # The max length of time (in seconds) to wait
# for the standby to connect to the primary
#------------------------------------------------------------------------------

View File

@@ -70,6 +70,7 @@
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */
#define DEFAULT_STANDBY_FOLLOW_TIMEOUT 30 /* seconds */
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
#define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */
#define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */