mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 07:06:30 +00:00
standby follow: check node has connect to new primary
After restarting the standby, poll pg_stat_replication on the upstream until the standby connects, and exit with an error if it doesn't by the timeout defined in "standby_follow_timeout". Implments GitHub #444.
This commit is contained in:
20
configfile.c
20
configfile.c
@@ -319,13 +319,20 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->use_primary_conninfo_password = false;
|
||||
memset(options->passfile, 0, sizeof(options->passfile));
|
||||
|
||||
/*-----------------------
|
||||
/*-------------------------
|
||||
* standby promote settings
|
||||
*------------------------
|
||||
*-------------------------
|
||||
*/
|
||||
options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT;
|
||||
options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL;
|
||||
|
||||
/*------------------------
|
||||
* standby follow settings
|
||||
*------------------------
|
||||
*/
|
||||
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
|
||||
options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
|
||||
|
||||
/*-----------------
|
||||
* repmgrd settings
|
||||
*-----------------
|
||||
@@ -345,7 +352,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->degraded_monitoring_timeout = -1;
|
||||
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
|
||||
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
|
||||
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
|
||||
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
|
||||
|
||||
/*-------------
|
||||
@@ -527,6 +533,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
else if (strcmp(name, "promote_check_interval") == 0)
|
||||
options->promote_check_interval = repmgr_atoi(value, name, error_list, 1);
|
||||
|
||||
/* standby follow settings */
|
||||
else if (strcmp(name, "primary_follow_timeout") == 0)
|
||||
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "standby_follow_timeout") == 0)
|
||||
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
|
||||
/* node check settings */
|
||||
else if (strcmp(name, "archive_ready_warning") == 0)
|
||||
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
|
||||
@@ -576,8 +588,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "primary_notification_timeout") == 0)
|
||||
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "primary_follow_timeout") == 0)
|
||||
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "standby_reconnect_timeout") == 0)
|
||||
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
|
||||
|
||||
@@ -98,6 +98,10 @@ typedef struct
|
||||
int promote_check_timeout;
|
||||
int promote_check_interval;
|
||||
|
||||
/* standby follow settings */
|
||||
int primary_follow_timeout;
|
||||
int standby_follow_timeout;
|
||||
|
||||
/* node check settings */
|
||||
int archive_ready_warning;
|
||||
int archive_ready_critical;
|
||||
@@ -120,7 +124,6 @@ typedef struct
|
||||
int degraded_monitoring_timeout;
|
||||
int async_query_timeout;
|
||||
int primary_notification_timeout;
|
||||
int primary_follow_timeout;
|
||||
int standby_reconnect_timeout;
|
||||
|
||||
/* BDR settings */
|
||||
@@ -167,6 +170,9 @@ typedef struct
|
||||
false, "", "", { NULL, NULL }, "", false, "", false, "", \
|
||||
/* standby promote settings */ \
|
||||
DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \
|
||||
/* standby follow settings */ \
|
||||
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
||||
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
|
||||
/* node check settings */ \
|
||||
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
|
||||
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
|
||||
@@ -180,7 +186,6 @@ typedef struct
|
||||
false, -1, \
|
||||
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
||||
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
|
||||
/* BDR settings */ \
|
||||
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||
|
||||
@@ -26,10 +26,18 @@
|
||||
running. It can only be used to attach an active standby to the current primary node
|
||||
(and not to another standby).
|
||||
</para>
|
||||
<para>
|
||||
To re-add an inactive node to the replication cluster, see
|
||||
<xref linkend="repmgr-node-rejoin">
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
To re-add an inactive node to the replication cluster, use
|
||||
<xref linkend="repmgr-node-rejoin">.
|
||||
</para>
|
||||
</tip>
|
||||
|
||||
<para>
|
||||
<command>repmgr standby follow</command> will wait up to
|
||||
<varname>standby_follow_timeout</varname> seconds (default: <literal>30</literal>)
|
||||
to verify the standby has actually connected to the new primary.
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
|
||||
|
||||
@@ -2352,6 +2352,74 @@ do_standby_follow(void)
|
||||
&follow_output,
|
||||
&follow_error_code);
|
||||
|
||||
/* unable to restart the standby */
|
||||
if (success == false)
|
||||
{
|
||||
create_event_notification_extended(
|
||||
primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"standby_follow",
|
||||
success,
|
||||
follow_output.data,
|
||||
&event_info);
|
||||
|
||||
PQfinish(primary_conn);
|
||||
|
||||
log_notice(_("STANDBY FOLLOW failed"));
|
||||
if (strlen( follow_output.data ))
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
termPQExpBuffer(&follow_output);
|
||||
exit(follow_error_code);
|
||||
}
|
||||
|
||||
termPQExpBuffer(&follow_output);
|
||||
|
||||
initPQExpBuffer(&follow_output);
|
||||
|
||||
/*
|
||||
* Wait up to "standby_follow_timeout" seconds for standby to connect to
|
||||
* upstream.
|
||||
* For 9.6 and later, we could check pg_stat_wal_receiver on the local node.
|
||||
*/
|
||||
|
||||
/* assume success, necessary if standby_follow_timeout is zero */
|
||||
success = true;
|
||||
|
||||
for (timer = 0; timer < config_file_options.standby_follow_timeout; timer++)
|
||||
{
|
||||
success = is_downstream_node_attached(primary_conn, config_file_options.node_name);
|
||||
if (success == true)
|
||||
break;
|
||||
|
||||
log_verbose(LOG_DEBUG, "sleeping %i of max %i seconds waiting for standby to attach to primary",
|
||||
timer + 1,
|
||||
config_file_options.standby_follow_timeout);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
if (success == true)
|
||||
{
|
||||
log_notice(_("STANDBY FOLLOW successful"));
|
||||
appendPQExpBuffer(&follow_output,
|
||||
"standby attached to upstream node \"%s\" (node ID: %i)",
|
||||
primary_node_record.node_name,
|
||||
primary_node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("STANDBY FOLLOW failed"));
|
||||
appendPQExpBuffer(&follow_output,
|
||||
"standby did not attach to upstream node \"%s\" (node ID: %i) after %i seconds",
|
||||
primary_node_record.node_name,
|
||||
primary_node_id,
|
||||
config_file_options.standby_follow_timeout);
|
||||
|
||||
}
|
||||
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
create_event_notification_extended(
|
||||
primary_conn,
|
||||
&config_file_options,
|
||||
@@ -2363,20 +2431,11 @@ do_standby_follow(void)
|
||||
|
||||
PQfinish(primary_conn);
|
||||
|
||||
if (success == false)
|
||||
{
|
||||
log_notice(_("STANDBY FOLLOW failed"));
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
termPQExpBuffer(&follow_output);
|
||||
exit(follow_error_code);
|
||||
}
|
||||
|
||||
log_notice(_("STANDBY FOLLOW successful"));
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
termPQExpBuffer(&follow_output);
|
||||
|
||||
if (success == false)
|
||||
exit(ERR_FOLLOW_FAIL);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -213,8 +213,10 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
# These settings apply when instructing a standby to follow the new primary
|
||||
# ("repmgr standby follow").
|
||||
|
||||
#primary_follow_timeout=60 # The length of time (in seconds) to wait
|
||||
#primary_follow_timeout=60 # The max length of time (in seconds) to wait
|
||||
# for the new primary to become available
|
||||
#standby_follow_timeout=15 # The max length of time (in seconds) to wait
|
||||
# for the standby to connect to the primary
|
||||
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
1
repmgr.h
1
repmgr.h
@@ -70,6 +70,7 @@
|
||||
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */
|
||||
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */
|
||||
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */
|
||||
#define DEFAULT_STANDBY_FOLLOW_TIMEOUT 30 /* seconds */
|
||||
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
|
||||
#define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */
|
||||
#define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */
|
||||
|
||||
Reference in New Issue
Block a user