mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 08:36:30 +00:00
standby follow: check node has connect to new primary
After restarting the standby, poll pg_stat_replication on the upstream until the standby connects, and exit with an error if it doesn't by the timeout defined in "standby_follow_timeout". Implments GitHub #444.
This commit is contained in:
20
configfile.c
20
configfile.c
@@ -319,13 +319,20 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->use_primary_conninfo_password = false;
|
options->use_primary_conninfo_password = false;
|
||||||
memset(options->passfile, 0, sizeof(options->passfile));
|
memset(options->passfile, 0, sizeof(options->passfile));
|
||||||
|
|
||||||
/*-----------------------
|
/*-------------------------
|
||||||
* standby promote settings
|
* standby promote settings
|
||||||
*------------------------
|
*-------------------------
|
||||||
*/
|
*/
|
||||||
options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT;
|
options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT;
|
||||||
options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL;
|
options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL;
|
||||||
|
|
||||||
|
/*------------------------
|
||||||
|
* standby follow settings
|
||||||
|
*------------------------
|
||||||
|
*/
|
||||||
|
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
|
||||||
|
options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
|
||||||
|
|
||||||
/*-----------------
|
/*-----------------
|
||||||
* repmgrd settings
|
* repmgrd settings
|
||||||
*-----------------
|
*-----------------
|
||||||
@@ -345,7 +352,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->degraded_monitoring_timeout = -1;
|
options->degraded_monitoring_timeout = -1;
|
||||||
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
|
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
|
||||||
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
|
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
|
||||||
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
|
|
||||||
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
|
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
|
||||||
|
|
||||||
/*-------------
|
/*-------------
|
||||||
@@ -527,6 +533,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
else if (strcmp(name, "promote_check_interval") == 0)
|
else if (strcmp(name, "promote_check_interval") == 0)
|
||||||
options->promote_check_interval = repmgr_atoi(value, name, error_list, 1);
|
options->promote_check_interval = repmgr_atoi(value, name, error_list, 1);
|
||||||
|
|
||||||
|
/* standby follow settings */
|
||||||
|
else if (strcmp(name, "primary_follow_timeout") == 0)
|
||||||
|
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
else if (strcmp(name, "standby_follow_timeout") == 0)
|
||||||
|
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
/* node check settings */
|
/* node check settings */
|
||||||
else if (strcmp(name, "archive_ready_warning") == 0)
|
else if (strcmp(name, "archive_ready_warning") == 0)
|
||||||
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
|
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
|
||||||
@@ -576,8 +588,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
|
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
else if (strcmp(name, "primary_notification_timeout") == 0)
|
else if (strcmp(name, "primary_notification_timeout") == 0)
|
||||||
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
|
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
else if (strcmp(name, "primary_follow_timeout") == 0)
|
|
||||||
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
|
||||||
else if (strcmp(name, "standby_reconnect_timeout") == 0)
|
else if (strcmp(name, "standby_reconnect_timeout") == 0)
|
||||||
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
|
|||||||
@@ -98,6 +98,10 @@ typedef struct
|
|||||||
int promote_check_timeout;
|
int promote_check_timeout;
|
||||||
int promote_check_interval;
|
int promote_check_interval;
|
||||||
|
|
||||||
|
/* standby follow settings */
|
||||||
|
int primary_follow_timeout;
|
||||||
|
int standby_follow_timeout;
|
||||||
|
|
||||||
/* node check settings */
|
/* node check settings */
|
||||||
int archive_ready_warning;
|
int archive_ready_warning;
|
||||||
int archive_ready_critical;
|
int archive_ready_critical;
|
||||||
@@ -120,7 +124,6 @@ typedef struct
|
|||||||
int degraded_monitoring_timeout;
|
int degraded_monitoring_timeout;
|
||||||
int async_query_timeout;
|
int async_query_timeout;
|
||||||
int primary_notification_timeout;
|
int primary_notification_timeout;
|
||||||
int primary_follow_timeout;
|
|
||||||
int standby_reconnect_timeout;
|
int standby_reconnect_timeout;
|
||||||
|
|
||||||
/* BDR settings */
|
/* BDR settings */
|
||||||
@@ -167,6 +170,9 @@ typedef struct
|
|||||||
false, "", "", { NULL, NULL }, "", false, "", false, "", \
|
false, "", "", { NULL, NULL }, "", false, "", false, "", \
|
||||||
/* standby promote settings */ \
|
/* standby promote settings */ \
|
||||||
DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \
|
DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \
|
||||||
|
/* standby follow settings */ \
|
||||||
|
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
||||||
|
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
|
||||||
/* node check settings */ \
|
/* node check settings */ \
|
||||||
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
|
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
|
||||||
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
|
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
|
||||||
@@ -180,7 +186,6 @@ typedef struct
|
|||||||
false, -1, \
|
false, -1, \
|
||||||
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
||||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||||
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
|
||||||
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
|
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
|
||||||
/* BDR settings */ \
|
/* BDR settings */ \
|
||||||
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||||
|
|||||||
@@ -26,10 +26,18 @@
|
|||||||
running. It can only be used to attach an active standby to the current primary node
|
running. It can only be used to attach an active standby to the current primary node
|
||||||
(and not to another standby).
|
(and not to another standby).
|
||||||
</para>
|
</para>
|
||||||
<para>
|
<tip>
|
||||||
To re-add an inactive node to the replication cluster, see
|
<para>
|
||||||
<xref linkend="repmgr-node-rejoin">
|
To re-add an inactive node to the replication cluster, use
|
||||||
</para>
|
<xref linkend="repmgr-node-rejoin">.
|
||||||
|
</para>
|
||||||
|
</tip>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
<command>repmgr standby follow</command> will wait up to
|
||||||
|
<varname>standby_follow_timeout</varname> seconds (default: <literal>30</literal>)
|
||||||
|
to verify the standby has actually connected to the new primary.
|
||||||
|
</para>
|
||||||
|
|
||||||
</refsect1>
|
</refsect1>
|
||||||
|
|
||||||
|
|||||||
@@ -2352,6 +2352,74 @@ do_standby_follow(void)
|
|||||||
&follow_output,
|
&follow_output,
|
||||||
&follow_error_code);
|
&follow_error_code);
|
||||||
|
|
||||||
|
/* unable to restart the standby */
|
||||||
|
if (success == false)
|
||||||
|
{
|
||||||
|
create_event_notification_extended(
|
||||||
|
primary_conn,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"standby_follow",
|
||||||
|
success,
|
||||||
|
follow_output.data,
|
||||||
|
&event_info);
|
||||||
|
|
||||||
|
PQfinish(primary_conn);
|
||||||
|
|
||||||
|
log_notice(_("STANDBY FOLLOW failed"));
|
||||||
|
if (strlen( follow_output.data ))
|
||||||
|
log_detail("%s", follow_output.data);
|
||||||
|
|
||||||
|
termPQExpBuffer(&follow_output);
|
||||||
|
exit(follow_error_code);
|
||||||
|
}
|
||||||
|
|
||||||
|
termPQExpBuffer(&follow_output);
|
||||||
|
|
||||||
|
initPQExpBuffer(&follow_output);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Wait up to "standby_follow_timeout" seconds for standby to connect to
|
||||||
|
* upstream.
|
||||||
|
* For 9.6 and later, we could check pg_stat_wal_receiver on the local node.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* assume success, necessary if standby_follow_timeout is zero */
|
||||||
|
success = true;
|
||||||
|
|
||||||
|
for (timer = 0; timer < config_file_options.standby_follow_timeout; timer++)
|
||||||
|
{
|
||||||
|
success = is_downstream_node_attached(primary_conn, config_file_options.node_name);
|
||||||
|
if (success == true)
|
||||||
|
break;
|
||||||
|
|
||||||
|
log_verbose(LOG_DEBUG, "sleeping %i of max %i seconds waiting for standby to attach to primary",
|
||||||
|
timer + 1,
|
||||||
|
config_file_options.standby_follow_timeout);
|
||||||
|
sleep(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (success == true)
|
||||||
|
{
|
||||||
|
log_notice(_("STANDBY FOLLOW successful"));
|
||||||
|
appendPQExpBuffer(&follow_output,
|
||||||
|
"standby attached to upstream node \"%s\" (node ID: %i)",
|
||||||
|
primary_node_record.node_name,
|
||||||
|
primary_node_id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_error(_("STANDBY FOLLOW failed"));
|
||||||
|
appendPQExpBuffer(&follow_output,
|
||||||
|
"standby did not attach to upstream node \"%s\" (node ID: %i) after %i seconds",
|
||||||
|
primary_node_record.node_name,
|
||||||
|
primary_node_id,
|
||||||
|
config_file_options.standby_follow_timeout);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
log_detail("%s", follow_output.data);
|
||||||
|
|
||||||
create_event_notification_extended(
|
create_event_notification_extended(
|
||||||
primary_conn,
|
primary_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
@@ -2363,20 +2431,11 @@ do_standby_follow(void)
|
|||||||
|
|
||||||
PQfinish(primary_conn);
|
PQfinish(primary_conn);
|
||||||
|
|
||||||
if (success == false)
|
|
||||||
{
|
|
||||||
log_notice(_("STANDBY FOLLOW failed"));
|
|
||||||
log_detail("%s", follow_output.data);
|
|
||||||
|
|
||||||
termPQExpBuffer(&follow_output);
|
|
||||||
exit(follow_error_code);
|
|
||||||
}
|
|
||||||
|
|
||||||
log_notice(_("STANDBY FOLLOW successful"));
|
|
||||||
log_detail("%s", follow_output.data);
|
|
||||||
|
|
||||||
termPQExpBuffer(&follow_output);
|
termPQExpBuffer(&follow_output);
|
||||||
|
|
||||||
|
if (success == false)
|
||||||
|
exit(ERR_FOLLOW_FAIL);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -213,8 +213,10 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
|||||||
# These settings apply when instructing a standby to follow the new primary
|
# These settings apply when instructing a standby to follow the new primary
|
||||||
# ("repmgr standby follow").
|
# ("repmgr standby follow").
|
||||||
|
|
||||||
#primary_follow_timeout=60 # The length of time (in seconds) to wait
|
#primary_follow_timeout=60 # The max length of time (in seconds) to wait
|
||||||
# for the new primary to become available
|
# for the new primary to become available
|
||||||
|
#standby_follow_timeout=15 # The max length of time (in seconds) to wait
|
||||||
|
# for the standby to connect to the primary
|
||||||
|
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
|
|||||||
1
repmgr.h
1
repmgr.h
@@ -70,6 +70,7 @@
|
|||||||
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */
|
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */
|
||||||
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */
|
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */
|
||||||
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */
|
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */
|
||||||
|
#define DEFAULT_STANDBY_FOLLOW_TIMEOUT 30 /* seconds */
|
||||||
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
|
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
|
||||||
#define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */
|
#define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */
|
||||||
#define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */
|
#define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */
|
||||||
|
|||||||
Reference in New Issue
Block a user