mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
"standby switchover": avoid potential race condition with WAL location check
Immediately after the demotion candidate (primary) has shut down, we can't be absolutely sure that the walreceiver has flushed all WAL to disk, so checking pg_last_wal_receive_lsn() at that point might not reflect the actual last available WAL location. To handle this, we'll loop for a while (timeout controlled by configuration parameter "wal_receive_check_timeout") before finally deciding whether the standby is still behind the shut-down primary. Addresses issue raised in GitHub #518.
This commit is contained in:
2
HISTORY
2
HISTORY
@@ -4,6 +4,8 @@
|
|||||||
repmgr: add --terse option to "cluster show"; GitHub #521 (Ian)
|
repmgr: add --terse option to "cluster show"; GitHub #521 (Ian)
|
||||||
repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
|
repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
|
||||||
repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
|
repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
|
||||||
|
repmgr: prevent potential race condition in "standby switchover"
|
||||||
|
when checking received WAL location; GitHub #518 (Ian)
|
||||||
repmgr: ensure "standby switchover" verifies repmgr can read the
|
repmgr: ensure "standby switchover" verifies repmgr can read the
|
||||||
data directory on the demotion candidate; GitHub #523 (Ian)
|
data directory on the demotion candidate; GitHub #523 (Ian)
|
||||||
repmgr: when executing "standby follow" and "node rejoin", check that
|
repmgr: when executing "standby follow" and "node rejoin", check that
|
||||||
|
|||||||
@@ -335,6 +335,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
*/
|
*/
|
||||||
options->shutdown_check_timeout = DEFAULT_SHUTDOWN_CHECK_TIMEOUT;
|
options->shutdown_check_timeout = DEFAULT_SHUTDOWN_CHECK_TIMEOUT;
|
||||||
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
|
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
|
||||||
|
options->wal_receive_check_timeout = DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT;
|
||||||
|
|
||||||
/*-----------------
|
/*-----------------
|
||||||
* repmgrd settings
|
* repmgrd settings
|
||||||
@@ -557,6 +558,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->shutdown_check_timeout = repmgr_atoi(value, name, error_list, 0);
|
options->shutdown_check_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
else if (strcmp(name, "standby_reconnect_timeout") == 0)
|
else if (strcmp(name, "standby_reconnect_timeout") == 0)
|
||||||
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
else if (strcmp(name, "wal_receive_check_timeout") == 0)
|
||||||
|
options->wal_receive_check_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
/* node rejoin settings */
|
/* node rejoin settings */
|
||||||
else if (strcmp(name, "node_rejoin_timeout") == 0)
|
else if (strcmp(name, "node_rejoin_timeout") == 0)
|
||||||
|
|||||||
@@ -106,6 +106,7 @@ typedef struct
|
|||||||
/* standby switchover settings */
|
/* standby switchover settings */
|
||||||
int shutdown_check_timeout;
|
int shutdown_check_timeout;
|
||||||
int standby_reconnect_timeout;
|
int standby_reconnect_timeout;
|
||||||
|
int wal_receive_check_timeout;
|
||||||
|
|
||||||
/* node rejoin settings */
|
/* node rejoin settings */
|
||||||
int node_rejoin_timeout;
|
int node_rejoin_timeout;
|
||||||
@@ -189,6 +190,7 @@ typedef struct
|
|||||||
/* standby switchover settings */ \
|
/* standby switchover settings */ \
|
||||||
DEFAULT_SHUTDOWN_CHECK_TIMEOUT, \
|
DEFAULT_SHUTDOWN_CHECK_TIMEOUT, \
|
||||||
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
|
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
|
||||||
|
DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT, \
|
||||||
/* node rejoin settings */ \
|
/* node rejoin settings */ \
|
||||||
DEFAULT_NODE_REJOIN_TIMEOUT, \
|
DEFAULT_NODE_REJOIN_TIMEOUT, \
|
||||||
/* node check settings */ \
|
/* node check settings */ \
|
||||||
|
|||||||
@@ -105,6 +105,15 @@
|
|||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
Add check <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>
|
||||||
|
when comparing received WAL on the standby to the primary's shutdown location to avoid a potential
|
||||||
|
race condition if the standby's walreceiver has not yet flushed all received WAL to disk.
|
||||||
|
GitHub #518.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
</itemizedlist>
|
</itemizedlist>
|
||||||
</para>
|
</para>
|
||||||
</sect2>
|
</sect2>
|
||||||
|
|||||||
@@ -168,20 +168,6 @@
|
|||||||
|
|
||||||
<variablelist>
|
<variablelist>
|
||||||
|
|
||||||
<varlistentry>
|
|
||||||
<indexterm>
|
|
||||||
<primary>x</primary>
|
|
||||||
<secondary>with "repmgr standby switchover "</secondary>
|
|
||||||
</indexterm>
|
|
||||||
|
|
||||||
<term><option></option></term>
|
|
||||||
<listitem>
|
|
||||||
<para>
|
|
||||||
</para>
|
|
||||||
</listitem>
|
|
||||||
</varlistentry>
|
|
||||||
|
|
||||||
|
|
||||||
<varlistentry>
|
<varlistentry>
|
||||||
<indexterm>
|
<indexterm>
|
||||||
<primary>replication_lag_critical</primary>
|
<primary>replication_lag_critical</primary>
|
||||||
@@ -207,7 +193,7 @@
|
|||||||
<term><option>shutdown_check_timeout</option></term>
|
<term><option>shutdown_check_timeout</option></term>
|
||||||
<listitem>
|
<listitem>
|
||||||
<para>
|
<para>
|
||||||
maximum number of seconds to wait for the
|
The maximum number of seconds to wait for the
|
||||||
demotion candidate (current primary) to shut down, before aborting the switchover.
|
demotion candidate (current primary) to shut down, before aborting the switchover.
|
||||||
</para>
|
</para>
|
||||||
<para>
|
<para>
|
||||||
@@ -225,7 +211,25 @@
|
|||||||
</listitem>
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
<varlistentry>
|
|
||||||
|
<varlistentry>
|
||||||
|
<indexterm>
|
||||||
|
<primary>wal_receive_check_timeout</primary>
|
||||||
|
<secondary>with "repmgr standby switchover "</secondary>
|
||||||
|
</indexterm>
|
||||||
|
|
||||||
|
<term><option>wal_receive_check_timeout</option></term>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
After the primary has shut down, the maximum number of seconds to wait for the
|
||||||
|
walreceiver on the standby to flush WAL to disk before comparing WAL receive location
|
||||||
|
with the primary's shut down location.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
</varlistentry>
|
||||||
|
|
||||||
|
|
||||||
|
<varlistentry>
|
||||||
<indexterm>
|
<indexterm>
|
||||||
<primary>standby_reconnect_timeout</primary>
|
<primary>standby_reconnect_timeout</primary>
|
||||||
<secondary>with "repmgr standby switchover "</secondary>
|
<secondary>with "repmgr standby switchover "</secondary>
|
||||||
@@ -234,8 +238,8 @@
|
|||||||
<term><option>standby_reconnect_timeout</option></term>
|
<term><option>standby_reconnect_timeout</option></term>
|
||||||
<listitem>
|
<listitem>
|
||||||
<para>
|
<para>
|
||||||
maximum number of seconds to attempt to wait for the demotion candidate (former primary)
|
The maximum number of seconds to attempt to wait for the demotion candidate (former primary)
|
||||||
to reconnect to the promoted primary (default: 60 seconds)
|
to reconnect to the promoted primary (default: 60 seconds)
|
||||||
</para>
|
</para>
|
||||||
<para>
|
<para>
|
||||||
Note that this parameter is set on the node where <command>repmgr standby switchover</command>
|
Note that this parameter is set on the node where <command>repmgr standby switchover</command>
|
||||||
@@ -245,7 +249,6 @@
|
|||||||
</listitem>
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
|
|
||||||
<varlistentry>
|
<varlistentry>
|
||||||
<indexterm>
|
<indexterm>
|
||||||
<primary>node_rejoin_timeout</primary>
|
<primary>node_rejoin_timeout</primary>
|
||||||
@@ -265,7 +268,7 @@
|
|||||||
</para>
|
</para>
|
||||||
<para>
|
<para>
|
||||||
However, this value <emphasis>must</emphasis> be less than <option>standby_reconnect_timeout</option> on the
|
However, this value <emphasis>must</emphasis> be less than <option>standby_reconnect_timeout</option> on the
|
||||||
promotion candidate (node where <command>repmgr standby switchover</command> is executed).
|
promotion candidate (the node where <command>repmgr standby switchover</command> is executed).
|
||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|||||||
@@ -3423,7 +3423,6 @@ do_standby_switchover(void)
|
|||||||
{
|
{
|
||||||
/* include walsender for promotion candidate in total */
|
/* include walsender for promotion candidate in total */
|
||||||
|
|
||||||
|
|
||||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||||
{
|
{
|
||||||
/* get host from node record */
|
/* get host from node record */
|
||||||
@@ -4180,7 +4179,37 @@ do_standby_switchover(void)
|
|||||||
log_verbose(LOG_INFO, _("successfully reconnected to local node"));
|
log_verbose(LOG_INFO, _("successfully reconnected to local node"));
|
||||||
}
|
}
|
||||||
|
|
||||||
get_replication_info(local_conn, &replication_info);
|
/*
|
||||||
|
* Compare standby's last WAL receive location with the primary's last
|
||||||
|
* checkpoint LSN. We'll loop for a while as it's possible the standby's
|
||||||
|
* walreceiver has not yet flushed all received WAL to disk.
|
||||||
|
*/
|
||||||
|
{
|
||||||
|
bool notice_emitted = false;
|
||||||
|
|
||||||
|
for (i = 0; i < config_file_options.wal_receive_check_timeout; i++)
|
||||||
|
{
|
||||||
|
get_replication_info(local_conn, &replication_info);
|
||||||
|
if (replication_info.last_wal_receive_lsn >= remote_last_checkpoint_lsn)
|
||||||
|
break;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We'll only output this notice if it looks like we're going to have
|
||||||
|
* to wait for WAL to be flushed.
|
||||||
|
*/
|
||||||
|
if (notice_emitted == false)
|
||||||
|
{
|
||||||
|
log_notice(_("waiting up to %i seconds (parameter \"wal_receive_check_timeout\") for received WAL to flush to disk"),
|
||||||
|
config_file_options.wal_receive_check_timeout);
|
||||||
|
|
||||||
|
notice_emitted = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
log_info(_("sleeping %i of maximum %i seconds waiting for standby to flush received WAL to disk"),
|
||||||
|
i + 1, config_file_options.wal_receive_check_timeout);
|
||||||
|
sleep(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (replication_info.last_wal_receive_lsn < remote_last_checkpoint_lsn)
|
if (replication_info.last_wal_receive_lsn < remote_last_checkpoint_lsn)
|
||||||
{
|
{
|
||||||
@@ -4200,6 +4229,10 @@ do_standby_switchover(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log_debug("local node last receive LSN is %X/%X, primary shutdown checkpoint LSN is %X/%X",
|
||||||
|
format_lsn(replication_info.last_wal_receive_lsn),
|
||||||
|
format_lsn(remote_last_checkpoint_lsn));
|
||||||
|
|
||||||
/* promote standby (local node) */
|
/* promote standby (local node) */
|
||||||
_do_standby_promote_internal(local_conn, server_version_num);
|
_do_standby_promote_internal(local_conn, server_version_num);
|
||||||
|
|
||||||
|
|||||||
@@ -241,6 +241,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
|||||||
# for the demoted standby to reconnect to the promoted
|
# for the demoted standby to reconnect to the promoted
|
||||||
# primary (note: this value should be equal to or greater
|
# primary (note: this value should be equal to or greater
|
||||||
# than that set for "node_rejoin_timeout")
|
# than that set for "node_rejoin_timeout")
|
||||||
|
#wal_receive_check_timeout=30 # The max length of time (in seconds) to wait for the walreceiver
|
||||||
|
# on the standby to flush WAL to disk before comparing location
|
||||||
|
# with the shut-down primary
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
# "node rejoin" settings
|
# "node rejoin" settings
|
||||||
|
|||||||
1
repmgr.h
1
repmgr.h
@@ -88,6 +88,7 @@
|
|||||||
#define DEFAULT_SHUTDOWN_CHECK_TIMEOUT 60 /* seconds */
|
#define DEFAULT_SHUTDOWN_CHECK_TIMEOUT 60 /* seconds */
|
||||||
#define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */
|
#define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */
|
||||||
#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */
|
#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */
|
||||||
|
#define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT 30 /* seconds */
|
||||||
|
|
||||||
#ifndef RECOVERY_COMMAND_FILE
|
#ifndef RECOVERY_COMMAND_FILE
|
||||||
#define RECOVERY_COMMAND_FILE "recovery.conf"
|
#define RECOVERY_COMMAND_FILE "recovery.conf"
|
||||||
|
|||||||
Reference in New Issue
Block a user