mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 07:06:30 +00:00
repmgr: poll demoted primary after restart during switchover
During a switchover operation, once the demoted primary has been restarted as a standby, repmgr attempts to reconnect to verify its status and drop any redundant replication slots. However it's possible the standby may still be in the startup phase, so poll for "standby_reconnect_timeout" seconds before giving up. Addresses GitHub #408.
This commit is contained in:
2
HISTORY
2
HISTORY
@@ -1,6 +1,8 @@
|
||||
4.0.5 2018-??-??
|
||||
repmgr: fix display of conninfo parsing error messages (Ian)
|
||||
repmgr: fix "repmgr cluster crosscheck" output; GitHub #389 (Ian)
|
||||
repmgr: poll demoted primary after restart as a standby during a
|
||||
switchover operation; GitHub #408 (Ian)
|
||||
repmgrd: fix memory leaks in witness code (AndrzejNowicki, Martín)
|
||||
repmgrd: set "connect_timeout=2" when pinging a server (Ian)
|
||||
|
||||
|
||||
@@ -115,6 +115,48 @@
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Configuration file settings</title>
|
||||
|
||||
<para>
|
||||
Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
|
||||
switchover operation:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_attempts</literal>: number of times to check the original primary
|
||||
for a clean shutdown after executing the shutdown command, before aborting
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_interval</literal>: interval (in seconds) to check the original
|
||||
primary for a clean shutdown after executing the shutdown command (up to a maximum
|
||||
of <literal>reconnect_attempts</literal> tries)
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>replication_lag_critical</literal>:
|
||||
if replication lag (in seconds) on the standby exceeds this value, the
|
||||
switchover will be aborted (unless the <literal>-F/--force</literal> option
|
||||
is provided)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>standby_reconnect_timeout</literal>:
|
||||
Number of seconds to attempt to reconnect to the demoted primary
|
||||
once it has been restarted.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>Execution</title>
|
||||
|
||||
|
||||
@@ -170,34 +170,16 @@
|
||||
</para>
|
||||
</important>
|
||||
|
||||
<para>
|
||||
Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
|
||||
switchover operation:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_attempts</literal>: number of times to check the original primary
|
||||
for a clean shutdown after executing the shutdown command, before aborting
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_interval</literal>: interval (in seconds) to check the original
|
||||
primary for a clean shutdown after executing the shutdown command (up to a maximum
|
||||
of <literal>reconnect_attempts</literal> tries)
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>replication_lag_critical</literal>:
|
||||
if replication lag (in seconds) on the standby exceeds this value, the
|
||||
switchover will be aborted (unless the <literal>-F/--force</literal> option
|
||||
is provided)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
See <xref linkend="repmgr-standby-switchover"> for a full list of available
|
||||
command line options and <filename>repmgr.conf</filename> settings relevant
|
||||
to performing a switchover.
|
||||
</simpara>
|
||||
</note>
|
||||
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="switchover-execution" xreflabel="Executing the switchover command">
|
||||
|
||||
@@ -3570,8 +3570,23 @@ x */
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
|
||||
/* clean up remote node */
|
||||
remote_conn = establish_db_connection(remote_node_record.conninfo, false);
|
||||
/*
|
||||
* Clean up remote node. It's possible that the standby is still starting up,
|
||||
* so poll for a while until we get a connection.
|
||||
*/
|
||||
|
||||
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
||||
{
|
||||
remote_conn = establish_db_connection(remote_node_record.conninfo, false);
|
||||
|
||||
if (PQstatus(remote_conn) == CONNECTION_OK)
|
||||
break;
|
||||
|
||||
log_info(_("sleeping 1 second; %i of %i attempts (\"standby_reconnect_timeout\") to reconnect to demoted primary"),
|
||||
i + 1,
|
||||
config_file_options.standby_reconnect_timeout);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
/* check new standby (old primary) is reachable */
|
||||
if (PQstatus(remote_conn) != CONNECTION_OK)
|
||||
@@ -3584,6 +3599,11 @@ x */
|
||||
log_detail(_("node \"%s\" is now primary but node \"%s\" is not reachable"),
|
||||
local_node_record.node_name,
|
||||
remote_node_record.node_name);
|
||||
|
||||
if (config_file_options.use_replication_slots == true)
|
||||
{
|
||||
log_hint(_("any inactive replication slots on the old primary will need to be dropped manually"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user