mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 16:46:28 +00:00
repmgr: poll demoted primary after restart during switchover
During a switchover operation, once the demoted primary has been restarted as a standby, repmgr attempts to reconnect to verify its status and drop any redundant replication slots. However it's possible the standby may still be in the startup phase, so poll for "standby_reconnect_timeout" seconds before giving up. Addresses GitHub #408.
This commit is contained in:
2
HISTORY
2
HISTORY
@@ -1,6 +1,8 @@
|
|||||||
4.0.5 2018-??-??
|
4.0.5 2018-??-??
|
||||||
repmgr: fix display of conninfo parsing error messages (Ian)
|
repmgr: fix display of conninfo parsing error messages (Ian)
|
||||||
repmgr: fix "repmgr cluster crosscheck" output; GitHub #389 (Ian)
|
repmgr: fix "repmgr cluster crosscheck" output; GitHub #389 (Ian)
|
||||||
|
repmgr: poll demoted primary after restart as a standby during a
|
||||||
|
switchover operation; GitHub #408 (Ian)
|
||||||
repmgrd: fix memory leaks in witness code (AndrzejNowicki, Martín)
|
repmgrd: fix memory leaks in witness code (AndrzejNowicki, Martín)
|
||||||
|
|
||||||
4.0.4 2018-03-09
|
4.0.4 2018-03-09
|
||||||
|
|||||||
@@ -115,6 +115,48 @@
|
|||||||
|
|
||||||
</refsect1>
|
</refsect1>
|
||||||
|
|
||||||
|
<refsect1>
|
||||||
|
<title>Configuration file settings</title>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
|
||||||
|
switchover operation:
|
||||||
|
<itemizedlist spacing="compact" mark="bullet">
|
||||||
|
<listitem>
|
||||||
|
<simpara>
|
||||||
|
<literal>reconnect_attempts</literal>: number of times to check the original primary
|
||||||
|
for a clean shutdown after executing the shutdown command, before aborting
|
||||||
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<simpara>
|
||||||
|
<literal>reconnect_interval</literal>: interval (in seconds) to check the original
|
||||||
|
primary for a clean shutdown after executing the shutdown command (up to a maximum
|
||||||
|
of <literal>reconnect_attempts</literal> tries)
|
||||||
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<simpara>
|
||||||
|
<literal>replication_lag_critical</literal>:
|
||||||
|
if replication lag (in seconds) on the standby exceeds this value, the
|
||||||
|
switchover will be aborted (unless the <literal>-F/--force</literal> option
|
||||||
|
is provided)
|
||||||
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<simpara>
|
||||||
|
<literal>standby_reconnect_timeout</literal>:
|
||||||
|
Number of seconds to attempt to reconnect to the demoted primary
|
||||||
|
once it has been restarted.
|
||||||
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
</itemizedlist>
|
||||||
|
</para>
|
||||||
|
</refsect1>
|
||||||
|
|
||||||
|
|
||||||
<refsect1>
|
<refsect1>
|
||||||
<title>Execution</title>
|
<title>Execution</title>
|
||||||
|
|
||||||
|
|||||||
@@ -170,34 +170,16 @@
|
|||||||
</para>
|
</para>
|
||||||
</important>
|
</important>
|
||||||
|
|
||||||
<para>
|
|
||||||
Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
|
|
||||||
switchover operation:
|
|
||||||
<itemizedlist spacing="compact" mark="bullet">
|
|
||||||
<listitem>
|
|
||||||
<simpara>
|
|
||||||
<literal>reconnect_attempts</literal>: number of times to check the original primary
|
|
||||||
for a clean shutdown after executing the shutdown command, before aborting
|
|
||||||
</simpara>
|
|
||||||
</listitem>
|
|
||||||
<listitem>
|
|
||||||
<simpara>
|
|
||||||
<literal>reconnect_interval</literal>: interval (in seconds) to check the original
|
|
||||||
primary for a clean shutdown after executing the shutdown command (up to a maximum
|
|
||||||
of <literal>reconnect_attempts</literal> tries)
|
|
||||||
</simpara>
|
|
||||||
</listitem>
|
|
||||||
<listitem>
|
|
||||||
<simpara>
|
|
||||||
<literal>replication_lag_critical</literal>:
|
|
||||||
if replication lag (in seconds) on the standby exceeds this value, the
|
|
||||||
switchover will be aborted (unless the <literal>-F/--force</literal> option
|
|
||||||
is provided)
|
|
||||||
</simpara>
|
|
||||||
</listitem>
|
|
||||||
|
|
||||||
</itemizedlist>
|
<note>
|
||||||
</para>
|
<simpara>
|
||||||
|
See <xref linkend="repmgr-standby-switchover"> for a full list of available
|
||||||
|
command line options and <filename>repmgr.conf</filename> settings relevant
|
||||||
|
to performing a switchover.
|
||||||
|
</simpara>
|
||||||
|
</note>
|
||||||
|
|
||||||
|
|
||||||
</sect1>
|
</sect1>
|
||||||
|
|
||||||
<sect1 id="switchover-execution" xreflabel="Executing the switchover command">
|
<sect1 id="switchover-execution" xreflabel="Executing the switchover command">
|
||||||
|
|||||||
@@ -3667,8 +3667,23 @@ do_standby_switchover(void)
|
|||||||
|
|
||||||
termPQExpBuffer(&command_output);
|
termPQExpBuffer(&command_output);
|
||||||
|
|
||||||
/* clean up remote node */
|
/*
|
||||||
remote_conn = establish_db_connection(remote_node_record.conninfo, false);
|
* Clean up remote node. It's possible that the standby is still starting up,
|
||||||
|
* so poll for a while until we get a connection.
|
||||||
|
*/
|
||||||
|
|
||||||
|
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
||||||
|
{
|
||||||
|
remote_conn = establish_db_connection(remote_node_record.conninfo, false);
|
||||||
|
|
||||||
|
if (PQstatus(remote_conn) == CONNECTION_OK)
|
||||||
|
break;
|
||||||
|
|
||||||
|
log_info(_("sleeping 1 second; %i of %i attempts (\"standby_reconnect_timeout\") to reconnect to demoted primary"),
|
||||||
|
i + 1,
|
||||||
|
config_file_options.standby_reconnect_timeout);
|
||||||
|
sleep(1);
|
||||||
|
}
|
||||||
|
|
||||||
/* check new standby (old primary) is reachable */
|
/* check new standby (old primary) is reachable */
|
||||||
if (PQstatus(remote_conn) != CONNECTION_OK)
|
if (PQstatus(remote_conn) != CONNECTION_OK)
|
||||||
@@ -3681,6 +3696,11 @@ do_standby_switchover(void)
|
|||||||
log_detail(_("node \"%s\" is now primary but node \"%s\" is not reachable"),
|
log_detail(_("node \"%s\" is now primary but node \"%s\" is not reachable"),
|
||||||
local_node_record.node_name,
|
local_node_record.node_name,
|
||||||
remote_node_record.node_name);
|
remote_node_record.node_name);
|
||||||
|
|
||||||
|
if (config_file_options.use_replication_slots == true)
|
||||||
|
{
|
||||||
|
log_hint(_("any inactive replication slots on the old primary will need to be dropped manually"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user