repmgr: poll demoted primary after restart during switchover

During a switchover operation, once the demoted primary has been restarted
as a standby, repmgr attempts to reconnect to verify its status and drop
any redundant replication slots. However it's possible the standby may still
be in the startup phase, so poll for "standby_reconnect_timeout" seconds
before giving up.

Addresses GitHub #408.
This commit is contained in:
Ian Barwick
2018-03-27 15:58:18 +09:00
parent deea4f69f7
commit 6f9a1f975e
4 changed files with 75 additions and 29 deletions

View File

@@ -3667,8 +3667,23 @@ do_standby_switchover(void)
termPQExpBuffer(&command_output);
/* clean up remote node */
remote_conn = establish_db_connection(remote_node_record.conninfo, false);
/*
* Clean up remote node. It's possible that the standby is still starting up,
* so poll for a while until we get a connection.
*/
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
{
remote_conn = establish_db_connection(remote_node_record.conninfo, false);
if (PQstatus(remote_conn) == CONNECTION_OK)
break;
log_info(_("sleeping 1 second; %i of %i attempts (\"standby_reconnect_timeout\") to reconnect to demoted primary"),
i + 1,
config_file_options.standby_reconnect_timeout);
sleep(1);
}
/* check new standby (old primary) is reachable */
if (PQstatus(remote_conn) != CONNECTION_OK)
@@ -3681,6 +3696,11 @@ do_standby_switchover(void)
log_detail(_("node \"%s\" is now primary but node \"%s\" is not reachable"),
local_node_record.node_name,
remote_node_record.node_name);
if (config_file_options.use_replication_slots == true)
{
log_hint(_("any inactive replication slots on the old primary will need to be dropped manually"));
}
}
else
{