mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 08:56:29 +00:00
repmgr: improve replication status checking during switchover
When checking the new standby's record in pg_stat_replication, keep polling until the expected status is reported, and only give up after a timeout was exceeded. Previously repmgr would report an error if status was "startup", even though this is not a problem.
This commit is contained in:
122
repmgr.c
122
repmgr.c
@@ -4180,8 +4180,8 @@ do_standby_promote(void)
|
|||||||
/*
|
/*
|
||||||
* Promote standby to master.
|
* Promote standby to master.
|
||||||
*
|
*
|
||||||
* `pg_ctl promote` returns immediately and has no -w option, so we
|
* `pg_ctl promote` returns immediately and (prior to 10.0) has no -w option
|
||||||
* can't be sure when or if the promotion completes.
|
* so we can't be sure when or if the promotion completes.
|
||||||
* For now we'll poll the server until the default timeout (60 seconds)
|
* For now we'll poll the server until the default timeout (60 seconds)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@@ -5275,69 +5275,95 @@ do_standby_switchover(void)
|
|||||||
exit(ERR_SWITCHOVER_FAIL);
|
exit(ERR_SWITCHOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* A connection was made and it was determined the standby is in recovery */
|
||||||
log_debug("new standby is in recovery\n");
|
log_debug("new standby is in recovery\n");
|
||||||
|
|
||||||
/* Check for entry in pg_stat_replication */
|
/* Check for entry in the new master's pg_stat_replication */
|
||||||
|
|
||||||
local_conn = establish_db_connection(options.conninfo, true);
|
|
||||||
|
|
||||||
query_result = get_node_replication_state(local_conn, remote_node_record.name, remote_node_replication_state);
|
|
||||||
|
|
||||||
if (query_result == -1)
|
|
||||||
{
|
{
|
||||||
|
int i,
|
||||||
|
replication_check_timeout = 60,
|
||||||
|
replication_check_interval = 2;
|
||||||
|
bool replication_connection_ok = false;
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
initPQExpBuffer(&event_details);
|
|
||||||
appendPQExpBuffer(&event_details,
|
|
||||||
_("unable to retrieve replication status for node %i"),
|
|
||||||
remote_node_id);
|
|
||||||
log_err("%s\n", event_details.data);
|
|
||||||
create_event_record(local_conn,
|
|
||||||
&options,
|
|
||||||
options.node,
|
|
||||||
"standby_switchover",
|
|
||||||
false,
|
|
||||||
event_details.data);
|
|
||||||
termPQExpBuffer(&event_details);
|
|
||||||
PQfinish(local_conn);
|
|
||||||
exit(ERR_SWITCHOVER_FAIL);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (query_result == 0)
|
initPQExpBuffer(&event_details);
|
||||||
{
|
|
||||||
log_err(_("node %i not replicating\n"), remote_node_id);
|
local_conn = establish_db_connection(options.conninfo, true);
|
||||||
}
|
|
||||||
else
|
i = 0;
|
||||||
{
|
for (;;)
|
||||||
/* XXX we should poll for a while in case the node takes time to connect to the primary */
|
|
||||||
if (strcmp(remote_node_replication_state, "streaming") == 0 ||
|
|
||||||
strcmp(remote_node_replication_state, "catchup") == 0)
|
|
||||||
{
|
{
|
||||||
log_verbose(LOG_NOTICE, _("node %i is replicating in state \"%s\"\n"), remote_node_id, remote_node_replication_state);
|
query_result = get_node_replication_state(local_conn, remote_node_record.name, remote_node_replication_state);
|
||||||
}
|
|
||||||
else
|
if (query_result == -1)
|
||||||
{
|
{
|
||||||
/*
|
appendPQExpBuffer(&event_details,
|
||||||
* Other possible replication states are:
|
_("unable to retrieve replication status for node %i"),
|
||||||
* - startup
|
remote_node_id);
|
||||||
* - backup
|
log_warning("%s\n", event_details.data);
|
||||||
* - UNKNOWN
|
}
|
||||||
*/
|
else if (query_result == 0)
|
||||||
PQExpBufferData event_details;
|
{
|
||||||
|
log_warning(_("pg_stat_replication record for node %i not yet found\n"), remote_node_id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (strcmp(remote_node_replication_state, "streaming") == 0 ||
|
||||||
|
strcmp(remote_node_replication_state, "catchup") == 0)
|
||||||
|
{
|
||||||
|
log_verbose(LOG_NOTICE, _("node %i is replicating in state \"%s\"\n"), remote_node_id, remote_node_replication_state);
|
||||||
|
replication_connection_ok = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (strcmp(remote_node_replication_state, "startup") == 0)
|
||||||
|
{
|
||||||
|
log_verbose(LOG_NOTICE, _("node %i is starting up replication\n"), remote_node_id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Other possible replication states are:
|
||||||
|
* - backup
|
||||||
|
* - UNKNOWN
|
||||||
|
*/
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("node %i has unexpected replication state \"%s\""),
|
||||||
|
remote_node_id, remote_node_replication_state);
|
||||||
|
log_warning("%s\n", event_details.data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i >= replication_check_timeout)
|
||||||
|
break;
|
||||||
|
|
||||||
|
sleep(replication_check_interval);
|
||||||
|
|
||||||
|
i += replication_check_interval;
|
||||||
|
|
||||||
|
/* Reinitialise the string buffer */
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
appendPQExpBuffer(&event_details,
|
}
|
||||||
_("node %i has unexpected replication state \"%s\""),
|
|
||||||
remote_node_id, remote_node_replication_state);
|
/*
|
||||||
log_err("%s\n", event_details.data);
|
* We were unable to establish that the new standby had a pg_stat_replication
|
||||||
|
* record within the timeout period, so fail with whatever error message
|
||||||
|
* was placed in the string buffer.
|
||||||
|
*/
|
||||||
|
if (replication_connection_ok == false)
|
||||||
|
{
|
||||||
create_event_record(local_conn,
|
create_event_record(local_conn,
|
||||||
&options,
|
&options,
|
||||||
options.node,
|
options.node,
|
||||||
"standby_switchover",
|
"standby_switchover",
|
||||||
false,
|
false,
|
||||||
event_details.data);
|
event_details.data);
|
||||||
termPQExpBuffer(&event_details);
|
|
||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
exit(ERR_SWITCHOVER_FAIL);
|
exit(ERR_SWITCHOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|||||||
@@ -1513,7 +1513,6 @@ do_master_failover(void)
|
|||||||
*/
|
*/
|
||||||
if (PQstatus(node_conn) != CONNECTION_OK)
|
if (PQstatus(node_conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
/* XXX */
|
|
||||||
log_info(_("At this point, it could be some race conditions "
|
log_info(_("At this point, it could be some race conditions "
|
||||||
"that are acceptable, assume the node is restarting "
|
"that are acceptable, assume the node is restarting "
|
||||||
"and starting failover procedure\n"));
|
"and starting failover procedure\n"));
|
||||||
|
|||||||
Reference in New Issue
Block a user