mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 07:06:30 +00:00
repmgrd: catch corner case in standby connection handle check
If repmgrd marks the local node as unavailable, and it was actually restarting but a failover event occured before the next local node check, failover will continue with the stale connection handle. Add a final local node check just before starting the failover process, so repmgrd can reconnect if it wasn't able to before.
This commit is contained in:
@@ -2390,7 +2390,9 @@ update_node_record_set_primary(PGconn *conn, int this_node_id)
|
||||
" UPDATE repmgr.nodes "
|
||||
" SET active = FALSE "
|
||||
" WHERE type = 'primary' "
|
||||
" AND active IS TRUE ");
|
||||
" AND active IS TRUE "
|
||||
" AND node_id != %i ",
|
||||
this_node_id);
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
termPQExpBuffer(&query);
|
||||
@@ -2412,7 +2414,8 @@ update_node_record_set_primary(PGconn *conn, int this_node_id)
|
||||
appendPQExpBuffer(&query,
|
||||
" UPDATE repmgr.nodes"
|
||||
" SET type = 'primary', "
|
||||
" upstream_node_id = NULL "
|
||||
" upstream_node_id = NULL, "
|
||||
" active = TRUE "
|
||||
" WHERE node_id = %i ",
|
||||
this_node_id);
|
||||
|
||||
@@ -3856,6 +3859,8 @@ connection_ping(PGconn *conn)
|
||||
{
|
||||
PGresult *res = PQexec(conn, "SELECT TRUE");
|
||||
|
||||
log_verbose(LOG_DEBUG, "connection_ping(): result is %s", PQresStatus(PQresultStatus(res)));
|
||||
|
||||
PQclear(res);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -750,6 +750,17 @@ monitor_streaming_standby(void)
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
close_connection(&upstream_conn);
|
||||
|
||||
/*
|
||||
* if local node is unreachable, make a last-minute attempt to reconnect
|
||||
* before continuing with the failover process
|
||||
*/
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
check_connection(&local_node_info, &local_conn);
|
||||
}
|
||||
|
||||
upstream_conn = try_reconnect(&upstream_node_info);
|
||||
|
||||
/* Node has recovered - log and continue */
|
||||
@@ -985,6 +996,15 @@ loop:
|
||||
}
|
||||
}
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
|
||||
{
|
||||
update_monitoring_history();
|
||||
}
|
||||
else
|
||||
{
|
||||
connection_ping(local_conn);
|
||||
}
|
||||
|
||||
/*
|
||||
* handle local node failure
|
||||
*
|
||||
@@ -1069,15 +1089,6 @@ loop:
|
||||
}
|
||||
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
|
||||
{
|
||||
update_monitoring_history();
|
||||
}
|
||||
else
|
||||
{
|
||||
connection_ping(local_conn);
|
||||
}
|
||||
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
log_debug("SIGHUP received");
|
||||
|
||||
Reference in New Issue
Block a user