From 242fa287b4f399b8ff9f77e152cd853cba9f8f8a Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Tue, 24 Apr 2018 21:49:55 +0900 Subject: [PATCH] repmgrd: catch corner case in standby connection handle check If repmgrd marks the local node as unavailable, and it was actually restarting but a failover event occured before the next local node check, failover will continue with the stale connection handle. Add a final local node check just before starting the failover process, so repmgrd can reconnect if it wasn't able to before. --- dbutils.c | 9 +++++++-- repmgrd-physical.c | 29 ++++++++++++++++++++--------- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/dbutils.c b/dbutils.c index f8c91657..42214480 100644 --- a/dbutils.c +++ b/dbutils.c @@ -2390,7 +2390,9 @@ update_node_record_set_primary(PGconn *conn, int this_node_id) " UPDATE repmgr.nodes " " SET active = FALSE " " WHERE type = 'primary' " - " AND active IS TRUE "); + " AND active IS TRUE " + " AND node_id != %i ", + this_node_id); res = PQexec(conn, query.data); termPQExpBuffer(&query); @@ -2412,7 +2414,8 @@ update_node_record_set_primary(PGconn *conn, int this_node_id) appendPQExpBuffer(&query, " UPDATE repmgr.nodes" " SET type = 'primary', " - " upstream_node_id = NULL " + " upstream_node_id = NULL, " + " active = TRUE " " WHERE node_id = %i ", this_node_id); @@ -3856,6 +3859,8 @@ connection_ping(PGconn *conn) { PGresult *res = PQexec(conn, "SELECT TRUE"); + log_verbose(LOG_DEBUG, "connection_ping(): result is %s", PQresStatus(PQresultStatus(res))); + PQclear(res); return; } diff --git a/repmgrd-physical.c b/repmgrd-physical.c index e8db03f4..af209222 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -750,6 +750,17 @@ monitor_streaming_standby(void) termPQExpBuffer(&event_details); close_connection(&upstream_conn); + + /* + * if local node is unreachable, make a last-minute attempt to reconnect + * before continuing with the failover process + */ + + if (PQstatus(local_conn) != CONNECTION_OK) + { + check_connection(&local_node_info, &local_conn); + } + upstream_conn = try_reconnect(&upstream_node_info); /* Node has recovered - log and continue */ @@ -985,6 +996,15 @@ loop: } } + if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true) + { + update_monitoring_history(); + } + else + { + connection_ping(local_conn); + } + /* * handle local node failure * @@ -1069,15 +1089,6 @@ loop: } - if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true) - { - update_monitoring_history(); - } - else - { - connection_ping(local_conn); - } - if (got_SIGHUP) { log_debug("SIGHUP received");