From a863dc7f6cdc650ad29452229cd5d3bbbd198384 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 13 May 2020 17:32:19 +0900 Subject: [PATCH] repmgrd: additional check for the upstream connection It's possible the upstream server was intermittently unavailable in the interval between checks, invalidating the upstream connection. With check types "ping" and "connection", the connection would not be restored, so if the availability check was successful, additionally verify the upstream connection and restore if necessary. Addresses GitHub #633. --- repmgrd-physical.c | 3 +-- repmgrd.c | 60 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 15 deletions(-) diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 07d46a7a..f27cc406 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -1443,6 +1443,7 @@ monitor_streaming_standby(void) while (true) { log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo); + if (check_upstream_connection(&upstream_conn, upstream_node_info.conninfo) == true) { set_upstream_last_seen(local_conn, upstream_node_info.node_id); @@ -3088,8 +3089,6 @@ do_primary_failover(void) } - - static void update_monitoring_history(void) { diff --git a/repmgrd.c b/repmgrd.c index 4f01ce47..d1cd6767 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -813,32 +813,65 @@ check_upstream_connection(PGconn **conn, const char *conninfo) /* Check the connection status twice in case it changes after reset */ bool twice = false; - if (config_file_options.connection_check_type == CHECK_PING) - return is_server_available(conninfo); - if (config_file_options.connection_check_type == CHECK_CONNECTION) + log_debug("connection check type is \"%s\"", + print_connection_check_type(config_file_options.connection_check_type)); + /* + * For the check types which do not involve using the existing database + * connection, we'll perform the actual check, then as an additional + * safeguard verify that the connection is still valid (as it might have + * gone away during a brief outage between checks). + */ + if (config_file_options.connection_check_type != CHECK_QUERY) { bool success = true; - PGconn *test_conn = PQconnectdb(conninfo); - log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo); - - if (PQstatus(test_conn) != CONNECTION_OK) + if (config_file_options.connection_check_type == CHECK_PING) { - log_warning(_("unable to connect to \"%s\""), conninfo); - log_detail("\n%s", PQerrorMessage(test_conn)); - success = false; + success = is_server_available(conninfo); } - PQfinish(test_conn); + else if (config_file_options.connection_check_type == CHECK_CONNECTION) + { + PGconn *test_conn = PQconnectdb(conninfo); - return success; + log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo); + + if (PQstatus(test_conn) != CONNECTION_OK) + { + log_warning(_("unable to connect to \"%s\""), conninfo); + log_detail("\n%s", PQerrorMessage(test_conn)); + success = false; + } + PQfinish(test_conn); + } + + if (success == false) + return false; + + if (PQstatus(*conn) == CONNECTION_OK) + return true; + + /* + * Checks have succeeded, but the open connection to the primary has gone away, + * possibly due to a brief outage between monitoring intervals - attempt to + * reset it. + */ + log_notice(_("upstream is available but upstream connection has gone away, resetting")); + + PQfinish(*conn); + *conn = PQconnectdb(conninfo); + + if (PQstatus(*conn) == CONNECTION_OK) + return true; + + return false; } for (;;) { if (PQstatus(*conn) != CONNECTION_OK) { - log_debug("check_upstream_connection(): connection not OK"); + log_debug("check_upstream_connection(): upstream connection has gone away, resetting"); if (twice) return false; /* reconnect */ @@ -873,6 +906,7 @@ check_upstream_connection(PGconn **conn, const char *conninfo) return false; /* reconnect */ + log_debug("check_upstream_connection(): upstream connection not available, resetting"); PQfinish(*conn); *conn = PQconnectdb(conninfo); twice = true;