From 68ad58f5fcb2e0f2a73c627365da026c77638b2c Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 13 May 2020 17:32:19 +0900 Subject: [PATCH] repmgrd: additional check for the upstream connection It's possible the upstream server was intermittently unavailable in the interval between checks, invalidating the upstream connection. With check types "ping" and "connection", the connection would not be restored, so if the availability check was successful, additionally verify the upstream connection and restore if necessary. Addresses GitHub #633. --- repmgrd-physical.c | 3 +-- repmgrd.c | 60 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 15 deletions(-) diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 7851a060..65e2b53c 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -1443,6 +1443,7 @@ monitor_streaming_standby(void) while (true) { log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo); + if (check_upstream_connection(&upstream_conn, upstream_node_info.conninfo) == true) { set_upstream_last_seen(local_conn, upstream_node_info.node_id); @@ -3088,8 +3089,6 @@ do_primary_failover(void) } - - static void update_monitoring_history(void) { diff --git a/repmgrd.c b/repmgrd.c index 383a6aa9..0ccfb556 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -817,32 +817,65 @@ check_upstream_connection(PGconn **conn, const char *conninfo) /* Check the connection status twice in case it changes after reset */ bool twice = false; - if (config_file_options.connection_check_type == CHECK_PING) - return is_server_available(conninfo); - if (config_file_options.connection_check_type == CHECK_CONNECTION) + log_debug("connection check type is \"%s\"", + print_connection_check_type(config_file_options.connection_check_type)); + /* + * For the check types which do not involve using the existing database + * connection, we'll perform the actual check, then as an additional + * safeguard verify that the connection is still valid (as it might have + * gone away during a brief outage between checks). + */ + if (config_file_options.connection_check_type != CHECK_QUERY) { bool success = true; - PGconn *test_conn = PQconnectdb(conninfo); - log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo); - - if (PQstatus(test_conn) != CONNECTION_OK) + if (config_file_options.connection_check_type == CHECK_PING) { - log_warning(_("unable to connect to \"%s\""), conninfo); - log_detail("\n%s", PQerrorMessage(test_conn)); - success = false; + success = is_server_available(conninfo); } - PQfinish(test_conn); + else if (config_file_options.connection_check_type == CHECK_CONNECTION) + { + PGconn *test_conn = PQconnectdb(conninfo); - return success; + log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo); + + if (PQstatus(test_conn) != CONNECTION_OK) + { + log_warning(_("unable to connect to \"%s\""), conninfo); + log_detail("\n%s", PQerrorMessage(test_conn)); + success = false; + } + PQfinish(test_conn); + } + + if (success == false) + return false; + + if (PQstatus(*conn) == CONNECTION_OK) + return true; + + /* + * Checks have succeeded, but the open connection to the primary has gone away, + * possibly due to a brief outage between monitoring intervals - attempt to + * reset it. + */ + log_notice(_("upstream is available but upstream connection has gone away, resetting")); + + PQfinish(*conn); + *conn = PQconnectdb(conninfo); + + if (PQstatus(*conn) == CONNECTION_OK) + return true; + + return false; } for (;;) { if (PQstatus(*conn) != CONNECTION_OK) { - log_debug("check_upstream_connection(): connection not OK"); + log_debug("check_upstream_connection(): upstream connection has gone away, resetting"); if (twice) return false; /* reconnect */ @@ -877,6 +910,7 @@ check_upstream_connection(PGconn **conn, const char *conninfo) return false; /* reconnect */ + log_debug("check_upstream_connection(): upstream connection not available, resetting"); PQfinish(*conn); *conn = PQconnectdb(conninfo); twice = true;