repmgrd: additional check for the upstream connection

It's possible the upstream server was intermittently unavailable in
the interval between checks, invalidating the upstream connection.
With check types "ping" and "connection", the connection would not be
restored, so if the availability check was successful, additionally
verify the upstream connection and restore if necessary.

Addresses GitHub #633.
This commit is contained in:
Ian Barwick
2020-05-13 17:32:19 +09:00
parent 9b6fe6858a
commit a863dc7f6c
2 changed files with 48 additions and 15 deletions

View File

@@ -1443,6 +1443,7 @@ monitor_streaming_standby(void)
while (true)
{
log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo);
if (check_upstream_connection(&upstream_conn, upstream_node_info.conninfo) == true)
{
set_upstream_last_seen(local_conn, upstream_node_info.node_id);
@@ -3088,8 +3089,6 @@ do_primary_failover(void)
}
static void
update_monitoring_history(void)
{

View File

@@ -813,32 +813,65 @@ check_upstream_connection(PGconn **conn, const char *conninfo)
/* Check the connection status twice in case it changes after reset */
bool twice = false;
if (config_file_options.connection_check_type == CHECK_PING)
return is_server_available(conninfo);
if (config_file_options.connection_check_type == CHECK_CONNECTION)
log_debug("connection check type is \"%s\"",
print_connection_check_type(config_file_options.connection_check_type));
/*
* For the check types which do not involve using the existing database
* connection, we'll perform the actual check, then as an additional
* safeguard verify that the connection is still valid (as it might have
* gone away during a brief outage between checks).
*/
if (config_file_options.connection_check_type != CHECK_QUERY)
{
bool success = true;
PGconn *test_conn = PQconnectdb(conninfo);
log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo);
if (PQstatus(test_conn) != CONNECTION_OK)
if (config_file_options.connection_check_type == CHECK_PING)
{
log_warning(_("unable to connect to \"%s\""), conninfo);
log_detail("\n%s", PQerrorMessage(test_conn));
success = false;
success = is_server_available(conninfo);
}
PQfinish(test_conn);
else if (config_file_options.connection_check_type == CHECK_CONNECTION)
{
PGconn *test_conn = PQconnectdb(conninfo);
return success;
log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo);
if (PQstatus(test_conn) != CONNECTION_OK)
{
log_warning(_("unable to connect to \"%s\""), conninfo);
log_detail("\n%s", PQerrorMessage(test_conn));
success = false;
}
PQfinish(test_conn);
}
if (success == false)
return false;
if (PQstatus(*conn) == CONNECTION_OK)
return true;
/*
* Checks have succeeded, but the open connection to the primary has gone away,
* possibly due to a brief outage between monitoring intervals - attempt to
* reset it.
*/
log_notice(_("upstream is available but upstream connection has gone away, resetting"));
PQfinish(*conn);
*conn = PQconnectdb(conninfo);
if (PQstatus(*conn) == CONNECTION_OK)
return true;
return false;
}
for (;;)
{
if (PQstatus(*conn) != CONNECTION_OK)
{
log_debug("check_upstream_connection(): connection not OK");
log_debug("check_upstream_connection(): upstream connection has gone away, resetting");
if (twice)
return false;
/* reconnect */
@@ -873,6 +906,7 @@ check_upstream_connection(PGconn **conn, const char *conninfo)
return false;
/* reconnect */
log_debug("check_upstream_connection(): upstream connection not available, resetting");
PQfinish(*conn);
*conn = PQconnectdb(conninfo);
twice = true;