repmgrd: improve reconnection handling

Previously, if the server being monitored was not available, repmgrd
would always close the existing connection handle and open a new one.

However, in some cases, e.g. a brief network outage, the existing
connection handle is still good and does not need to be reopened.

This could be particularly problematic if monitoring_history is on,
as this risks leaving orphan sessions on the primary which (given
a sufficiently unstable network) could lead to all available backends
being occupied.

Instead, during an outage we now use a new connection to verify
the server is accessible; if the old connection is still available
(e.g. following a short network interruption) we continue using that;
if  not (e.g. the server was restarted), we use the new one.
This commit is contained in:
Ian Barwick
2018-08-30 10:24:06 +09:00
parent 3b8586d82a
commit 17e75f6b31
8 changed files with 59 additions and 25 deletions

View File

@@ -770,10 +770,10 @@ show_help(void)
}
PGconn *
try_reconnect(t_node_info *node_info)
void
try_reconnect(PGconn **conn, t_node_info *node_info)
{
PGconn *conn;
PGconn *our_conn;
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
int i;
@@ -782,7 +782,6 @@ try_reconnect(t_node_info *node_info)
initialize_conninfo_params(&conninfo_params, false);
/* we assume by now the conninfo string is parseable */
(void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false);
@@ -805,18 +804,47 @@ try_reconnect(t_node_info *node_info)
* degraded monitoring? - make that configurable
*/
conn = establish_db_connection_by_params(&conninfo_params, false);
our_conn = establish_db_connection_by_params(&conninfo_params, false);
if (PQstatus(conn) == CONNECTION_OK)
if (PQstatus(our_conn) == CONNECTION_OK)
{
free_conninfo_params(&conninfo_params);
log_info(_("connection to node %i succeeded"), node_info.node_id);
if (PQstatus(*conn) == CONNECTION_BAD)
{
log_verbose(LOG_INFO, "original connection handle returned CONNECTION_BAD, using new connection");
close_connection(conn);
*conn = our_conn;
}
else
{
ExecStatusType ping_result;
ping_result = connection_ping(*conn);
if (ping_result != PGRES_TUPLES_OK)
{
log_info("original connnection no longer available, using new connection");
close_connection(conn);
*conn = our_conn;
}
else
{
log_info(_("original connection is still available"));
PQfinish(our_conn);
}
}
node_info->node_status = NODE_STATUS_UP;
return conn;
return;
}
close_connection(&conn);
log_notice(_("unable to reconnect to node"));
close_connection(&our_conn);
log_notice(_("unable to reconnect to node %i"), node_info.node_id);
}
if (i + 1 < max_attempts)
@@ -835,7 +863,7 @@ try_reconnect(t_node_info *node_info)
free_conninfo_params(&conninfo_params);
return NULL;
return;
}