repmgrd: improve reconnection handling

Previously, if the server being monitored was not available, repmgrd
would always close the existing connection handle and open a new one.

However, in some cases, e.g. a brief network outage, the existing
connection handle is still good and does not need to be reopened.

This could be particularly problematic if monitoring_history is on,
as this risks leaving orphan sessions on the primary which (given
a sufficiently unstable network) could lead to all available backends
being occupied.

Instead, during an outage we now use a new connection to verify
the server is accessible; if the old connection is still available
(e.g. following a short network interruption) we continue using that;
if  not (e.g. the server was restarted), we use the new one.
This commit is contained in:
Ian Barwick
2018-08-30 10:24:06 +09:00
parent 3b8586d82a
commit 17e75f6b31
8 changed files with 59 additions and 25 deletions

View File

@@ -14,6 +14,7 @@
repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian) repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian)
repmgrd: fix startup on witness node when local data is stale; GitHub #488/#489 (Ian) repmgrd: fix startup on witness node when local data is stale; GitHub #488/#489 (Ian)
repmgrd: improve cascaded standby failover handling; GitHub #480 (Ian) repmgrd: improve cascaded standby failover handling; GitHub #480 (Ian)
repmgrd: improve reconnection handling (Ian)
4.1.0 2018-07-31 4.1.0 2018-07-31
repmgr: change default log_level to INFO, add documentation; GitHub #470 (Ian) repmgr: change default log_level to INFO, add documentation; GitHub #470 (Ian)

View File

@@ -4074,17 +4074,20 @@ is_server_available_params(t_conninfo_param_list *param_list)
/* /*
* Simple throw-away query to stop a connection handle going stale * Simple throw-away query to stop a connection handle going stale.
*/ */
void ExecStatusType
connection_ping(PGconn *conn) connection_ping(PGconn *conn)
{ {
PGresult *res = PQexec(conn, "SELECT TRUE"); PGresult *res = PQexec(conn, "SELECT TRUE");
ExecStatusType ping_result;
log_verbose(LOG_DEBUG, "connection_ping(): result is %s", PQresStatus(PQresultStatus(res))); log_verbose(LOG_DEBUG, "connection_ping(): result is %s", PQresStatus(PQresultStatus(res)));
ping_result = PQresultStatus(res);
PQclear(res); PQclear(res);
return;
return ping_result;
} }

View File

@@ -475,7 +475,7 @@ int wait_connection_availability(PGconn *conn, long long timeout);
/* node availability functions */ /* node availability functions */
bool is_server_available(const char *conninfo); bool is_server_available(const char *conninfo);
bool is_server_available_params(t_conninfo_param_list *param_list); bool is_server_available_params(t_conninfo_param_list *param_list);
void connection_ping(PGconn *conn); ExecStatusType connection_ping(PGconn *conn);
/* monitoring functions */ /* monitoring functions */
void void

View File

@@ -96,6 +96,12 @@
</para> </para>
</listitem> </listitem>
<listitem>
<para>
Improve reconnection handling. (GitHub #480).
</para>
</listitem>
</itemizedlist> </itemizedlist>
</para> </para>
</sect2> </sect2>

View File

@@ -214,7 +214,8 @@ monitor_bdr(void)
log_warning(_("unable to connect to node %s (ID %i)"), log_warning(_("unable to connect to node %s (ID %i)"),
cell->node_info->node_name, cell->node_info->node_id); cell->node_info->node_name, cell->node_info->node_id);
cell->node_info->conn = try_reconnect(cell->node_info); //cell->node_info->conn = try_reconnect(cell->node_info);
try_reconnect(&cell->node_info->conn, cell->node_info);
/* node has recovered - log and continue */ /* node has recovered - log and continue */
if (cell->node_info->node_status == NODE_STATUS_UP) if (cell->node_info->node_status == NODE_STATUS_UP)

View File

@@ -288,8 +288,6 @@ monitor_streaming_primary(void)
local_node_info.node_status = NODE_STATUS_UNKNOWN; local_node_info.node_status = NODE_STATUS_UNKNOWN;
close_connection(&local_conn);
/* /*
* as we're monitoring the primary, no point in trying to * as we're monitoring the primary, no point in trying to
* write the event to the database * write the event to the database
@@ -305,7 +303,7 @@ monitor_streaming_primary(void)
termPQExpBuffer(&event_details); termPQExpBuffer(&event_details);
local_conn = try_reconnect(&local_node_info); try_reconnect(&local_conn, &local_node_info);
if (local_node_info.node_status == NODE_STATUS_UP) if (local_node_info.node_status == NODE_STATUS_UP)
{ {
@@ -744,8 +742,6 @@ monitor_streaming_standby(void)
log_warning("%s", event_details.data); log_warning("%s", event_details.data);
termPQExpBuffer(&event_details); termPQExpBuffer(&event_details);
close_connection(&upstream_conn);
/* /*
* if local node is unreachable, make a last-minute attempt to reconnect * if local node is unreachable, make a last-minute attempt to reconnect
* before continuing with the failover process * before continuing with the failover process
@@ -756,7 +752,7 @@ monitor_streaming_standby(void)
check_connection(&local_node_info, &local_conn); check_connection(&local_node_info, &local_conn);
} }
upstream_conn = try_reconnect(&upstream_node_info); try_reconnect(&upstream_conn, &upstream_node_info);
/* Node has recovered - log and continue */ /* Node has recovered - log and continue */
if (upstream_node_info.node_status == NODE_STATUS_UP) if (upstream_node_info.node_status == NODE_STATUS_UP)
@@ -1087,7 +1083,7 @@ loop:
* if monitoring not in use, we'll need to ensure the local connection * if monitoring not in use, we'll need to ensure the local connection
* handle isn't stale * handle isn't stale
*/ */
connection_ping(local_conn); (void) connection_ping(local_conn);
} }
/* /*
@@ -1304,8 +1300,7 @@ monitor_streaming_witness(void)
true, true,
event_details.data); event_details.data);
close_connection(&primary_conn); try_reconnect(&primary_conn, &upstream_node_info);
primary_conn = try_reconnect(&upstream_node_info);
/* Node has recovered - log and continue */ /* Node has recovered - log and continue */
if (upstream_node_info.node_status == NODE_STATUS_UP) if (upstream_node_info.node_status == NODE_STATUS_UP)

View File

@@ -770,10 +770,10 @@ show_help(void)
} }
PGconn * void
try_reconnect(t_node_info *node_info) try_reconnect(PGconn **conn, t_node_info *node_info)
{ {
PGconn *conn; PGconn *our_conn;
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER; t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
int i; int i;
@@ -782,7 +782,6 @@ try_reconnect(t_node_info *node_info)
initialize_conninfo_params(&conninfo_params, false); initialize_conninfo_params(&conninfo_params, false);
/* we assume by now the conninfo string is parseable */ /* we assume by now the conninfo string is parseable */
(void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false); (void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false);
@@ -805,18 +804,47 @@ try_reconnect(t_node_info *node_info)
* degraded monitoring? - make that configurable * degraded monitoring? - make that configurable
*/ */
conn = establish_db_connection_by_params(&conninfo_params, false); our_conn = establish_db_connection_by_params(&conninfo_params, false);
if (PQstatus(conn) == CONNECTION_OK) if (PQstatus(our_conn) == CONNECTION_OK)
{ {
free_conninfo_params(&conninfo_params); free_conninfo_params(&conninfo_params);
log_info(_("connection to node %i succeeded"), node_info.node_id);
if (PQstatus(*conn) == CONNECTION_BAD)
{
log_verbose(LOG_INFO, "original connection handle returned CONNECTION_BAD, using new connection");
close_connection(conn);
*conn = our_conn;
}
else
{
ExecStatusType ping_result;
ping_result = connection_ping(*conn);
if (ping_result != PGRES_TUPLES_OK)
{
log_info("original connnection no longer available, using new connection");
close_connection(conn);
*conn = our_conn;
}
else
{
log_info(_("original connection is still available"));
PQfinish(our_conn);
}
}
node_info->node_status = NODE_STATUS_UP; node_info->node_status = NODE_STATUS_UP;
return conn;
return;
} }
close_connection(&conn); close_connection(&our_conn);
log_notice(_("unable to reconnect to node")); log_notice(_("unable to reconnect to node %i"), node_info.node_id);
} }
if (i + 1 < max_attempts) if (i + 1 < max_attempts)
@@ -835,7 +863,7 @@ try_reconnect(t_node_info *node_info)
free_conninfo_params(&conninfo_params); free_conninfo_params(&conninfo_params);
return NULL; return;
} }

View File

@@ -21,7 +21,7 @@ extern t_node_info local_node_info;
extern PGconn *local_conn; extern PGconn *local_conn;
extern bool startup_event_logged; extern bool startup_event_logged;
PGconn *try_reconnect(t_node_info *node_info); void try_reconnect(PGconn **conn, t_node_info *node_info);
int calculate_elapsed(instr_time start_time); int calculate_elapsed(instr_time start_time);
const char *print_monitoring_state(MonitoringState monitoring_state); const char *print_monitoring_state(MonitoringState monitoring_state);