mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 08:36:30 +00:00
repmgrd: improve reconnection handling
Previously, if the server being monitored was not available, repmgrd would always close the existing connection handle and open a new one. However, in some cases, e.g. a brief network outage, the existing connection handle is still good and does not need to be reopened. This could be particularly problematic if monitoring_history is on, as this risks leaving orphan sessions on the primary which (given a sufficiently unstable network) could lead to all available backends being occupied. Instead, during an outage we now use a new connection to verify the server is accessible; if the old connection is still available (e.g. following a short network interruption) we continue using that; if not (e.g. the server was restarted), we use the new one.
This commit is contained in:
1
HISTORY
1
HISTORY
@@ -14,6 +14,7 @@
|
|||||||
repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian)
|
repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian)
|
||||||
repmgrd: fix startup on witness node when local data is stale; GitHub #488/#489 (Ian)
|
repmgrd: fix startup on witness node when local data is stale; GitHub #488/#489 (Ian)
|
||||||
repmgrd: improve cascaded standby failover handling; GitHub #480 (Ian)
|
repmgrd: improve cascaded standby failover handling; GitHub #480 (Ian)
|
||||||
|
repmgrd: improve reconnection handling (Ian)
|
||||||
|
|
||||||
4.1.0 2018-07-31
|
4.1.0 2018-07-31
|
||||||
repmgr: change default log_level to INFO, add documentation; GitHub #470 (Ian)
|
repmgr: change default log_level to INFO, add documentation; GitHub #470 (Ian)
|
||||||
|
|||||||
@@ -4074,17 +4074,20 @@ is_server_available_params(t_conninfo_param_list *param_list)
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Simple throw-away query to stop a connection handle going stale
|
* Simple throw-away query to stop a connection handle going stale.
|
||||||
*/
|
*/
|
||||||
void
|
ExecStatusType
|
||||||
connection_ping(PGconn *conn)
|
connection_ping(PGconn *conn)
|
||||||
{
|
{
|
||||||
PGresult *res = PQexec(conn, "SELECT TRUE");
|
PGresult *res = PQexec(conn, "SELECT TRUE");
|
||||||
|
ExecStatusType ping_result;
|
||||||
|
|
||||||
log_verbose(LOG_DEBUG, "connection_ping(): result is %s", PQresStatus(PQresultStatus(res)));
|
log_verbose(LOG_DEBUG, "connection_ping(): result is %s", PQresStatus(PQresultStatus(res)));
|
||||||
|
|
||||||
|
ping_result = PQresultStatus(res);
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
return;
|
|
||||||
|
return ping_result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -475,7 +475,7 @@ int wait_connection_availability(PGconn *conn, long long timeout);
|
|||||||
/* node availability functions */
|
/* node availability functions */
|
||||||
bool is_server_available(const char *conninfo);
|
bool is_server_available(const char *conninfo);
|
||||||
bool is_server_available_params(t_conninfo_param_list *param_list);
|
bool is_server_available_params(t_conninfo_param_list *param_list);
|
||||||
void connection_ping(PGconn *conn);
|
ExecStatusType connection_ping(PGconn *conn);
|
||||||
|
|
||||||
/* monitoring functions */
|
/* monitoring functions */
|
||||||
void
|
void
|
||||||
|
|||||||
@@ -96,6 +96,12 @@
|
|||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
Improve reconnection handling. (GitHub #480).
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
</itemizedlist>
|
</itemizedlist>
|
||||||
</para>
|
</para>
|
||||||
</sect2>
|
</sect2>
|
||||||
|
|||||||
@@ -214,7 +214,8 @@ monitor_bdr(void)
|
|||||||
|
|
||||||
log_warning(_("unable to connect to node %s (ID %i)"),
|
log_warning(_("unable to connect to node %s (ID %i)"),
|
||||||
cell->node_info->node_name, cell->node_info->node_id);
|
cell->node_info->node_name, cell->node_info->node_id);
|
||||||
cell->node_info->conn = try_reconnect(cell->node_info);
|
//cell->node_info->conn = try_reconnect(cell->node_info);
|
||||||
|
try_reconnect(&cell->node_info->conn, cell->node_info);
|
||||||
|
|
||||||
/* node has recovered - log and continue */
|
/* node has recovered - log and continue */
|
||||||
if (cell->node_info->node_status == NODE_STATUS_UP)
|
if (cell->node_info->node_status == NODE_STATUS_UP)
|
||||||
|
|||||||
@@ -288,8 +288,6 @@ monitor_streaming_primary(void)
|
|||||||
|
|
||||||
local_node_info.node_status = NODE_STATUS_UNKNOWN;
|
local_node_info.node_status = NODE_STATUS_UNKNOWN;
|
||||||
|
|
||||||
close_connection(&local_conn);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* as we're monitoring the primary, no point in trying to
|
* as we're monitoring the primary, no point in trying to
|
||||||
* write the event to the database
|
* write the event to the database
|
||||||
@@ -305,7 +303,7 @@ monitor_streaming_primary(void)
|
|||||||
|
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
local_conn = try_reconnect(&local_node_info);
|
try_reconnect(&local_conn, &local_node_info);
|
||||||
|
|
||||||
if (local_node_info.node_status == NODE_STATUS_UP)
|
if (local_node_info.node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
@@ -744,8 +742,6 @@ monitor_streaming_standby(void)
|
|||||||
log_warning("%s", event_details.data);
|
log_warning("%s", event_details.data);
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
close_connection(&upstream_conn);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* if local node is unreachable, make a last-minute attempt to reconnect
|
* if local node is unreachable, make a last-minute attempt to reconnect
|
||||||
* before continuing with the failover process
|
* before continuing with the failover process
|
||||||
@@ -756,7 +752,7 @@ monitor_streaming_standby(void)
|
|||||||
check_connection(&local_node_info, &local_conn);
|
check_connection(&local_node_info, &local_conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
upstream_conn = try_reconnect(&upstream_node_info);
|
try_reconnect(&upstream_conn, &upstream_node_info);
|
||||||
|
|
||||||
/* Node has recovered - log and continue */
|
/* Node has recovered - log and continue */
|
||||||
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
||||||
@@ -1087,7 +1083,7 @@ loop:
|
|||||||
* if monitoring not in use, we'll need to ensure the local connection
|
* if monitoring not in use, we'll need to ensure the local connection
|
||||||
* handle isn't stale
|
* handle isn't stale
|
||||||
*/
|
*/
|
||||||
connection_ping(local_conn);
|
(void) connection_ping(local_conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -1304,8 +1300,7 @@ monitor_streaming_witness(void)
|
|||||||
true,
|
true,
|
||||||
event_details.data);
|
event_details.data);
|
||||||
|
|
||||||
close_connection(&primary_conn);
|
try_reconnect(&primary_conn, &upstream_node_info);
|
||||||
primary_conn = try_reconnect(&upstream_node_info);
|
|
||||||
|
|
||||||
/* Node has recovered - log and continue */
|
/* Node has recovered - log and continue */
|
||||||
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
||||||
|
|||||||
48
repmgrd.c
48
repmgrd.c
@@ -770,10 +770,10 @@ show_help(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
PGconn *
|
void
|
||||||
try_reconnect(t_node_info *node_info)
|
try_reconnect(PGconn **conn, t_node_info *node_info)
|
||||||
{
|
{
|
||||||
PGconn *conn;
|
PGconn *our_conn;
|
||||||
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
@@ -782,7 +782,6 @@ try_reconnect(t_node_info *node_info)
|
|||||||
|
|
||||||
initialize_conninfo_params(&conninfo_params, false);
|
initialize_conninfo_params(&conninfo_params, false);
|
||||||
|
|
||||||
|
|
||||||
/* we assume by now the conninfo string is parseable */
|
/* we assume by now the conninfo string is parseable */
|
||||||
(void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false);
|
(void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false);
|
||||||
|
|
||||||
@@ -805,18 +804,47 @@ try_reconnect(t_node_info *node_info)
|
|||||||
* degraded monitoring? - make that configurable
|
* degraded monitoring? - make that configurable
|
||||||
*/
|
*/
|
||||||
|
|
||||||
conn = establish_db_connection_by_params(&conninfo_params, false);
|
our_conn = establish_db_connection_by_params(&conninfo_params, false);
|
||||||
|
|
||||||
if (PQstatus(conn) == CONNECTION_OK)
|
if (PQstatus(our_conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
free_conninfo_params(&conninfo_params);
|
free_conninfo_params(&conninfo_params);
|
||||||
|
|
||||||
|
log_info(_("connection to node %i succeeded"), node_info.node_id);
|
||||||
|
|
||||||
|
if (PQstatus(*conn) == CONNECTION_BAD)
|
||||||
|
{
|
||||||
|
log_verbose(LOG_INFO, "original connection handle returned CONNECTION_BAD, using new connection");
|
||||||
|
close_connection(conn);
|
||||||
|
*conn = our_conn;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ExecStatusType ping_result;
|
||||||
|
|
||||||
|
ping_result = connection_ping(*conn);
|
||||||
|
|
||||||
|
if (ping_result != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
log_info("original connnection no longer available, using new connection");
|
||||||
|
close_connection(conn);
|
||||||
|
*conn = our_conn;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_info(_("original connection is still available"));
|
||||||
|
|
||||||
|
PQfinish(our_conn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
node_info->node_status = NODE_STATUS_UP;
|
node_info->node_status = NODE_STATUS_UP;
|
||||||
return conn;
|
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
close_connection(&conn);
|
close_connection(&our_conn);
|
||||||
log_notice(_("unable to reconnect to node"));
|
log_notice(_("unable to reconnect to node %i"), node_info.node_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i + 1 < max_attempts)
|
if (i + 1 < max_attempts)
|
||||||
@@ -835,7 +863,7 @@ try_reconnect(t_node_info *node_info)
|
|||||||
|
|
||||||
free_conninfo_params(&conninfo_params);
|
free_conninfo_params(&conninfo_params);
|
||||||
|
|
||||||
return NULL;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ extern t_node_info local_node_info;
|
|||||||
extern PGconn *local_conn;
|
extern PGconn *local_conn;
|
||||||
extern bool startup_event_logged;
|
extern bool startup_event_logged;
|
||||||
|
|
||||||
PGconn *try_reconnect(t_node_info *node_info);
|
void try_reconnect(PGconn **conn, t_node_info *node_info);
|
||||||
|
|
||||||
int calculate_elapsed(instr_time start_time);
|
int calculate_elapsed(instr_time start_time);
|
||||||
const char *print_monitoring_state(MonitoringState monitoring_state);
|
const char *print_monitoring_state(MonitoringState monitoring_state);
|
||||||
|
|||||||
Reference in New Issue
Block a user