diff --git a/HISTORY b/HISTORY index f0179482..996033cc 100644 --- a/HISTORY +++ b/HISTORY @@ -14,6 +14,7 @@ repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian) repmgrd: fix startup on witness node when local data is stale; GitHub #488/#489 (Ian) repmgrd: improve cascaded standby failover handling; GitHub #480 (Ian) + repmgrd: improve reconnection handling (Ian) 4.1.0 2018-07-31 repmgr: change default log_level to INFO, add documentation; GitHub #470 (Ian) diff --git a/dbutils.c b/dbutils.c index 8b006dee..7139e0a0 100644 --- a/dbutils.c +++ b/dbutils.c @@ -4074,17 +4074,20 @@ is_server_available_params(t_conninfo_param_list *param_list) /* - * Simple throw-away query to stop a connection handle going stale + * Simple throw-away query to stop a connection handle going stale. */ -void +ExecStatusType connection_ping(PGconn *conn) { PGresult *res = PQexec(conn, "SELECT TRUE"); + ExecStatusType ping_result; log_verbose(LOG_DEBUG, "connection_ping(): result is %s", PQresStatus(PQresultStatus(res))); + ping_result = PQresultStatus(res); PQclear(res); - return; + + return ping_result; } diff --git a/dbutils.h b/dbutils.h index 1978a577..0df3d7fa 100644 --- a/dbutils.h +++ b/dbutils.h @@ -475,7 +475,7 @@ int wait_connection_availability(PGconn *conn, long long timeout); /* node availability functions */ bool is_server_available(const char *conninfo); bool is_server_available_params(t_conninfo_param_list *param_list); -void connection_ping(PGconn *conn); +ExecStatusType connection_ping(PGconn *conn); /* monitoring functions */ void diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index 02fd96b9..b59fc073 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -96,6 +96,12 @@ + + + Improve reconnection handling. (GitHub #480). + + + diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c index 8c3c1169..40732756 100644 --- a/repmgrd-bdr.c +++ b/repmgrd-bdr.c @@ -214,7 +214,8 @@ monitor_bdr(void) log_warning(_("unable to connect to node %s (ID %i)"), cell->node_info->node_name, cell->node_info->node_id); - cell->node_info->conn = try_reconnect(cell->node_info); + //cell->node_info->conn = try_reconnect(cell->node_info); + try_reconnect(&cell->node_info->conn, cell->node_info); /* node has recovered - log and continue */ if (cell->node_info->node_status == NODE_STATUS_UP) diff --git a/repmgrd-physical.c b/repmgrd-physical.c index fd2537ee..a04ad5e4 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -288,8 +288,6 @@ monitor_streaming_primary(void) local_node_info.node_status = NODE_STATUS_UNKNOWN; - close_connection(&local_conn); - /* * as we're monitoring the primary, no point in trying to * write the event to the database @@ -305,7 +303,7 @@ monitor_streaming_primary(void) termPQExpBuffer(&event_details); - local_conn = try_reconnect(&local_node_info); + try_reconnect(&local_conn, &local_node_info); if (local_node_info.node_status == NODE_STATUS_UP) { @@ -744,8 +742,6 @@ monitor_streaming_standby(void) log_warning("%s", event_details.data); termPQExpBuffer(&event_details); - close_connection(&upstream_conn); - /* * if local node is unreachable, make a last-minute attempt to reconnect * before continuing with the failover process @@ -756,7 +752,7 @@ monitor_streaming_standby(void) check_connection(&local_node_info, &local_conn); } - upstream_conn = try_reconnect(&upstream_node_info); + try_reconnect(&upstream_conn, &upstream_node_info); /* Node has recovered - log and continue */ if (upstream_node_info.node_status == NODE_STATUS_UP) @@ -1087,7 +1083,7 @@ loop: * if monitoring not in use, we'll need to ensure the local connection * handle isn't stale */ - connection_ping(local_conn); + (void) connection_ping(local_conn); } /* @@ -1304,8 +1300,7 @@ monitor_streaming_witness(void) true, event_details.data); - close_connection(&primary_conn); - primary_conn = try_reconnect(&upstream_node_info); + try_reconnect(&primary_conn, &upstream_node_info); /* Node has recovered - log and continue */ if (upstream_node_info.node_status == NODE_STATUS_UP) diff --git a/repmgrd.c b/repmgrd.c index ed7f6b61..07c863d9 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -770,10 +770,10 @@ show_help(void) } -PGconn * -try_reconnect(t_node_info *node_info) +void +try_reconnect(PGconn **conn, t_node_info *node_info) { - PGconn *conn; + PGconn *our_conn; t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER; int i; @@ -782,7 +782,6 @@ try_reconnect(t_node_info *node_info) initialize_conninfo_params(&conninfo_params, false); - /* we assume by now the conninfo string is parseable */ (void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false); @@ -805,18 +804,47 @@ try_reconnect(t_node_info *node_info) * degraded monitoring? - make that configurable */ - conn = establish_db_connection_by_params(&conninfo_params, false); + our_conn = establish_db_connection_by_params(&conninfo_params, false); - if (PQstatus(conn) == CONNECTION_OK) + if (PQstatus(our_conn) == CONNECTION_OK) { free_conninfo_params(&conninfo_params); + log_info(_("connection to node %i succeeded"), node_info.node_id); + + if (PQstatus(*conn) == CONNECTION_BAD) + { + log_verbose(LOG_INFO, "original connection handle returned CONNECTION_BAD, using new connection"); + close_connection(conn); + *conn = our_conn; + } + else + { + ExecStatusType ping_result; + + ping_result = connection_ping(*conn); + + if (ping_result != PGRES_TUPLES_OK) + { + log_info("original connnection no longer available, using new connection"); + close_connection(conn); + *conn = our_conn; + } + else + { + log_info(_("original connection is still available")); + + PQfinish(our_conn); + } + } + node_info->node_status = NODE_STATUS_UP; - return conn; + + return; } - close_connection(&conn); - log_notice(_("unable to reconnect to node")); + close_connection(&our_conn); + log_notice(_("unable to reconnect to node %i"), node_info.node_id); } if (i + 1 < max_attempts) @@ -835,7 +863,7 @@ try_reconnect(t_node_info *node_info) free_conninfo_params(&conninfo_params); - return NULL; + return; } diff --git a/repmgrd.h b/repmgrd.h index ad811214..0f8f3706 100644 --- a/repmgrd.h +++ b/repmgrd.h @@ -21,7 +21,7 @@ extern t_node_info local_node_info; extern PGconn *local_conn; extern bool startup_event_logged; -PGconn *try_reconnect(t_node_info *node_info); +void try_reconnect(PGconn **conn, t_node_info *node_info); int calculate_elapsed(instr_time start_time); const char *print_monitoring_state(MonitoringState monitoring_state);