From 48a0aa3bf7eb6875ba87717698c17ed85ca30825 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Mon, 17 Jul 2017 14:56:52 +0900 Subject: [PATCH] repmgrd: improve failover handling Make retry frequency/interval configurable as per streaming replication. --- repmgrd-bdr.c | 60 +++++++++++++++++++++++++++++++++++++++++----- repmgrd-physical.c | 1 + 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c index 59f34eaf..3fbbb79a 100644 --- a/repmgrd-bdr.c +++ b/repmgrd-bdr.c @@ -31,6 +31,8 @@ monitor_bdr(void) NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER; t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER; RecordStatus record_status; + NodeInfoListCell *cell; + PQExpBufferData event_details; /* sanity check local database */ log_info(_("connecting to local database '%s'"), @@ -129,13 +131,18 @@ monitor_bdr(void) */ get_all_node_records(local_conn, &nodes); + /* we're expecting all (both) nodes to be up */ + for (cell = nodes.head; cell; cell = cell->next) + { + cell->node_info->node_status = NODE_STATUS_UP; + } + log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id); log_info(_("starting continuous bdr node monitoring")); while (true) { - NodeInfoListCell *cell; /* monitoring loop */ log_verbose(LOG_DEBUG, "bdr check loop..."); @@ -168,14 +175,53 @@ monitor_bdr(void) { if (is_server_available(cell->node_info->conninfo) == false) { - instr_time upstream_node_unreachable_start; + /* node is down, we were expecting it to be up */ + if (cell->node_info->node_status == NODE_STATUS_UP) + { + instr_time node_unreachable_start; + INSTR_TIME_SET_CURRENT(node_unreachable_start); - INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start); + cell->node_info->node_status = NODE_STATUS_DOWN; + if (cell->node_info->conn != NULL) + { + PQfinish(cell->node_info->conn); + cell->node_info->conn = NULL; + } - // XXX improve - log_warning("connection problem! to node %i", cell->node_info->node_id); - do_bdr_failover(&nodes, cell->node_info); + cell->node_info->conn = try_reconnect(cell->node_info); + + /* Node has recovered - log and continue */ + if (cell->node_info->node_status == NODE_STATUS_UP) + { + int node_unreachable_elapsed = calculate_elapsed(node_unreachable_start); + + initPQExpBuffer(&event_details); + + appendPQExpBuffer(&event_details, + _("reconnected to node %i after %i seconds"), + cell->node_info->node_id, + node_unreachable_elapsed); + log_notice("%s", event_details.data); + + create_event_notification(cell->node_info->conn, + &config_file_options, + config_file_options.node_id, + "repmgrd_upstream_reconnect", + true, + event_details.data); + termPQExpBuffer(&event_details); + + goto loop; + } + + /* still down after reconnect attempt(s) */ + if (cell->node_info->node_status == NODE_STATUS_DOWN) + { + do_bdr_failover(&nodes, cell->node_info); + goto loop; + } + } } } break; @@ -193,6 +239,8 @@ monitor_bdr(void) } } + loop: + if (got_SIGHUP) { /* diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 5f8b2007..f8b2463c 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -503,6 +503,7 @@ monitor_streaming_standby(void) PQfinish(upstream_conn); upstream_conn = try_reconnect(&upstream_node_info); + /* Node has recovered - log and continue */ if (upstream_node_info.node_status == NODE_STATUS_UP) { int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);