From 5e9d50f8cab15bbdca6f8c0feb511d341a68f831 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 15 Nov 2017 11:34:17 +0900 Subject: [PATCH] repmgrd: finalize witness failover handling --- repmgrd-physical.c | 120 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 117 insertions(+), 3 deletions(-) diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 3d6b3799..683790d7 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -867,8 +867,8 @@ monitor_streaming_standby(void) follow_new_primary(follow_node_id); } } + clear_node_info_list(&standby_nodes); } - } } @@ -1183,7 +1183,93 @@ monitor_streaming_witness(void) if (monitoring_state == MS_DEGRADED) { - // XXX + int degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start); + + log_debug("monitoring node %i in degraded state for %i seconds", + upstream_node_info.node_id, + degraded_monitoring_elapsed); + + if (is_server_available(upstream_node_info.conninfo) == true) + { + primary_conn = establish_db_connection(upstream_node_info.conninfo, false); + + if (PQstatus(primary_conn) == CONNECTION_OK) + { + upstream_node_info.node_status = NODE_STATUS_UP; + monitoring_state = MS_NORMAL; + + initPQExpBuffer(&event_details); + + appendPQExpBuffer(&event_details, + _("reconnected to upstream node %i after %i seconds, resuming monitoring"), + upstream_node_info.node_id, + degraded_monitoring_elapsed); + + create_event_notification(primary_conn, + &config_file_options, + config_file_options.node_id, + "repmgrd_upstream_reconnect", + true, + event_details.data); + + log_notice("%s", event_details.data); + termPQExpBuffer(&event_details); + + goto loop; + } + } + else + { + /* + * unable to connect to former primary - check if another node + * has been promoted + */ + + NodeInfoListCell *cell; + int follow_node_id = UNKNOWN_NODE_ID; + + get_active_sibling_node_records(local_conn, + local_node_info.node_id, + local_node_info.upstream_node_id, + &standby_nodes); + + if (standby_nodes.node_count > 0) + { + log_debug("scanning %i node records to detect new primary...", standby_nodes.node_count); + for (cell = standby_nodes.head; cell; cell = cell->next) + { + /* skip local node check, we did that above */ + if (cell->node_info->node_id == local_node_info.node_id) + { + continue; + } + + cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false); + + if (PQstatus(cell->node_info->conn) != CONNECTION_OK) + { + log_debug("unable to connect to %i ... ", cell->node_info->node_id); + continue; + } + + if (get_recovery_type(cell->node_info->conn) == RECTYPE_PRIMARY) + { + follow_node_id = cell->node_info->node_id; + PQfinish(cell->node_info->conn); + cell->node_info->conn = NULL; + break; + } + PQfinish(cell->node_info->conn); + cell->node_info->conn = NULL; + } + + if (follow_node_id != UNKNOWN_NODE_ID) + { + witness_follow_new_primary(follow_node_id); + } + } + clear_node_info_list(&standby_nodes); + } } loop: @@ -1229,6 +1315,31 @@ loop: } } + + if (got_SIGHUP) + { + log_debug("SIGHUP received"); + + if (reload_config(&config_file_options)) + { + PQfinish(local_conn); + local_conn = establish_db_connection(config_file_options.conninfo, true); + + if (*config_file_options.log_file) + { + FILE *fd; + + fd = freopen(config_file_options.log_file, "a", stderr); + if (fd == NULL) + { + fprintf(stderr, "error reopening stderr to \"%s\": %s", + config_file_options.log_file, strerror(errno)); + } + } + } + got_SIGHUP = false; + } + sleep(config_file_options.monitor_interval_secs); } #endif @@ -2491,8 +2602,8 @@ bool do_witness_failover(void) failover_state = FAILOVER_STATE_NONE; return true; - case FAILOVER_STATE_FOLLOW_FAIL: + case FAILOVER_STATE_FOLLOW_FAIL: /* * for whatever reason we were unable to follow the new primary - * continue monitoring in degraded state @@ -2503,6 +2614,9 @@ bool do_witness_failover(void) return false; default: + monitoring_state = MS_DEGRADED; + INSTR_TIME_SET_CURRENT(degraded_monitoring_start); + return false; } /* should never reach here */