From fed6fba4ef0221437df750d896e6d2aef881e4bf Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 27 Jul 2017 14:13:39 +0900 Subject: [PATCH] repmgrd: more fixes for BDR node recovery --- dbutils.c | 8 +++++++- repmgrd-bdr.c | 39 +++++++++++++++++++++++++-------------- repmgrd.c | 3 ++- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/dbutils.c b/dbutils.c index a75dd3d8..66ffaa96 100644 --- a/dbutils.c +++ b/dbutils.c @@ -3204,6 +3204,8 @@ get_bdr_node_replication_slot_status(PGconn *conn, const char *node_name) " WHERE node_name = '%s') ", node_name); + log_verbose(LOG_DEBUG, "get_bdr_node_replication_slot_status():\n %s", query.data); + res = PQexec(conn, query.data); termPQExpBuffer(&query); @@ -3248,7 +3250,11 @@ get_bdr_other_node_name(PGconn *conn, int node_id, char *node_name) { strncpy(node_name, PQgetvalue(res, 0, 0), MAXLEN); } - + else + { + log_warning(_("get_bdr_other_node_name(): unable to execute query\n %s"), + PQerrorMessage(conn)); + } PQclear(res); return; diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c index a208e689..1982f534 100644 --- a/repmgrd-bdr.c +++ b/repmgrd-bdr.c @@ -207,7 +207,7 @@ monitor_bdr(void) create_event_notification(cell->node_info->conn, &config_file_options, config_file_options.node_id, - "repmgrd_upstream_reconnect", + "repmgrd_bdr_reconnect", true, event_details.data); termPQExpBuffer(&event_details); @@ -296,6 +296,14 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) monitored_node->monitoring_state = MS_DEGRADED; INSTR_TIME_SET_CURRENT(degraded_monitoring_start); + /* terminate local connection if this is the failed node */ + if (monitored_node->node_id == local_node_info.node_id) + { + PQfinish(local_conn); + local_conn = NULL; + } + + /* get other node */ for (cell = nodes->head; cell; cell = cell->next) @@ -346,7 +354,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) { PQfinish(next_node_conn); log_notice(_("record for node %i has already been set inactive"), - cell->node_info->node_id); + failed_node.node_id); return; } @@ -409,6 +417,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) PQfinish(next_node_conn); + return; } @@ -416,7 +425,6 @@ static void do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node) { PGconn *recovered_node_conn; - PGconn *slot_check_conn; PQExpBufferData event_details; t_event_info event_info = T_EVENT_INFO_INITIALIZER; @@ -426,6 +434,8 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node) char node_name[MAXLEN] = ""; + log_debug("handling recovery for monitored node %i", monitored_node->node_id); + recovered_node_conn = establish_db_connection(monitored_node->conninfo, false); if (PQstatus(recovered_node_conn) != CONNECTION_OK) @@ -434,18 +444,17 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node) return; } - /* determine which replication slot to look fore */ - if (monitored_node->node_id == local_node_info.node_id) + if (PQstatus(local_conn) != CONNECTION_OK) { - slot_check_conn = recovered_node_conn; - get_bdr_other_node_name(recovered_node_conn, local_node_info.node_id, node_name); - } - else - { - slot_check_conn = local_conn; - strncpy(node_name, monitored_node->node_name, MAXLEN); + log_debug("no local conn"); + local_conn = establish_db_connection(config_file_options.conninfo, true); } + // double-check local conn + + get_bdr_other_node_name(local_conn, local_node_info.node_id, node_name); + + for (i = 0; i < config_file_options.bdr_recovery_timeout; i++) { ReplSlotStatus slot_status; @@ -453,7 +462,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node) log_debug("checking for state of replication slot for node \"%s\"", node_name); slot_status = get_bdr_node_replication_slot_status( - slot_check_conn, + local_conn, node_name); if (slot_status == SLOT_ACTIVE) @@ -463,7 +472,6 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node) } sleep(1); - continue; } @@ -483,6 +491,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node) node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start); monitored_node->monitoring_state = MS_NORMAL; + monitored_node->node_status = NODE_STATUS_UP; initPQExpBuffer(&event_details); @@ -535,6 +544,8 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node) } } + update_node_record_set_active(local_conn, monitored_node->node_id, true); + termPQExpBuffer(&event_details); PQfinish(recovered_node_conn); diff --git a/repmgrd.c b/repmgrd.c index efbef52c..099321c5 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -619,7 +619,8 @@ try_reconnect(t_node_info *node_info) for (i = 0; i < max_attempts; i++) { - log_info(_("checking state of node, %i of %i attempts"), i, max_attempts); + log_info(_("checking state of node %i, %i of %i attempts"), + node_info->node_id, i, max_attempts); if (is_server_available(node_info->conninfo) == true) { log_notice(_("node has recovered, reconnecting"));