diff --git a/dbutils.c b/dbutils.c index d264c4f6..df833014 100644 --- a/dbutils.c +++ b/dbutils.c @@ -1395,7 +1395,7 @@ get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_i if (record_status == RECORD_NOT_FOUND) { - log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %s", + log_verbose(LOG_DEBUG, "get_node_record_by_name(): no record found for node %s", node_name); } @@ -3250,10 +3250,18 @@ am_bdr_failover_handler(PGconn *conn, int node_id) "SELECT repmgr.am_bdr_failover_handler(%i)", node_id); - res = PQexec(conn, query.data); termPQExpBuffer(&query); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + log_error(_("unable to execute function repmgr.am_bdr_failover_handler():\n %s"), + PQerrorMessage(conn)); + PQclear(res); + return false; + } + + am_handler = (strcmp(PQgetvalue(res, 0, 0), "t") == 0) ? true : false; PQclear(res); diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c index 3fbbb79a..ae3f32fe 100644 --- a/repmgrd-bdr.c +++ b/repmgrd-bdr.c @@ -290,6 +290,8 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) PQExpBufferData event_details; t_event_info event_info = T_EVENT_INFO_INITIALIZER; t_node_info target_node = T_NODE_INFO_INITIALIZER; + t_node_info failed_node = T_NODE_INFO_INITIALIZER; + RecordStatus record_status; monitored_node->monitoring_state = MS_DEGRADED; INSTR_TIME_SET_CURRENT(degraded_monitoring_start); @@ -310,9 +312,9 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) if (PQstatus(next_node_conn) == CONNECTION_OK) { - RecordStatus record_status = get_node_record(next_node_conn, - cell->node_info->node_id, - &target_node); + record_status = get_node_record(next_node_conn, + cell->node_info->node_id, + &target_node); if (record_status == RECORD_FOUND) { @@ -332,6 +334,23 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) return; } + /* + * check if the node record for the failed node is still marked as active, + * if not it means the other node has done the "failover" already + */ + + record_status = get_node_record(next_node_conn, + monitored_node->node_id, + &failed_node); + + if (record_status == RECORD_FOUND && failed_node.active == false) + { + PQfinish(next_node_conn); + log_notice(_("record for node %i has already been set inactive"), + cell->node_info->node_id); + return; + } + if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false) { PQfinish(next_node_conn); @@ -360,9 +379,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) target_node.node_name, target_node.node_id); - - - /* * Create an event record *