mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 07:06:30 +00:00
repmgrd: additional check to ensure only one node handles failover
It's possible the "failover" is completed by one repmgrd before the other has a chance to react, in which case the am_bdr_failover_handler() check will not apply. Instead check if the node record has already been set to "inactive".
This commit is contained in:
12
dbutils.c
12
dbutils.c
@@ -1395,7 +1395,7 @@ get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_i
|
||||
|
||||
if (record_status == RECORD_NOT_FOUND)
|
||||
{
|
||||
log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %s",
|
||||
log_verbose(LOG_DEBUG, "get_node_record_by_name(): no record found for node %s",
|
||||
node_name);
|
||||
}
|
||||
|
||||
@@ -3250,10 +3250,18 @@ am_bdr_failover_handler(PGconn *conn, int node_id)
|
||||
"SELECT repmgr.am_bdr_failover_handler(%i)",
|
||||
node_id);
|
||||
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
termPQExpBuffer(&query);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_error(_("unable to execute function repmgr.am_bdr_failover_handler():\n %s"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
am_handler = (strcmp(PQgetvalue(res, 0, 0), "t") == 0) ? true : false;
|
||||
|
||||
PQclear(res);
|
||||
|
||||
@@ -290,6 +290,8 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||
PQExpBufferData event_details;
|
||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||
t_node_info target_node = T_NODE_INFO_INITIALIZER;
|
||||
t_node_info failed_node = T_NODE_INFO_INITIALIZER;
|
||||
RecordStatus record_status;
|
||||
|
||||
monitored_node->monitoring_state = MS_DEGRADED;
|
||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||
@@ -310,9 +312,9 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||
|
||||
if (PQstatus(next_node_conn) == CONNECTION_OK)
|
||||
{
|
||||
RecordStatus record_status = get_node_record(next_node_conn,
|
||||
cell->node_info->node_id,
|
||||
&target_node);
|
||||
record_status = get_node_record(next_node_conn,
|
||||
cell->node_info->node_id,
|
||||
&target_node);
|
||||
|
||||
if (record_status == RECORD_FOUND)
|
||||
{
|
||||
@@ -332,6 +334,23 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the node record for the failed node is still marked as active,
|
||||
* if not it means the other node has done the "failover" already
|
||||
*/
|
||||
|
||||
record_status = get_node_record(next_node_conn,
|
||||
monitored_node->node_id,
|
||||
&failed_node);
|
||||
|
||||
if (record_status == RECORD_FOUND && failed_node.active == false)
|
||||
{
|
||||
PQfinish(next_node_conn);
|
||||
log_notice(_("record for node %i has already been set inactive"),
|
||||
cell->node_info->node_id);
|
||||
return;
|
||||
}
|
||||
|
||||
if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
|
||||
{
|
||||
PQfinish(next_node_conn);
|
||||
@@ -360,9 +379,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||
target_node.node_name,
|
||||
target_node.node_id);
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Create an event record
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user