mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 00:26:30 +00:00
repmgrd: additional check to ensure only one node handles failover
It's possible the "failover" is completed by one repmgrd before the other has a chance to react, in which case the am_bdr_failover_handler() check will not apply. Instead check if the node record has already been set to "inactive".
This commit is contained in:
12
dbutils.c
12
dbutils.c
@@ -1395,7 +1395,7 @@ get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_i
|
|||||||
|
|
||||||
if (record_status == RECORD_NOT_FOUND)
|
if (record_status == RECORD_NOT_FOUND)
|
||||||
{
|
{
|
||||||
log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %s",
|
log_verbose(LOG_DEBUG, "get_node_record_by_name(): no record found for node %s",
|
||||||
node_name);
|
node_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3250,10 +3250,18 @@ am_bdr_failover_handler(PGconn *conn, int node_id)
|
|||||||
"SELECT repmgr.am_bdr_failover_handler(%i)",
|
"SELECT repmgr.am_bdr_failover_handler(%i)",
|
||||||
node_id);
|
node_id);
|
||||||
|
|
||||||
|
|
||||||
res = PQexec(conn, query.data);
|
res = PQexec(conn, query.data);
|
||||||
termPQExpBuffer(&query);
|
termPQExpBuffer(&query);
|
||||||
|
|
||||||
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
log_error(_("unable to execute function repmgr.am_bdr_failover_handler():\n %s"),
|
||||||
|
PQerrorMessage(conn));
|
||||||
|
PQclear(res);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
am_handler = (strcmp(PQgetvalue(res, 0, 0), "t") == 0) ? true : false;
|
am_handler = (strcmp(PQgetvalue(res, 0, 0), "t") == 0) ? true : false;
|
||||||
|
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
|
|||||||
@@ -290,6 +290,8 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||||
t_node_info target_node = T_NODE_INFO_INITIALIZER;
|
t_node_info target_node = T_NODE_INFO_INITIALIZER;
|
||||||
|
t_node_info failed_node = T_NODE_INFO_INITIALIZER;
|
||||||
|
RecordStatus record_status;
|
||||||
|
|
||||||
monitored_node->monitoring_state = MS_DEGRADED;
|
monitored_node->monitoring_state = MS_DEGRADED;
|
||||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||||
@@ -310,9 +312,9 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
if (PQstatus(next_node_conn) == CONNECTION_OK)
|
if (PQstatus(next_node_conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
RecordStatus record_status = get_node_record(next_node_conn,
|
record_status = get_node_record(next_node_conn,
|
||||||
cell->node_info->node_id,
|
cell->node_info->node_id,
|
||||||
&target_node);
|
&target_node);
|
||||||
|
|
||||||
if (record_status == RECORD_FOUND)
|
if (record_status == RECORD_FOUND)
|
||||||
{
|
{
|
||||||
@@ -332,6 +334,23 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* check if the node record for the failed node is still marked as active,
|
||||||
|
* if not it means the other node has done the "failover" already
|
||||||
|
*/
|
||||||
|
|
||||||
|
record_status = get_node_record(next_node_conn,
|
||||||
|
monitored_node->node_id,
|
||||||
|
&failed_node);
|
||||||
|
|
||||||
|
if (record_status == RECORD_FOUND && failed_node.active == false)
|
||||||
|
{
|
||||||
|
PQfinish(next_node_conn);
|
||||||
|
log_notice(_("record for node %i has already been set inactive"),
|
||||||
|
cell->node_info->node_id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
|
if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
|
||||||
{
|
{
|
||||||
PQfinish(next_node_conn);
|
PQfinish(next_node_conn);
|
||||||
@@ -360,9 +379,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
target_node.node_name,
|
target_node.node_name,
|
||||||
target_node.node_id);
|
target_node.node_id);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create an event record
|
* Create an event record
|
||||||
*
|
*
|
||||||
|
|||||||
Reference in New Issue
Block a user