repmgrd: additional check to ensure only one node handles failover

It's possible the "failover" is completed by one repmgrd before the
other has a chance to react, in which case the am_bdr_failover_handler()
check will not apply. Instead check if the node record has already been
set to "inactive".
This commit is contained in:
Ian Barwick
2017-07-17 16:47:42 +09:00
parent 48a0aa3bf7
commit 2c8dd49831
2 changed files with 32 additions and 8 deletions

View File

@@ -1395,7 +1395,7 @@ get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_i
if (record_status == RECORD_NOT_FOUND)
{
log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %s",
log_verbose(LOG_DEBUG, "get_node_record_by_name(): no record found for node %s",
node_name);
}
@@ -3250,10 +3250,18 @@ am_bdr_failover_handler(PGconn *conn, int node_id)
"SELECT repmgr.am_bdr_failover_handler(%i)",
node_id);
res = PQexec(conn, query.data);
termPQExpBuffer(&query);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_error(_("unable to execute function repmgr.am_bdr_failover_handler():\n %s"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
am_handler = (strcmp(PQgetvalue(res, 0, 0), "t") == 0) ? true : false;
PQclear(res);

View File

@@ -290,6 +290,8 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
PQExpBufferData event_details;
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
t_node_info target_node = T_NODE_INFO_INITIALIZER;
t_node_info failed_node = T_NODE_INFO_INITIALIZER;
RecordStatus record_status;
monitored_node->monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
@@ -310,9 +312,9 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
if (PQstatus(next_node_conn) == CONNECTION_OK)
{
RecordStatus record_status = get_node_record(next_node_conn,
cell->node_info->node_id,
&target_node);
record_status = get_node_record(next_node_conn,
cell->node_info->node_id,
&target_node);
if (record_status == RECORD_FOUND)
{
@@ -332,6 +334,23 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
return;
}
/*
* check if the node record for the failed node is still marked as active,
* if not it means the other node has done the "failover" already
*/
record_status = get_node_record(next_node_conn,
monitored_node->node_id,
&failed_node);
if (record_status == RECORD_FOUND && failed_node.active == false)
{
PQfinish(next_node_conn);
log_notice(_("record for node %i has already been set inactive"),
cell->node_info->node_id);
return;
}
if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
{
PQfinish(next_node_conn);
@@ -360,9 +379,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
target_node.node_name,
target_node.node_id);
/*
* Create an event record
*