repmgrd: catch cases where more than one node has initiated voting

The node(s) with higher ID will "yield", leaving the decision making
up to the node with the lower ID.

This happens very rarely, usually when the random delay is close
enough on two or mode nodes that vote initiation is simultaneous.
This commit is contained in:
Ian Barwick
2017-07-18 17:04:24 +09:00
parent 248525ccba
commit 6e270b2faf
4 changed files with 93 additions and 29 deletions

View File

@@ -2622,7 +2622,8 @@ get_voting_status(PGconn *conn)
return voting_status;
}
int
VoteRequestResult
request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term)
{
PQExpBufferData query;
@@ -2648,8 +2649,36 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int
/* check for NULL */
if (PQgetisnull(res, 0, 0))
{
log_debug("XXX NULL returned by repmgr.request_vote()");
return 0;
PQclear(res);
log_debug("NULL returned by repmgr.request_vote()");
/*
* get the node's last receive location anyway
* TODO: have repmgr.request_vote() return two values
*/
initPQExpBuffer(&query);
appendPQExpBuffer(
&query,
#if (PG_VERSION_NUM >= 100000)
"SELECT pg_catalog.pg_last_wal_receive_lsn()");
#else
"SELECT pg_catalog.pg_last_xlog_receive_location()");
#endif
res = PQexec(conn, query.data);
termPQExpBuffer(&query);
if (PQresultStatus(res) == PGRES_TUPLES_OK)
{
other_node->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 0));
}
PQclear(res);
return VR_VOTE_REFUSED;
}
other_node->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 0));
@@ -2664,7 +2693,7 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int
if (lsn_diff > 0)
{
log_debug("local node is ahead");
return 1;
return VR_POSITIVE_VOTE;
}
@@ -2672,7 +2701,7 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int
if (lsn_diff < 0)
{
log_debug("other node is ahead");
return 0;
return VR_NEGATIVE_VOTE;
}
/* tiebreak */
@@ -2681,12 +2710,12 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int
if (this_node->priority < other_node->priority)
{
log_debug("other node has higher priority");
return 0;
return VR_NEGATIVE_VOTE;
}
/* still tiebreak - we're the candidate, so we win */
log_debug("win by default");
return 1;
return VR_POSITIVE_VOTE;
}

View File

@@ -52,6 +52,12 @@ typedef enum {
NODE_STATUS_DOWN
} NodeStatus;
typedef enum {
VR_VOTE_REFUSED = -1,
VR_POSITIVE_VOTE,
VR_NEGATIVE_VOTE
} VoteRequestResult;
/*
* Struct to store node information
*/
@@ -310,7 +316,7 @@ bool is_server_available(const char *conninfo);
/* node voting functions */
NodeVotingStatus get_voting_status(PGconn *conn);
int request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term);
VoteRequestResult request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term);
int set_voting_status_initiated(PGconn *conn);
bool announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term);
void notify_follow_primary(PGconn *conn, int primary_node_id);

View File

@@ -206,13 +206,11 @@ request_vote(PG_FUNCTION_ARGS)
initStringInfo(&query);
#if (PG_VERSION_NUM >= 100000)
appendStringInfo(
&query,
#if (PG_VERSION_NUM >= 100000)
"SELECT pg_catalog.pg_last_wal_receive_lsn()");
#else
appendStringInfo(
&query,
"SELECT pg_catalog.pg_last_xlog_receive_location()");
#endif

View File

@@ -680,7 +680,7 @@ do_primary_failover(void)
/* attempt to initiate voting process */
ElectionResult election_result = do_election();
/* XXX add pre-event notification here */
/* TODO add pre-event notification here */
failover_state = FAILOVER_STATE_UNKNOWN;
log_debug("election result: %s", _print_election_result(election_result));
@@ -702,13 +702,7 @@ do_primary_failover(void)
log_info(_("I am the candidate but did not get all votes; will now determine the best candidate"));
/* reset node list */
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
upstream_node_info.node_id,
&standby_nodes);
/* standby_nodes is in the state created by do_election() */
best_candidate = poll_best_candidate(&standby_nodes);
/*
@@ -1220,12 +1214,14 @@ poll_best_candidate(NodeInfoList *standby_nodes)
NodeInfoListCell *cell;
t_node_info *best_candidate = &local_node_info;
// XXX ensure standby_nodes is set correctly
/*
* we need to definitively decide the best candidate, as in some corner
* cases we could end up with two candidate nodes, so they should each
* come to the same conclusion
* come to the same conclusion.
*
* XXX check there are no cases where the standby node's LSN is
* not set
*/
for (cell = standby_nodes->head; cell; cell = cell->next)
{
@@ -1248,9 +1244,17 @@ poll_best_candidate(NodeInfoList *standby_nodes)
log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id);
best_candidate = cell->node_info;
}
if (cell->node_info->conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
{
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
}
}
log_info(_("best candidate is %i"), best_candidate->node_id);
log_info(_("best candidate is node %s (node ID: %i)"),
best_candidate->node_name,
best_candidate->node_id);
return best_candidate;
}
@@ -1653,23 +1657,51 @@ do_election(void)
for (cell = standby_nodes.head; cell; cell = cell->next)
{
VoteRequestResult vote_result;
log_debug("checking node %i...", cell->node_info->node_id);
/* ignore unreachable nodes */
if (cell->node_info->node_status != NODE_STATUS_UP)
continue;
votes_for_me += request_vote(cell->node_info->conn,
&local_node_info,
cell->node_info,
electoral_term);
vote_result = request_vote(cell->node_info->conn,
&local_node_info,
cell->node_info,
electoral_term);
switch (vote_result)
{
case VR_VOTE_REFUSED:
if (cell->node_info->node_id < local_node_info.node_id)
{
log_debug(_("node %i refused vote, their ID is lower, yielding"),
cell->node_info->node_id);
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
clear_node_info_list(&standby_nodes);
reset_node_voting_status();
log_debug("other node is candidate, returning NOT CANDIDATE");
return ELECTION_NOT_CANDIDATE;
}
log_debug(_("no vote recevied from %i, our ID is lower, not yielding"),
cell->node_info->node_id);
break;
case VR_POSITIVE_VOTE:
votes_for_me += 1;
break;
case VR_NEGATIVE_VOTE:
break;
}
if (cell->node_info->last_wal_receive_lsn > local_node_info.last_wal_receive_lsn)
{
/* register if another node is ahead of us */
other_node_is_ahead = true;
}
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
}
/* vote for myself, but only if I believe no-one else is ahead */
@@ -1722,4 +1754,3 @@ close_connections_physical()
}
}