repmgrd: catch cases where more than one node has initiated voting

The node(s) with higher ID will "yield", leaving the decision making
up to the node with the lower ID.

This happens very rarely, usually when the random delay is close
enough on two or mode nodes that vote initiation is simultaneous.
This commit is contained in:
Ian Barwick
2017-07-18 17:04:24 +09:00
parent 248525ccba
commit 6e270b2faf
4 changed files with 93 additions and 29 deletions

View File

@@ -2622,7 +2622,8 @@ get_voting_status(PGconn *conn)
return voting_status; return voting_status;
} }
int
VoteRequestResult
request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term) request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term)
{ {
PQExpBufferData query; PQExpBufferData query;
@@ -2648,8 +2649,36 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int
/* check for NULL */ /* check for NULL */
if (PQgetisnull(res, 0, 0)) if (PQgetisnull(res, 0, 0))
{ {
log_debug("XXX NULL returned by repmgr.request_vote()"); PQclear(res);
return 0;
log_debug("NULL returned by repmgr.request_vote()");
/*
* get the node's last receive location anyway
* TODO: have repmgr.request_vote() return two values
*/
initPQExpBuffer(&query);
appendPQExpBuffer(
&query,
#if (PG_VERSION_NUM >= 100000)
"SELECT pg_catalog.pg_last_wal_receive_lsn()");
#else
"SELECT pg_catalog.pg_last_xlog_receive_location()");
#endif
res = PQexec(conn, query.data);
termPQExpBuffer(&query);
if (PQresultStatus(res) == PGRES_TUPLES_OK)
{
other_node->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 0));
}
PQclear(res);
return VR_VOTE_REFUSED;
} }
other_node->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 0)); other_node->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 0));
@@ -2664,7 +2693,7 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int
if (lsn_diff > 0) if (lsn_diff > 0)
{ {
log_debug("local node is ahead"); log_debug("local node is ahead");
return 1; return VR_POSITIVE_VOTE;
} }
@@ -2672,7 +2701,7 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int
if (lsn_diff < 0) if (lsn_diff < 0)
{ {
log_debug("other node is ahead"); log_debug("other node is ahead");
return 0; return VR_NEGATIVE_VOTE;
} }
/* tiebreak */ /* tiebreak */
@@ -2681,12 +2710,12 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int
if (this_node->priority < other_node->priority) if (this_node->priority < other_node->priority)
{ {
log_debug("other node has higher priority"); log_debug("other node has higher priority");
return 0; return VR_NEGATIVE_VOTE;
} }
/* still tiebreak - we're the candidate, so we win */ /* still tiebreak - we're the candidate, so we win */
log_debug("win by default"); log_debug("win by default");
return 1; return VR_POSITIVE_VOTE;
} }

View File

@@ -52,6 +52,12 @@ typedef enum {
NODE_STATUS_DOWN NODE_STATUS_DOWN
} NodeStatus; } NodeStatus;
typedef enum {
VR_VOTE_REFUSED = -1,
VR_POSITIVE_VOTE,
VR_NEGATIVE_VOTE
} VoteRequestResult;
/* /*
* Struct to store node information * Struct to store node information
*/ */
@@ -310,7 +316,7 @@ bool is_server_available(const char *conninfo);
/* node voting functions */ /* node voting functions */
NodeVotingStatus get_voting_status(PGconn *conn); NodeVotingStatus get_voting_status(PGconn *conn);
int request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term); VoteRequestResult request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term);
int set_voting_status_initiated(PGconn *conn); int set_voting_status_initiated(PGconn *conn);
bool announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term); bool announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term);
void notify_follow_primary(PGconn *conn, int primary_node_id); void notify_follow_primary(PGconn *conn, int primary_node_id);

View File

@@ -206,13 +206,11 @@ request_vote(PG_FUNCTION_ARGS)
initStringInfo(&query); initStringInfo(&query);
#if (PG_VERSION_NUM >= 100000)
appendStringInfo( appendStringInfo(
&query, &query,
#if (PG_VERSION_NUM >= 100000)
"SELECT pg_catalog.pg_last_wal_receive_lsn()"); "SELECT pg_catalog.pg_last_wal_receive_lsn()");
#else #else
appendStringInfo(
&query,
"SELECT pg_catalog.pg_last_xlog_receive_location()"); "SELECT pg_catalog.pg_last_xlog_receive_location()");
#endif #endif

View File

@@ -680,7 +680,7 @@ do_primary_failover(void)
/* attempt to initiate voting process */ /* attempt to initiate voting process */
ElectionResult election_result = do_election(); ElectionResult election_result = do_election();
/* XXX add pre-event notification here */ /* TODO add pre-event notification here */
failover_state = FAILOVER_STATE_UNKNOWN; failover_state = FAILOVER_STATE_UNKNOWN;
log_debug("election result: %s", _print_election_result(election_result)); log_debug("election result: %s", _print_election_result(election_result));
@@ -702,13 +702,7 @@ do_primary_failover(void)
log_info(_("I am the candidate but did not get all votes; will now determine the best candidate")); log_info(_("I am the candidate but did not get all votes; will now determine the best candidate"));
/* standby_nodes is in the state created by do_election() */
/* reset node list */
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
upstream_node_info.node_id,
&standby_nodes);
best_candidate = poll_best_candidate(&standby_nodes); best_candidate = poll_best_candidate(&standby_nodes);
/* /*
@@ -1220,12 +1214,14 @@ poll_best_candidate(NodeInfoList *standby_nodes)
NodeInfoListCell *cell; NodeInfoListCell *cell;
t_node_info *best_candidate = &local_node_info; t_node_info *best_candidate = &local_node_info;
// XXX ensure standby_nodes is set correctly
/* /*
* we need to definitively decide the best candidate, as in some corner * we need to definitively decide the best candidate, as in some corner
* cases we could end up with two candidate nodes, so they should each * cases we could end up with two candidate nodes, so they should each
* come to the same conclusion * come to the same conclusion.
*
* XXX check there are no cases where the standby node's LSN is
* not set
*/ */
for (cell = standby_nodes->head; cell; cell = cell->next) for (cell = standby_nodes->head; cell; cell = cell->next)
{ {
@@ -1248,9 +1244,17 @@ poll_best_candidate(NodeInfoList *standby_nodes)
log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id); log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id);
best_candidate = cell->node_info; best_candidate = cell->node_info;
} }
if (cell->node_info->conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
{
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
}
} }
log_info(_("best candidate is %i"), best_candidate->node_id); log_info(_("best candidate is node %s (node ID: %i)"),
best_candidate->node_name,
best_candidate->node_id);
return best_candidate; return best_candidate;
} }
@@ -1653,23 +1657,51 @@ do_election(void)
for (cell = standby_nodes.head; cell; cell = cell->next) for (cell = standby_nodes.head; cell; cell = cell->next)
{ {
VoteRequestResult vote_result;
log_debug("checking node %i...", cell->node_info->node_id); log_debug("checking node %i...", cell->node_info->node_id);
/* ignore unreachable nodes */ /* ignore unreachable nodes */
if (cell->node_info->node_status != NODE_STATUS_UP) if (cell->node_info->node_status != NODE_STATUS_UP)
continue; continue;
votes_for_me += request_vote(cell->node_info->conn, vote_result = request_vote(cell->node_info->conn,
&local_node_info, &local_node_info,
cell->node_info, cell->node_info,
electoral_term); electoral_term);
switch (vote_result)
{
case VR_VOTE_REFUSED:
if (cell->node_info->node_id < local_node_info.node_id)
{
log_debug(_("node %i refused vote, their ID is lower, yielding"),
cell->node_info->node_id);
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
clear_node_info_list(&standby_nodes);
reset_node_voting_status();
log_debug("other node is candidate, returning NOT CANDIDATE");
return ELECTION_NOT_CANDIDATE;
}
log_debug(_("no vote recevied from %i, our ID is lower, not yielding"),
cell->node_info->node_id);
break;
case VR_POSITIVE_VOTE:
votes_for_me += 1;
break;
case VR_NEGATIVE_VOTE:
break;
}
if (cell->node_info->last_wal_receive_lsn > local_node_info.last_wal_receive_lsn) if (cell->node_info->last_wal_receive_lsn > local_node_info.last_wal_receive_lsn)
{ {
/* register if another node is ahead of us */ /* register if another node is ahead of us */
other_node_is_ahead = true; other_node_is_ahead = true;
} }
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
} }
/* vote for myself, but only if I believe no-one else is ahead */ /* vote for myself, but only if I believe no-one else is ahead */
@@ -1722,4 +1754,3 @@ close_connections_physical()
} }
} }