From 6e270b2faf8da75937c2320258a68da206c76f79 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Tue, 18 Jul 2017 17:04:24 +0900 Subject: [PATCH] repmgrd: catch cases where more than one node has initiated voting The node(s) with higher ID will "yield", leaving the decision making up to the node with the lower ID. This happens very rarely, usually when the random delay is close enough on two or mode nodes that vote initiation is simultaneous. --- dbutils.c | 43 ++++++++++++++++++++++++----- dbutils.h | 8 +++++- repmgr.c | 4 +-- repmgrd-physical.c | 67 +++++++++++++++++++++++++++++++++------------- 4 files changed, 93 insertions(+), 29 deletions(-) diff --git a/dbutils.c b/dbutils.c index df833014..8fcba31f 100644 --- a/dbutils.c +++ b/dbutils.c @@ -2622,7 +2622,8 @@ get_voting_status(PGconn *conn) return voting_status; } -int + +VoteRequestResult request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term) { PQExpBufferData query; @@ -2648,8 +2649,36 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int /* check for NULL */ if (PQgetisnull(res, 0, 0)) { - log_debug("XXX NULL returned by repmgr.request_vote()"); - return 0; + PQclear(res); + + log_debug("NULL returned by repmgr.request_vote()"); + + /* + * get the node's last receive location anyway + * TODO: have repmgr.request_vote() return two values + */ + + initPQExpBuffer(&query); + + appendPQExpBuffer( + &query, +#if (PG_VERSION_NUM >= 100000) + "SELECT pg_catalog.pg_last_wal_receive_lsn()"); +#else + "SELECT pg_catalog.pg_last_xlog_receive_location()"); +#endif + + res = PQexec(conn, query.data); + termPQExpBuffer(&query); + + if (PQresultStatus(res) == PGRES_TUPLES_OK) + { + other_node->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 0)); + } + + PQclear(res); + + return VR_VOTE_REFUSED; } other_node->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 0)); @@ -2664,7 +2693,7 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int if (lsn_diff > 0) { log_debug("local node is ahead"); - return 1; + return VR_POSITIVE_VOTE; } @@ -2672,7 +2701,7 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int if (lsn_diff < 0) { log_debug("other node is ahead"); - return 0; + return VR_NEGATIVE_VOTE; } /* tiebreak */ @@ -2681,12 +2710,12 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int if (this_node->priority < other_node->priority) { log_debug("other node has higher priority"); - return 0; + return VR_NEGATIVE_VOTE; } /* still tiebreak - we're the candidate, so we win */ log_debug("win by default"); - return 1; + return VR_POSITIVE_VOTE; } diff --git a/dbutils.h b/dbutils.h index 380ab83b..533ce75e 100644 --- a/dbutils.h +++ b/dbutils.h @@ -52,6 +52,12 @@ typedef enum { NODE_STATUS_DOWN } NodeStatus; +typedef enum { + VR_VOTE_REFUSED = -1, + VR_POSITIVE_VOTE, + VR_NEGATIVE_VOTE +} VoteRequestResult; + /* * Struct to store node information */ @@ -310,7 +316,7 @@ bool is_server_available(const char *conninfo); /* node voting functions */ NodeVotingStatus get_voting_status(PGconn *conn); -int request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term); +VoteRequestResult request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term); int set_voting_status_initiated(PGconn *conn); bool announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term); void notify_follow_primary(PGconn *conn, int primary_node_id); diff --git a/repmgr.c b/repmgr.c index 867d40e7..697f178a 100644 --- a/repmgr.c +++ b/repmgr.c @@ -206,13 +206,11 @@ request_vote(PG_FUNCTION_ARGS) initStringInfo(&query); -#if (PG_VERSION_NUM >= 100000) appendStringInfo( &query, +#if (PG_VERSION_NUM >= 100000) "SELECT pg_catalog.pg_last_wal_receive_lsn()"); #else - appendStringInfo( - &query, "SELECT pg_catalog.pg_last_xlog_receive_location()"); #endif diff --git a/repmgrd-physical.c b/repmgrd-physical.c index f8b2463c..7fcd7be6 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -680,7 +680,7 @@ do_primary_failover(void) /* attempt to initiate voting process */ ElectionResult election_result = do_election(); - /* XXX add pre-event notification here */ + /* TODO add pre-event notification here */ failover_state = FAILOVER_STATE_UNKNOWN; log_debug("election result: %s", _print_election_result(election_result)); @@ -702,13 +702,7 @@ do_primary_failover(void) log_info(_("I am the candidate but did not get all votes; will now determine the best candidate")); - - /* reset node list */ - get_active_sibling_node_records(local_conn, - local_node_info.node_id, - upstream_node_info.node_id, - &standby_nodes); - + /* standby_nodes is in the state created by do_election() */ best_candidate = poll_best_candidate(&standby_nodes); /* @@ -1220,12 +1214,14 @@ poll_best_candidate(NodeInfoList *standby_nodes) NodeInfoListCell *cell; t_node_info *best_candidate = &local_node_info; - // XXX ensure standby_nodes is set correctly /* * we need to definitively decide the best candidate, as in some corner * cases we could end up with two candidate nodes, so they should each - * come to the same conclusion + * come to the same conclusion. + * + * XXX check there are no cases where the standby node's LSN is + * not set */ for (cell = standby_nodes->head; cell; cell = cell->next) { @@ -1248,9 +1244,17 @@ poll_best_candidate(NodeInfoList *standby_nodes) log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id); best_candidate = cell->node_info; } + + if (cell->node_info->conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK) + { + PQfinish(cell->node_info->conn); + cell->node_info->conn = NULL; + } } - log_info(_("best candidate is %i"), best_candidate->node_id); + log_info(_("best candidate is node %s (node ID: %i)"), + best_candidate->node_name, + best_candidate->node_id); return best_candidate; } @@ -1653,23 +1657,51 @@ do_election(void) for (cell = standby_nodes.head; cell; cell = cell->next) { + VoteRequestResult vote_result; + log_debug("checking node %i...", cell->node_info->node_id); /* ignore unreachable nodes */ if (cell->node_info->node_status != NODE_STATUS_UP) continue; - votes_for_me += request_vote(cell->node_info->conn, - &local_node_info, - cell->node_info, - electoral_term); + vote_result = request_vote(cell->node_info->conn, + &local_node_info, + cell->node_info, + electoral_term); + + switch (vote_result) + { + case VR_VOTE_REFUSED: + if (cell->node_info->node_id < local_node_info.node_id) + { + log_debug(_("node %i refused vote, their ID is lower, yielding"), + cell->node_info->node_id); + PQfinish(cell->node_info->conn); + cell->node_info->conn = NULL; + clear_node_info_list(&standby_nodes); + + reset_node_voting_status(); + log_debug("other node is candidate, returning NOT CANDIDATE"); + return ELECTION_NOT_CANDIDATE; + } + + log_debug(_("no vote recevied from %i, our ID is lower, not yielding"), + cell->node_info->node_id); + break; + + case VR_POSITIVE_VOTE: + votes_for_me += 1; + break; + case VR_NEGATIVE_VOTE: + break; + } if (cell->node_info->last_wal_receive_lsn > local_node_info.last_wal_receive_lsn) { /* register if another node is ahead of us */ other_node_is_ahead = true; } - PQfinish(cell->node_info->conn); - cell->node_info->conn = NULL; + } /* vote for myself, but only if I believe no-one else is ahead */ @@ -1722,4 +1754,3 @@ close_connections_physical() } } -