From 79d21b516b943db820f475146204ce202541738a Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 8 Nov 2017 14:19:52 +0900 Subject: [PATCH] repmgrd: fixes to failover handling get_new_primary() returns NULL if no notification for the new primary has been received, but the code was expecting it to return UNKNOWN_NODE_ID, which was causing repmgrd to prematurely drop out of the new primary detection loop if no notification had been received by the time the loop started. Also store the electoral term as a single row, single column table, to ensure that all repmgrds see the same turn. It is then bumped by the winning node after it gets promoted. Various logging improvements. --- dbutils.c | 88 +++++++++++++++++++++++++++++++++++++++------- dbutils.h | 4 ++- repmgr--4.0.sql | 20 +++++++++-- repmgr.c | 17 +++++---- repmgrd-physical.c | 34 ++++++++++++++---- 5 files changed, 134 insertions(+), 29 deletions(-) diff --git a/dbutils.c b/dbutils.c index 33b9c37d..c7ecef9e 100644 --- a/dbutils.c +++ b/dbutils.c @@ -3578,9 +3578,52 @@ delete_monitoring_records(PGconn *primary_conn, int keep_history) /* * node voting functions * - * These are intended to run under repmgrd and rely on shared memory + * These are intended to run under repmgrd and mainly rely on shared memory */ +int +get_current_term(PGconn *conn) +{ + PGresult *res = NULL; + int term = -1; + + res = PQexec(conn, "SELECT term FROM repmgr.voting_term"); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + log_error(_("unable to query repmgr.voting_term:\n %s"), + PQerrorMessage(conn)); + PQclear(res); + return -1; + } + + term = atoi(PQgetvalue(res, 0, 0)); + + PQclear(res); + return term; +} + + +void +increment_current_term(PGconn *conn) +{ + PGresult *res = NULL; + + res = PQexec(conn, "UPDATE repmgr.voting_term SET term = term + 1"); + + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + log_error(_("unable to increment repmgr.voting_term:\n %s"), + PQerrorMessage(conn)); + PQclear(res); + return; + } + + PQclear(res); + return; +} + + NodeVotingStatus get_voting_status(PGconn *conn) { @@ -3700,19 +3743,26 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int } -int -set_voting_status_initiated(PGconn *conn) +void +set_voting_status_initiated(PGconn *conn, int electoral_term) { + PQExpBufferData query; PGresult *res = NULL; - int electoral_term = 0; - res = PQexec(conn, "SELECT repmgr.set_voting_status_initiated()"); + initPQExpBuffer(&query); + + appendPQExpBuffer(&query, + "SELECT repmgr.set_voting_status_initiated(%i)", + electoral_term); + + res = PQexec(conn, query.data); + termPQExpBuffer(&query); electoral_term = atoi(PQgetvalue(res, 0, 0)); PQclear(res); - return electoral_term; + return; } @@ -3748,7 +3798,6 @@ notify_follow_primary(PGconn *conn, int primary_node_id) PQExpBufferData query; PGresult *res = NULL; - initPQExpBuffer(&query); appendPQExpBuffer(&query, @@ -3756,10 +3805,17 @@ notify_follow_primary(PGconn *conn, int primary_node_id) primary_node_id); log_verbose(LOG_DEBUG, "notify_follow_primary():\n %s", query.data); - /* XXX handle failure */ res = PQexec(conn, query.data); termPQExpBuffer(&query); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + log_error(_("unable to execute repmgr.notify_follow_primary():\n %s"), + PQerrorMessage(conn)); + PQclear(res); + return; + } + if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_error(_("unable to execute repmgr.notify_follow_primary():\n %s"), @@ -3786,16 +3842,24 @@ get_new_primary(PGconn *conn, int *primary_node_id) res = PQexec(conn, query.data); termPQExpBuffer(&query); - /* XXX handle error */ - new_primary_node_id = atoi(PQgetvalue(res, 0, 0)); - - if (new_primary_node_id == UNKNOWN_NODE_ID) + if (PQresultStatus(res) != PGRES_TUPLES_OK) { + log_error(_("unable to execute repmgr.reset_voting_status():\n %s"), + PQerrorMessage(conn)); PQclear(res); return false; } + if (PQgetisnull(res, 0, 0)) + { + *primary_node_id = UNKNOWN_NODE_ID; + PQclear(res); + return false; + } + + new_primary_node_id = atoi(PQgetvalue(res, 0, 0)); + PQclear(res); *primary_node_id = new_primary_node_id; diff --git a/dbutils.h b/dbutils.h index fbd002cc..c4dc9843 100644 --- a/dbutils.h +++ b/dbutils.h @@ -474,9 +474,11 @@ bool delete_monitoring_records(PGconn *primary_conn, int keep_history); /* node voting functions */ +int get_current_term(PGconn *conn); +void increment_current_term(PGconn *conn); NodeVotingStatus get_voting_status(PGconn *conn); VoteRequestResult request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term); -int set_voting_status_initiated(PGconn *conn); +void set_voting_status_initiated(PGconn *conn, int electoral_term); bool announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term); void notify_follow_primary(PGconn *conn, int primary_node_id); bool get_new_primary(PGconn *conn, int *primary_node_id); diff --git a/repmgr--4.0.sql b/repmgr--4.0.sql index b84c87f3..c9811f70 100644 --- a/repmgr--4.0.sql +++ b/repmgr--4.0.sql @@ -79,6 +79,22 @@ LEFT JOIN repmgr.nodes un ON un.node_id = n.upstream_node_id; +/* XXX update upgrade scripts! */ +CREATE TABLE repmgr.voting_term ( + term INT NOT NULL +); + +CREATE UNIQUE INDEX voting_term_restrict +ON repmgr.voting_term ((TRUE)); + +CREATE RULE voting_term_delete AS + ON DELETE TO repmgr.voting_term + DO INSTEAD NOTHING; + +/* XXX do this in "repmgr primary register" */ +INSERT INTO repmgr.voting_term (term) VALUES (1); + + /* ================= */ /* repmgrd functions */ /* ================= */ @@ -135,8 +151,8 @@ CREATE FUNCTION get_voting_status() AS 'MODULE_PATHNAME', 'get_voting_status' LANGUAGE C STRICT; -CREATE FUNCTION set_voting_status_initiated() - RETURNS INT +CREATE FUNCTION set_voting_status_initiated(INT) + RETURNS VOID AS 'MODULE_PATHNAME', 'set_voting_status_initiated' LANGUAGE C STRICT; diff --git a/repmgr.c b/repmgr.c index 11632de4..bd25c520 100644 --- a/repmgr.c +++ b/repmgr.c @@ -427,6 +427,7 @@ get_voting_status(PG_FUNCTION_ARGS) #endif } + Datum set_voting_status_initiated(PG_FUNCTION_ARGS) { @@ -434,7 +435,12 @@ set_voting_status_initiated(PG_FUNCTION_ARGS) int electoral_term = -1; if (!shared_state) - PG_RETURN_NULL(); + PG_RETURN_VOID(); + + if (PG_ARGISNULL(0)) + PG_RETURN_VOID(); + + electoral_term = PG_GETARG_INT32(0); LWLockAcquire(shared_state->lock, LW_SHARED); @@ -445,21 +451,18 @@ set_voting_status_initiated(PG_FUNCTION_ARGS) LWLockAcquire(shared_state->lock, LW_EXCLUSIVE); shared_state->voting_status = VS_VOTE_INITIATED; - shared_state->current_electoral_term += 1; - - electoral_term = shared_state->current_electoral_term; + shared_state->current_electoral_term = electoral_term; elog(INFO, "setting voting term to %i", electoral_term); } LWLockRelease(shared_state->lock); - PG_RETURN_INT32(electoral_term); -#else - PG_RETURN_INT32(-1); #endif + PG_RETURN_VOID(); } + Datum other_node_is_candidate(PG_FUNCTION_ARGS) { diff --git a/repmgrd-physical.c b/repmgrd-physical.c index c963f222..4ee9063b 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -957,7 +957,14 @@ do_primary_failover(void) } else if (election_result == ELECTION_WON) { - log_notice("this node is the winner, will now promote self and inform other nodes"); + if (standby_nodes.node_count > 0) + { + log_notice("this node is the winner, will now promote itself and inform other nodes"); + } + else + { + log_notice("this node is the only available candidate and will now promote itself"); + } failover_state = promote_self(); } @@ -1034,7 +1041,7 @@ do_primary_failover(void) */ if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY) { - int new_primary_id; + int new_primary_id = UNKNOWN_NODE_ID; /* TODO: rerun election if new primary doesn't appear after timeout */ @@ -1563,6 +1570,8 @@ promote_self(void) return FAILOVER_STATE_PROMOTION_FAILED; } + /* bump the electoral term */ + increment_current_term(local_conn); initPQExpBuffer(&event_details); @@ -1604,10 +1613,11 @@ notify_followers(NodeInfoList *standby_nodes, int follow_node_id) { NodeInfoListCell *cell; - log_debug("notify_followers()"); + log_debug("notify_followers(): %i followers to notify", standby_nodes->node_count); + for (cell = standby_nodes->head; cell; cell = cell->next) { - log_debug("intending to notify node %i... ", cell->node_info->node_id); + log_verbose(LOG_DEBUG, "intending to notify node %i... ", cell->node_info->node_id); if (PQstatus(cell->node_info->conn) != CONNECTION_OK) { log_debug("reconnecting to node %i... ", cell->node_info->node_id); @@ -1925,7 +1935,6 @@ static ElectionResult do_election(void) { int electoral_term = -1; - int votes_for_me = 0; /* we're visible */ @@ -1960,6 +1969,17 @@ do_election(void) long unsigned rand_wait = (long) ((rand() % 35) + 10) * 10000; + electoral_term = get_current_term(local_conn); + + if (electoral_term == -1) + { + log_error(_("unable to determine electoral term")); + + return ELECTION_NOT_CANDIDATE; + } + + log_debug("do_election(): electoral term is %i", electoral_term); + /* get all active nodes attached to primary, excluding self */ get_active_sibling_node_records(local_conn, local_node_info.node_id, @@ -2007,7 +2027,7 @@ do_election(void) * so when announcing ourselves as candidate to the other nodes, we'll * check for that and withdraw our candidature. */ - electoral_term = set_voting_status_initiated(local_conn); + set_voting_status_initiated(local_conn, electoral_term); /* no other standbys - normally win by default */ if (standby_nodes.node_count == 0) @@ -2065,7 +2085,7 @@ do_election(void) /* * see if the node is in the primary's location (but skip the check if - * we've seen + * we've seen a node there already) */ if (primary_location_seen == false) {