repmgrd: fixes to failover handling

get_new_primary() returns NULL if no notification for the new primary has
been received, but the code was expecting it to return UNKNOWN_NODE_ID,
which was causing repmgrd to prematurely drop out of the new primary
detection loop if no notification had been received by the time the loop
started.

Also store the electoral term as a single row, single column table,
to ensure that all repmgrds see the same turn. It is then bumped
by the winning node after it gets promoted.

Various logging improvements.
This commit is contained in:
Ian Barwick
2017-11-08 14:19:52 +09:00
parent 7232187f4d
commit 79d21b516b
5 changed files with 134 additions and 29 deletions

View File

@@ -3578,9 +3578,52 @@ delete_monitoring_records(PGconn *primary_conn, int keep_history)
/*
* node voting functions
*
* These are intended to run under repmgrd and rely on shared memory
* These are intended to run under repmgrd and mainly rely on shared memory
*/
int
get_current_term(PGconn *conn)
{
PGresult *res = NULL;
int term = -1;
res = PQexec(conn, "SELECT term FROM repmgr.voting_term");
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_error(_("unable to query repmgr.voting_term:\n %s"),
PQerrorMessage(conn));
PQclear(res);
return -1;
}
term = atoi(PQgetvalue(res, 0, 0));
PQclear(res);
return term;
}
void
increment_current_term(PGconn *conn)
{
PGresult *res = NULL;
res = PQexec(conn, "UPDATE repmgr.voting_term SET term = term + 1");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_error(_("unable to increment repmgr.voting_term:\n %s"),
PQerrorMessage(conn));
PQclear(res);
return;
}
PQclear(res);
return;
}
NodeVotingStatus
get_voting_status(PGconn *conn)
{
@@ -3700,19 +3743,26 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int
}
int
set_voting_status_initiated(PGconn *conn)
void
set_voting_status_initiated(PGconn *conn, int electoral_term)
{
PQExpBufferData query;
PGresult *res = NULL;
int electoral_term = 0;
res = PQexec(conn, "SELECT repmgr.set_voting_status_initiated()");
initPQExpBuffer(&query);
appendPQExpBuffer(&query,
"SELECT repmgr.set_voting_status_initiated(%i)",
electoral_term);
res = PQexec(conn, query.data);
termPQExpBuffer(&query);
electoral_term = atoi(PQgetvalue(res, 0, 0));
PQclear(res);
return electoral_term;
return;
}
@@ -3748,7 +3798,6 @@ notify_follow_primary(PGconn *conn, int primary_node_id)
PQExpBufferData query;
PGresult *res = NULL;
initPQExpBuffer(&query);
appendPQExpBuffer(&query,
@@ -3756,10 +3805,17 @@ notify_follow_primary(PGconn *conn, int primary_node_id)
primary_node_id);
log_verbose(LOG_DEBUG, "notify_follow_primary():\n %s", query.data);
/* XXX handle failure */
res = PQexec(conn, query.data);
termPQExpBuffer(&query);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_error(_("unable to execute repmgr.notify_follow_primary():\n %s"),
PQerrorMessage(conn));
PQclear(res);
return;
}
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_error(_("unable to execute repmgr.notify_follow_primary():\n %s"),
@@ -3786,16 +3842,24 @@ get_new_primary(PGconn *conn, int *primary_node_id)
res = PQexec(conn, query.data);
termPQExpBuffer(&query);
/* XXX handle error */
new_primary_node_id = atoi(PQgetvalue(res, 0, 0));
if (new_primary_node_id == UNKNOWN_NODE_ID)
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_error(_("unable to execute repmgr.reset_voting_status():\n %s"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
if (PQgetisnull(res, 0, 0))
{
*primary_node_id = UNKNOWN_NODE_ID;
PQclear(res);
return false;
}
new_primary_node_id = atoi(PQgetvalue(res, 0, 0));
PQclear(res);
*primary_node_id = new_primary_node_id;

View File

@@ -474,9 +474,11 @@ bool delete_monitoring_records(PGconn *primary_conn, int keep_history);
/* node voting functions */
int get_current_term(PGconn *conn);
void increment_current_term(PGconn *conn);
NodeVotingStatus get_voting_status(PGconn *conn);
VoteRequestResult request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term);
int set_voting_status_initiated(PGconn *conn);
void set_voting_status_initiated(PGconn *conn, int electoral_term);
bool announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term);
void notify_follow_primary(PGconn *conn, int primary_node_id);
bool get_new_primary(PGconn *conn, int *primary_node_id);

View File

@@ -79,6 +79,22 @@ LEFT JOIN repmgr.nodes un
ON un.node_id = n.upstream_node_id;
/* XXX update upgrade scripts! */
CREATE TABLE repmgr.voting_term (
term INT NOT NULL
);
CREATE UNIQUE INDEX voting_term_restrict
ON repmgr.voting_term ((TRUE));
CREATE RULE voting_term_delete AS
ON DELETE TO repmgr.voting_term
DO INSTEAD NOTHING;
/* XXX do this in "repmgr primary register" */
INSERT INTO repmgr.voting_term (term) VALUES (1);
/* ================= */
/* repmgrd functions */
/* ================= */
@@ -135,8 +151,8 @@ CREATE FUNCTION get_voting_status()
AS 'MODULE_PATHNAME', 'get_voting_status'
LANGUAGE C STRICT;
CREATE FUNCTION set_voting_status_initiated()
RETURNS INT
CREATE FUNCTION set_voting_status_initiated(INT)
RETURNS VOID
AS 'MODULE_PATHNAME', 'set_voting_status_initiated'
LANGUAGE C STRICT;

View File

@@ -427,6 +427,7 @@ get_voting_status(PG_FUNCTION_ARGS)
#endif
}
Datum
set_voting_status_initiated(PG_FUNCTION_ARGS)
{
@@ -434,7 +435,12 @@ set_voting_status_initiated(PG_FUNCTION_ARGS)
int electoral_term = -1;
if (!shared_state)
PG_RETURN_NULL();
PG_RETURN_VOID();
if (PG_ARGISNULL(0))
PG_RETURN_VOID();
electoral_term = PG_GETARG_INT32(0);
LWLockAcquire(shared_state->lock, LW_SHARED);
@@ -445,21 +451,18 @@ set_voting_status_initiated(PG_FUNCTION_ARGS)
LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
shared_state->voting_status = VS_VOTE_INITIATED;
shared_state->current_electoral_term += 1;
electoral_term = shared_state->current_electoral_term;
shared_state->current_electoral_term = electoral_term;
elog(INFO, "setting voting term to %i", electoral_term);
}
LWLockRelease(shared_state->lock);
PG_RETURN_INT32(electoral_term);
#else
PG_RETURN_INT32(-1);
#endif
PG_RETURN_VOID();
}
Datum
other_node_is_candidate(PG_FUNCTION_ARGS)
{

View File

@@ -957,7 +957,14 @@ do_primary_failover(void)
}
else if (election_result == ELECTION_WON)
{
log_notice("this node is the winner, will now promote self and inform other nodes");
if (standby_nodes.node_count > 0)
{
log_notice("this node is the winner, will now promote itself and inform other nodes");
}
else
{
log_notice("this node is the only available candidate and will now promote itself");
}
failover_state = promote_self();
}
@@ -1034,7 +1041,7 @@ do_primary_failover(void)
*/
if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
{
int new_primary_id;
int new_primary_id = UNKNOWN_NODE_ID;
/* TODO: rerun election if new primary doesn't appear after timeout */
@@ -1563,6 +1570,8 @@ promote_self(void)
return FAILOVER_STATE_PROMOTION_FAILED;
}
/* bump the electoral term */
increment_current_term(local_conn);
initPQExpBuffer(&event_details);
@@ -1604,10 +1613,11 @@ notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
{
NodeInfoListCell *cell;
log_debug("notify_followers()");
log_debug("notify_followers(): %i followers to notify", standby_nodes->node_count);
for (cell = standby_nodes->head; cell; cell = cell->next)
{
log_debug("intending to notify node %i... ", cell->node_info->node_id);
log_verbose(LOG_DEBUG, "intending to notify node %i... ", cell->node_info->node_id);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{
log_debug("reconnecting to node %i... ", cell->node_info->node_id);
@@ -1925,7 +1935,6 @@ static ElectionResult
do_election(void)
{
int electoral_term = -1;
int votes_for_me = 0;
/* we're visible */
@@ -1960,6 +1969,17 @@ do_election(void)
long unsigned rand_wait = (long) ((rand() % 35) + 10) * 10000;
electoral_term = get_current_term(local_conn);
if (electoral_term == -1)
{
log_error(_("unable to determine electoral term"));
return ELECTION_NOT_CANDIDATE;
}
log_debug("do_election(): electoral term is %i", electoral_term);
/* get all active nodes attached to primary, excluding self */
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
@@ -2007,7 +2027,7 @@ do_election(void)
* so when announcing ourselves as candidate to the other nodes, we'll
* check for that and withdraw our candidature.
*/
electoral_term = set_voting_status_initiated(local_conn);
set_voting_status_initiated(local_conn, electoral_term);
/* no other standbys - normally win by default */
if (standby_nodes.node_count == 0)
@@ -2065,7 +2085,7 @@ do_election(void)
/*
* see if the node is in the primary's location (but skip the check if
* we've seen
* we've seen a node there already)
*/
if (primary_location_seen == false)
{