More failover fixes

2026-06-01 03:39:05 +00:00 · 2017-07-03 17:37:32 +09:00
parent 1bc4e968f6
commit 890b88d644
3 changed files with 122 additions and 11 deletions
@@ -2353,12 +2353,12 @@ request_vote(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int

 	lsn_diff = this_node->last_wal_receive_lsn - other_node->last_wal_receive_lsn;

-	log_debug("XXX lsn_diff %i", lsn_diff);
+	log_debug("lsn_diff %i", lsn_diff);

 	/* we're ahead */
 	if (lsn_diff > 0)
 	{
-		log_debug("this node is ahead");
+		log_debug("local node is ahead");
 		return 1;
 	}

@@ -2435,16 +2435,24 @@ notify_follow_primary(PGconn *conn, int primary_node_id)
 	PQExpBufferData	  query;
 	PGresult   *res;

+
 	initPQExpBuffer(&query);

 	appendPQExpBuffer(&query,
 					  "SELECT repmgr.notify_follow_primary(%i)",
 					  primary_node_id);
+	log_verbose(LOG_DEBUG, "notify_follow_primary():\n  %s", query.data);

 	// XXX handle failure
 	res = PQexec(conn, query.data);
 	termPQExpBuffer(&query);

+	if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_error(_("unable to execute repmgr.notify_follow_primary():\n  %s"),
+				PQerrorMessage(conn));
+	}
+
 	PQclear(res);
 	return;
 }
@@ -80,6 +80,8 @@ PG_FUNCTION_INFO_V1(notify_follow_primary);
 Datum		get_new_primary(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(get_new_primary);

+//Datum		set_new_primary(PG_FUNCTION_ARGS);
+//PG_FUNCTION_INFO_V1(set_new_primary);

 /*
 * Module load callback
@@ -286,6 +288,8 @@ notify_follow_primary(PG_FUNCTION_ARGS)
 {
 	int primary_node_id = PG_GETARG_INT32(0);

+	elog(INFO, "received notification to follow node %i", primary_node_id);
+
 	LWLockAcquire(shared_state->lock, LW_SHARED);

 	/* Explicitly set the primary node id */
@@ -70,8 +70,8 @@ static ItemList	cli_errors = { NULL, NULL };
 static bool        startup_event_logged = false;

 /*
- * Record receipt SIGHUP; will cause configuration file to be reread at the
- * appropriate point in the main loop.
+ * Record receipt of SIGHUP; will cause configuration file to be reread
+ * at the appropriate point in the main loop.
 */
 static volatile sig_atomic_t got_SIGHUP = false;

@@ -98,6 +98,8 @@ static const char *_print_election_result(ElectionResult result);
 static FailoverState promote_self(void);
 static void notify_followers(NodeInfoList *standby_nodes);

+static t_node_info *poll_best_candidate(NodeInfoList *standby_nodes);
+
 static bool wait_primary_notification(int *new_primary_id);
 static FailoverState follow_new_primary(int new_primary_id);

@@ -614,23 +616,73 @@ monitor_streaming_standby(void)
 					}
 					else if (election_result == ELECTION_LOST)
 					{
+						t_node_info *best_candidate;
+
 						log_info("I am the candidate but did not get all votes; will now determine the best candidate");
-						
+
+						best_candidate = poll_best_candidate(&standby_nodes);
+
+						/* this can occur in a tie-break situation, after we establish this node has priority*/
+						if (best_candidate->node_id == local_node_info.node_id)
+						{
+							log_notice("I am the best candidate, will now promote self and inform other nodes");
+
+							failover_state = promote_self();
+						}
+						else
+						{
+							PGconn *candidate_conn = NULL;
+
+							log_info("node %i is the best candidate, waiting for it to confirm so I can follow it",
+									 best_candidate->node_id);
+
+							// notify candidate
+
+							// XXX check result
+							candidate_conn = establish_db_connection(best_candidate->conninfo, false);
+
+							notify_follow_primary(candidate_conn, best_candidate->node_id);
+
+							PQfinish(candidate_conn);
+							// we'll wait for the candidate to get back to us
+							failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
+						}
 					}
 					else
+					{
+						log_info("I am a follower and am waiting to be informed by the winner");
+						failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
+					}
+
+
+					if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
 					{
 						int new_primary_id;

 						//   --> need timeout in case new primary doesn't come up, then rerun election

-						log_info("I am a follower and am waiting to be informed by the winner");
-						failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
-
 						/* either follow or time out; either way resume monitoring */
 						if (wait_primary_notification(&new_primary_id) == true)
 						{
-							failover_state = follow_new_primary(new_primary_id);
+							// if new_primary_id is self, promote
+							if (new_primary_id == local_node_info.node_id)
+							{
+								log_notice("looks like I'm the promotion candidate, promoting");

+								failover_state = promote_self();
+
+								/* reset node list */
+								clear_node_info_list(&standby_nodes);
+								get_active_sibling_node_records(local_conn,
+																local_node_info.node_id,
+																upstream_node_info.node_id,
+																&standby_nodes);
+
+							}
+							else
+							{
+								failover_state = follow_new_primary(new_primary_id);
+							}
 						}
 						else
 						{
@@ -653,6 +705,7 @@ monitor_streaming_standby(void)
 							failover_state = FAILOVER_STATE_NONE;
 							return;
 						case FAILOVER_STATE_PROMOTION_FAILED:
+							log_debug("failover state is FAILED");
 							break;
 						case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
 							log_info(_("resuming standby monitoring mode"));
@@ -666,6 +719,8 @@ monitor_streaming_standby(void)
 						case FAILOVER_STATE_PRIMARY_REAPPEARED:
 						case FAILOVER_STATE_LOCAL_NODE_FAILURE:
 						case FAILOVER_STATE_UNKNOWN:
+						case FAILOVER_STATE_NONE:
+							log_debug("failover state is %i", failover_state);
 							break;
 					}
 				}
@@ -684,6 +739,7 @@ monitor_streaming_standby(void)
 			INSTR_TIME_SET_CURRENT(log_status_interval_current);
 			INSTR_TIME_SUBTRACT(log_status_interval_current, log_status_interval_start);
 			log_status_interval_elapsed = INSTR_TIME_GET_DOUBLE(log_status_interval_current);
+
 			if ((int) log_status_interval_elapsed >= config_file_options.log_status_interval)
 			{
 				log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
@@ -801,6 +857,7 @@ notify_followers(NodeInfoList *standby_nodes)
 {
 	NodeInfoListCell *cell;

+	log_debug("notify_followers()");
 	for (cell = standby_nodes->head; cell; cell = cell->next)
 	{
 		log_debug("intending to notify %i... ", cell->node_info->node_id);
@@ -823,6 +880,46 @@ notify_followers(NodeInfoList *standby_nodes)
 }


+static t_node_info *
+poll_best_candidate(NodeInfoList *standby_nodes)
+{
+	NodeInfoListCell *cell;
+	t_node_info *best_candidate = &local_node_info;
+
+	/*
+	 * we need to definitively decide the best candidate, as in some corner
+	 * cases we could end up with two candidate nodes, so they should each
+	 * come to the same conclusion
+	 */
+	for (cell = standby_nodes->head; cell; cell = cell->next)
+	{
+		if (cell->node_info->last_wal_receive_lsn > best_candidate->last_wal_receive_lsn)
+		{
+			log_debug("node %i has higher LSN, now best candidate", cell->node_info->node_id);
+			best_candidate = cell->node_info;
+		}
+		else if (cell->node_info->last_wal_receive_lsn == best_candidate->last_wal_receive_lsn)
+		{
+			if (cell->node_info->priority > best_candidate->priority)
+			{
+				log_debug("node %i has higher priority, now best candidate", cell->node_info->node_id);
+				best_candidate = cell->node_info;
+			}
+		}
+		/* if all else fails, we decide by node_id */
+		else if (cell->node_info->node_id < best_candidate->node_id)
+		{
+			log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id);
+			best_candidate = cell->node_info;
+		}
+	}
+
+	log_info(_("best candidate is %i"), best_candidate->node_id);
+
+	return best_candidate;
+}
+
+
 static bool
 wait_primary_notification(int *new_primary_id)
 {
@@ -835,7 +932,7 @@ wait_primary_notification(int *new_primary_id)
 		if (get_new_primary(local_conn, new_primary_id) == true)
 		{
 			log_debug("new primary is %i; elapsed: %i",
-					  *new_primary_id, wait_primary_timeout);
+					  *new_primary_id, i);
 			return true;
 		}
 		sleep(1);
@@ -1081,7 +1178,8 @@ do_election(void)

 		// XXX do this
 		// unset_voting_status_initiated(local_conn);
-		return VS_NO_VOTE;
+		log_debug("other node is candidate, returning NOT CANDIDATE XXX unset own status");
+		return ELECTION_NOT_CANDIDATE;
 	}

 	// XXX check if > 50% visible
@@ -1138,6 +1236,7 @@ do_election(void)
 	return ELECTION_LOST;
 }

+
 static void
 daemonize_process(void)
 {