repmgrd: during failover, check if a node was already promoted

Previously, repmgrd assumed that during a failover, there would not already be another primary node. However it's possible a node was promoted manually. While this is not a desirable situation, it's conceivable this could happen in the wild, so we should check for it and react accordingly. Also sanity-check that the follow target can actually be followed. Addresses issue raised in GitHub #420.
2026-05-31 19:39:04 +00:00 · 2019-03-21 16:16:59 +09:00
parent 6f0f338968
commit 539861cb58
4 changed files with 290 additions and 17 deletions
@@ -4895,6 +4895,7 @@ void
 init_replication_info(ReplInfo *replication_info)
 {
 	memset(replication_info->current_timestamp, 0, sizeof(replication_info->current_timestamp));
+	replication_info->in_recovery = false;
 	replication_info->last_wal_receive_lsn = InvalidXLogRecPtr;
 	replication_info->last_wal_replay_lsn = InvalidXLogRecPtr;
 	memset(replication_info->last_xact_replay_timestamp, 0, sizeof(replication_info->last_xact_replay_timestamp));
@@ -4915,6 +4916,7 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
 	initPQExpBuffer(&query);
 	appendPQExpBufferStr(&query,
 						 " SELECT ts, "
+						 "        in_recovery, "
 						 "        last_wal_receive_lsn, "
 						 "        last_wal_replay_lsn, "
 						 "        last_xact_replay_timestamp, "
@@ -4932,6 +4934,7 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
 						 "        upstream_last_seen "
 						 "   FROM ( "
 						 " SELECT CURRENT_TIMESTAMP AS ts, "
+						 "        pg_catalog.pg_is_in_recovery() AS in_recovery, "
 						 "        pg_catalog.pg_last_xact_replay_timestamp() AS last_xact_replay_timestamp, ");


@@ -4998,13 +5001,14 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
 	else
 	{
 		strncpy(replication_info->current_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
-		replication_info->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 1));
-		replication_info->last_wal_replay_lsn = parse_lsn(PQgetvalue(res, 0, 2));
-		strncpy(replication_info->last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
-		replication_info->replication_lag_time = atoi(PQgetvalue(res, 0, 4));
-		replication_info->receiving_streamed_wal = atobool(PQgetvalue(res, 0, 5));
-		replication_info->wal_replay_paused = atobool(PQgetvalue(res, 0, 6));
-		replication_info->upstream_last_seen = atoi(PQgetvalue(res, 0, 7));
+		replication_info->in_recovery = atobool(PQgetvalue(res, 0, 1));
+		replication_info->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 2));
+		replication_info->last_wal_replay_lsn = parse_lsn(PQgetvalue(res, 0, 3));
+		strncpy(replication_info->last_xact_replay_timestamp, PQgetvalue(res, 0, 4), MAXLEN);
+		replication_info->replication_lag_time = atoi(PQgetvalue(res, 0, 5));
+		replication_info->receiving_streamed_wal = atobool(PQgetvalue(res, 0, 6));
+		replication_info->wal_replay_paused = atobool(PQgetvalue(res, 0, 7));
+		replication_info->upstream_last_seen = atoi(PQgetvalue(res, 0, 8));
 	}

 	termPQExpBuffer(&query);
@@ -302,6 +302,7 @@ typedef struct BdrNodeInfoList
 typedef struct
 {
 	char		current_timestamp[MAXLEN];
+	bool		in_recovery;
 	XLogRecPtr	last_wal_receive_lsn;
 	XLogRecPtr	last_wal_replay_lsn;
 	char		last_xact_replay_timestamp[MAXLEN];
@@ -3144,6 +3144,8 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
 /*
 * Here we'll perform some timeline sanity checks to ensure the follow target
 * can actually be followed.
+ *
+ * See also comment for check_node_can_follow() in repmgrd-physical.c .
 */
 bool
 check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin)
@@ -3234,6 +3236,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
 		return false;
 	}

+	/* timelines are the same - check relative positions */
 	if (follow_target_identification.timeline == local_tli)
 	{
 		XLogRecPtr follow_target_xlogpos = get_node_current_lsn(follow_target_conn);
@@ -3245,7 +3248,6 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
 			return false;
 		}

-		/* timeline is the same - check relative positions */
 		if (local_xlogpos <= follow_target_xlogpos)
 		{
 			log_info(_("timelines are same, this server is not ahead"));
@@ -32,6 +32,7 @@ typedef enum
 	FAILOVER_STATE_PRIMARY_REAPPEARED,
 	FAILOVER_STATE_LOCAL_NODE_FAILURE,
 	FAILOVER_STATE_WAITING_NEW_PRIMARY,
+	FAILOVER_STATE_FOLLOW_NEW_PRIMARY,
 	FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER,
 	FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
 	FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
@@ -63,7 +64,7 @@ static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
 static instr_time last_monitoring_update;


-static ElectionResult do_election(NodeInfoList *sibling_nodes);
+static ElectionResult do_election(NodeInfoList *sibling_nodes, int *new_primary_id);
 static const char *_print_election_result(ElectionResult result);

 static FailoverState promote_self(void);
@@ -91,6 +92,8 @@ static const char *format_failover_state(FailoverState failover_state);
 static const char * format_failover_state(FailoverState failover_state);
 static ElectionResult execute_failover_validation_command(t_node_info *node_info);
 static void parse_failover_validation_command(const char *template,  t_node_info *node_info, PQExpBufferData *out);
+static bool check_node_can_follow(PGconn *local_conn, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_info);
+

 void
 handle_sigint_physical(SIGNAL_ARGS)
@@ -2015,6 +2018,7 @@ do_primary_failover(void)
 	ElectionResult election_result;
 	bool final_result = false;
 	NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
+	int new_primary_id = UNKNOWN_NODE_ID;

 	/*
 	 * Double-check status of the local connection
@@ -2108,7 +2112,7 @@ do_primary_failover(void)
 	}

 	/* attempt to initiate voting process */
-	election_result = do_election(&sibling_nodes);
+	election_result = do_election(&sibling_nodes, &new_primary_id);

 	/* TODO add pre-event notification here */
 	failover_state = FAILOVER_STATE_UNKNOWN;
@@ -2122,10 +2126,19 @@ do_primary_failover(void)
 		enable_wal_receiver(local_conn, false);
 	}

+	/* election was cancelled and do_election() did not determine a new primary */
 	if (election_result == ELECTION_CANCELLED)
 	{
-		log_notice(_("election cancelled"));
-		return false;
+		if (new_primary_id == UNKNOWN_NODE_ID)
+		{
+			log_notice(_("election cancelled"));
+			clear_node_info_list(&sibling_nodes);
+			return false;
+		}
+
+		log_info(_("follower node intending to follow new primary %i"), new_primary_id);
+
+		failover_state = FAILOVER_STATE_FOLLOW_NEW_PRIMARY;
 	}
 	else if (election_result == ELECTION_RERUN)
 	{
@@ -2166,14 +2179,20 @@ do_primary_failover(void)
 		failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
 	}

+	/*
+	 * node has determined a new primary is already available
+	 */
+	if (failover_state == FAILOVER_STATE_FOLLOW_NEW_PRIMARY)
+	{
+		failover_state = follow_new_primary(new_primary_id);
+	}
+
 	/*
 	 * node has decided it is a follower, so will await notification from the
 	 * candidate that it has promoted itself and can be followed
 	 */
-	if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
+	else if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
 	{
-		int			new_primary_id = UNKNOWN_NODE_ID;
-
 		/* TODO: rerun election if new primary doesn't appear after timeout */

 		/* either follow, self-promote or time out; either way resume monitoring */
@@ -3287,7 +3306,7 @@ _print_election_result(ElectionResult result)
 * expects to be able to read this list
 */
 static ElectionResult
-do_election(NodeInfoList *sibling_nodes)
+do_election(NodeInfoList *sibling_nodes, int *new_primary_id)
 {
 	int			electoral_term = -1;

@@ -3329,7 +3348,6 @@ do_election(NodeInfoList *sibling_nodes)

 	log_debug("do_election(): electoral term is %i", electoral_term);

-
 	if (config_file_options.failover == FAILOVER_MANUAL)
 	{
 		log_notice(_("this node is not configured for automatic failover so will not be considered as promotion candidate, and will not follow the new primary"));
@@ -3507,6 +3525,41 @@ do_election(NodeInfoList *sibling_nodes)
 			continue;
 		}

+		/*
+		 * Check if node is not in recovery - it may have been promoted
+		 * outside of the failover mechanism, in which case we may be able
+		 * to follow it.
+		 */
+
+		if (sibling_replication_info.in_recovery == false)
+		{
+			bool can_follow;
+
+			log_warning(_("node \"%s\" (ID: %i) is not in recovery"),
+						cell->node_info->node_name,
+						cell->node_info->node_id);
+
+			can_follow = check_node_can_follow(local_conn,
+											   local_node_info.last_wal_receive_lsn,
+											   cell->node_info->conn,
+											   cell->node_info);
+
+			if (can_follow == true)
+			{
+				*new_primary_id = cell->node_info->node_id;
+				termPQExpBuffer(&nodes_with_primary_visible);
+				return ELECTION_CANCELLED;
+			}
+
+			/*
+			 * Tricky situation here - we'll assume the node is a rogue primary
+			 */
+			log_warning(_("not possible to attach to node \"%s\" (ID: %i), ignoring"),
+						cell->node_info->node_name,
+						cell->node_info->node_id);
+			continue;
+		}
+
 		/* check if WAL replay on node is paused */
 		if (sibling_replication_info.wal_replay_paused == true)
 		{
@@ -3875,6 +3928,8 @@ format_failover_state(FailoverState failover_state)
 			return "LOCAL_NODE_FAILURE";
 		case FAILOVER_STATE_WAITING_NEW_PRIMARY:
 			return "WAITING_NEW_PRIMARY";
+		case FAILOVER_STATE_FOLLOW_NEW_PRIMARY:
+			return "FOLLOW_NEW_PRIMARY";
 		case FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER:
 			return "REQUIRES_MANUAL_FAILOVER";
 		case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
@@ -4015,3 +4070,214 @@ parse_failover_validation_command(const char *template, t_node_info *node_info,

 	return;
 }
+
+
+/*
+ * Sanity-check whether the local node can follow the proposed upstream node.
+ *
+ * Note this function is very similar to check_node_can_attach() in
+ * repmgr-client.c, however the later is very focussed on client-side
+ * functionality (including log output related to --dry-run, pg_rewind etc.)
+ * which we don't want here.
+ */
+static bool
+check_node_can_follow(PGconn *local_conn, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_info)
+{
+	t_conninfo_param_list local_repl_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
+	PGconn	   *local_repl_conn = NULL;
+	t_system_identification local_identification = T_SYSTEM_IDENTIFICATION_INITIALIZER;
+
+	t_conninfo_param_list follow_target_repl_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
+	PGconn	   *follow_target_repl_conn = NULL;
+	t_system_identification follow_target_identification = T_SYSTEM_IDENTIFICATION_INITIALIZER;
+	TimeLineHistoryEntry *follow_target_history = NULL;
+
+	bool can_follow = true;
+	bool success;
+
+	/* Check local replication connection - we want to execute IDENTIFY_SYSTEM
+	 * to get the current timeline ID, which might not yet be written to
+	 * pg_control.
+	 *
+	 * TODO: from 9.6, query "pg_stat_wal_receiver" via the existing local connection
+	 */
+
+	initialize_conninfo_params(&local_repl_conninfo, false);
+
+	conn_to_param_list(local_conn, &local_repl_conninfo);
+
+	/* Set the replication user from the node record */
+	param_set(&local_repl_conninfo, "user", local_node_info.repluser);
+	param_set(&local_repl_conninfo, "replication", "1");
+
+	local_repl_conn = establish_db_connection_by_params(&local_repl_conninfo, false);
+	free_conninfo_params(&local_repl_conninfo);
+
+	if (PQstatus(local_repl_conn) != CONNECTION_OK)
+	{
+		log_error(_("unable to establish a replication connection to the local node"));
+		PQfinish(local_repl_conn);
+
+		return false;
+	}
+	success = identify_system(local_repl_conn, &local_identification);
+	PQfinish(local_repl_conn);
+
+	if (success == false)
+	{
+		log_error(_("unable to query the local node's system identification"));
+
+		return false;
+	}
+
+	/* check replication connection */
+	initialize_conninfo_params(&follow_target_repl_conninfo, false);
+
+	conn_to_param_list(follow_target_conn, &follow_target_repl_conninfo);
+
+	if (strcmp(param_get(&follow_target_repl_conninfo, "user"), follow_target_node_info->repluser) != 0)
+	{
+		param_set(&follow_target_repl_conninfo, "user", follow_target_node_info->repluser);
+		param_set(&follow_target_repl_conninfo, "dbname", "replication");
+	}
+
+	param_set(&follow_target_repl_conninfo, "replication", "1");
+
+	follow_target_repl_conn = establish_db_connection_by_params(&follow_target_repl_conninfo, false);
+
+	free_conninfo_params(&follow_target_repl_conninfo);
+
+	if (PQstatus(follow_target_repl_conn) != CONNECTION_OK)
+	{
+		log_error(_("unable to establish a replication connection to the follow target node"));
+		return false;
+	}
+
+	/* check system_identifiers match */
+	if (identify_system(follow_target_repl_conn, &follow_target_identification) == false)
+	{
+		log_error(_("unable to query the follow target node's system identification"));
+
+		PQfinish(follow_target_repl_conn);
+		return false;
+	}
+
+	/*
+	 * Check for thing that should never happen, but expect the unexpected anyway.
+	 */
+	if (follow_target_identification.system_identifier != local_identification.system_identifier)
+	{
+		log_error(_("this node is not part of the follow target node's replication cluster"));
+		log_detail(_("this node's system identifier is %lu, follow target node's system identifier is %lu"),
+				   local_identification.system_identifier,
+				   follow_target_identification.system_identifier);
+		PQfinish(follow_target_repl_conn);
+		return false;
+	}
+
+	/* check timelines */
+
+	log_verbose(LOG_DEBUG, "local timeline: %i; follow target timeline: %i",
+				local_identification.timeline,
+				follow_target_identification.timeline);
+
+	/* upstream's timeline is lower than ours - impossible case */
+	if (follow_target_identification.timeline < local_identification.timeline)
+	{
+		log_error(_("this node's timeline is ahead of the follow target node's timeline"));
+		log_detail(_("this node's timeline is %i, follow target node's timeline is %i"),
+				   local_identification.timeline,
+				   follow_target_identification.timeline);
+		PQfinish(follow_target_repl_conn);
+		return false;
+	}
+
+	/* timeline is the same - check relative positions */
+	if (follow_target_identification.timeline == local_identification.timeline)
+	{
+		XLogRecPtr follow_target_xlogpos = get_node_current_lsn(follow_target_conn);
+
+		if (local_xlogpos == InvalidXLogRecPtr || follow_target_xlogpos == InvalidXLogRecPtr)
+		{
+			log_error(_("unable to compare LSN positions"));
+			PQfinish(follow_target_repl_conn);
+			return false;
+		}
+
+		if (local_xlogpos <= follow_target_xlogpos)
+		{
+			log_info(_("timelines are same, this server is not ahead"));
+			log_detail(_("local node lsn is %X/%X, follow target lsn is %X/%X"),
+					   format_lsn(local_xlogpos),
+					   format_lsn(follow_target_xlogpos));
+		}
+		else
+		{
+			log_error(_("this node is ahead of the follow target"));
+			log_detail(_("local node lsn is %X/%X, follow target lsn is %X/%X"),
+					   format_lsn(local_xlogpos),
+					   format_lsn(follow_target_xlogpos));
+
+			can_follow = false;
+		}
+	}
+	else
+	{
+		/*
+		 * upstream has higher timeline - check where it forked off from this node's timeline
+		 */
+		follow_target_history = get_timeline_history(follow_target_repl_conn,
+													 local_identification.timeline + 1);
+
+		if (follow_target_history == NULL)
+		{
+			/* get_timeline_history() will emit relevant error messages */
+			PQfinish(follow_target_repl_conn);
+			return false;
+		}
+
+		log_debug("local tli: %i; local_xlogpos: %X/%X; follow_target_history->tli: %i; follow_target_history->end: %X/%X",
+				  (int)local_identification.timeline,
+				  format_lsn(local_xlogpos),
+				  follow_target_history->tli,
+				  format_lsn(follow_target_history->end));
+
+		/*
+		 * Local node has proceeded beyond the follow target's fork, so we
+		 * definitely can't attach.
+		 *
+		 * This could be the case if the follow target was promoted, but does
+		 * not contain all changes which are being replayed to this standby.
+		 */
+		if (local_xlogpos > follow_target_history->end)
+		{
+			log_error(_("this node cannot attach to follow target node %i"),
+					  follow_target_node_info->node_id);
+			can_follow = false;
+
+			log_detail(_("follow target server's timeline %lu forked off current database system timeline %lu before current recovery point %X/%X"),
+					   local_identification.system_identifier + 1,
+					   local_identification.system_identifier,
+					   format_lsn(local_xlogpos));
+		}
+
+		if (can_follow == true)
+		{
+			log_info(_("local node %i can attach to follow target node %i"),
+					 config_file_options.node_id,
+					 follow_target_node_info->node_id);
+
+			log_detail(_("local node's recovery point: %X/%X; follow target node's fork point: %X/%X"),
+					   format_lsn(local_xlogpos),
+					   format_lsn(follow_target_history->end));
+		}
+	}
+
+	PQfinish(follow_target_repl_conn);
+
+	if (follow_target_history)
+		pfree(follow_target_history);
+
+
+	return can_follow;
+}