repmgrd: improve cascaded standby failover

Check primary is available.
2026-06-01 19:59:05 +00:00 · 2017-08-29 15:29:17 +09:00
parent 6d02415d26
commit 154c76e5e7
1 changed files with 35 additions and 12 deletions
@@ -202,9 +202,10 @@ monitor_streaming_primary(void)

 	while (true)
 	{
-
-		// cache node list here, refresh at `node_list_refresh_interval`
-		// also return reason for inavailability so we can log it
+		/*
+		 * TODO: cache node list here, refresh at `node_list_refresh_interval`
+		 * also return reason for inavailability so we can log it
+		 */
 		if (is_server_available(local_node_info.conninfo) == false)
 		{

@@ -331,9 +332,11 @@ monitor_streaming_primary(void)
 			}


-			// possibly attempt to find another node from cached list
-			// check if there's a new primary - if so add hook for fencing?
-			// loop, if starts up check status, switch monitoring mode
+			/*
+			 * possibly attempt to find another node from cached list
+			 * check if there's a new primary - if so add hook for fencing?
+			 * loop, if starts up check status, switch monitoring mode
+			 */
 		}
 	loop:
 		/* emit "still alive" log message at regular intervals, if requested */
@@ -1004,7 +1007,7 @@ do_primary_failover(void)
 	{
 		int new_primary_id;

-		//   --> need timeout in case new primary doesn't come up, then rerun election
+		/*  TODO: rerun election if new primary doesn't appear after timeout */

 		/* either follow or time out; either way resume monitoring */
 		if (wait_primary_notification(&new_primary_id) == true)
@@ -1155,7 +1158,6 @@ do_primary_failover(void)
 		case FAILOVER_STATE_NO_NEW_PRIMARY:
 		case FAILOVER_STATE_WAITING_NEW_PRIMARY:
 			/* pass control back down to start_monitoring() */
-			// -> should kick off new election
 			return false;

 		case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
@@ -1260,7 +1262,8 @@ do_upstream_standby_failover(void)
 {
 	PQExpBufferData event_details;
 	t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
-	RecordStatus record_status;
+	RecordStatus record_status = RECORD_NOT_FOUND;
+	RecoveryType primary_type = RECTYPE_UNKNOWN;
 	int r;

 	PQfinish(upstream_conn);
@@ -1278,10 +1281,30 @@ do_upstream_standby_failover(void)

 	check_connection(&primary_node_info, &primary_conn);

-	/* grandparent upstream is inactive  */
-	if (primary_node_info.active == false)
+	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
-		// XXX
+		log_error(_("unable to connect to last known primary \"%s\" (ID: %i)"),
+				  primary_node_info.node_name,
+				  primary_node_info.node_id);
+
+		PQfinish(primary_conn);
+		monitoring_state = MS_DEGRADED;
+		INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+		return false;
+	}
+
+	primary_type = get_recovery_type(primary_conn);
+
+	if (primary_type != RECTYPE_PRIMARY)
+	{
+		log_error(_("last known primary\"%s\" (ID: %i) is in recovery, not following"),
+				  primary_node_info.node_name,
+				  primary_node_info.node_id);
+
+		PQfinish(primary_conn);
+		monitoring_state = MS_DEGRADED;
+		INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+		return false;
 	}

 	/* Close the connection to this server */