repmgrd: improve cascaded standby failover handling

In particular, improve handling of the case where the standby follow command fails due to the primary not being available. GitHub #480.
2026-06-01 11:49:06 +00:00 · 2018-08-16 17:14:05 +09:00
parent 76f5bcf3cd
commit bc584d84f6
1 changed files with 43 additions and 22 deletions
@@ -1820,7 +1820,7 @@ do_upstream_standby_failover(void)
 	t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
 	RecordStatus record_status = RECORD_NOT_FOUND;
 	RecoveryType primary_type = RECTYPE_UNKNOWN;
-	int			i, r;
+	int			i, standby_follow_result;
 	char		parsed_follow_command[MAXPGPATH] = "";
 	close_connection(&upstream_conn);
@@ -1885,9 +1885,9 @@ do_upstream_standby_failover(void)
 	 */
 	parse_follow_command(parsed_follow_command, config_file_options.follow_command, primary_node_info.node_id);
-	r = system(parsed_follow_command);
+	standby_follow_result = system(parsed_follow_command);
-	if (r != 0)
+	if (standby_follow_result != 0)
 	{
 		initPQExpBuffer(&event_details);
@@ -1914,6 +1914,10 @@ do_upstream_standby_failover(void)
 	/*
 	 * It's possible that the standby is still starting up after the "follow_command"
 	 * completes, so poll for a while until we get a connection.
 	 *
 	 * NOTE: we've previously closed the local connection, so even if the follow command
 	 * failed for whatever reason and the local node remained up, we can re-open
 	 * the local connection.
 	 */
 	for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
@@ -1923,7 +1927,7 @@ do_upstream_standby_failover(void)
 		if (PQstatus(local_conn) == CONNECTION_OK)
 			break;
-		log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
+		log_debug("sleeping 1 second; %i of %i (\"repmgrd_standby_startup_timeout\") attempts to reconnect to local node",
 				  i + 1,
 				  config_file_options.repmgrd_standby_startup_timeout);
 		sleep(1);
@@ -1939,30 +1943,47 @@ do_upstream_standby_failover(void)
 	/* refresh shared memory settings which will have been zapped by the restart */
 	repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
-	if (update_node_record_set_upstream(primary_conn,
+	/*
-										local_node_info.node_id,
+	 *
-										primary_node_info.node_id) == false)
+	 */
 	if (standby_follow_result != 0)
 	{
-		initPQExpBuffer(&event_details);
+		monitoring_state = MS_DEGRADED;
 		INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
-		appendPQExpBuffer(&event_details,
+		return FAILOVER_STATE_FOLLOW_FAIL;
-						  _("unable to set node %i's new upstream ID to %i"),
+	}
 						  local_node_info.node_id,
 						  primary_node_info.node_id);
-		log_error("%s", event_details.data);
+	/*
 	 * update upstream_node_id to primary node (but only if follow command
 	 * was successful)
 	 */
-		create_event_notification(
+	{
-								  NULL,
+		if (update_node_record_set_upstream(primary_conn,
-								  &config_file_options,
+											local_node_info.node_id,
-								  local_node_info.node_id,
+											primary_node_info.node_id) == false)
-								  "repmgrd_failover_follow",
+		{
-								  false,
+			initPQExpBuffer(&event_details);
-								  event_details.data);
+			appendPQExpBuffer(&event_details,
 							  _("unable to set node %i's new upstream ID to %i"),
 							  local_node_info.node_id,
 							  primary_node_info.node_id);
-		termPQExpBuffer(&event_details);
+			log_error("%s", event_details.data);
-		terminate(ERR_BAD_CONFIG);
+			create_event_notification(NULL,
 									  &config_file_options,
 									  local_node_info.node_id,
 									  "repmgrd_failover_follow",
 									  false,
 									  event_details.data);
 			termPQExpBuffer(&event_details);
 			terminate(ERR_BAD_CONFIG);
 		}
 	}
 	/* refresh own internal node record */