repmgrd: retry standby connection after cascading standby failover

2026-06-01 03:39:05 +00:00 · 2018-02-28 21:15:11 +09:00
parent 60e63feaca
commit fe594c95ad
1 changed files with 45 additions and 28 deletions
@@ -1701,7 +1701,7 @@ do_upstream_standby_failover(void)
 	t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
 	RecordStatus record_status = RECORD_NOT_FOUND;
 	RecoveryType primary_type = RECTYPE_UNKNOWN;
-	int			r;
+	int			i, r;
 	char		parsed_follow_command[MAXPGPATH] = "";
 	PQfinish(upstream_conn);
@@ -1786,8 +1786,30 @@ do_upstream_standby_failover(void)
 		termPQExpBuffer(&event_details);
 	}
-	/* reconnect to local node */
+	/*
-	local_conn = establish_db_connection(config_file_options.conninfo, false);
+	 * It's possible that the standby is still starting up after the "follow_command"
 	 * completes, so poll for a while until we get a connection.
 	 */
 	for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
 	{
 		local_conn = establish_db_connection(local_node_info.conninfo, false);
 		if (PQstatus(local_conn) == CONNECTION_OK)
 			break;
 		log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
 				  i + 1,
 				  config_file_options.standby_reconnect_timeout);
 		sleep(1);
 	}
 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
 		log_error(_("unable to reconnect to local node %i"),
 				  local_node_info.node_id);
 		return FAILOVER_STATE_FOLLOW_FAIL;
 	}
 	/* refresh shared memory settings which will have been zapped by the restart */
 	repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
@@ -2067,7 +2089,7 @@ follow_new_primary(int new_primary_id)
 	char		parsed_follow_command[MAXPGPATH] = "";
 	PQExpBufferData event_details;
-	int			r;
+	int			i, r;
 	/* Store details of the failed node here */
 	t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
@@ -2201,15 +2223,11 @@ follow_new_primary(int new_primary_id)
 		return FAILOVER_STATE_FOLLOW_FAIL;
 	}
 	{
 	/*
 	 * It's possible that the standby is still starting up after the "follow_command"
 	 * completes, so poll for a while until we get a connection.
 		 *
 		 * TODO:
 		 *  - implement for cascading standby follow too
 	 */
-		int i;
+
 	for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
 	{
 		local_conn = establish_db_connection(local_node_info.conninfo, false);
@@ -2229,7 +2247,6 @@ follow_new_primary(int new_primary_id)
 				  local_node_info.node_id);
 		return FAILOVER_STATE_FOLLOW_FAIL;
 	}
 	}
 	/* refresh shared memory settings which will have been zapped by the restart */
 	repmgrd_set_local_node_id(local_conn, config_file_options.node_id);