repmgrd: retry standby connection after cascading standby failover

This commit is contained in:
Ian Barwick
2018-02-28 21:15:11 +09:00
parent 60e63feaca
commit fe594c95ad

View File

@@ -1701,7 +1701,7 @@ do_upstream_standby_failover(void)
t_node_info primary_node_info = T_NODE_INFO_INITIALIZER; t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
RecordStatus record_status = RECORD_NOT_FOUND; RecordStatus record_status = RECORD_NOT_FOUND;
RecoveryType primary_type = RECTYPE_UNKNOWN; RecoveryType primary_type = RECTYPE_UNKNOWN;
int r; int i, r;
char parsed_follow_command[MAXPGPATH] = ""; char parsed_follow_command[MAXPGPATH] = "";
PQfinish(upstream_conn); PQfinish(upstream_conn);
@@ -1786,8 +1786,30 @@ do_upstream_standby_failover(void)
termPQExpBuffer(&event_details); termPQExpBuffer(&event_details);
} }
/* reconnect to local node */ /*
local_conn = establish_db_connection(config_file_options.conninfo, false); * It's possible that the standby is still starting up after the "follow_command"
* completes, so poll for a while until we get a connection.
*/
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
{
local_conn = establish_db_connection(local_node_info.conninfo, false);
if (PQstatus(local_conn) == CONNECTION_OK)
break;
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
i + 1,
config_file_options.standby_reconnect_timeout);
sleep(1);
}
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_error(_("unable to reconnect to local node %i"),
local_node_info.node_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/* refresh shared memory settings which will have been zapped by the restart */ /* refresh shared memory settings which will have been zapped by the restart */
repmgrd_set_local_node_id(local_conn, config_file_options.node_id); repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
@@ -2067,7 +2089,7 @@ follow_new_primary(int new_primary_id)
char parsed_follow_command[MAXPGPATH] = ""; char parsed_follow_command[MAXPGPATH] = "";
PQExpBufferData event_details; PQExpBufferData event_details;
int r; int i, r;
/* Store details of the failed node here */ /* Store details of the failed node here */
t_node_info failed_primary = T_NODE_INFO_INITIALIZER; t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
@@ -2201,15 +2223,11 @@ follow_new_primary(int new_primary_id)
return FAILOVER_STATE_FOLLOW_FAIL; return FAILOVER_STATE_FOLLOW_FAIL;
} }
{
/* /*
* It's possible that the standby is still starting up after the "follow_command" * It's possible that the standby is still starting up after the "follow_command"
* completes, so poll for a while until we get a connection. * completes, so poll for a while until we get a connection.
*
* TODO:
* - implement for cascading standby follow too
*/ */
int i;
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++) for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
{ {
local_conn = establish_db_connection(local_node_info.conninfo, false); local_conn = establish_db_connection(local_node_info.conninfo, false);
@@ -2229,7 +2247,6 @@ follow_new_primary(int new_primary_id)
local_node_info.node_id); local_node_info.node_id);
return FAILOVER_STATE_FOLLOW_FAIL; return FAILOVER_STATE_FOLLOW_FAIL;
} }
}
/* refresh shared memory settings which will have been zapped by the restart */ /* refresh shared memory settings which will have been zapped by the restart */
repmgrd_set_local_node_id(local_conn, config_file_options.node_id); repmgrd_set_local_node_id(local_conn, config_file_options.node_id);