mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-28 01:16:29 +00:00
repmgrd: refactory primary failover code into separate function
This commit is contained in:
174
repmgrd.c
174
repmgrd.c
@@ -93,6 +93,10 @@ static void handle_sigint(SIGNAL_ARGS);
|
||||
#endif
|
||||
|
||||
static PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
|
||||
|
||||
static bool do_primary_failover(void);
|
||||
static bool do_upstream_standby_failover(void);
|
||||
|
||||
static ElectionResult do_election(void);
|
||||
static const char *_print_voting_status(NodeVotingStatus voting_status);
|
||||
static const char *_print_election_result(ElectionResult result);
|
||||
@@ -699,8 +703,94 @@ monitor_streaming_standby(void)
|
||||
goto loop;
|
||||
}
|
||||
|
||||
/* still down after reconnect attempt(s) - */
|
||||
/* still down after reconnect attempt(s) */
|
||||
if (upstream_node_status == NODE_STATUS_DOWN)
|
||||
{
|
||||
bool failover_done = false;
|
||||
|
||||
if (upstream_node_info.type == PRIMARY)
|
||||
{
|
||||
failover_done = do_primary_failover();
|
||||
}
|
||||
else if (upstream_node_info.type == STANDBY)
|
||||
{
|
||||
failover_done = do_upstream_standby_failover();
|
||||
}
|
||||
|
||||
// it's possible it will make sense to return in
|
||||
// all cases to restart monitoring
|
||||
if (failover_done == true)
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
loop:
|
||||
|
||||
/* emit "still alive" log message at regular intervals, if requested */
|
||||
if (config_file_options.log_status_interval > 0)
|
||||
{
|
||||
double log_status_interval_elapsed = 0;
|
||||
instr_time log_status_interval_current;
|
||||
|
||||
INSTR_TIME_SET_CURRENT(log_status_interval_current);
|
||||
INSTR_TIME_SUBTRACT(log_status_interval_current, log_status_interval_start);
|
||||
log_status_interval_elapsed = INSTR_TIME_GET_DOUBLE(log_status_interval_current);
|
||||
|
||||
if ((int) log_status_interval_elapsed >= config_file_options.log_status_interval)
|
||||
{
|
||||
log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
|
||||
local_node_info.node_name,
|
||||
local_node_info.node_id,
|
||||
upstream_node_info.node_name,
|
||||
upstream_node_info.node_id);
|
||||
|
||||
//log_debug(
|
||||
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* handle local node failure
|
||||
*
|
||||
* currently we'll just check the connection, and try to reconnect
|
||||
*
|
||||
* TODO: add timeout, after which we run in degraded state
|
||||
*/
|
||||
if (is_server_available(local_node_info.conninfo) == false)
|
||||
{
|
||||
log_warning(_("connection to local node %i lost"), local_node_info.node_id);
|
||||
|
||||
if (local_conn != NULL)
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
local_conn = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_info(_("attempting to reconnect"));
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_warning(_("reconnection failed"));
|
||||
}
|
||||
else
|
||||
{
|
||||
log_info(_("reconnected"));
|
||||
}
|
||||
}
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
do_primary_failover(void)
|
||||
{
|
||||
/* attempt to initiate voting process */
|
||||
ElectionResult election_result = do_election();
|
||||
@@ -834,7 +924,7 @@ monitor_streaming_standby(void)
|
||||
log_info(_("switching to primary monitoring mode"));
|
||||
|
||||
failover_state = FAILOVER_STATE_NONE;
|
||||
return;
|
||||
return true;
|
||||
|
||||
case FAILOVER_STATE_PRIMARY_REAPPEARED:
|
||||
log_debug("failover state is PRIMARY_REAPPEARED");
|
||||
@@ -851,11 +941,8 @@ monitor_streaming_standby(void)
|
||||
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||
|
||||
failover_state = FAILOVER_STATE_NONE;
|
||||
return;
|
||||
return true;
|
||||
|
||||
case FAILOVER_STATE_PROMOTION_FAILED:
|
||||
log_debug("failover state is PROMOTION FAILED");
|
||||
break;
|
||||
|
||||
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
|
||||
log_info(_("resuming standby monitoring mode"));
|
||||
@@ -863,7 +950,7 @@ monitor_streaming_standby(void)
|
||||
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||
failover_state = FAILOVER_STATE_NONE;
|
||||
|
||||
return;
|
||||
return true;
|
||||
|
||||
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
|
||||
log_info(_("resuming standby monitoring mode"));
|
||||
@@ -871,85 +958,36 @@ monitor_streaming_standby(void)
|
||||
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||
failover_state = FAILOVER_STATE_NONE;
|
||||
|
||||
return;
|
||||
return true;
|
||||
|
||||
case FAILOVER_STATE_PROMOTION_FAILED:
|
||||
log_debug("failover state is PROMOTION FAILED");
|
||||
return false;
|
||||
case FAILOVER_STATE_NO_NEW_PRIMARY:
|
||||
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
|
||||
/* pass control back down to start_monitoring() */
|
||||
// -> should kick off new election
|
||||
return;
|
||||
return false;
|
||||
|
||||
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
||||
case FAILOVER_STATE_UNKNOWN:
|
||||
case FAILOVER_STATE_NONE:
|
||||
log_debug("failover state is %i", failover_state);
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
// should never reach here
|
||||
return false;
|
||||
}
|
||||
|
||||
loop:
|
||||
|
||||
/* emit "still alive" log message at regular intervals, if requested */
|
||||
if (config_file_options.log_status_interval > 0)
|
||||
static bool
|
||||
do_upstream_standby_failover(void)
|
||||
{
|
||||
double log_status_interval_elapsed = 0;
|
||||
instr_time log_status_interval_current;
|
||||
|
||||
INSTR_TIME_SET_CURRENT(log_status_interval_current);
|
||||
INSTR_TIME_SUBTRACT(log_status_interval_current, log_status_interval_start);
|
||||
log_status_interval_elapsed = INSTR_TIME_GET_DOUBLE(log_status_interval_current);
|
||||
|
||||
if ((int) log_status_interval_elapsed >= config_file_options.log_status_interval)
|
||||
{
|
||||
log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
|
||||
local_node_info.node_name,
|
||||
local_node_info.node_id,
|
||||
upstream_node_info.node_name,
|
||||
upstream_node_info.node_id);
|
||||
|
||||
//log_debug(
|
||||
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
||||
}
|
||||
// not implemented yet
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* handle local node failure
|
||||
*
|
||||
* currently we'll just check the connection, and try to reconnect
|
||||
*
|
||||
* TODO: add timeout, after which we run in degraded state
|
||||
*/
|
||||
if (is_server_available(local_node_info.conninfo) == false)
|
||||
{
|
||||
log_warning(_("connection to local node %i lost"), local_node_info.node_id);
|
||||
|
||||
if (local_conn != NULL)
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
local_conn = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_info(_("attempting to reconnect"));
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_warning(_("reconnection failed"));
|
||||
}
|
||||
else
|
||||
{
|
||||
log_info(_("reconnected"));
|
||||
}
|
||||
}
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
static FailoverState
|
||||
promote_self(void)
|
||||
|
||||
Reference in New Issue
Block a user