mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 08:56:29 +00:00
repmgrd: refactory primary failover code into separate function
This commit is contained in:
184
repmgrd.c
184
repmgrd.c
@@ -93,6 +93,10 @@ static void handle_sigint(SIGNAL_ARGS);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
|
static PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
|
||||||
|
|
||||||
|
static bool do_primary_failover(void);
|
||||||
|
static bool do_upstream_standby_failover(void);
|
||||||
|
|
||||||
static ElectionResult do_election(void);
|
static ElectionResult do_election(void);
|
||||||
static const char *_print_voting_status(NodeVotingStatus voting_status);
|
static const char *_print_voting_status(NodeVotingStatus voting_status);
|
||||||
static const char *_print_election_result(ElectionResult result);
|
static const char *_print_election_result(ElectionResult result);
|
||||||
@@ -699,9 +703,95 @@ monitor_streaming_standby(void)
|
|||||||
goto loop;
|
goto loop;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* still down after reconnect attempt(s) - */
|
/* still down after reconnect attempt(s) */
|
||||||
if (upstream_node_status == NODE_STATUS_DOWN)
|
if (upstream_node_status == NODE_STATUS_DOWN)
|
||||||
{
|
{
|
||||||
|
bool failover_done = false;
|
||||||
|
|
||||||
|
if (upstream_node_info.type == PRIMARY)
|
||||||
|
{
|
||||||
|
failover_done = do_primary_failover();
|
||||||
|
}
|
||||||
|
else if (upstream_node_info.type == STANDBY)
|
||||||
|
{
|
||||||
|
failover_done = do_upstream_standby_failover();
|
||||||
|
}
|
||||||
|
|
||||||
|
// it's possible it will make sense to return in
|
||||||
|
// all cases to restart monitoring
|
||||||
|
if (failover_done == true)
|
||||||
|
return;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
loop:
|
||||||
|
|
||||||
|
/* emit "still alive" log message at regular intervals, if requested */
|
||||||
|
if (config_file_options.log_status_interval > 0)
|
||||||
|
{
|
||||||
|
double log_status_interval_elapsed = 0;
|
||||||
|
instr_time log_status_interval_current;
|
||||||
|
|
||||||
|
INSTR_TIME_SET_CURRENT(log_status_interval_current);
|
||||||
|
INSTR_TIME_SUBTRACT(log_status_interval_current, log_status_interval_start);
|
||||||
|
log_status_interval_elapsed = INSTR_TIME_GET_DOUBLE(log_status_interval_current);
|
||||||
|
|
||||||
|
if ((int) log_status_interval_elapsed >= config_file_options.log_status_interval)
|
||||||
|
{
|
||||||
|
log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
|
||||||
|
local_node_info.node_name,
|
||||||
|
local_node_info.node_id,
|
||||||
|
upstream_node_info.node_name,
|
||||||
|
upstream_node_info.node_id);
|
||||||
|
|
||||||
|
//log_debug(
|
||||||
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* handle local node failure
|
||||||
|
*
|
||||||
|
* currently we'll just check the connection, and try to reconnect
|
||||||
|
*
|
||||||
|
* TODO: add timeout, after which we run in degraded state
|
||||||
|
*/
|
||||||
|
if (is_server_available(local_node_info.conninfo) == false)
|
||||||
|
{
|
||||||
|
log_warning(_("connection to local node %i lost"), local_node_info.node_id);
|
||||||
|
|
||||||
|
if (local_conn != NULL)
|
||||||
|
{
|
||||||
|
PQfinish(local_conn);
|
||||||
|
local_conn = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
log_info(_("attempting to reconnect"));
|
||||||
|
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
||||||
|
|
||||||
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
log_warning(_("reconnection failed"));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_info(_("reconnected"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sleep(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool
|
||||||
|
do_primary_failover(void)
|
||||||
|
{
|
||||||
/* attempt to initiate voting process */
|
/* attempt to initiate voting process */
|
||||||
ElectionResult election_result = do_election();
|
ElectionResult election_result = do_election();
|
||||||
|
|
||||||
@@ -834,7 +924,7 @@ monitor_streaming_standby(void)
|
|||||||
log_info(_("switching to primary monitoring mode"));
|
log_info(_("switching to primary monitoring mode"));
|
||||||
|
|
||||||
failover_state = FAILOVER_STATE_NONE;
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
return;
|
return true;
|
||||||
|
|
||||||
case FAILOVER_STATE_PRIMARY_REAPPEARED:
|
case FAILOVER_STATE_PRIMARY_REAPPEARED:
|
||||||
log_debug("failover state is PRIMARY_REAPPEARED");
|
log_debug("failover state is PRIMARY_REAPPEARED");
|
||||||
@@ -851,11 +941,8 @@ monitor_streaming_standby(void)
|
|||||||
upstream_node_info.node_name, upstream_node_info.node_id);
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
|
|
||||||
failover_state = FAILOVER_STATE_NONE;
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
return;
|
return true;
|
||||||
|
|
||||||
case FAILOVER_STATE_PROMOTION_FAILED:
|
|
||||||
log_debug("failover state is PROMOTION FAILED");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
|
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
|
||||||
log_info(_("resuming standby monitoring mode"));
|
log_info(_("resuming standby monitoring mode"));
|
||||||
@@ -863,7 +950,7 @@ monitor_streaming_standby(void)
|
|||||||
upstream_node_info.node_name, upstream_node_info.node_id);
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
failover_state = FAILOVER_STATE_NONE;
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
|
|
||||||
return;
|
return true;
|
||||||
|
|
||||||
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
|
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
|
||||||
log_info(_("resuming standby monitoring mode"));
|
log_info(_("resuming standby monitoring mode"));
|
||||||
@@ -871,86 +958,37 @@ monitor_streaming_standby(void)
|
|||||||
upstream_node_info.node_name, upstream_node_info.node_id);
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
failover_state = FAILOVER_STATE_NONE;
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
|
|
||||||
return;
|
return true;
|
||||||
|
|
||||||
|
case FAILOVER_STATE_PROMOTION_FAILED:
|
||||||
|
log_debug("failover state is PROMOTION FAILED");
|
||||||
|
return false;
|
||||||
case FAILOVER_STATE_NO_NEW_PRIMARY:
|
case FAILOVER_STATE_NO_NEW_PRIMARY:
|
||||||
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
|
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
|
||||||
/* pass control back down to start_monitoring() */
|
/* pass control back down to start_monitoring() */
|
||||||
// -> should kick off new election
|
// -> should kick off new election
|
||||||
return;
|
return false;
|
||||||
|
|
||||||
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
||||||
case FAILOVER_STATE_UNKNOWN:
|
case FAILOVER_STATE_UNKNOWN:
|
||||||
case FAILOVER_STATE_NONE:
|
case FAILOVER_STATE_NONE:
|
||||||
log_debug("failover state is %i", failover_state);
|
log_debug("failover state is %i", failover_state);
|
||||||
break;
|
return false;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
// should never reach here
|
||||||
}
|
return false;
|
||||||
|
|
||||||
loop:
|
|
||||||
|
|
||||||
/* emit "still alive" log message at regular intervals, if requested */
|
|
||||||
if (config_file_options.log_status_interval > 0)
|
|
||||||
{
|
|
||||||
double log_status_interval_elapsed = 0;
|
|
||||||
instr_time log_status_interval_current;
|
|
||||||
|
|
||||||
INSTR_TIME_SET_CURRENT(log_status_interval_current);
|
|
||||||
INSTR_TIME_SUBTRACT(log_status_interval_current, log_status_interval_start);
|
|
||||||
log_status_interval_elapsed = INSTR_TIME_GET_DOUBLE(log_status_interval_current);
|
|
||||||
|
|
||||||
if ((int) log_status_interval_elapsed >= config_file_options.log_status_interval)
|
|
||||||
{
|
|
||||||
log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
|
|
||||||
local_node_info.node_name,
|
|
||||||
local_node_info.node_id,
|
|
||||||
upstream_node_info.node_name,
|
|
||||||
upstream_node_info.node_id);
|
|
||||||
|
|
||||||
//log_debug(
|
|
||||||
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* handle local node failure
|
|
||||||
*
|
|
||||||
* currently we'll just check the connection, and try to reconnect
|
|
||||||
*
|
|
||||||
* TODO: add timeout, after which we run in degraded state
|
|
||||||
*/
|
|
||||||
if (is_server_available(local_node_info.conninfo) == false)
|
|
||||||
{
|
|
||||||
log_warning(_("connection to local node %i lost"), local_node_info.node_id);
|
|
||||||
|
|
||||||
if (local_conn != NULL)
|
|
||||||
{
|
|
||||||
PQfinish(local_conn);
|
|
||||||
local_conn = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
|
||||||
{
|
|
||||||
log_info(_("attempting to reconnect"));
|
|
||||||
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
|
||||||
|
|
||||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
|
||||||
{
|
|
||||||
log_warning(_("reconnection failed"));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
log_info(_("reconnected"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sleep(1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool
|
||||||
|
do_upstream_standby_failover(void)
|
||||||
|
{
|
||||||
|
// not implemented yet
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static FailoverState
|
static FailoverState
|
||||||
promote_self(void)
|
promote_self(void)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user