repmgrd: refactory primary failover code into separate function

This commit is contained in:
Ian Barwick
2017-07-04 20:42:22 +09:00
parent f7f49ae85e
commit e1f4384f7e

184
repmgrd.c
View File

@@ -93,6 +93,10 @@ static void handle_sigint(SIGNAL_ARGS);
#endif #endif
static PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status); static PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
static bool do_primary_failover(void);
static bool do_upstream_standby_failover(void);
static ElectionResult do_election(void); static ElectionResult do_election(void);
static const char *_print_voting_status(NodeVotingStatus voting_status); static const char *_print_voting_status(NodeVotingStatus voting_status);
static const char *_print_election_result(ElectionResult result); static const char *_print_election_result(ElectionResult result);
@@ -699,9 +703,95 @@ monitor_streaming_standby(void)
goto loop; goto loop;
} }
/* still down after reconnect attempt(s) - */ /* still down after reconnect attempt(s) */
if (upstream_node_status == NODE_STATUS_DOWN) if (upstream_node_status == NODE_STATUS_DOWN)
{ {
bool failover_done = false;
if (upstream_node_info.type == PRIMARY)
{
failover_done = do_primary_failover();
}
else if (upstream_node_info.type == STANDBY)
{
failover_done = do_upstream_standby_failover();
}
// it's possible it will make sense to return in
// all cases to restart monitoring
if (failover_done == true)
return;
}
}
}
loop:
/* emit "still alive" log message at regular intervals, if requested */
if (config_file_options.log_status_interval > 0)
{
double log_status_interval_elapsed = 0;
instr_time log_status_interval_current;
INSTR_TIME_SET_CURRENT(log_status_interval_current);
INSTR_TIME_SUBTRACT(log_status_interval_current, log_status_interval_start);
log_status_interval_elapsed = INSTR_TIME_GET_DOUBLE(log_status_interval_current);
if ((int) log_status_interval_elapsed >= config_file_options.log_status_interval)
{
log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
local_node_info.node_name,
local_node_info.node_id,
upstream_node_info.node_name,
upstream_node_info.node_id);
//log_debug(
INSTR_TIME_SET_CURRENT(log_status_interval_start);
}
}
/*
* handle local node failure
*
* currently we'll just check the connection, and try to reconnect
*
* TODO: add timeout, after which we run in degraded state
*/
if (is_server_available(local_node_info.conninfo) == false)
{
log_warning(_("connection to local node %i lost"), local_node_info.node_id);
if (local_conn != NULL)
{
PQfinish(local_conn);
local_conn = NULL;
}
}
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_info(_("attempting to reconnect"));
local_conn = establish_db_connection(config_file_options.conninfo, false);
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_warning(_("reconnection failed"));
}
else
{
log_info(_("reconnected"));
}
}
sleep(1);
}
}
static bool
do_primary_failover(void)
{
/* attempt to initiate voting process */ /* attempt to initiate voting process */
ElectionResult election_result = do_election(); ElectionResult election_result = do_election();
@@ -834,7 +924,7 @@ monitor_streaming_standby(void)
log_info(_("switching to primary monitoring mode")); log_info(_("switching to primary monitoring mode"));
failover_state = FAILOVER_STATE_NONE; failover_state = FAILOVER_STATE_NONE;
return; return true;
case FAILOVER_STATE_PRIMARY_REAPPEARED: case FAILOVER_STATE_PRIMARY_REAPPEARED:
log_debug("failover state is PRIMARY_REAPPEARED"); log_debug("failover state is PRIMARY_REAPPEARED");
@@ -851,11 +941,8 @@ monitor_streaming_standby(void)
upstream_node_info.node_name, upstream_node_info.node_id); upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE; failover_state = FAILOVER_STATE_NONE;
return; return true;
case FAILOVER_STATE_PROMOTION_FAILED:
log_debug("failover state is PROMOTION FAILED");
break;
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY: case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
log_info(_("resuming standby monitoring mode")); log_info(_("resuming standby monitoring mode"));
@@ -863,7 +950,7 @@ monitor_streaming_standby(void)
upstream_node_info.node_name, upstream_node_info.node_id); upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE; failover_state = FAILOVER_STATE_NONE;
return; return true;
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY: case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
log_info(_("resuming standby monitoring mode")); log_info(_("resuming standby monitoring mode"));
@@ -871,86 +958,37 @@ monitor_streaming_standby(void)
upstream_node_info.node_name, upstream_node_info.node_id); upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE; failover_state = FAILOVER_STATE_NONE;
return; return true;
case FAILOVER_STATE_PROMOTION_FAILED:
log_debug("failover state is PROMOTION FAILED");
return false;
case FAILOVER_STATE_NO_NEW_PRIMARY: case FAILOVER_STATE_NO_NEW_PRIMARY:
case FAILOVER_STATE_WAITING_NEW_PRIMARY: case FAILOVER_STATE_WAITING_NEW_PRIMARY:
/* pass control back down to start_monitoring() */ /* pass control back down to start_monitoring() */
// -> should kick off new election // -> should kick off new election
return; return false;
case FAILOVER_STATE_LOCAL_NODE_FAILURE: case FAILOVER_STATE_LOCAL_NODE_FAILURE:
case FAILOVER_STATE_UNKNOWN: case FAILOVER_STATE_UNKNOWN:
case FAILOVER_STATE_NONE: case FAILOVER_STATE_NONE:
log_debug("failover state is %i", failover_state); log_debug("failover state is %i", failover_state);
break; return false;
}
} }
} // should never reach here
} return false;
loop:
/* emit "still alive" log message at regular intervals, if requested */
if (config_file_options.log_status_interval > 0)
{
double log_status_interval_elapsed = 0;
instr_time log_status_interval_current;
INSTR_TIME_SET_CURRENT(log_status_interval_current);
INSTR_TIME_SUBTRACT(log_status_interval_current, log_status_interval_start);
log_status_interval_elapsed = INSTR_TIME_GET_DOUBLE(log_status_interval_current);
if ((int) log_status_interval_elapsed >= config_file_options.log_status_interval)
{
log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
local_node_info.node_name,
local_node_info.node_id,
upstream_node_info.node_name,
upstream_node_info.node_id);
//log_debug(
INSTR_TIME_SET_CURRENT(log_status_interval_start);
}
}
/*
* handle local node failure
*
* currently we'll just check the connection, and try to reconnect
*
* TODO: add timeout, after which we run in degraded state
*/
if (is_server_available(local_node_info.conninfo) == false)
{
log_warning(_("connection to local node %i lost"), local_node_info.node_id);
if (local_conn != NULL)
{
PQfinish(local_conn);
local_conn = NULL;
}
}
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_info(_("attempting to reconnect"));
local_conn = establish_db_connection(config_file_options.conninfo, false);
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_warning(_("reconnection failed"));
}
else
{
log_info(_("reconnected"));
}
}
sleep(1);
}
} }
static bool
do_upstream_standby_failover(void)
{
// not implemented yet
return false;
}
static FailoverState static FailoverState
promote_self(void) promote_self(void)
{ {