repmgrd: don't fail over unless more than 50% of active nodes are visible.

This commit is contained in:
Ian Barwick
2017-11-15 13:46:03 +09:00
parent 3c557ebd8e
commit 9d432546bf
2 changed files with 22 additions and 6 deletions

View File

@@ -3545,6 +3545,7 @@ is_server_available(const char *conninfo)
{ {
PGPing status = PQping(conninfo); PGPing status = PQping(conninfo);
log_verbose(LOG_DEBUG, "ping status for %s is %i", conninfo, (int)status);
if (status == PQPING_OK) if (status == PQPING_OK)
return true; return true;

View File

@@ -630,9 +630,9 @@ monitor_streaming_standby(void)
while (true) while (true)
{ {
log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo);
if (is_server_available(upstream_node_info.conninfo) == false) if (is_server_available(upstream_node_info.conninfo) == false)
{ {
/* upstream node is down, we were expecting it to be up */ /* upstream node is down, we were expecting it to be up */
if (upstream_node_info.node_status == NODE_STATUS_UP) if (upstream_node_info.node_status == NODE_STATUS_UP)
{ {
@@ -994,7 +994,7 @@ loop:
} }
if (PQstatus(local_conn) == CONNECTION_OK && config_file_options.monitoring_history == true) if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
update_monitoring_history(); update_monitoring_history();
if (got_SIGHUP) if (got_SIGHUP)
@@ -1069,6 +1069,7 @@ monitor_streaming_witness(void)
/* synchronise local copy of "repmgr.nodes", in case it was stale */ /* synchronise local copy of "repmgr.nodes", in case it was stale */
witness_copy_node_records(primary_conn, local_conn); witness_copy_node_records(primary_conn, local_conn);
/* /*
* refresh upstream node record from primary, so it's as up-to-date * refresh upstream node record from primary, so it's as up-to-date
* as possible * as possible
@@ -1527,7 +1528,6 @@ do_primary_failover(void)
monitoring_state = MS_DEGRADED; monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start); INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
log_debug("failover state is PROMOTION FAILED");
return false; return false;
case FAILOVER_STATE_FOLLOW_FAIL: case FAILOVER_STATE_FOLLOW_FAIL:
@@ -1557,7 +1557,6 @@ do_primary_failover(void)
case FAILOVER_STATE_LOCAL_NODE_FAILURE: case FAILOVER_STATE_LOCAL_NODE_FAILURE:
case FAILOVER_STATE_UNKNOWN: case FAILOVER_STATE_UNKNOWN:
case FAILOVER_STATE_NONE: case FAILOVER_STATE_NONE:
log_debug("failover state is %i", failover_state);
return false; return false;
} }
@@ -2533,6 +2532,22 @@ do_election(void)
return ELECTION_CANCELLED; return ELECTION_CANCELLED;
} }
log_debug("visible nodes: %i; total nodes: %i",
visible_nodes,
standby_nodes.node_count);
if (visible_nodes <= (standby_nodes.node_count / 2.0))
{
log_notice(_("unable to reach a qualified majority of nodes"));
log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
reset_node_voting_status();
return ELECTION_CANCELLED;
}
log_debug("promotion candidate is %i", candidate_node->node_id); log_debug("promotion candidate is %i", candidate_node->node_id);
if (candidate_node->node_id == local_node_info.node_id) if (candidate_node->node_id == local_node_info.node_id)