mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 17:06:29 +00:00
repmgrd: don't fail over unless more than 50% of active nodes are visible.
This commit is contained in:
@@ -3545,6 +3545,7 @@ is_server_available(const char *conninfo)
|
|||||||
{
|
{
|
||||||
PGPing status = PQping(conninfo);
|
PGPing status = PQping(conninfo);
|
||||||
|
|
||||||
|
log_verbose(LOG_DEBUG, "ping status for %s is %i", conninfo, (int)status);
|
||||||
if (status == PQPING_OK)
|
if (status == PQPING_OK)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
|||||||
@@ -630,9 +630,9 @@ monitor_streaming_standby(void)
|
|||||||
|
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
|
log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo);
|
||||||
if (is_server_available(upstream_node_info.conninfo) == false)
|
if (is_server_available(upstream_node_info.conninfo) == false)
|
||||||
{
|
{
|
||||||
|
|
||||||
/* upstream node is down, we were expecting it to be up */
|
/* upstream node is down, we were expecting it to be up */
|
||||||
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
@@ -994,7 +994,7 @@ loop:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (PQstatus(local_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
|
if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
|
||||||
update_monitoring_history();
|
update_monitoring_history();
|
||||||
|
|
||||||
if (got_SIGHUP)
|
if (got_SIGHUP)
|
||||||
@@ -1069,6 +1069,7 @@ monitor_streaming_witness(void)
|
|||||||
|
|
||||||
/* synchronise local copy of "repmgr.nodes", in case it was stale */
|
/* synchronise local copy of "repmgr.nodes", in case it was stale */
|
||||||
witness_copy_node_records(primary_conn, local_conn);
|
witness_copy_node_records(primary_conn, local_conn);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* refresh upstream node record from primary, so it's as up-to-date
|
* refresh upstream node record from primary, so it's as up-to-date
|
||||||
* as possible
|
* as possible
|
||||||
@@ -1527,7 +1528,6 @@ do_primary_failover(void)
|
|||||||
monitoring_state = MS_DEGRADED;
|
monitoring_state = MS_DEGRADED;
|
||||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||||
|
|
||||||
log_debug("failover state is PROMOTION FAILED");
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
case FAILOVER_STATE_FOLLOW_FAIL:
|
case FAILOVER_STATE_FOLLOW_FAIL:
|
||||||
@@ -1557,7 +1557,6 @@ do_primary_failover(void)
|
|||||||
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
||||||
case FAILOVER_STATE_UNKNOWN:
|
case FAILOVER_STATE_UNKNOWN:
|
||||||
case FAILOVER_STATE_NONE:
|
case FAILOVER_STATE_NONE:
|
||||||
log_debug("failover state is %i", failover_state);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2533,6 +2532,22 @@ do_election(void)
|
|||||||
return ELECTION_CANCELLED;
|
return ELECTION_CANCELLED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log_debug("visible nodes: %i; total nodes: %i",
|
||||||
|
visible_nodes,
|
||||||
|
standby_nodes.node_count);
|
||||||
|
|
||||||
|
if (visible_nodes <= (standby_nodes.node_count / 2.0))
|
||||||
|
{
|
||||||
|
log_notice(_("unable to reach a qualified majority of nodes"));
|
||||||
|
log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
|
||||||
|
|
||||||
|
monitoring_state = MS_DEGRADED;
|
||||||
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||||
|
|
||||||
|
reset_node_voting_status();
|
||||||
|
|
||||||
|
return ELECTION_CANCELLED;
|
||||||
|
}
|
||||||
|
|
||||||
log_debug("promotion candidate is %i", candidate_node->node_id);
|
log_debug("promotion candidate is %i", candidate_node->node_id);
|
||||||
if (candidate_node->node_id == local_node_info.node_id)
|
if (candidate_node->node_id == local_node_info.node_id)
|
||||||
|
|||||||
Reference in New Issue
Block a user