repmgrd: don't fail over unless more than 50% of active nodes are visible.

2026-07-16 14:29:05 +00:00 · 2017-11-15 13:46:03 +09:00
parent 3c557ebd8e
commit 9d432546bf
2 changed files with 22 additions and 6 deletions
@@ -3545,6 +3545,7 @@ is_server_available(const char *conninfo)
 {
 	PGPing		status = PQping(conninfo);
 	log_verbose(LOG_DEBUG, "ping status for %s is %i", conninfo, (int)status);
 	if (status == PQPING_OK)
 		return true;
@@ -630,9 +630,9 @@ monitor_streaming_standby(void)
 	while (true)
 	{
 		log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo);
 		if (is_server_available(upstream_node_info.conninfo) == false)
 		{
 			/* upstream node is down, we were expecting it to be up */
 			if (upstream_node_info.node_status == NODE_STATUS_UP)
 			{
@@ -994,7 +994,7 @@ loop:
 		}
-		if (PQstatus(local_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
+		if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
 			update_monitoring_history();
 		if (got_SIGHUP)
@@ -1069,6 +1069,7 @@ monitor_streaming_witness(void)
 	/* synchronise local copy of "repmgr.nodes", in case it was stale */
 	witness_copy_node_records(primary_conn, local_conn);
 	/*
 	 * refresh upstream node record from primary, so it's as up-to-date
 	 * as possible
@@ -1527,7 +1528,6 @@ do_primary_failover(void)
 			monitoring_state = MS_DEGRADED;
 			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
 			log_debug("failover state is PROMOTION FAILED");
 			return false;
 		case FAILOVER_STATE_FOLLOW_FAIL:
@@ -1557,7 +1557,6 @@ do_primary_failover(void)
 		case FAILOVER_STATE_LOCAL_NODE_FAILURE:
 		case FAILOVER_STATE_UNKNOWN:
 		case FAILOVER_STATE_NONE:
 			log_debug("failover state is %i", failover_state);
 			return false;
 	}
@@ -2533,6 +2532,22 @@ do_election(void)
 		return ELECTION_CANCELLED;
 	}
 	log_debug("visible nodes: %i; total nodes: %i",
 			  visible_nodes,
 			  standby_nodes.node_count);
 	if (visible_nodes <= (standby_nodes.node_count / 2.0))
 	{
 		log_notice(_("unable to reach a qualified majority of nodes"));
 		log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
 		monitoring_state = MS_DEGRADED;
 		INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
 		reset_node_voting_status();
 		return ELECTION_CANCELLED;
 	}
 	log_debug("promotion candidate is %i", candidate_node->node_id);
 	if (candidate_node->node_id == local_node_info.node_id)