repmgr/repmgrd-physical.c

/*
 * repmgrd-physical.c - physical replication functionality for repmgrd
 *
 * Copyright (c) 2ndQuadrant, 2010-2017
 */

#include <signal.h>

#include "repmgr.h"
#include "repmgrd.h"
#include "repmgrd-physical.h"


typedef enum {
	FAILOVER_STATE_UNKNOWN = -1,
	FAILOVER_STATE_NONE,
	FAILOVER_STATE_PROMOTED,
	FAILOVER_STATE_PROMOTION_FAILED,
	FAILOVER_STATE_PRIMARY_REAPPEARED,
	FAILOVER_STATE_LOCAL_NODE_FAILURE,
	FAILOVER_STATE_WAITING_NEW_PRIMARY,
	FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER,
	FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
	FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
	FAILOVER_STATE_NO_NEW_PRIMARY,
	FAILOVER_STATE_FOLLOW_FAIL,
	FAILOVER_STATE_NODE_NOTIFICATION_ERROR
} FailoverState;


typedef enum {
	ELECTION_NOT_CANDIDATE = -1,
	ELECTION_WON,
	ELECTION_LOST,
	ELECTION_CANCELLED
} ElectionResult;


static PGconn *upstream_conn = NULL;
static PGconn *primary_conn = NULL;

#ifndef BDR_ONLY
static FailoverState failover_state = FAILOVER_STATE_UNKNOWN;

static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
static NodeInfoList standby_nodes = T_NODE_INFO_LIST_INITIALIZER;


static ElectionResult do_election(void);
static const char *_print_voting_status(NodeVotingStatus voting_status);
static const char *_print_election_result(ElectionResult result);

static FailoverState promote_self(void);
static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);

static t_node_info *poll_best_candidate(NodeInfoList *standby_nodes);

static void check_connection(t_node_info *node_info, PGconn *conn);

static bool wait_primary_notification(int *new_primary_id);
static FailoverState follow_new_primary(int new_primary_id);

static void reset_node_voting_status(void);
void close_connections_physical();

static bool do_primary_failover(void);
static bool do_upstream_standby_failover(void);

#endif


/* perform some sanity checks on the node's configuration */

void
do_physical_node_check(void)
{
#ifndef BDR_ONLY
    /*
     * Check if node record is active - if not, and `failover=automatic`, the node
     * won't be considered as a promotion candidate; this often happens when
     * a failed primary is recloned and the node was not re-registered, giving
     * the impression failover capability is there when it's not. In this case
     * abort with an error and a hint about registering.
     *
     * If `failover=manual`, repmgrd can continue to passively monitor the node, but
     * we should nevertheless issue a warning and the same hint.
     */

    if (local_node_info.active == false)
    {
        char *hint = "Check that 'repmgr (primary|standby) register' was executed for this node";

        switch (config_file_options.failover)
        {
			/* "failover" is an enum, all values should be covered here */

            case FAILOVER_AUTOMATIC:
                log_error(_("this node is marked as inactive and cannot be used as a failover target"));
                log_hint(_("%s"), hint);
				PQfinish(local_conn);
                terminate(ERR_BAD_CONFIG);

            case FAILOVER_MANUAL:
                log_warning(_("this node is marked as inactive and will be passively monitored only"));
                log_hint(_("%s"), hint);
                break;
        }
    }

	if (config_file_options.failover == FAILOVER_AUTOMATIC)
	{
		/*
		 * check that promote/follow commands are defined, otherwise repmgrd
		 * won't be able to perform any useful action
		 */

		bool required_param_missing = false;

		if (config_file_options.promote_command[0] == '\0')
		{
			log_error(_("\"promote_command\" must be defined in the configuration file"));

			if (config_file_options.service_promote_command[0] != '\0')
			{
				/*
				 * if repmgrd executes "service_promote_command" directly, repmgr metadata
				 * won't get updated
				 */
				log_hint(_("\"service_promote_command\" is set, but can only be executed by \"repmgr standby promote\""));
			}

			required_param_missing = true;
		}
		if (config_file_options.follow_command[0] == '\0')
		{
			log_error(_("\"follow_command\" must be defined in the configuration file"));
			required_param_missing = true;
		}

		if (required_param_missing == true)
		{
			log_hint(_("add the missing configuration parameter(s) and start repmgrd again"));
			PQfinish(local_conn);
			exit(ERR_BAD_CONFIG);
		}
	}
#endif
}


/*
 * repmgrd running on the primary server
 */
void
monitor_streaming_primary(void)
{
#ifndef BDR_ONLY
	instr_time	log_status_interval_start;
	PQExpBufferData event_details;

	reset_node_voting_status();

	initPQExpBuffer(&event_details);

	appendPQExpBuffer(&event_details,
					  _("monitoring cluster primary \"%s\" (node ID: %i)"),
					  local_node_info.node_name,
					  local_node_info.node_id);


	/* Log startup event */
	if (startup_event_logged == false)
	{
		create_event_notification(local_conn,
								  &config_file_options,
								  config_file_options.node_id,
								  "repmgrd_start",
								  true,
								  event_details.data);

		startup_event_logged = true;
	}
	else
	{
		create_event_notification(local_conn,
								  &config_file_options,
								  config_file_options.node_id,
								  "repmgrd_reload",
								  true,
								  event_details.data);
	}

	log_notice("%s", event_details.data);

	termPQExpBuffer(&event_details);

	INSTR_TIME_SET_CURRENT(log_status_interval_start);
	local_node_info.node_status = NODE_STATUS_UP;

	while (true)
	{

		// cache node list here, refresh at `node_list_refresh_interval`
		// also return reason for inavailability so we can log it
		if (is_server_available(local_node_info.conninfo) == false)
		{

			/* node is down, we were expecting it to be up */
			if (local_node_info.node_status == NODE_STATUS_UP)
			{
				PQExpBufferData event_details;
				instr_time	local_node_unreachable_start;

				INSTR_TIME_SET_CURRENT(local_node_unreachable_start);

				initPQExpBuffer(&event_details);

				appendPQExpBuffer(&event_details,
								  _("unable to connect to local node"));

				log_warning("%s", event_details.data);

				local_node_info.node_status = NODE_STATUS_UNKNOWN;

				PQfinish(local_conn);

				/*
				 * as we're monitoring the primary, no point in trying to write
				 * the event to the database
				 *
				 * XXX possible pre-action event
				 */
				create_event_notification(NULL,
										  &config_file_options,
										  config_file_options.node_id,
										  "repmgrd_local_disconnect",
										  true,
										  event_details.data);

				termPQExpBuffer(&event_details);

				local_conn = try_reconnect(&local_node_info);

				if (local_node_info.node_status == NODE_STATUS_UP)
				{
					int		local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);

					initPQExpBuffer(&event_details);

					appendPQExpBuffer(&event_details,
									  _("reconnected to local node after %i seconds"),
									  local_node_unreachable_elapsed);
					log_notice("%s", event_details.data);

					create_event_notification(local_conn,
										&config_file_options,
										config_file_options.node_id,
										"repmgrd_local_reconnect",
										true,
										event_details.data);
					termPQExpBuffer(&event_details);

					goto loop;
				}

				monitoring_state = MS_DEGRADED;
				INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
			}

		}


		if (monitoring_state == MS_DEGRADED)
		{
			int		degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);

			if (config_file_options.degraded_monitoring_timeout > 0
				&& degraded_monitoring_elapsed > config_file_options.degraded_monitoring_timeout)
			{
				initPQExpBuffer(&event_details);

				appendPQExpBuffer(&event_details,
								  _("degraded monitoring timeout (%i seconds) exceeded, terminating"),
								  degraded_monitoring_elapsed);

				log_notice("%s", event_details.data);

				create_event_notification(NULL,
										  &config_file_options,
										  config_file_options.node_id,
										  "repmgrd_terminate",
										  true,
										  event_details.data);

				termPQExpBuffer(&event_details);
				terminate(ERR_MONITORING_TIMEOUT);
			}

			log_debug("monitoring node in degraded state for %i seconds", degraded_monitoring_elapsed);

			if (is_server_available(local_node_info.conninfo) == true)
			{
				local_conn = establish_db_connection(local_node_info.conninfo, false);

				if (PQstatus(local_conn) == CONNECTION_OK)
				{
					local_node_info.node_status = NODE_STATUS_UP;
					monitoring_state = MS_NORMAL;

					initPQExpBuffer(&event_details);

					appendPQExpBuffer(&event_details,
									  _("reconnected to primary node after %i seconds, resuming monitoring"),
									  degraded_monitoring_elapsed);

					create_event_notification(local_conn,
											  &config_file_options,
											  config_file_options.node_id,
											  "repmgrd_local_reconnect",
											  true,
											  event_details.data);

					log_notice("%s", event_details.data);
					termPQExpBuffer(&event_details);

					goto loop;
				}
			}


			// possibly attempt to find another node from cached list
			// check if there's a new primary - if so add hook for fencing?
			// loop, if starts up check status, switch monitoring mode
		}
	loop:
		/* emit "still alive" log message at regular intervals, if requested */
		if (config_file_options.log_status_interval > 0)
		{
			int		log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);

			if (log_status_interval_elapsed >= config_file_options.log_status_interval)
			{
				log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"),
						 local_node_info.node_name,
						 local_node_info.node_id,
						 print_monitoring_state(monitoring_state));

				if (monitoring_state == MS_DEGRADED)
				{
					log_detail(_("waiting for primary to reappear"));
				}

				INSTR_TIME_SET_CURRENT(log_status_interval_start);
			}
		}

		log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
					config_file_options.monitor_interval_secs);

		sleep(config_file_options.monitor_interval_secs);
	}
#endif
}


void
monitor_streaming_standby(void)
{
#ifndef BDR_ONLY
	RecordStatus record_status;
	instr_time	log_status_interval_start;
	PQExpBufferData event_details;

	reset_node_voting_status();

	log_debug("monitor_streaming_standby()");

	/*
	 * If no upstream node id is specified in the metadata, we'll try
	 * and determine the current cluster primary in the assumption we
	 * should connect to that by default.
	 */
	if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
	{
		local_node_info.upstream_node_id = get_primary_node_id(local_conn);

		/*
		 * Terminate if there doesn't appear to be an active cluster primary.
		 * There could be one or more nodes marked as inactive primaries, and one
		 * of them could actually be a primary, but we can't sensibly monitor
		 * in that state.
		 */
		if (local_node_info.upstream_node_id == NODE_NOT_FOUND)
		{
			// XXX check if there's an inactive record(s) and log detail/hint
			log_error(_("unable to determine an active primary for this cluster, terminating"));
			PQfinish(local_conn);
			exit(ERR_BAD_CONFIG);
		}
	}

	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &upstream_node_info);

	/*
	 * Terminate if we can't find the record for the node we're supposed
	 * to monitor. This is a "fix-the-config" situation, not a lot else we
	 * can do.
	 */
	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
					local_node_info.upstream_node_id);
		PQfinish(local_conn);
		exit(ERR_DB_CONN);
	}

	log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);

	upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);

	/*
	 * Upstream node must be running.
	 *
	 * We could possibly have repmgrd skip to degraded monitoring mode until it
	 * comes up, but there doesn't seem to be much point in doint that.
	 */
	if (PQstatus(upstream_conn) != CONNECTION_OK)
	{
		log_error(_("unable connect to upstream node (ID: %i), terminating"),
				  local_node_info.upstream_node_id);
		log_hint(_("upstream node must be running before repmgrd can start"));

		PQfinish(local_conn);
		exit(ERR_DB_CONN);
	}

	/* refresh upstream node record from upstream node, so it's as up-to-date as possible */
	record_status = get_node_record(upstream_conn, upstream_node_info.node_id, &upstream_node_info);

	if (upstream_node_info.type == STANDBY)
	{
		/*
		 * Currently cascaded standbys need to be able to connect to the primary.
		 * We could possibly add a limited connection mode for cases where this isn't
		 * possible.
		 */
		primary_conn = establish_primary_db_connection(upstream_conn, false);

		if (PQstatus(primary_conn) != CONNECTION_OK)
		{
			log_error(_("unable to connect to primary node"));
			log_hint(_("ensure the primary node is reachable from this node"));
			exit(ERR_DB_CONN);
		}

		log_verbose(LOG_DEBUG, "connected to primary");
	}
	else
	{
		primary_conn = upstream_conn;
	}

	/* Log startup event */
	if (startup_event_logged == false)
	{
		PQExpBufferData event_details;
		initPQExpBuffer(&event_details);

		appendPQExpBuffer(&event_details,
						  _("monitoring upstream node \"%s\" (node ID: %i)"),
						  upstream_node_info.node_name,
						  upstream_node_info.node_id);

		create_event_notification(primary_conn,
								  &config_file_options,
								  config_file_options.node_id,
								  "repmgrd_start",
								  true,
								  event_details.data);

		startup_event_logged = true;

		log_notice("%s", event_details.data);

		termPQExpBuffer(&event_details);
	}

	monitoring_state = MS_NORMAL;
	INSTR_TIME_SET_CURRENT(log_status_interval_start);
	upstream_node_info.node_status = NODE_STATUS_UP;

	while (true)
	{
		if (is_server_available(upstream_node_info.conninfo) == false)
		{

			/* upstream node is down, we were expecting it to be up */
			if (upstream_node_info.node_status == NODE_STATUS_UP)
			{
				instr_time	upstream_node_unreachable_start;

				INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);

				initPQExpBuffer(&event_details);

				upstream_node_info.node_status = NODE_STATUS_UNKNOWN;

				appendPQExpBuffer(&event_details,
								  _("unable to connect to upstream node \"%s\" (node ID: %i)"),
								  upstream_node_info.node_name, upstream_node_info.node_id);

				if (upstream_node_info.type == STANDBY)
				{
					/* XXX possible pre-action event */
					create_event_record(primary_conn,
										&config_file_options,
										config_file_options.node_id,
										"repmgrd_upstream_disconnect",
										true,
										event_details.data);
				}

				log_warning("%s", event_details.data);
				termPQExpBuffer(&event_details);

				PQfinish(upstream_conn);
				upstream_conn = try_reconnect(&upstream_node_info);

				/* Node has recovered - log and continue */
				if (upstream_node_info.node_status == NODE_STATUS_UP)
				{
					int		upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);

					initPQExpBuffer(&event_details);

					appendPQExpBuffer(&event_details,
									  _("reconnected to upstream node after %i seconds"),
									  upstream_node_unreachable_elapsed);
					log_notice("%s", event_details.data);

					create_event_notification(local_conn,
											  &config_file_options,
											  config_file_options.node_id,
											  "repmgrd_upstream_reconnect",
											  true,
											  event_details.data);
					termPQExpBuffer(&event_details);

					goto loop;
				}

				/* still down after reconnect attempt(s) */
				if (upstream_node_info.node_status == NODE_STATUS_DOWN)
				{
					bool failover_done = false;

					if (upstream_node_info.type == PRIMARY)
					{
						failover_done = do_primary_failover();
					}
					else if (upstream_node_info.type == STANDBY)
					{
						failover_done = do_upstream_standby_failover();
					}

					// it's possible it will make sense to return in
					// all cases to restart monitoring
					if (failover_done == true)
						return;
				}
			}
		}

		if (monitoring_state == MS_DEGRADED)
		{
			int		degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);

			log_debug("monitoring node %i in degraded state for %i seconds",
					  upstream_node_info.node_id,
					  degraded_monitoring_elapsed);

			if (is_server_available(upstream_node_info.conninfo) == true)
			{
				upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);

				if (PQstatus(upstream_conn) == CONNECTION_OK)
				{
					// XXX check here if upstream is still primary
					// -> will be a problem if another node was promoted in the meantime
					// and upstream is now former primary
					// XXX scan other nodes to see if any has become primary

					upstream_node_info.node_status = NODE_STATUS_UP;
					monitoring_state = MS_NORMAL;

					if (upstream_node_info.type == PRIMARY)
					{
						primary_conn = upstream_conn;
					}
					else
					{
						if (primary_conn == NULL || PQstatus(primary_conn) != CONNECTION_OK)
						{
							primary_conn = establish_primary_db_connection(upstream_conn, false);
						}
					}

					initPQExpBuffer(&event_details);

					appendPQExpBuffer(&event_details,
									  _("reconnected to upstream node %i after %i seconds, resuming monitoring"),
									  upstream_node_info.node_id,
									  degraded_monitoring_elapsed);

					create_event_notification(primary_conn,
											  &config_file_options,
											  config_file_options.node_id,
											  "repmgrd_upstream_reconnect",
											  true,
											  event_details.data);

					log_notice("%s", event_details.data);
					termPQExpBuffer(&event_details);

					goto loop;
				}
			}
			else
			{
				/*
				 * unable to connect to former primary - check if another node has
				 * been promoted
				 */

				NodeInfoListCell *cell;
				int follow_node_id = UNKNOWN_NODE_ID;

				/* local node has been promoted */
				if (get_recovery_type(local_conn) == RECTYPE_PRIMARY)
				{
					log_notice(_("local node is primary, checking local node record"));

					/*
					 * There may be a delay between the node being promoted and the local
					 * record being updated, so if the node record still shows it as a
					 * standby, do nothing, we'll catch the update during the next loop.
					 * (e.g. node was manually
					 * promoted) we'll do nothing, as the repmgr metadata is now out-of-sync.
					 * If it does get fixed, we'll catch it here on a future iteration.
					 */

					/* refresh own internal node record */
					record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);

					if (local_node_info.type == PRIMARY)
					{

						int		degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);

						log_notice(_("resuming monitoring as primary node after %i seconds"),
								   degraded_monitoring_elapsed);

						/* this will restart monitoring in primary mode */
						monitoring_state = MS_NORMAL;
						return;
					}
				}


				if (config_file_options.failover == FAILOVER_AUTOMATIC)
				{
					get_active_sibling_node_records(local_conn,
													local_node_info.node_id,
													local_node_info.upstream_node_id,
													&standby_nodes);

					if (standby_nodes.node_count > 0)
					{
						log_debug("scanning %i node records to detect new primary...", standby_nodes.node_count);
						for (cell = standby_nodes.head; cell; cell = cell->next)
						{
							/* skip local node check, we did that above */
							if (cell->node_info->node_id == local_node_info.node_id)
							{
								continue;
							}

							cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);

							if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
							{
								log_debug("unable to connect to %i ... ", cell->node_info->node_id);
								continue;
							}

							if (get_recovery_type(cell->node_info->conn) == RECTYPE_PRIMARY)
							{
								follow_node_id = cell->node_info->node_id;
								PQfinish(cell->node_info->conn);
								cell->node_info->conn = NULL;
								break;
							}
							PQfinish(cell->node_info->conn);
							cell->node_info->conn = NULL;
						}

						if (follow_node_id != UNKNOWN_NODE_ID)
						{
							follow_new_primary(follow_node_id);
						}
					}
				}

			}
		}

	loop:

		/* emit "still alive" log message at regular intervals, if requested */
		if (config_file_options.log_status_interval > 0)
		{
			int		log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);

			if (log_status_interval_elapsed >= config_file_options.log_status_interval)
			{
				PQExpBufferData monitoring_summary;
				initPQExpBuffer(&monitoring_summary);

				appendPQExpBuffer(
					&monitoring_summary,
					_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i) in %s state"),
					local_node_info.node_name,
					local_node_info.node_id,
					upstream_node_info.node_name,
					upstream_node_info.node_id,
					print_monitoring_state(monitoring_state));

				if (config_file_options.failover == FAILOVER_MANUAL)
				{
					appendPQExpBuffer(
						&monitoring_summary,
						_(" (automatic failover disabled)"));
				}

				log_info("%s", monitoring_summary.data);
				termPQExpBuffer(&monitoring_summary);
				if (monitoring_state == MS_DEGRADED && config_file_options.failover == FAILOVER_AUTOMATIC)
				{
					log_detail(_("waiting for upstream or another primary to reappear"));
				}

				INSTR_TIME_SET_CURRENT(log_status_interval_start);
			}
		}

		/*
		 * handle local node failure
		 *
		 * currently we'll just check the connection, and try to reconnect
		 *
		 * TODO: add timeout, after which we run in degraded state
		 */

		check_connection(&local_node_info, local_conn);

		sleep(config_file_options.monitor_interval_secs);
	}
#endif
}

#ifndef BDR_ONLY
static bool
do_primary_failover(void)
{
	/* attempt to initiate voting process */
	ElectionResult election_result = do_election();

	/* TODO add pre-event notification here */
	failover_state = FAILOVER_STATE_UNKNOWN;

	log_debug("election result: %s", _print_election_result(election_result));

	if (election_result == ELECTION_CANCELLED)
	{
		log_notice(_("election cancelled"));
		return false;
	}
	else if (election_result == ELECTION_WON)
	{
		log_notice("I am the winner, will now promote self and inform other nodes");

		failover_state = promote_self();
	}
	else if (election_result == ELECTION_LOST)
	{
		t_node_info *best_candidate;

		log_info(_("I am the candidate but did not get all votes; will now determine the best candidate"));

		/* standby_nodes is in the state created by do_election() */
		best_candidate = poll_best_candidate(&standby_nodes);

		/*
		 * this can occur in a tie-break situation, where this node establishes
		 * it is the best candidate
		 */
		if (best_candidate->node_id == local_node_info.node_id)
		{
			log_notice("I am the best candidate, will now promote self and inform other nodes");

			failover_state = promote_self();
		}
		else
		{
			PGconn *candidate_conn = NULL;

			log_info("node %i is the best candidate, waiting for it to confirm so I can follow it",
					 best_candidate->node_id);

			/* notify the best candidate so it */

			candidate_conn = establish_db_connection(best_candidate->conninfo, false);

			if (PQstatus(candidate_conn) == CONNECTION_OK)
			{
				notify_follow_primary(candidate_conn, best_candidate->node_id);

				/*  we'll wait for the candidate to get back to us */
				failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
			}
			else
			{
				log_error(_("unable to connect to candidate node (ID: %i)"), best_candidate->node_id);
				failover_state = FAILOVER_STATE_NODE_NOTIFICATION_ERROR;
			}
			PQfinish(candidate_conn);
		}
	}
	else
	{

		if (standby_nodes.node_count == 0)
		{
			/* Node is not a candidate but no other nodes are available */
			log_notice(_("no other nodes are available as promotion candidate"));
			log_hint(_("use \"repmgr standby promote\" to manually promote this node"));

			monitoring_state = MS_DEGRADED;
			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);

			failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
		}
		else
		{
			log_info(_("follower node awaiting notification from the candidate node"));
			failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
		}
	}


	/*
	 * node has decided it is a follower, so will await notification
	 * from the candidate that it has promoted itself and can be followed
	 */
	if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
	{
		int new_primary_id;

		//   --> need timeout in case new primary doesn't come up, then rerun election

		/* either follow or time out; either way resume monitoring */
		if (wait_primary_notification(&new_primary_id) == true)
		{
			/* if primary has reappeared, no action needed */
			if (new_primary_id == upstream_node_info.node_id)
			{
				failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY;
			}
			/* if new_primary_id is self, promote */
			else if (new_primary_id == local_node_info.node_id)
			{
				log_notice(_("this node is promotion candidate, promoting"));

				failover_state = promote_self();

				get_active_sibling_node_records(local_conn,
												local_node_info.node_id,
												upstream_node_info.node_id,
												&standby_nodes);

			}
			else if (config_file_options.failover == FAILOVER_MANUAL)
			{
				/* automatic failover disabled */

				t_node_info new_primary = T_NODE_INFO_INITIALIZER;
				RecordStatus record_status = RECORD_NOT_FOUND;
				PGconn *new_primary_conn;

				record_status = get_node_record(local_conn, new_primary_id, &new_primary);

				if (record_status != RECORD_FOUND)
				{
					log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
							  new_primary_id);
				}
				else
				{
					PQExpBufferData event_details;
					initPQExpBuffer(&event_details);
					appendPQExpBuffer(&event_details,
									  _("node %i is in manual failover mode and is now disconnected from streaming replication"),
									  local_node_info.node_id);

					new_primary_conn = establish_db_connection(new_primary.conninfo, false);

					create_event_notification(
						new_primary_conn,
						&config_file_options,
						local_node_info.node_id,
						"standby_disconnect_manual",
						/* here "true" indicates the action has occurred as expected */
						true,
						event_details.data);
					PQfinish(new_primary_conn);
					termPQExpBuffer(&event_details);

				}
				failover_state = FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER;
			}
			else
			{
				failover_state = follow_new_primary(new_primary_id);
			}
		}
		else
		{
			failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
		}
	}

	switch(failover_state)
	{
		case FAILOVER_STATE_PROMOTED:
			log_debug("failover state is PROMOTED");

			/* notify former siblings that they should now follow this node */
			notify_followers(&standby_nodes, local_node_info.node_id);

			/* we no longer care about our former siblings */
			clear_node_info_list(&standby_nodes);

			/* pass control back down to start_monitoring() */
			log_info(_("switching to primary monitoring mode"));

			failover_state = FAILOVER_STATE_NONE;
			return true;

		case FAILOVER_STATE_PRIMARY_REAPPEARED:
			log_debug("failover state is PRIMARY_REAPPEARED");

			/* notify siblings that they should resume following the original primary */
			notify_followers(&standby_nodes, upstream_node_info.node_id);

			/* we no longer care about our former siblings */
			clear_node_info_list(&standby_nodes);

			/* pass control back down to start_monitoring() */
			log_info(_("resuming standby monitoring mode"));
			log_detail(_("original primary \"%s\" (node ID: %i) reappeared"),
					   upstream_node_info.node_name, upstream_node_info.node_id);

			failover_state = FAILOVER_STATE_NONE;
			return true;


		case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
			log_info(_("resuming standby monitoring mode"));
			log_detail(_("following new primary \"%s\" (node id: %i)"),
					   upstream_node_info.node_name, upstream_node_info.node_id);
			failover_state = FAILOVER_STATE_NONE;

			return true;

		case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
			log_info(_("resuming standby monitoring mode"));
			log_detail(_("following original primary \"%s\" (node id: %i)"),
					   upstream_node_info.node_name, upstream_node_info.node_id);
			failover_state = FAILOVER_STATE_NONE;

			return true;

		case FAILOVER_STATE_PROMOTION_FAILED:
			log_debug("failover state is PROMOTION FAILED");
			return false;

		case FAILOVER_STATE_FOLLOW_FAIL:
			/*
			 * for whatever reason we were unable to follow the new primary -
			 * continue monitoring in degraded state
			 */
			monitoring_state = MS_DEGRADED;
			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);

			return false;

		case FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER:
			log_info(_("automatic failover disabled for this node, manual intervention required"));

			monitoring_state = MS_DEGRADED;
			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
			return false;

		case FAILOVER_STATE_NO_NEW_PRIMARY:
		case FAILOVER_STATE_WAITING_NEW_PRIMARY:
			/* pass control back down to start_monitoring() */
			// -> should kick off new election
			return false;

		case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
		case FAILOVER_STATE_LOCAL_NODE_FAILURE:
		case FAILOVER_STATE_UNKNOWN:
		case FAILOVER_STATE_NONE:
			log_debug("failover state is %i", failover_state);
			return false;
	}


	/* should never reach here */
	return false;
}


/*
 * do_upstream_standby_failover()
 *
 * Attach cascaded standby to primary
 *
 * Currently we will try to attach to the cluster primary, as "repmgr
 * standby follow" doesn't support attaching to another node.
 *
 * If this becomes supported, it might be worth providing a selection
 * of reconnection strategies as different behaviour might be desirable
 * in different situations;
 * or maybe the option not to reconnect might be required?
 *
 * XXX check this handles replication slots gracefully
 */
static bool
do_upstream_standby_failover(void)
{
	PQExpBufferData event_details;
	t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
	RecordStatus record_status;
	int r;

	PQfinish(upstream_conn);
	upstream_conn = NULL;

	if (get_primary_node_record(local_conn, &primary_node_info) == false)
	{
		log_error(_("unable to retrieve primary node record"));
		return false;
	}
	/*
	 * Verify that we can still talk to the cluster primary, even though
	 * the node's upstream is not available
	 */

	check_connection(&primary_node_info, primary_conn);


	/* grandparent upstream is inactive  */
	if (primary_node_info.active == false)
	{
		// XXX
	}

	/* Close the connection to this server */
	PQfinish(local_conn);
	local_conn = NULL;

	initPQExpBuffer(&event_details);

	log_debug(_("standby follow command is:\n  \"%s\""),
			  config_file_options.follow_command);

	r = system(config_file_options.follow_command);

	if (r != 0)
	{
		appendPQExpBuffer(&event_details,
						  _("unable to execute follow command:\n %s"),
						  config_file_options.follow_command);

		log_error("%s", event_details.data);

		/* It may not possible to write to the event notification
		 * table but we should be able to generate an external notification
		 * if required.
		 */
		create_event_notification(
			primary_conn,
			&config_file_options,
			local_node_info.node_id,
			"repmgrd_failover_follow",
			false,
			event_details.data);

		termPQExpBuffer(&event_details);
	}

	/* reconnect to local node */
	local_conn = establish_db_connection(config_file_options.conninfo, false);

	if (update_node_record_set_upstream(primary_conn,
										local_node_info.node_id,
										primary_node_info.node_id) == false)
	{
		appendPQExpBuffer(&event_details,
						  _("unable to set node %i's new upstream ID to %i"),
						  local_node_info.node_id,
						  primary_node_info.node_id);

		log_error("%s", event_details.data);

		create_event_notification(
			NULL,
			&config_file_options,
			local_node_info.node_id,
			"repmgrd_failover_follow",
			false,
			event_details.data);

		termPQExpBuffer(&event_details);

		terminate(ERR_BAD_CONFIG);
	}

	/* refresh own internal node record */
	record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);

	/*
	 * highly improbable this will happen, but in case we're unable to retrieve
	 * our node record from the primary, update it ourselves, and hope for the best
	 */
	if (record_status != RECORD_FOUND)
	{
		local_node_info.upstream_node_id = primary_node_info.node_id;
	}

	appendPQExpBuffer(&event_details,
					  _("node %i is now following primary node %i"),
					  local_node_info.node_id,
					  primary_node_info.node_id);

	log_notice("%s", event_details.data);

	create_event_notification(
		primary_conn,
		&config_file_options,
		local_node_info.node_id,
		"repmgrd_failover_follow",
		true,
		event_details.data);

	termPQExpBuffer(&event_details);

	PQfinish(primary_conn);
	primary_conn = NULL;

	return true;
}


static FailoverState
promote_self(void)
{
	PQExpBufferData event_details;
	char *promote_command;
	int r;

	/* Store details of the failed node here */
	t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
	RecordStatus record_status;

	/*
	 * optionally add a delay before promoting the standby; this is mainly
	 * useful for testing (e.g. for reappearance of the original primary)
	 * and is not documented.
	 */
	if (config_file_options.promote_delay > 0)
	{
		log_debug("sleeping %i seconds before promoting standby",
				  config_file_options.promote_delay);
		sleep(config_file_options.promote_delay);
	}

	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);

	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
				  local_node_info.upstream_node_id);
		return FAILOVER_STATE_PROMOTION_FAILED;
	}

	/* the presence of either of this command has been established already */
	promote_command = config_file_options.promote_command;

	log_debug("promote command is:\n  \"%s\"",
			  promote_command);

	if (log_type == REPMGR_STDERR && *config_file_options.log_file)
	{
		fflush(stderr);
	}

	r = system(promote_command);

	/* connection should stay up, but check just in case */
	if(PQstatus(local_conn) != CONNECTION_OK)
	{
		local_conn = establish_db_connection(local_node_info.conninfo, true);

		/* assume node failed */
		if(PQstatus(local_conn) != CONNECTION_OK)
		{
			log_error(_("unable to reconnect to local node"));
			// XXX handle this
			return FAILOVER_STATE_LOCAL_NODE_FAILURE;
		}
	}

	if (r != 0)
	{
		int primary_node_id;

		upstream_conn = get_primary_connection(local_conn,
											  &primary_node_id, NULL);

		if (PQstatus(upstream_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
		{
			log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"),
					   failed_primary.node_id);

			initPQExpBuffer(&event_details);
			appendPQExpBuffer(&event_details,
							  _("original primary \"%s\" (node ID: %i) reappeared"),
							  failed_primary.node_name,
							  failed_primary.node_id);

			create_event_notification(upstream_conn,
								&config_file_options,
								local_node_info.node_id,
								"repmgrd_failover_abort",
								true,
								event_details.data);

			termPQExpBuffer(&event_details);

			//primary_conn = NULL;

			// XXX handle this!
			// -> we'll need to let the other nodes know too....
			/* no failover occurred but we'll want to restart connections */
			//failover_done = true;
			return FAILOVER_STATE_PRIMARY_REAPPEARED;
		}

		// handle this
		//  -> check if somehow primary; otherwise go for new election?
		log_error(_("promote command failed"));
		return FAILOVER_STATE_PROMOTION_FAILED;
	}


	initPQExpBuffer(&event_details);

	/* update own internal node record */
	record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);

	/*
	 * XXX here we're assuming the promote command updated metadata
	 */
	appendPQExpBuffer(&event_details,
					  _("node %i promoted to primary; old primary %i marked as failed"),
					  local_node_info.node_id,
					  failed_primary.node_id);

	/* local_conn is now the primary connection */
	create_event_notification(local_conn,
						&config_file_options,
						local_node_info.node_id,
						"repmgrd_failover_promote",
						true,
						event_details.data);

	termPQExpBuffer(&event_details);

	return FAILOVER_STATE_PROMOTED;
}


/*
 * Notify follower nodes about which node to follow. Normally this
 * will be the current node, however if the original primary reappeared
 * before this node could be promoted, we'll inform the followers they
 * should resume monitoring the original primary.
 */
static void
notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
{
	NodeInfoListCell *cell;

	log_debug("notify_followers()");
	for (cell = standby_nodes->head; cell; cell = cell->next)
	{
		log_debug("intending to notify node %i... ", cell->node_info->node_id);
		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
		{
			log_debug("reconnecting to node %i... ", cell->node_info->node_id);

			cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
		}

		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
		{
			log_debug("unable to reconnect to  %i ... ", cell->node_info->node_id);

			continue;
		}

		log_debug("notifying node %i to follow node %i",
				  cell->node_info->node_id, follow_node_id);
		notify_follow_primary(cell->node_info->conn, follow_node_id);
	}
}


static t_node_info *
poll_best_candidate(NodeInfoList *standby_nodes)
{
	NodeInfoListCell *cell;
	t_node_info *best_candidate = &local_node_info;


	/*
	 * we need to definitively decide the best candidate, as in some corner
	 * cases we could end up with two candidate nodes, so they should each
	 * come to the same conclusion.
	 *
	 * XXX check there are no cases where the standby node's LSN is
	 * not set
	 */
	for (cell = standby_nodes->head; cell; cell = cell->next)
	{
		if (cell->node_info->last_wal_receive_lsn > best_candidate->last_wal_receive_lsn)
		{
			log_debug("node %i has higher LSN, now best candidate", cell->node_info->node_id);
			best_candidate = cell->node_info;
		}
		else if (cell->node_info->last_wal_receive_lsn == best_candidate->last_wal_receive_lsn)
		{
			if (cell->node_info->priority > best_candidate->priority)
			{
				log_debug("node %i has higher priority, now best candidate", cell->node_info->node_id);
				best_candidate = cell->node_info;
			}
		}
		/* if all else fails, we decide by node_id */
		else if (cell->node_info->node_id < best_candidate->node_id)
		{
			log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id);
			best_candidate = cell->node_info;
		}

		if (cell->node_info->conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
		{
			PQfinish(cell->node_info->conn);
			cell->node_info->conn = NULL;
		}
	}

	log_info(_("best candidate is node %s (node ID: %i)"),
			 best_candidate->node_name,
			 best_candidate->node_id);

	return best_candidate;
}


static bool
wait_primary_notification(int *new_primary_id)
{
	int i;

	for (i = 0; i < config_file_options.primary_notification_timeout; i++)
	{
		if (get_new_primary(local_conn, new_primary_id) == true)
		{
			log_debug("new primary is %i; elapsed: %i",
					  *new_primary_id, i);
			return true;
		}
		sleep(1);
	}


	log_warning(_("no notification received from new primary after %i seconds"),
				config_file_options.primary_notification_timeout);

	monitoring_state = MS_DEGRADED;
	INSTR_TIME_SET_CURRENT(degraded_monitoring_start);

	return false;
}


static FailoverState
follow_new_primary(int new_primary_id)
{
	PQExpBufferData event_details;
	int r;

	/* Store details of the failed node here */
	t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
	t_node_info new_primary = T_NODE_INFO_INITIALIZER;
	RecordStatus record_status = RECORD_NOT_FOUND;
	bool new_primary_ok = false;

	record_status = get_node_record(local_conn, new_primary_id, &new_primary);

	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
				  new_primary_id);
		return FAILOVER_STATE_FOLLOW_FAIL;
	}

	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);

	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record for failed primary (ID: %i)"),
					local_node_info.upstream_node_id);
		return FAILOVER_STATE_FOLLOW_FAIL;
	}

	// XXX check if new_primary_id == failed_primary.node_id?

	if (log_type == REPMGR_STDERR && *config_file_options.log_file)
	{
		fflush(stderr);
	}

	log_debug(_("standby follow command is:\n  \"%s\""),
			  config_file_options.follow_command);

	upstream_conn = establish_db_connection(new_primary.conninfo, false);

	if (PQstatus(upstream_conn) == CONNECTION_OK)
	{
		RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
		if (primary_recovery_type == RECTYPE_PRIMARY)
		{
			new_primary_ok = true;
		}
		else
		{
			log_warning(_("new primary is not in recovery"));
			PQfinish(upstream_conn);
		}
	}

	if (new_primary_ok == false)
	{
		return FAILOVER_STATE_FOLLOW_FAIL;
	}

	/*
	 * disconnect from local node, as follow operation will result in
	 * a server restart
	 */

	PQfinish(local_conn);
	local_conn = NULL;

	/* execute the follow command */
	r = system(config_file_options.follow_command);

	if (r != 0)
	{
		PGconn *old_primary_conn;
		/*
		 * The follow action could still fail due to the original primary reappearing
		 * before the candidate could promote itself ("repmgr standby follow" will
		 * refuse to promote another node if the primary is available). However
		 * the new primary will only instruct use to follow it after it's successfully
		 * promoted itself, so that very likely won't be the reason for the failure.
		 *
		 *
		 * TODO: check the new primary too - we could have a split-brain
		 * situation where the old primary reappeared just after the new
		 * one promoted itself.
		 */
		old_primary_conn = establish_db_connection(failed_primary.conninfo, false);

		if (PQstatus(old_primary_conn) == CONNECTION_OK)
		{
			// XXX add event notifications
			RecoveryType upstream_recovery_type = get_recovery_type(old_primary_conn);
			PQfinish(old_primary_conn);

			if (upstream_recovery_type == RECTYPE_PRIMARY)
			{
				log_notice(_("original primary reappeared - no action taken"));
				return FAILOVER_STATE_PRIMARY_REAPPEARED;
			}
		}

		return FAILOVER_STATE_FOLLOW_FAIL;
	}


	/*
	 * refresh local copy of local and primary node records - we get these
	 * directly from the primary to ensure they're the current version
	 */

	record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);

	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record found for node %i"),
				  new_primary_id);
		return FAILOVER_STATE_FOLLOW_FAIL;
	}

	record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record found for node %i"),
				  local_node_info.node_id);
		return FAILOVER_STATE_FOLLOW_FAIL;
	}


	local_conn = establish_db_connection(local_node_info.conninfo, false);
	initPQExpBuffer(&event_details);
	appendPQExpBuffer(&event_details,
					  _("node %i now following new upstream node %i"),
					  local_node_info.node_id,
					  upstream_node_info.node_id);

	log_notice("%s\n", event_details.data);

	create_event_notification(
		upstream_conn,
		&config_file_options,
		local_node_info.node_id,
		"repmgrd_failover_follow",
		true,
		event_details.data);

	termPQExpBuffer(&event_details);

	return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
}


static const char *
_print_voting_status(NodeVotingStatus voting_status)
{
	switch(voting_status)
	{
		case VS_NO_VOTE:
			return "NO VOTE";

		case VS_VOTE_REQUEST_RECEIVED:
			return "VOTE REQUEST RECEIVED";

		case VS_VOTE_INITIATED:
			return "VOTE REQUEST INITIATED";

		case VS_UNKNOWN:
			return "VOTE REQUEST UNKNOWN";
	}

	return "UNKNOWN VOTE REQUEST STATE";
}

static const char *
_print_election_result(ElectionResult result)
{
	switch(result)
	{
		case ELECTION_NOT_CANDIDATE:
			return "NOT CANDIDATE";

		case ELECTION_WON:
			return "WON";

		case ELECTION_LOST:
			return "LOST";

		case ELECTION_CANCELLED:
			return "CANCELLED";
	}

	/* should never reach here */
	return "UNKNOWN";
}


/*
 * NB: this function sets standby_nodes; caller (do_primary_failover)
 * expects to be able to read this list
 */
static ElectionResult
do_election(void)
{
	int electoral_term = -1;

	int votes_for_me = 0;

	/* we're visible */
	int visible_nodes = 1;

	/*
	 * get voting status from shared memory - should be one of "VS_NO_VOTE"
	 * or "VS_VOTE_REQUEST_RECEIVED". If VS_NO_VOTE, we declare ourselves as
	 * candidate and initiate the voting process.
	 */
	NodeVotingStatus voting_status;

	NodeInfoListCell *cell;

	bool other_node_is_candidate = false;
	bool other_node_is_ahead = false;

	/*
	 * Check if at least one server in the primary's location is visible;
	 * if not we'll assume a network split between this node and the primary
	 * location, and not promote any standby.
	 *
	 * NOTE: this function is only ever called by standbys attached to the current
	 * (unreachable) primary, so "upstream_node_info" will always contain the
	 * primary node record.
	 */
	bool primary_location_seen = false;

	/*
	 * sleep for a random period of 100 ~ 350 ms
	 */

	long unsigned rand_wait = (long) ((rand() % 35) + 10) * 10000;

	/* get all active nodes attached to primary, excluding self */
	get_active_sibling_node_records(local_conn,
									local_node_info.node_id,
									upstream_node_info.node_id,
									&standby_nodes);

	if (config_file_options.failover == FAILOVER_MANUAL)
	{
		log_notice(_("this node is not configured for automatic failover so will not be considered as promotion candidate"));

		return ELECTION_NOT_CANDIDATE;
	}

	/* node priority is set to zero - don't ever become a candidate */
	if (local_node_info.priority <= 0)
	{
		log_notice(_("this node's priority is %i so will not be considered as an automatic promotion candidate"),
				   local_node_info.priority);

		return ELECTION_NOT_CANDIDATE;
	}


	log_debug("do_election(): sleeping %lu", rand_wait);
	log_debug("do_election(): primary location is %s", upstream_node_info.location);

	pg_usleep(rand_wait);

	local_node_info.last_wal_receive_lsn = InvalidXLogRecPtr;

	log_debug("do_election(): executing get_voting_status()");
	voting_status = get_voting_status(local_conn);
	log_debug("do_election(): node voting status is %s", _print_voting_status(voting_status));

	if (voting_status == VS_VOTE_REQUEST_RECEIVED)
	{
		/* we've already been requested to vote, so can't become a candidate */
		log_debug("vote request already received, not candidate");
		return ELECTION_NOT_CANDIDATE;
	}

	/*
	 * Here we mark ourselves as candidate, so any further vote requests
	 * are rejected. However it's possible another node has done the
	 * same thing, so when announcing ourselves as candidate to the other
	 * nodes, we'll check for that and withdraw our candidature.
	 */
	electoral_term = set_voting_status_initiated(local_conn);

	/* no other standbys - normally win by default */
	if (standby_nodes.node_count == 0)
	{
		if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
		{
			log_debug("no other nodes - we win by default");
			return ELECTION_WON;
		}
		else
		{
			log_debug("no other nodes, but primary and standby locations differ");

			monitoring_state = MS_DEGRADED;
			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);

			return ELECTION_NOT_CANDIDATE;
		}
	}

	for (cell = standby_nodes.head; cell; cell = cell->next)
	{
		/* assume the worst case */
		cell->node_info->node_status = NODE_STATUS_UNKNOWN;

		cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);

		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
		{
			continue;
		}

		/*
		 * tell the other node we're candidate - if the node has already declared
		 * itself, we withdraw
		 *
		 * XXX check for situations where more than one node could end up as candidate?
		 *
		 * XXX note it's possible some nodes accepted our candidature before we
		 * found out about the other candidate, check what happens in that situation
		 *  -> other node will have info from all the nodes, even if not the vote,
		 *     so it should be able to determine the best node anyway
		 */

		if (announce_candidature(cell->node_info->conn, &local_node_info, cell->node_info, electoral_term) == false)
		{
			log_debug("node %i is candidate",  cell->node_info->node_id);
			other_node_is_candidate = true;

			/* don't notify any further standbys */
			break;
		}

		/*
		 * see if the node is in the primary's location (but skip the check
		 * if we've seen
		 */
		if (primary_location_seen == false)
		{
			if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0)
			{
				primary_location_seen = true;
			}
		}

		cell->node_info->node_status = NODE_STATUS_UP;
		visible_nodes ++;
	}

	if (other_node_is_candidate == true)
	{
		reset_node_voting_status();
		log_debug("other node is candidate, returning NOT CANDIDATE");
		return ELECTION_NOT_CANDIDATE;
	}

	if (primary_location_seen == false)
	{
		log_notice(_("no nodes from the primary location \"%s\" visible - assuming network split"),
				   upstream_node_info.location);
		log_detail(_("node will enter degraded monitoring state waiting for reconnect"));

		monitoring_state = MS_DEGRADED;
		INSTR_TIME_SET_CURRENT(degraded_monitoring_start);

		reset_node_voting_status();

		return ELECTION_CANCELLED;
	}


	/* get our lsn */
	local_node_info.last_wal_receive_lsn = get_last_wal_receive_location(local_conn);

	log_debug("last receive lsn = %X/%X", format_lsn(local_node_info.last_wal_receive_lsn));

	/* request vote from each node */

	for (cell = standby_nodes.head; cell; cell = cell->next)
	{
		VoteRequestResult vote_result;

		log_debug("checking node %i...", cell->node_info->node_id);
		/* ignore unreachable nodes */
		if (cell->node_info->node_status != NODE_STATUS_UP)
			continue;

		vote_result = request_vote(cell->node_info->conn,
								   &local_node_info,
								   cell->node_info,
								   electoral_term);

		switch (vote_result)
		{
			case VR_VOTE_REFUSED:
				if (cell->node_info->node_id < local_node_info.node_id)
				{
					log_debug(_("node %i refused vote, their ID is lower, yielding"),
							  cell->node_info->node_id);
					PQfinish(cell->node_info->conn);
					cell->node_info->conn = NULL;

					reset_node_voting_status();
					log_debug("other node is candidate, returning NOT CANDIDATE");
					return ELECTION_NOT_CANDIDATE;
				}

				log_debug(_("no vote received from %i, our ID is lower, not yielding"),
						  cell->node_info->node_id);
				break;

			case VR_POSITIVE_VOTE:
				votes_for_me += 1;
				break;
			case VR_NEGATIVE_VOTE:
				break;
		}

		if (cell->node_info->last_wal_receive_lsn > local_node_info.last_wal_receive_lsn)
		{
			/* register if another node is ahead of us */
			other_node_is_ahead = true;
		}

	}

	/* vote for myself, but only if I believe no-one else is ahead */
	if (other_node_is_ahead == false)
	{
		votes_for_me += 1;
	}

	log_debug(_("%i of of %i votes"), votes_for_me, visible_nodes);

	if (votes_for_me == visible_nodes)
		return ELECTION_WON;

	return ELECTION_LOST;
}


static void
reset_node_voting_status(void)
{
	failover_state = FAILOVER_STATE_NONE;

	if (PQstatus(local_conn) != CONNECTION_OK)
	{
		log_error(_("reset_node_voting_status(): local_conn not set"));
		return;
	}
	reset_voting_status(local_conn);
}


static void
check_connection(t_node_info *node_info, PGconn *conn)
{
		// consolidate below code
	if (is_server_available(node_info->conninfo) == false)
	{
		log_warning(_("connection to node %i lost"), node_info->node_id);

		if (conn != NULL)
		{
			PQfinish(conn);
			conn = NULL;
		}
	}

	if (PQstatus(conn) != CONNECTION_OK)
	{
		log_info(_("attempting to reconnect"));
		conn = establish_db_connection(node_info->conninfo, false);

		if (PQstatus(conn) != CONNECTION_OK)
		{
			log_warning(_("reconnection failed"));
		}
		else
		{
			log_info(_("reconnected"));
		}
	}
}


#endif /* #ifndef BDR_ONLY */

void
close_connections_physical()
{
	if (PQstatus(primary_conn) == CONNECTION_OK)
	{
		/* cancel any pending queries to the primary */
		if (PQisBusy(primary_conn) == 1)
			cancel_query(primary_conn, config_file_options.async_query_timeout);
		PQfinish(primary_conn);
		primary_conn = NULL;
	}

	if (upstream_conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
	{
		PQfinish(upstream_conn);
		upstream_conn = NULL;
	}

}