repmgr/repmgrd-physical.c

/*
 * repmgrd-physical.c - physical replication functionality for repmgrd
 *
 * Copyright (c) 2ndQuadrant, 2010-2017
 */

#include <signal.h>

#include "repmgr.h"
#include "repmgrd.h"
#include "repmgrd-physical.h"


typedef enum {
	FAILOVER_STATE_UNKNOWN = -1,
	FAILOVER_STATE_NONE,
	FAILOVER_STATE_PROMOTED,
	FAILOVER_STATE_PROMOTION_FAILED,
	FAILOVER_STATE_PRIMARY_REAPPEARED,
	FAILOVER_STATE_LOCAL_NODE_FAILURE,
	FAILOVER_STATE_WAITING_NEW_PRIMARY,
	FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
    FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
	FAILOVER_STATE_NO_NEW_PRIMARY,
	FAILOVER_STATE_FOLLOW_FAIL,
	FAILOVER_STATE_NODE_NOTIFICATION_ERROR
} FailoverState;


typedef enum {
	ELECTION_NOT_CANDIDATE = -1,
	ELECTION_WON,
	ELECTION_LOST,
	ELECTION_CANCELLED
} ElectionResult;


static FailoverState failover_state = FAILOVER_STATE_UNKNOWN;

static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
static NodeInfoList standby_nodes = T_NODE_INFO_LIST_INITIALIZER;

static PGconn *upstream_conn = NULL;
static PGconn *primary_conn = NULL;

static ElectionResult do_election(void);
static const char *_print_voting_status(NodeVotingStatus voting_status);
static const char *_print_election_result(ElectionResult result);

static FailoverState promote_self(void);
static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);

static t_node_info *poll_best_candidate(NodeInfoList *standby_nodes);

static bool wait_primary_notification(int *new_primary_id);
static FailoverState follow_new_primary(int new_primary_id);

static void reset_node_voting_status(void);
void close_connections_physical();

static bool do_primary_failover(void);
static bool do_upstream_standby_failover(void);


void
do_physical_node_check(void)
{
    /*
     * Check if node record is active - if not, and `failover_mode=automatic`, the node
     * won't be considered as a promotion candidate; this often happens when
     * a failed primary is recloned and the node was not re-registered, giving
     * the impression failover capability is there when it's not. In this case
     * abort with an error and a hint about registering.
     *
     * If `failover_mode=manual`, repmgrd can continue to passively monitor the node, but
     * we should nevertheless issue a warning and the same hint.
     */

    if (local_node_info.active == false)
    {
        char *hint = "Check that 'repmgr (primary|standby) register' was executed for this node";

        switch (config_file_options.failover_mode)
        {
			/* "failover_mode" is an enum, all values should be covered here */

            case FAILOVER_AUTOMATIC:
                log_error(_("this node is marked as inactive and cannot be used as a failover target"));
                log_hint(_("%s"), hint);
				PQfinish(local_conn);
                terminate(ERR_BAD_CONFIG);

            case FAILOVER_MANUAL:
                log_warning(_("this node is marked as inactive and will be passively monitored only"));
                log_hint(_("%s"), hint);
                break;
        }
    }

	if (config_file_options.failover_mode == FAILOVER_AUTOMATIC)
	{
		/*
		 * check that promote/follow commands are defined, otherwise repmgrd
		 * won't be able to perform any useful action
		 */

		bool required_param_missing = false;

		if (config_file_options.promote_command[0] == '\0'
			&& config_file_options.service_promote_command[0] == '\0')
		{
			log_error(_("either \"promote_command\" or \"service_promote_command\" must be defined in the configuration file"));
			required_param_missing = true;
		}
		if (config_file_options.follow_command[0] == '\0')
		{
			log_error(_("\"follow_command\" must be defined in the configuration file"));
			required_param_missing = true;
		}

		if (required_param_missing == true)
		{
			log_hint(_("add the missing configuration parameter(s) and start repmgrd again"));
			PQfinish(local_conn);
			exit(ERR_BAD_CONFIG);
		}
	}
}


void
monitor_streaming_primary(void)
{
	NodeStatus	node_status = NODE_STATUS_UP;
	instr_time	log_status_interval_start;
	PQExpBufferData event_details;

	reset_node_voting_status();

	/* Log startup event */
	if (startup_event_logged == false)
	{
		initPQExpBuffer(&event_details);

		appendPQExpBuffer(&event_details,
						  _("monitoring cluster primary \"%s\" (node ID: %i)"),
						  local_node_info.node_name,
						  local_node_info.node_id);

		create_event_notification(local_conn,
								  &config_file_options,
								  config_file_options.node_id,
								  "repmgrd_start",
								  true,
								  event_details.data);

		startup_event_logged = true;

		log_notice("%s", event_details.data);

		termPQExpBuffer(&event_details);
	}

	INSTR_TIME_SET_CURRENT(log_status_interval_start);

	while (true)
	{

		// cache node list here, refresh at `node_list_refresh_interval`
		// also return reason for inavailability so we can log it
		if (is_server_available(local_node_info.conninfo) == false)
		{

			/* node is down, we were expecting it to be up */
			if (node_status == NODE_STATUS_UP)
			{
				PQExpBufferData event_details;
				instr_time	local_node_unreachable_start;

				INSTR_TIME_SET_CURRENT(local_node_unreachable_start);

				initPQExpBuffer(&event_details);

				appendPQExpBuffer(&event_details,
								  _("unable to connect to local node"));

				log_warning("%s", event_details.data);

				node_status = NODE_STATUS_UNKNOWN;

				PQfinish(local_conn);

				/*
				 * as we're monitoring the primary, no point in trying to write
				 * the event to the database
				 *
				 * XXX possible pre-action event
				 */
				create_event_notification(NULL,
										  &config_file_options,
										  config_file_options.node_id,
										  "repmgrd_local_disconnect",
										  true,
										  event_details.data);

				termPQExpBuffer(&event_details);

				local_conn = try_reconnect(local_node_info.conninfo, &node_status);

				if (node_status == NODE_STATUS_UP)
				{
					int		local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);

					initPQExpBuffer(&event_details);

					appendPQExpBuffer(&event_details,
									  _("reconnected to local node after %i seconds"),
									  local_node_unreachable_elapsed);
					log_notice("%s", event_details.data);

					create_event_notification(local_conn,
										&config_file_options,
										config_file_options.node_id,
										"repmgrd_local_reconnect",
										true,
										event_details.data);
					termPQExpBuffer(&event_details);

					goto loop;
				}

				monitoring_state = MS_DEGRADED;
				INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
			}

		}


		if (monitoring_state == MS_DEGRADED)
		{
			int		degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);

			if (config_file_options.degraded_monitoring_timeout > 0
				&& degraded_monitoring_elapsed > config_file_options.degraded_monitoring_timeout)
			{
				initPQExpBuffer(&event_details);

				appendPQExpBuffer(&event_details,
								  _("degraded monitoring timeout (%i seconds) exceeded, terminating"),
								  degraded_monitoring_elapsed);

				log_notice("%s", event_details.data);

				create_event_notification(NULL,
										  &config_file_options,
										  config_file_options.node_id,
										  "repmgrd_terminate",
										  true,
										  event_details.data);

				termPQExpBuffer(&event_details);
				terminate(ERR_MONITORING_TIMEOUT);
			}

			log_debug("monitoring node in degraded state for %i seconds", degraded_monitoring_elapsed);

			if (is_server_available(local_node_info.conninfo) == true)
			{
				local_conn = establish_db_connection(local_node_info.conninfo, false);

				if (PQstatus(local_conn) == CONNECTION_OK)
				{
					node_status = NODE_STATUS_UP;
					monitoring_state = MS_NORMAL;

					initPQExpBuffer(&event_details);

					appendPQExpBuffer(&event_details,
									  _("reconnected to primary node after %i seconds, resuming monitoring"),
									  degraded_monitoring_elapsed);

					create_event_notification(local_conn,
											  &config_file_options,
											  config_file_options.node_id,
											  "repmgrd_local_reconnect",
											  true,
											  event_details.data);

					log_notice("%s", event_details.data);
					termPQExpBuffer(&event_details);

					goto loop;
				}
			}


			// possibly attempt to find another node from cached list
			// check if there's a new primary - if so add hook for fencing?
			// loop, if starts up check status, switch monitoring mode
		}
	loop:
		/* emit "still alive" log message at regular intervals, if requested */
		if (config_file_options.log_status_interval > 0)
		{
			int		log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);

			if (log_status_interval_elapsed >= config_file_options.log_status_interval)
			{
				log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"),
						 local_node_info.node_name,
						 local_node_info.node_id,
						 print_monitoring_state(monitoring_state));

				if (monitoring_state == MS_DEGRADED)
				{
					log_detail(_("waiting primary to reappear"));
				}

				INSTR_TIME_SET_CURRENT(log_status_interval_start);
			}
		}

		log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
					config_file_options.monitor_interval_secs);

		sleep(config_file_options.monitor_interval_secs);
	}
}


void
monitor_streaming_standby(void)
{
	RecordStatus record_status;
	NodeStatus	upstream_node_status = NODE_STATUS_UP;
	instr_time	log_status_interval_start;
	PQExpBufferData event_details;

	reset_node_voting_status();

	log_debug("monitor_streaming_standby()");

	/*
	 * If no upstream node id is specified in the metadata, we'll try
	 * and determine the current cluster primary in the assumption we
	 * should connect to that by default.
	 */
	if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
	{
		local_node_info.upstream_node_id = get_primary_node_id(local_conn);

		/*
		 * Terminate if there doesn't appear to be an active cluster primary.
		 * There could be one or more nodes marked as inactive primaries, and one
		 * of them could actually be a primary, but we can't sensibly monitor
		 * in that state.
		 */
		if (local_node_info.upstream_node_id == NODE_NOT_FOUND)
		{
			// XXX check if there's an inactive record(s) and log detail/hint
			log_error(_("unable to determine an active primary for this cluster, terminating"));
			PQfinish(local_conn);
			exit(ERR_BAD_CONFIG);
		}
	}

	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &upstream_node_info);

	/*
	 * Terminate if we can't find the record for the node we're supposed
	 * to monitor. This is a "fix-the-config" situation, not a lot else we
	 * can do.
	 */
	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
					local_node_info.upstream_node_id);
		PQfinish(local_conn);
		exit(ERR_DB_CONN);
	}

	log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);

	upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);

	/*
	 * Upstream node must be running.
	 *
	 * We could possibly have repmgrd skip to degraded monitoring mode until it
	 * comes up, but there doesn't seem to be much point in doint that.
	 */
	if (PQstatus(upstream_conn) != CONNECTION_OK)
	{
		log_error(_("unable connect to upstream node (ID: %i), terminating"),
				  local_node_info.upstream_node_id);
		log_hint(_("upstream node must be running before repmgrd can start"));

		PQfinish(local_conn);
		exit(ERR_DB_CONN);
	}

	/* refresh upstream node record from upstream node, so it's as up-to-date as possible */
	record_status = get_node_record(upstream_conn, upstream_node_info.node_id, &upstream_node_info);

	if (upstream_node_info.type == STANDBY)
	{
		/*
		 * Currently cascaded standbys need to be able to connect to the primary.
		 * We could possibly add a limited connection mode for cases where this isn't
		 * possible.
		 */
		primary_conn = establish_primary_db_connection(upstream_conn, false);

		if (PQstatus(primary_conn) != CONNECTION_OK)
		{
			log_error(_("unable to connect to primary node"));
			log_hint(_("ensure the primary node is reachable from this node"));
			exit(ERR_DB_CONN);
		}

		log_verbose(LOG_DEBUG, "connected to primary");
	}
	else
	{
		primary_conn = upstream_conn;
	}

	/* Log startup event */
	if (startup_event_logged == false)
	{
		PQExpBufferData event_details;
		initPQExpBuffer(&event_details);

		appendPQExpBuffer(&event_details,
						  _("monitoring upstream node \"%s\" (node ID: %i)"),
						  upstream_node_info.node_name,
						  upstream_node_info.node_id);

		create_event_notification(primary_conn,
								  &config_file_options,
								  config_file_options.node_id,
								  "repmgrd_start",
								  true,
								  event_details.data);

		startup_event_logged = true;

		log_notice("%s", event_details.data);

		termPQExpBuffer(&event_details);
	}

	monitoring_state = MS_NORMAL;
	INSTR_TIME_SET_CURRENT(log_status_interval_start);

	while (true)
	{
		if (is_server_available(upstream_node_info.conninfo) == false)
		{

			/* upstream node is down, we were expecting it to be up */
			if (upstream_node_status == NODE_STATUS_UP)
			{
				instr_time	upstream_node_unreachable_start;

				INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);

				initPQExpBuffer(&event_details);

				upstream_node_status = NODE_STATUS_UNKNOWN;

				appendPQExpBuffer(&event_details,
								  _("unable to connect to upstream node \"%s\" (node ID: %i)"),
								  upstream_node_info.node_name, upstream_node_info.node_id);

				if (upstream_node_info.type == STANDBY)
				{
					/* XXX possible pre-action event */
					create_event_record(primary_conn,
										&config_file_options,
										config_file_options.node_id,
										"repmgrd_upstream_disconnect",
										true,
										event_details.data);
				}

				log_warning("%s", event_details.data);
				termPQExpBuffer(&event_details);

				PQfinish(upstream_conn);
				upstream_conn = try_reconnect(upstream_node_info.conninfo, &upstream_node_status);

				if (upstream_node_status == NODE_STATUS_UP)
				{
					int		upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);

					initPQExpBuffer(&event_details);

					appendPQExpBuffer(&event_details,
									  _("reconnected to upstream node after %i seconds"),
									  upstream_node_unreachable_elapsed);
					log_notice("%s", event_details.data);

					create_event_notification(local_conn,
											  &config_file_options,
											  config_file_options.node_id,
											  "repmgrd_upstream_reconnect",
											  true,
											  event_details.data);
					termPQExpBuffer(&event_details);

					goto loop;
				}

				/* still down after reconnect attempt(s) */
				if (upstream_node_status == NODE_STATUS_DOWN)
				{
					bool failover_done = false;

					if (upstream_node_info.type == PRIMARY)
					{
						failover_done = do_primary_failover();
					}
					else if (upstream_node_info.type == STANDBY)
					{
						failover_done = do_upstream_standby_failover();
					}

					// it's possible it will make sense to return in
					// all cases to restart monitoring
					if (failover_done == true)
						return;
				}
			}
		}

		if (monitoring_state == MS_DEGRADED)
		{
			int		degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);

			log_debug("monitoring node %i in degraded state for %i seconds",
					  upstream_node_info.node_id,
					  degraded_monitoring_elapsed);

			if (is_server_available(upstream_node_info.conninfo) == true)
			{
				upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);

				if (PQstatus(upstream_conn) == CONNECTION_OK)
				{
					// XXX check here if upstream is still primary
					// -> will be a problem if another node was promoted in the meantime
					// and upstream is now former primary
					// XXX scan other nodes to see if any has become primary

					upstream_node_status = NODE_STATUS_UP;
					monitoring_state = MS_NORMAL;

					if (upstream_node_info.type == PRIMARY)
					{
						primary_conn = upstream_conn;
					}
					else
					{

						if (primary_conn == NULL || PQstatus(primary_conn) != CONNECTION_OK)
						{
							primary_conn = establish_primary_db_connection(upstream_conn, false);
						}
					}

					initPQExpBuffer(&event_details);

					appendPQExpBuffer(&event_details,
									  _("reconnected to upstream node %i after %i seconds, resuming monitoring"),
									  upstream_node_info.node_id,
									  degraded_monitoring_elapsed);

					create_event_notification(primary_conn,
											  &config_file_options,
											  config_file_options.node_id,
											  "repmgrd_upstream_reconnect",
											  true,
											  event_details.data);

					log_notice("%s", event_details.data);
					termPQExpBuffer(&event_details);

					goto loop;
				}
			}
			else
			{
				// unable to connect to former primary - check if another node has
				// been promoted
			}

		}

	loop:

		/* emit "still alive" log message at regular intervals, if requested */
		if (config_file_options.log_status_interval > 0)
		{
			int		log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);

			if (log_status_interval_elapsed >= config_file_options.log_status_interval)
			{
				log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i) in %s state"),
						 local_node_info.node_name,
						 local_node_info.node_id,
						 upstream_node_info.node_name,
						 upstream_node_info.node_id,
						 print_monitoring_state(monitoring_state));

				if (monitoring_state == MS_DEGRADED)
				{
					log_detail(_("waiting for upstream or another primary to reappear"));
				}

				INSTR_TIME_SET_CURRENT(log_status_interval_start);
			}
		}

		/*
		 * handle local node failure
		 *
		 * currently we'll just check the connection, and try to reconnect
		 *
		 * TODO: add timeout, after which we run in degraded state
		 */
		if (is_server_available(local_node_info.conninfo) == false)
		{
			log_warning(_("connection to local node %i lost"), local_node_info.node_id);

			if (local_conn != NULL)
			{
				PQfinish(local_conn);
				local_conn = NULL;
			}
		}

		if (PQstatus(local_conn) != CONNECTION_OK)
		{
			log_info(_("attempting to reconnect"));
			local_conn = establish_db_connection(config_file_options.conninfo, false);

			if (PQstatus(local_conn) != CONNECTION_OK)
			{
				log_warning(_("reconnection failed"));
			}
			else
			{
				log_info(_("reconnected"));
			}
		}
		sleep(1);
	}
}

static bool
do_primary_failover(void)
{
	/* attempt to initiate voting process */
	ElectionResult election_result = do_election();

	/* XXX add pre-event notification here */
	failover_state = FAILOVER_STATE_UNKNOWN;

	log_debug("election result: %s", _print_election_result(election_result));

	if (election_result == ELECTION_CANCELLED)
	{
		log_notice(_("election cancelled"));
		return false;
	}
	else if (election_result == ELECTION_WON)
	{
		log_notice("I am the winner, will now promote self and inform other nodes");

		failover_state = promote_self();
	}
	else if (election_result == ELECTION_LOST)
	{
		t_node_info *best_candidate;

		log_info(_("I am the candidate but did not get all votes; will now determine the best candidate"));


		/* reset node list */
		get_active_sibling_node_records(local_conn,
										local_node_info.node_id,
										upstream_node_info.node_id,
										&standby_nodes);

		best_candidate = poll_best_candidate(&standby_nodes);

		/*
		 * this can occur in a tie-break situation, where this node establishes
		 * it is the best candidate
		 */
		if (best_candidate->node_id == local_node_info.node_id)
		{
			log_notice("I am the best candidate, will now promote self and inform other nodes");

			failover_state = promote_self();
		}
		else
		{
			PGconn *candidate_conn = NULL;

			log_info("node %i is the best candidate, waiting for it to confirm so I can follow it",
					 best_candidate->node_id);

			/* notify the best candidate so it */

			candidate_conn = establish_db_connection(best_candidate->conninfo, false);

			if (PQstatus(candidate_conn) == CONNECTION_OK)
			{
				notify_follow_primary(candidate_conn, best_candidate->node_id);

				/*  we'll wait for the candidate to get back to us */
				failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
			}
			else
			{
				log_error(_("unable to connect to candidate node (ID: %i)"), best_candidate->node_id);
				failover_state = FAILOVER_STATE_NODE_NOTIFICATION_ERROR;
			}
			PQfinish(candidate_conn);
		}
	}
	else
	{
		log_info(_("follower node awaiting notification from the candidate node"));
		failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
	}


	/*
	 * node has decided it is a follower, so will await notification
	 * from the candidate that it has promoted itself and can be followed
	 */
	if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
	{
		int new_primary_id;

		//   --> need timeout in case new primary doesn't come up, then rerun election

		/* either follow or time out; either way resume monitoring */
		if (wait_primary_notification(&new_primary_id) == true)
		{
			/* if primary has reappeared, no action needed */
			if (new_primary_id == upstream_node_info.node_id)
			{
				failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY;
			}
			/* if new_primary_id is self, promote */
			else if (new_primary_id == local_node_info.node_id)
			{
				log_notice(_("this node is promotion candidate, promoting"));

				failover_state = promote_self();

				get_active_sibling_node_records(local_conn,
												local_node_info.node_id,
												upstream_node_info.node_id,
												&standby_nodes);

			}
			else
			{
				failover_state = follow_new_primary(new_primary_id);
			}
		}
		else
		{
			failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
		}
	}

	switch(failover_state)
	{
		case FAILOVER_STATE_PROMOTED:
			log_debug("failover state is PROMOTED");

			/* notify former siblings that they should now follow this node */
			notify_followers(&standby_nodes, local_node_info.node_id);

			/* we no longer care about our former siblings */
			clear_node_info_list(&standby_nodes);

			/* pass control back down to start_monitoring() */
			log_info(_("switching to primary monitoring mode"));

			failover_state = FAILOVER_STATE_NONE;
			return true;

		case FAILOVER_STATE_PRIMARY_REAPPEARED:
			log_debug("failover state is PRIMARY_REAPPEARED");

			/* notify siblings that they should resume following the original primary */
			notify_followers(&standby_nodes, upstream_node_info.node_id);

			/* we no longer care about our former siblings */
			clear_node_info_list(&standby_nodes);

			/* pass control back down to start_monitoring() */
			log_info(_("resuming standby monitoring mode"));
			log_detail(_("original primary \"%s\" (node ID: %i) reappeared"),
					   upstream_node_info.node_name, upstream_node_info.node_id);

			failover_state = FAILOVER_STATE_NONE;
			return true;


		case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
			log_info(_("resuming standby monitoring mode"));
			log_detail(_("following new primary \"%s\" (node id: %i)"),
					   upstream_node_info.node_name, upstream_node_info.node_id);
			failover_state = FAILOVER_STATE_NONE;

			return true;

		case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
			log_info(_("resuming standby monitoring mode"));
			log_detail(_("following original primary \"%s\" (node id: %i)"),
					   upstream_node_info.node_name, upstream_node_info.node_id);
			failover_state = FAILOVER_STATE_NONE;

			return true;

		case FAILOVER_STATE_PROMOTION_FAILED:
			log_debug("failover state is PROMOTION FAILED");
			return false;

		case FAILOVER_STATE_FOLLOW_FAIL:
			/*
			 * for whatever reason we were unable to follow the new primary -
			 * continue monitoring in degraded state
			 */
			monitoring_state = MS_DEGRADED;
			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);

			return false;

		case FAILOVER_STATE_NO_NEW_PRIMARY:
		case FAILOVER_STATE_WAITING_NEW_PRIMARY:
			/* pass control back down to start_monitoring() */
			// -> should kick off new election
			return false;

		case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
		case FAILOVER_STATE_LOCAL_NODE_FAILURE:
		case FAILOVER_STATE_UNKNOWN:
		case FAILOVER_STATE_NONE:
			log_debug("failover state is %i", failover_state);
			return false;
	}

	/* should never reach here */
	return false;
}

/*
 * do_upstream_standby_failover()
 *
 * Attach cascaded standby to primary
 *
 * Currently we will try to attach to the cluster primary, as "repmgr
 * standby follow" doesn't support attaching to another node.
 *
 * If this becomes supported, it might be worth providing a selection
 * of reconnection strategies as different behaviour might be desirable
 * in different situations;
 * or maybe the option not to reconnect might be required?
 *
 * XXX check this handles replication slots gracefully
 */
static bool
do_upstream_standby_failover(void)
{
	PQExpBufferData event_details;
	t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
	RecordStatus record_status;
	int r;

	PQfinish(upstream_conn);
	upstream_conn = NULL;

	record_status = get_primary_node_record(local_conn, &primary_node_info);

	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve primary node record"));
		return false;
	}
	/*
	 * Verify that we can still talk to the cluster primary, even though
	 * the node's upstream is not available
	 */

	// consolidate below code
	if (is_server_available(primary_node_info.conninfo) == false)
	{
		log_warning(_("connection to primary %i lost"), primary_node_info.node_id);

		if (primary_conn != NULL)
		{
			PQfinish(primary_conn);
			primary_conn = NULL;
		}
	}

	if (PQstatus(primary_conn) != CONNECTION_OK)
	{
		log_info(_("attempting to reconnect"));
		primary_conn = establish_db_connection(primary_node_info.conninfo, false);

		if (PQstatus(primary_conn) != CONNECTION_OK)
		{
			log_warning(_("reconnection failed"));
		}
		else
		{
			log_info(_("reconnected"));
		}
	}

	/* grandparent upstream is inactive  */
	if (primary_node_info.active == false)
	{
		// XXX
	}

	/* Close the connection to this server */
	PQfinish(local_conn);
	local_conn = NULL;

	initPQExpBuffer(&event_details);

	log_debug(_("standby follow command is:\n  \"%s\""),
			  config_file_options.follow_command);

	r = system(config_file_options.follow_command);

	if (r != 0)
	{
		appendPQExpBuffer(&event_details,
						  _("unable to execute follow command:\n %s"),
						  config_file_options.follow_command);

		log_error("%s", event_details.data);

		/* It may not possible to write to the event notification
		 * table but we should be able to generate an external notification
		 * if required.
		 */
		create_event_notification(primary_conn,
							&config_file_options,
							local_node_info.node_id,
							"repmgrd_failover_follow",
							false,
							event_details.data);

		termPQExpBuffer(&event_details);
	}

	/* reconnect to local node */
	local_conn = establish_db_connection(config_file_options.conninfo, false);

	if (update_node_record_set_upstream(primary_conn,
										local_node_info.node_id,
										primary_node_info.node_id) == false)
	{
		appendPQExpBuffer(&event_details,
						  _("unable to set node %i's new upstream ID to %i"),
						  local_node_info.node_id,
						  primary_node_info.node_id);

		log_error("%s", event_details.data);

		create_event_notification(NULL,
							&config_file_options,
							local_node_info.node_id,
							"repmgrd_failover_follow",
							false,
							event_details.data);

		termPQExpBuffer(&event_details);

		terminate(ERR_BAD_CONFIG);
	}
	/* update own internal node record */
    record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);


	appendPQExpBuffer(&event_details,
					  _("node %i is now following primary node %i"),
					  local_node_info.node_id,
					  primary_node_info.node_id);

	log_notice("%s", event_details.data);

	create_event_notification(primary_conn,
							  &config_file_options,
							  local_node_info.node_id,
							  "repmgrd_failover_follow",
							  true,
							  event_details.data);

	termPQExpBuffer(&event_details);


	PQfinish(primary_conn);
	primary_conn = NULL;


	return true;
}


static FailoverState
promote_self(void)
{
	PQExpBufferData event_details;
	char *promote_command;
	int r;

	/* Store details of the failed node here */
	t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
	RecordStatus record_status;

	/*
	 * optionally add a delay before promoting the standby; this is mainly
	 * useful for testing (e.g. for reappearance of the original primary)
	 * and is not documented.
	 */
	if (config_file_options.promote_delay > 0)
	{
		log_debug("sleeping %i seconds before promoting standby",
				  config_file_options.promote_delay);
		sleep(config_file_options.promote_delay);
	}

	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);

	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
				  local_node_info.upstream_node_id);
		return FAILOVER_STATE_PROMOTION_FAILED;
	}

	/* the presence of either of these commands has been established already */
	if (config_file_options.service_promote_command[0] != '\0')
		promote_command = config_file_options.service_promote_command;
	else
		promote_command = config_file_options.promote_command;

	log_debug("promote command is:\n  \"%s\"",
			  promote_command);

	if (log_type == REPMGR_STDERR && *config_file_options.log_file)
	{
		fflush(stderr);
	}

	r = system(promote_command);

	/* connection should stay up, but check just in case */
	if(PQstatus(local_conn) != CONNECTION_OK)
	{
		local_conn = establish_db_connection(local_node_info.conninfo, true);

		/* assume node failed */
		if(PQstatus(local_conn) != CONNECTION_OK)
		{
			log_error(_("unable to reconnect to local node"));
			// XXX handle this
			return FAILOVER_STATE_LOCAL_NODE_FAILURE;
		}
	}

	if (r != 0)
	{
		int primary_node_id;

		upstream_conn = get_primary_connection(local_conn,
											  &primary_node_id, NULL);

		if (PQstatus(upstream_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
		{
			log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"),
					   failed_primary.node_id);

			initPQExpBuffer(&event_details);
			appendPQExpBuffer(&event_details,
							  _("original primary \"%s\" (node ID: %i) reappeared"),
							  failed_primary.node_name,
							  failed_primary.node_id);

			create_event_notification(upstream_conn,
								&config_file_options,
								local_node_info.node_id,
								"repmgrd_failover_abort",
								true,
								event_details.data);

			termPQExpBuffer(&event_details);

			//primary_conn = NULL;

			// XXX handle this!
			// -> we'll need to let the other nodes know too....
			/* no failover occurred but we'll want to restart connections */
			//failover_done = true;
			return FAILOVER_STATE_PRIMARY_REAPPEARED;
		}

		// handle this
		//  -> check if somehow primary; otherwise go for new election?
		log_error(_("promote command failed"));
		return FAILOVER_STATE_PROMOTION_FAILED;
	}


	initPQExpBuffer(&event_details);

	/* update own internal node record */
	record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);

	/*
	 * XXX here we're assuming the promote command updated metadata
	 */
	appendPQExpBuffer(&event_details,
					  _("node %i promoted to primary; old primary %i marked as failed"),
					  local_node_info.node_id,
					  failed_primary.node_id);

	/* local_conn is now the primary connection */
	create_event_notification(local_conn,
						&config_file_options,
						local_node_info.node_id,
						"repmgrd_failover_promote",
						true,
						event_details.data);

	termPQExpBuffer(&event_details);

	return FAILOVER_STATE_PROMOTED;
}


/*
 * Notify follower nodes about which node to follow. Normally this
 * will be the current node, however if the original primary reappeared
 * before this node could be promoted, we'll inform the followers they
 * should resume monitoring the original primary.
 */
static void
notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
{
	NodeInfoListCell *cell;

	log_debug("notify_followers()");
	for (cell = standby_nodes->head; cell; cell = cell->next)
	{
		log_debug("intending to notify node %i... ", cell->node_info->node_id);
		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
		{
			log_debug("reconnecting to node %i... ", cell->node_info->node_id);

			cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
		}

		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
		{
			log_debug("unable to reconnect to  %i ... ", cell->node_info->node_id);

			continue;
		}

		log_debug("notifying node %i to follow node %i",
				  cell->node_info->node_id, follow_node_id);
		notify_follow_primary(cell->node_info->conn, follow_node_id);
	}
}


static t_node_info *
poll_best_candidate(NodeInfoList *standby_nodes)
{
	NodeInfoListCell *cell;
	t_node_info *best_candidate = &local_node_info;

	// XXX ensure standby_nodes is set correctly

	/*
	 * we need to definitively decide the best candidate, as in some corner
	 * cases we could end up with two candidate nodes, so they should each
	 * come to the same conclusion
	 */
	for (cell = standby_nodes->head; cell; cell = cell->next)
	{
		if (cell->node_info->last_wal_receive_lsn > best_candidate->last_wal_receive_lsn)
		{
			log_debug("node %i has higher LSN, now best candidate", cell->node_info->node_id);
			best_candidate = cell->node_info;
		}
		else if (cell->node_info->last_wal_receive_lsn == best_candidate->last_wal_receive_lsn)
		{
			if (cell->node_info->priority > best_candidate->priority)
			{
				log_debug("node %i has higher priority, now best candidate", cell->node_info->node_id);
				best_candidate = cell->node_info;
			}
		}
		/* if all else fails, we decide by node_id */
		else if (cell->node_info->node_id < best_candidate->node_id)
		{
			log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id);
			best_candidate = cell->node_info;
		}
	}

	log_info(_("best candidate is %i"), best_candidate->node_id);

	return best_candidate;
}


static bool
wait_primary_notification(int *new_primary_id)
{
	// XXX make this configurable
	int wait_primary_timeout = 60;
	int i;

	for (i = 0; i < wait_primary_timeout; i++)
	{
		if (get_new_primary(local_conn, new_primary_id) == true)
		{
			log_debug("new primary is %i; elapsed: %i",
					  *new_primary_id, i);
			return true;
		}
		sleep(1);
	}


	log_warning(_("no notifcation received from new primary after %i seconds"),
				wait_primary_timeout);

	return false;
}


static FailoverState
follow_new_primary(int new_primary_id)
{
	PQExpBufferData event_details;
	int r;

	/* Store details of the failed node here */
	t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
	t_node_info new_primary = T_NODE_INFO_INITIALIZER;
	RecordStatus record_status;
	bool new_primary_ok = false;

	record_status = get_node_record(local_conn, new_primary_id, &new_primary);

	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record for upstream node (ID: %i)"),
				  new_primary_id);
		return FAILOVER_STATE_FOLLOW_FAIL;
	}

	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);

	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record for failed primary (ID: %i)"),
					local_node_info.upstream_node_id);
		return FAILOVER_STATE_FOLLOW_FAIL;
	}

	// XXX check if new_primary_id == failed_primary.node_id?

	if (log_type == REPMGR_STDERR && *config_file_options.log_file)
	{
		fflush(stderr);
	}

	log_debug(_("standby follow command is:\n  \"%s\""),
			  config_file_options.follow_command);

	upstream_conn = establish_db_connection(new_primary.conninfo, false);

	if (PQstatus(upstream_conn) == CONNECTION_OK)
	{
		RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
		if (primary_recovery_type == RECTYPE_PRIMARY)
		{
			new_primary_ok = true;
		}
		else
		{
			log_warning(_("new primary is not in recovery"));
			PQfinish(upstream_conn);
		}
	}

	if (new_primary_ok == false)
	{
		return FAILOVER_STATE_FOLLOW_FAIL;
	}

	/*
	 * disconnect from local node, as follow operation will result in
	 * a server restart
	 */

	PQfinish(local_conn);
	local_conn = NULL;

	/* execute the follow command */
	r = system(config_file_options.follow_command);

	if (r != 0)
	{
		PGconn *old_primary_conn;
		/*
		 * The follow action could still fail due to the original primary reappearing
		 * before the candidate could promote itself ("repmgr standby follow" will
		 * refuse to promote another node if the primary is available). However
		 * the new primary will only instruct use to follow it after it's successfully
		 * promoted itself, so that very likely won't be the reason for the failure.
		 *
		 *
		 * TODO: check the new primary too - we could have a split-brain
		 * situation where the old primary reappeared just after the new
		 * one promoted itself.
		 */
		old_primary_conn = establish_db_connection(failed_primary.conninfo, false);

		if (PQstatus(old_primary_conn) == CONNECTION_OK)
		{
			// XXX add event notifications
			RecoveryType upstream_recovery_type = get_recovery_type(old_primary_conn);
			PQfinish(old_primary_conn);

			if (upstream_recovery_type == RECTYPE_PRIMARY)
			{
				log_notice(_("original primary reappeared - no action taken"));
				return FAILOVER_STATE_PRIMARY_REAPPEARED;
			}
		}

		return FAILOVER_STATE_FOLLOW_FAIL;
	}


	/*
	 * refresh local copy of local and primary node records - we get these
	 * directly from the primary to ensure they're the current version
	 */

	record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);

	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record found for node %i"),
				  new_primary_id);
		return FAILOVER_STATE_FOLLOW_FAIL;
	}

	record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
	if (record_status != RECORD_FOUND)
	{
		log_error(_("unable to retrieve metadata record found for node %i"),
				  local_node_info.node_id);
		return FAILOVER_STATE_FOLLOW_FAIL;
	}


	local_conn = establish_db_connection(local_node_info.conninfo, false);
	initPQExpBuffer(&event_details);
	appendPQExpBuffer(&event_details,
					  _("node %i now following new upstream node %i"),
					  local_node_info.node_id,
					  upstream_node_info.node_id);

	log_notice("%s\n", event_details.data);

	create_event_notification(upstream_conn,
						&config_file_options,
						local_node_info.node_id,
						"repmgrd_failover_follow",
						true,
						event_details.data);

	termPQExpBuffer(&event_details);

	return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
}


static const char *
_print_voting_status(NodeVotingStatus voting_status)
{
	switch(voting_status)
	{
		case VS_NO_VOTE:
			return "NO VOTE";

		case VS_VOTE_REQUEST_RECEIVED:
			return "VOTE REQUEST RECEIVED";

		case VS_VOTE_INITIATED:
			return "VOTE REQUEST INITIATED";

		case VS_UNKNOWN:
			return "VOTE REQUEST UNKNOWN";
	}

	return "UNKNOWN VOTE REQUEST STATE";
}

static const char *
_print_election_result(ElectionResult result)
{
	switch(result)
	{
		case ELECTION_NOT_CANDIDATE:
			return "NOT CANDIDATE";

		case ELECTION_WON:
			return "WON";

		case ELECTION_LOST:
			return "LOST";

		case ELECTION_CANCELLED:
			return "CANCELLED";
	}

	/* should never reach here */
	return "UNKNOWN";
}


static ElectionResult
do_election(void)
{
	int electoral_term = -1;

	int votes_for_me = 0;

	/* we're visible */
	int visible_nodes = 1;

	/*
	 * get voting status from shared memory - should be one of "VS_NO_VOTE"
	 * or "VS_VOTE_REQUEST_RECEIVED". If VS_NO_VOTE, we declare ourselves as
	 * candidate and initiate the voting process.
	 */
	NodeVotingStatus voting_status;

	NodeInfoListCell *cell;

	bool other_node_is_candidate = false;
	bool other_node_is_ahead = false;

	/*
	 * Check if at least one server in the primary's location is visible;
	 * if not we'll assume a network split between this node and the primary
	 * location, and not promote any standby.
	 *
	 * NOTE: this function is only ever called by standbys attached to the current
	 * (unreachable) primary, so "upstream_node_info" will always contain the
	 * primary node record.
	 */
	bool primary_location_seen = false;

	/*
	 * sleep for a random period of 100 ~ 350 ms
	 */

	long unsigned rand_wait = (long) ((rand() % 35) + 10) * 10000;

	log_debug("do_election(): sleeping %lu", rand_wait);
	log_debug("do_election(): primary location is %s", upstream_node_info.location);

	pg_usleep(rand_wait);

	local_node_info.last_wal_receive_lsn = InvalidXLogRecPtr;

	log_debug("do_election(): executing get_voting_status()");
	voting_status = get_voting_status(local_conn);
	log_debug("do_election(): node voting status is %s", _print_voting_status(voting_status));

	if (voting_status == VS_VOTE_REQUEST_RECEIVED)
	{
		/* we've already been requested to vote, so can't become a candidate */
		log_debug("vote request already received, not candidate");
		return ELECTION_NOT_CANDIDATE;
	}

	/*
	 * Here we mark ourselves as candidate, so any further vote requests
	 * are rejected. However it's possible another node has done the
	 * same thing, so when announcing ourselves as candidate to the other
	 * nodes, we'll check for that and withdraw our candidature.
	 */
	electoral_term = set_voting_status_initiated(local_conn);

	/* get all active nodes attached to primary, excluding self */
	get_active_sibling_node_records(local_conn,
									local_node_info.node_id,
									upstream_node_info.node_id,
									&standby_nodes);

	/* no other standbys - win by default */

	if (standby_nodes.node_count == 0)
	{
		if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
		{
			log_debug("no other nodes - we win by default");
			return ELECTION_WON;
		}
		else
		{
			log_debug("no other nodes, but primary and standby locations differ");

			monitoring_state = MS_DEGRADED;
			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);

			return ELECTION_NOT_CANDIDATE;
		}
	}

	for (cell = standby_nodes.head; cell; cell = cell->next)
	{
		/* assume the worst case */
		cell->node_info->is_visible = false;

		cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);

		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
		{
			continue;
		}

		/*
		 * tell the other node we're candidate - if the node has already declared
		 * itself, we withdraw
		 *
		 * XXX check for situations where more than one node could end up as candidate?
		 *
		 * XXX note it's possible some nodes accepted our candidature before we
		 * found out about the other candidate, check what happens in that situation
		 *  -> other node will have info from all the nodes, even if not the vote,
		 *     so it should be able to determine the best node anyway
		 */

		if (announce_candidature(cell->node_info->conn, &local_node_info, cell->node_info, electoral_term) == false)
		{
			log_debug("node %i is candidate",  cell->node_info->node_id);
			other_node_is_candidate = true;

			/* don't notify any further standbys */
			break;
		}

		/*
		 * see if the node is in the primary's location (but skip the check
		 * if we've seen
		 */
		if (primary_location_seen == false)
		{
			if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0)
			{
				primary_location_seen = true;
			}
		}

		cell->node_info->is_visible = true;
		visible_nodes ++;
	}

	if (other_node_is_candidate == true)
	{
		clear_node_info_list(&standby_nodes);

		reset_node_voting_status();
		log_debug("other node is candidate, returning NOT CANDIDATE");
		return ELECTION_NOT_CANDIDATE;
	}

	if (primary_location_seen == false)
	{
		log_notice(_("no nodes from the primary location \"%s\" visible - assuming network split"),
				   upstream_node_info.location);
		log_detail(_("node will enter degraded monitoring state waiting for reconnect"));

		monitoring_state = MS_DEGRADED;
		INSTR_TIME_SET_CURRENT(degraded_monitoring_start);

		reset_node_voting_status();

		return ELECTION_CANCELLED;
	}


	/* get our lsn */
	local_node_info.last_wal_receive_lsn = get_last_wal_receive_location(local_conn);

	log_debug("last receive lsn = %X/%X",
			  (uint32) (local_node_info.last_wal_receive_lsn >> 32),
			  (uint32)  local_node_info.last_wal_receive_lsn);

	/* request vote from each node */

	for (cell = standby_nodes.head; cell; cell = cell->next)
	{
		log_debug("checking node %i...", cell->node_info->node_id);
		/* ignore unreachable nodes */
		if (cell->node_info->is_visible == false)
			continue;
		votes_for_me += request_vote(cell->node_info->conn,
									 &local_node_info,
									 cell->node_info,
									 electoral_term);

		if (cell->node_info->last_wal_receive_lsn > local_node_info.last_wal_receive_lsn)
		{
			/* register if another node is ahead of us */
			other_node_is_ahead = true;
		}
		PQfinish(cell->node_info->conn);
		cell->node_info->conn = NULL;
	}

	/* vote for myself, but only if I believe no-one else is ahead */
	if (other_node_is_ahead == false)
	{
		votes_for_me += 1;
	}

	log_debug(_("%i of of %i votes"), votes_for_me, visible_nodes);

	if (votes_for_me == visible_nodes)
		return ELECTION_WON;

	return ELECTION_LOST;
}


static void
reset_node_voting_status(void)
{
	failover_state = FAILOVER_STATE_NONE;

	if (PQstatus(local_conn) != CONNECTION_OK)
	{
		log_error(_("reset_node_voting_status(): local_conn not set"));
		return;
	}
	reset_voting_status(local_conn);
}


void
close_connections_physical()
{
	if (PQstatus(primary_conn) == CONNECTION_OK)
	{
		/* cancel any pending queries to the primary */
		if (PQisBusy(primary_conn) == 1)
			cancel_query(primary_conn, config_file_options.primary_response_timeout);
		PQfinish(primary_conn);
		primary_conn = NULL;
	}

	if (upstream_conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
	{
		PQfinish(upstream_conn);
		upstream_conn = NULL;
	}

}