diff --git a/Makefile.in b/Makefile.in
index bcd8eacc..132d90bb 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -29,7 +29,7 @@ $(info Building against PostgreSQL $(MAJORVERSION))
 REPMGR_CLIENT_OBJS = repmgr-client.o \
   repmgr-action-primary.o repmgr-action-standby.o repmgr-action-bdr.o repmgr-action-cluster.o \
   config.o log.o strutil.o dbutils.o dirutil.o compat.o controldata.o
-REPMGRD_OBJS = repmgrd.o config.o log.o dbutils.o strutil.o
+REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o config.o log.o dbutils.o strutil.o
 
 $(REPMGR_CLIENT_OBJS): repmgr-client.h
 
@@ -58,6 +58,8 @@ additional-clean:
 	rm -f repmgr-action-bdr.o
 	rm -f repmgr-action-cluster.o
 	rm -f repmgrd.o
+	rm -f repmgrd-physical.o
+	rm -f repmgrd-bdr.o
 	rm -f compat.o
 	rm -f config.o
 	rm -f controldata.o
diff --git a/repmgr-bdr.h b/repmgr-bdr.h
new file mode 100644
index 00000000..8b437602
--- /dev/null
+++ b/repmgr-bdr.h
@@ -0,0 +1,13 @@
+/*
+ * repmgr-bdr.h
+ * Copyright (c) 2ndQuadrant, 2010-2017
+ */
+
+#ifndef _REPMGR_BDR_H_
+#define _REPMGR_BDR_H_
+
+extern void do_bdr_node_check(void);
+extern void monitor_bdr(void);
+extern t_node_info *do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node);
+
+#endif /* _REPMGR_BDR_H_ */
diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c
new file mode 100644
index 00000000..e6ec164c
--- /dev/null
+++ b/repmgrd-bdr.c
@@ -0,0 +1,385 @@
+/*
+ * repmgrd-bdr.c - BDR functionality for repmgrd
+ *
+ * Copyright (c) 2ndQuadrant, 2010-2017
+ */
+
+#include <signal.h>
+
+#include "repmgr.h"
+#include "repmgrd.h"
+#include "repmgrd-bdr.h"
+#include "config.h"
+
+
+static volatile sig_atomic_t got_SIGHUP = false;
+
+void
+do_bdr_node_check(void)
+{
+	/* nothing to do at the moment */
+}
+
+
+void
+monitor_bdr(void)
+{
+	NodeInfoList  nodes = T_NODE_INFO_LIST_INITIALIZER;
+	PGconn		 *monitoring_conn = NULL;
+	t_node_info	 *monitored_node = NULL;
+	RecordStatus  record_status;
+
+	bool failover_done = false;
+
+	/* sanity check local database */
+	log_info(_("connecting to local database '%s'"),
+			 config_file_options.conninfo);
+
+	local_conn = establish_db_connection(config_file_options.conninfo, true);
+
+	/*
+	 * Local node must be running
+	 */
+	if (PQstatus(local_conn) != CONNECTION_OK)
+	{
+		log_error(_("unable connect to local node (ID: %i), terminating"),
+				  local_node_info.node_id);
+		log_hint(_("local node must be running before repmgrd can start"));
+		PQfinish(local_conn);
+		exit(ERR_DB_CONN);
+	}
+
+	/*
+	 * Verify that database is a BDR one
+	 * TODO: check if supported BDR version?
+	 */
+	log_info(_("connected to database, checking for BDR"));
+
+	if (!is_bdr_db(local_conn))
+	{
+		log_error(_("database is not BDR-enabled"));
+		exit(ERR_BAD_CONFIG);
+	}
+
+
+	if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr"))
+	{
+		log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"),
+				  "nodes");
+
+		/* TODO: add `repmgr bdr sync` or similar for this situation, and hint here */
+
+		exit(ERR_BAD_CONFIG);
+	}
+
+	/* Retrieve record for this node from the local database */
+	record_status = get_node_record(local_conn, config_file_options.node_id, &local_node_info);
+
+	/*
+	 * Terminate if we can't find the local node record. This is a "fix-the-config"
+	 * situation, not a lot else we can do.
+	 */
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve record for local node (ID: %i), terminating"),
+					local_node_info.node_id);
+		log_hint(_("check that 'repmgr bdr register' was executed for this node\n"));
+		PQfinish(local_conn);
+		exit(ERR_BAD_CONFIG);
+	}
+
+
+	// check if inactive node
+	// -> what to do?
+
+	/* Log startup event */
+
+	create_event_record(local_conn,
+						&config_file_options,
+						config_file_options.node_id,
+						"repmgrd_start",
+						true,
+						NULL);
+
+	/*
+	 * retrieve list of nodes - we'll need these if the DB connection goes away,
+	 * or if we're monitoring a non-local node
+	 */
+	get_node_records_by_priority(local_conn, &nodes);
+
+		/* decided which node to monitor */
+
+	if (config_file_options.bdr_monitoring_mode == BDR_MONITORING_LOCAL)
+	{
+		// if local, reuse local_conn and node info
+		//record_status = get_node_record(local_conn, config_file_options.node_id, &monitored_node);
+		monitored_node = &local_node_info;
+
+		monitoring_conn = establish_db_connection(monitored_node->conninfo, false);
+		log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id);
+	}
+	else
+	{
+		NodeInfoListCell *cell;
+
+		for (cell = nodes.head; cell; cell = cell->next)
+		{
+			log_debug("main_loop_bdr() checking node %s %i", cell->node_info->node_name, cell->node_info->priority);
+
+			monitoring_conn = establish_db_connection(cell->node_info->conninfo, false);
+			if (PQstatus(monitoring_conn) == CONNECTION_OK)
+			{
+				log_debug("main_loop_bdr() monitoring node '%s' (ID %i, priority %i)",
+						  cell->node_info->node_name, cell->node_info->node_id, cell->node_info->priority);
+				/* fetch the record again, as the node list is transient */
+				monitored_node = get_node_record_pointer(monitoring_conn, cell->node_info->node_id);
+
+				break;
+			}
+		}
+	}
+
+	// check monitored_node not null!
+
+	while (true)
+	{
+		/* normal state - connection active */
+		if (PQstatus(monitoring_conn) == CONNECTION_OK)
+		{
+			// XXX detail
+			log_info(_("starting continuous bdr node monitoring"));
+
+			/* monitoring loop */
+			do
+			{
+				log_verbose(LOG_DEBUG, "bdr check loop...");
+
+				{
+					NodeInfoListCell *cell;
+
+					for (cell = nodes.head; cell; cell = cell->next)
+					{
+						log_debug("bdr_monitor() %s", cell->node_info->node_name);
+					}
+				}
+
+				if (is_server_available(monitored_node->conninfo) == false)
+				{
+					t_node_info  *new_monitored_node;
+
+					// XXX improve
+					log_warning("connection problem!");
+					new_monitored_node = do_bdr_failover(&nodes, monitored_node);
+
+					if (new_monitored_node != NULL)
+					{
+						pfree(monitored_node);
+						monitored_node = new_monitored_node;
+					}
+					log_notice(_("monitored_node->node_name is now '%s' \n"), monitored_node->node_name);
+				}
+				else
+				{
+					sleep(config_file_options.monitor_interval_secs);
+				}
+
+				if (got_SIGHUP)
+				{
+					/*
+					 * if we can reload, then could need to change
+					 * local_conn
+					 */
+					if (reload_config(&config_file_options))
+					{
+						PQfinish(local_conn);
+						local_conn = establish_db_connection(config_file_options.conninfo, true);
+						update_registration(local_conn);
+					}
+
+					/* reload node list */
+					get_node_records_by_priority(local_conn, &nodes);
+
+					got_SIGHUP = false;
+				}
+
+			} while (!failover_done);
+		}
+		/* local connection inactive - periodically try and connect */
+		/* TODO: make this an option */
+		else
+		{
+
+			monitoring_conn = establish_db_connection(monitored_node->conninfo, false);
+
+			if (PQstatus(monitoring_conn) == CONNECTION_OK)
+			{
+				// XXX event bdr_node_recovered -> if monitored == local node
+
+				if (monitored_node->node_id == config_file_options.node_id)
+				{
+					log_notice(_("local connection has returned, resuming monitoring"));
+				}
+				else
+				{
+					log_notice(_("connection to '%s' has returned, resuming monitoring"), monitored_node->node_name);
+				}
+			}
+			else
+			{
+				sleep(config_file_options.monitor_interval_secs);
+			}
+
+
+			if (got_SIGHUP)
+			{
+				/*
+				 * if we can reload, then could need to change
+				 * local_conn
+				 */
+				if (reload_config(&config_file_options))
+				{
+					if (PQstatus(local_conn) == CONNECTION_OK)
+					{
+						PQfinish(local_conn);
+						local_conn = establish_db_connection(config_file_options.conninfo, true);
+						update_registration(local_conn);
+					}
+				}
+
+				/* reload node list */
+				if (PQstatus(local_conn) == CONNECTION_OK)
+					get_node_records_by_priority(local_conn, &nodes);
+
+				got_SIGHUP = false;
+			}
+		}
+
+		failover_done = false;
+	}
+
+	return;
+}
+
+/*
+ * do_bdr_failover()
+ *
+ * Here we attempt to perform a BDR "failover".
+ *
+ * As there's no equivalent of a physical replication failover,
+ * we'll do the following:
+ *
+ *  - attempt to find another node, to set our node record as inactive
+ *  - generate an event log record on that node
+ *  - optionally execute `bdr_failover_command`, passing the conninfo string
+ *    of that node to the command; this can be used for e.g. reconfiguring
+ *    pgbouncer.
+ *  - if mode is 'BDR_MONITORING_PRIORITY', redirect monitoring to that node.
+ *
+ */
+t_node_info *
+do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
+{
+	PGconn	   *next_node_conn = NULL;
+	NodeInfoListCell *cell;
+	bool	    failover_success = false;
+	PQExpBufferData event_details;
+	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
+	t_node_info *new_monitored_node = NULL;
+
+	initPQExpBuffer(&event_details);
+
+	/* get next active priority node */
+
+	for (cell = nodes->head; cell; cell = cell->next)
+	{
+		log_debug("do_bdr_failover() %s", cell->node_info->node_name);
+
+		/* don't attempt to connect to the current monitored node, as that's the one which has failed  */
+		if (cell->node_info->node_id == monitored_node->node_id)
+			continue;
+
+		/* XXX skip inactive node? */
+
+		next_node_conn = establish_db_connection(cell->node_info->conninfo, false);
+
+		if (PQstatus(next_node_conn) == CONNECTION_OK)
+		{
+			// XXX check if record returned
+			new_monitored_node = get_node_record_pointer(next_node_conn, cell->node_info->node_id);
+
+			break;
+		}
+
+		next_node_conn = NULL;
+	}
+
+	if (next_node_conn == NULL)
+	{
+		appendPQExpBuffer(&event_details,
+						  _("no other available node found"));
+
+		log_error("%s", event_details.data);
+
+
+		// no other nodes found
+		// continue degraded monitoring until node is restored?
+	}
+	else
+	{
+		log_info(_("connecting to target node %s"), cell->node_info->node_name);
+
+		failover_success = true;
+
+		event_info.conninfo_str = cell->node_info->conninfo;
+		event_info.node_name = cell->node_info->node_name;
+
+		/* update our own record on the other node */
+		if (monitored_node->node_id == config_file_options.node_id)
+		{
+			update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
+		}
+
+		if (config_file_options.bdr_monitoring_mode == BDR_MONITORING_PRIORITY)
+		{
+			log_notice(_("monitoring next available node by prioriy: %s (ID %i)"),
+					   new_monitored_node->node_name,
+					   new_monitored_node->node_id);
+		}
+
+		appendPQExpBuffer(&event_details,
+						  _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
+						  monitored_node->node_name,
+						  monitored_node->node_id,
+						  cell->node_info->node_name,
+						  cell->node_info->node_id);
+	}
+
+	/*
+	 * Create an event record
+	 *
+	 * If we were able to connect to another node, we'll update the
+	 * event log there.
+	 *
+	 * In any case the event notification command will be triggered
+	 * with the event "bdr_failover"
+	 */
+
+	create_event_notification_extended(
+		next_node_conn,
+		&config_file_options,
+		config_file_options.node_id,
+		"bdr_failover",
+		failover_success,
+		event_details.data,
+		&event_info);
+
+	termPQExpBuffer(&event_details);
+
+	//failover_done = true;
+
+	if (config_file_options.bdr_monitoring_mode == BDR_MONITORING_PRIORITY)
+		return new_monitored_node;
+
+	/* local monitoring mode - there's no new node to monitor */
+	return NULL;
+}
diff --git a/repmgrd-bdr.h b/repmgrd-bdr.h
new file mode 100644
index 00000000..6bbe6257
--- /dev/null
+++ b/repmgrd-bdr.h
@@ -0,0 +1,13 @@
+/*
+ * repmgrd-bdr.h
+ * Copyright (c) 2ndQuadrant, 2010-2017
+ */
+
+#ifndef _REPMGRD_BDR_H_
+#define _REPMGRD_BDR_H_
+
+extern void do_bdr_node_check(void);
+extern void monitor_bdr(void);
+extern t_node_info *do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node);
+
+#endif /* _REPMGRD_BDR_H_ */
diff --git a/repmgrd-physical.c b/repmgrd-physical.c
new file mode 100644
index 00000000..1384a876
--- /dev/null
+++ b/repmgrd-physical.c
@@ -0,0 +1,1716 @@
+/*
+ * repmgrd-physical.c - physical replication functionality for repmgrd
+ *
+ * Copyright (c) 2ndQuadrant, 2010-2017
+ */
+
+#include <signal.h>
+
+#include "repmgr.h"
+#include "repmgrd.h"
+#include "repmgrd-physical.h"
+
+
+typedef enum {
+	FAILOVER_STATE_UNKNOWN = -1,
+	FAILOVER_STATE_NONE,
+	FAILOVER_STATE_PROMOTED,
+	FAILOVER_STATE_PROMOTION_FAILED,
+	FAILOVER_STATE_PRIMARY_REAPPEARED,
+	FAILOVER_STATE_LOCAL_NODE_FAILURE,
+	FAILOVER_STATE_WAITING_NEW_PRIMARY,
+	FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
+    FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
+	FAILOVER_STATE_NO_NEW_PRIMARY,
+	FAILOVER_STATE_FOLLOW_FAIL,
+	FAILOVER_STATE_NODE_NOTIFICATION_ERROR
+} FailoverState;
+
+
+typedef enum {
+	ELECTION_NOT_CANDIDATE = -1,
+	ELECTION_WON,
+	ELECTION_LOST,
+	ELECTION_CANCELLED
+} ElectionResult;
+
+
+
+static FailoverState failover_state = FAILOVER_STATE_UNKNOWN;
+
+static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
+static NodeInfoList standby_nodes = T_NODE_INFO_LIST_INITIALIZER;
+
+static PGconn *upstream_conn = NULL;
+static PGconn *primary_conn = NULL;
+
+static ElectionResult do_election(void);
+static const char *_print_voting_status(NodeVotingStatus voting_status);
+static const char *_print_election_result(ElectionResult result);
+static const char *_print_monitoring_state(MonitoringState monitoring_state);
+
+static FailoverState promote_self(void);
+static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);
+
+static t_node_info *poll_best_candidate(NodeInfoList *standby_nodes);
+
+static bool wait_primary_notification(int *new_primary_id);
+static FailoverState follow_new_primary(int new_primary_id);
+
+static void reset_node_voting_status(void);
+void close_connections_physical();
+
+static bool do_primary_failover(void);
+static bool do_upstream_standby_failover(void);
+
+
+
+
+void
+do_physical_node_check(void)
+{
+    /*
+     * Check if node record is active - if not, and `failover_mode=automatic`, the node
+     * won't be considered as a promotion candidate; this often happens when
+     * a failed primary is recloned and the node was not re-registered, giving
+     * the impression failover capability is there when it's not. In this case
+     * abort with an error and a hint about registering.
+     *
+     * If `failover_mode=manual`, repmgrd can continue to passively monitor the node, but
+     * we should nevertheless issue a warning and the same hint.
+     */
+
+    if (local_node_info.active == false)
+    {
+        char *hint = "Check that 'repmgr (primary|standby) register' was executed for this node";
+
+        switch (config_file_options.failover_mode)
+        {
+			/* "failover_mode" is an enum, all values should be covered here */
+
+            case FAILOVER_AUTOMATIC:
+                log_error(_("this node is marked as inactive and cannot be used as a failover target"));
+                log_hint(_("%s"), hint);
+				PQfinish(local_conn);
+                terminate(ERR_BAD_CONFIG);
+
+            case FAILOVER_MANUAL:
+                log_warning(_("this node is marked as inactive and will be passively monitored only"));
+                log_hint(_("%s"), hint);
+                break;
+        }
+    }
+
+	if (config_file_options.failover_mode == FAILOVER_AUTOMATIC)
+	{
+		/*
+		 * check that promote/follow commands are defined, otherwise repmgrd
+		 * won't be able to perform any useful action
+		 */
+
+		bool required_param_missing = false;
+
+		if (config_file_options.promote_command[0] == '\0'
+			&& config_file_options.service_promote_command[0] == '\0')
+		{
+			log_error(_("either \"promote_command\" or \"service_promote_command\" must be defined in the configuration file"));
+			required_param_missing = true;
+		}
+		if (config_file_options.follow_command[0] == '\0')
+		{
+			log_error(_("\"follow_command\" must be defined in the configuration file"));
+			required_param_missing = true;
+		}
+
+		if (required_param_missing == true)
+		{
+			log_hint(_("add the missing configuration parameter(s) and start repmgrd again"));
+			PQfinish(local_conn);
+			exit(ERR_BAD_CONFIG);
+		}
+	}
+}
+
+
+
+
+void
+monitor_streaming_primary(void)
+{
+	NodeStatus	node_status = NODE_STATUS_UP;
+	instr_time	log_status_interval_start;
+	PQExpBufferData event_details;
+
+	reset_node_voting_status();
+
+	/* Log startup event */
+	if (startup_event_logged == false)
+	{
+		initPQExpBuffer(&event_details);
+
+		appendPQExpBuffer(&event_details,
+						  _("monitoring cluster primary \"%s\" (node ID: %i)"),
+						  local_node_info.node_name,
+						  local_node_info.node_id);
+
+		create_event_notification(local_conn,
+								  &config_file_options,
+								  config_file_options.node_id,
+								  "repmgrd_start",
+								  true,
+								  event_details.data);
+
+		startup_event_logged = true;
+
+		log_notice("%s", event_details.data);
+
+		termPQExpBuffer(&event_details);
+	}
+
+	INSTR_TIME_SET_CURRENT(log_status_interval_start);
+
+	while (true)
+	{
+
+		// cache node list here, refresh at `node_list_refresh_interval`
+		// also return reason for inavailability so we can log it
+		if (is_server_available(local_node_info.conninfo) == false)
+		{
+
+			/* node is down, we were expecting it to be up */
+			if (node_status == NODE_STATUS_UP)
+			{
+				PQExpBufferData event_details;
+				instr_time	local_node_unreachable_start;
+
+				INSTR_TIME_SET_CURRENT(local_node_unreachable_start);
+
+				initPQExpBuffer(&event_details);
+
+				appendPQExpBuffer(&event_details,
+								  _("unable to connect to local node"));
+
+				log_warning("%s", event_details.data);
+
+				node_status = NODE_STATUS_UNKNOWN;
+
+				PQfinish(local_conn);
+
+				/*
+				 * as we're monitoring the primary, no point in trying to write
+				 * the event to the database
+				 *
+				 * XXX possible pre-action event
+				 */
+				create_event_notification(NULL,
+										  &config_file_options,
+										  config_file_options.node_id,
+										  "repmgrd_local_disconnect",
+										  true,
+										  event_details.data);
+
+				termPQExpBuffer(&event_details);
+
+				local_conn = try_reconnect(local_node_info.conninfo, &node_status);
+
+				if (node_status == NODE_STATUS_UP)
+				{
+					int		local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);
+
+					initPQExpBuffer(&event_details);
+
+					appendPQExpBuffer(&event_details,
+									  _("reconnected to local node after %i seconds"),
+									  local_node_unreachable_elapsed);
+					log_notice("%s", event_details.data);
+
+					create_event_notification(local_conn,
+										&config_file_options,
+										config_file_options.node_id,
+										"repmgrd_local_reconnect",
+										true,
+										event_details.data);
+					termPQExpBuffer(&event_details);
+
+					goto loop;
+				}
+
+				monitoring_state = MS_DEGRADED;
+				INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+			}
+
+		}
+
+
+		if (monitoring_state == MS_DEGRADED)
+		{
+			int		degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
+
+			if (config_file_options.degraded_monitoring_timeout > 0
+				&& degraded_monitoring_elapsed > config_file_options.degraded_monitoring_timeout)
+			{
+				initPQExpBuffer(&event_details);
+
+				appendPQExpBuffer(&event_details,
+								  _("degraded monitoring timeout (%i seconds) exceeded, terminating"),
+								  degraded_monitoring_elapsed);
+
+				log_notice("%s", event_details.data);
+
+				create_event_notification(NULL,
+										  &config_file_options,
+										  config_file_options.node_id,
+										  "repmgrd_terminate",
+										  true,
+										  event_details.data);
+
+				termPQExpBuffer(&event_details);
+				terminate(ERR_MONITORING_TIMEOUT);
+			}
+
+			log_debug("monitoring node in degraded state for %i seconds", degraded_monitoring_elapsed);
+
+			if (is_server_available(local_node_info.conninfo) == true)
+			{
+				local_conn = establish_db_connection(local_node_info.conninfo, false);
+
+				if (PQstatus(local_conn) == CONNECTION_OK)
+				{
+					node_status = NODE_STATUS_UP;
+					monitoring_state = MS_NORMAL;
+
+					initPQExpBuffer(&event_details);
+
+					appendPQExpBuffer(&event_details,
+									  _("reconnected to primary node after %i seconds, resuming monitoring"),
+									  degraded_monitoring_elapsed);
+
+					create_event_notification(local_conn,
+											  &config_file_options,
+											  config_file_options.node_id,
+											  "repmgrd_local_reconnect",
+											  true,
+											  event_details.data);
+
+					log_notice("%s", event_details.data);
+					termPQExpBuffer(&event_details);
+
+					goto loop;
+				}
+			}
+
+
+			// possibly attempt to find another node from cached list
+			// check if there's a new primary - if so add hook for fencing?
+			// loop, if starts up check status, switch monitoring mode
+		}
+	loop:
+		/* emit "still alive" log message at regular intervals, if requested */
+		if (config_file_options.log_status_interval > 0)
+		{
+			int		log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
+
+			if (log_status_interval_elapsed >= config_file_options.log_status_interval)
+			{
+				log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"),
+						 local_node_info.node_name,
+						 local_node_info.node_id,
+						 _print_monitoring_state(monitoring_state));
+
+				if (monitoring_state == MS_DEGRADED)
+				{
+					log_detail(_("waiting primary to reappear"));
+				}
+
+				INSTR_TIME_SET_CURRENT(log_status_interval_start);
+			}
+		}
+		sleep(1);
+	}
+}
+
+
+void
+monitor_streaming_standby(void)
+{
+	RecordStatus record_status;
+	NodeStatus	upstream_node_status = NODE_STATUS_UP;
+	instr_time	log_status_interval_start;
+	PQExpBufferData event_details;
+
+	reset_node_voting_status();
+
+	log_debug("monitor_streaming_standby()");
+
+	/*
+	 * If no upstream node id is specified in the metadata, we'll try
+	 * and determine the current cluster primary in the assumption we
+	 * should connect to that by default.
+	 */
+	if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
+	{
+		local_node_info.upstream_node_id = get_primary_node_id(local_conn);
+
+		/*
+		 * Terminate if there doesn't appear to be an active cluster primary.
+		 * There could be one or more nodes marked as inactive primaries, and one
+		 * of them could actually be a primary, but we can't sensibly monitor
+		 * in that state.
+		 */
+		if (local_node_info.upstream_node_id == NODE_NOT_FOUND)
+		{
+			// XXX check if there's an inactive record(s) and log detail/hint
+			log_error(_("unable to determine an active primary for this cluster, terminating"));
+			PQfinish(local_conn);
+			exit(ERR_BAD_CONFIG);
+		}
+	}
+
+	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &upstream_node_info);
+
+	/*
+	 * Terminate if we can't find the record for the node we're supposed
+	 * to monitor. This is a "fix-the-config" situation, not a lot else we
+	 * can do.
+	 */
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
+					local_node_info.upstream_node_id);
+		PQfinish(local_conn);
+		exit(ERR_DB_CONN);
+	}
+
+	log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);
+
+	upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
+
+	/*
+	 * Upstream node must be running.
+	 *
+	 * We could possibly have repmgrd skip to degraded monitoring mode until it
+	 * comes up, but there doesn't seem to be much point in doint that.
+	 */
+	if (PQstatus(upstream_conn) != CONNECTION_OK)
+	{
+		log_error(_("unable connect to upstream node (ID: %i), terminating"),
+				  local_node_info.upstream_node_id);
+		log_hint(_("upstream node must be running before repmgrd can start"));
+
+		PQfinish(local_conn);
+		exit(ERR_DB_CONN);
+	}
+
+	/* refresh upstream node record from upstream node, so it's as up-to-date as possible */
+	record_status = get_node_record(upstream_conn, upstream_node_info.node_id, &upstream_node_info);
+
+	if (upstream_node_info.type == STANDBY)
+	{
+		/*
+		 * Currently cascaded standbys need to be able to connect to the primary.
+		 * We could possibly add a limited connection mode for cases where this isn't
+		 * possible.
+		 */
+		primary_conn = establish_primary_db_connection(upstream_conn, false);
+
+		if (PQstatus(primary_conn) != CONNECTION_OK)
+		{
+			log_error(_("unable to connect to primary node"));
+			log_hint(_("ensure the primary node is reachable from this node"));
+			exit(ERR_DB_CONN);
+		}
+
+		log_verbose(LOG_DEBUG, "connected to primary");
+	}
+	else
+	{
+		primary_conn = upstream_conn;
+	}
+
+	/* Log startup event */
+	if (startup_event_logged == false)
+	{
+		PQExpBufferData event_details;
+		initPQExpBuffer(&event_details);
+
+		appendPQExpBuffer(&event_details,
+						  _("monitoring upstream node \"%s\" (node ID: %i)"),
+						  upstream_node_info.node_name,
+						  upstream_node_info.node_id);
+
+		create_event_notification(primary_conn,
+								  &config_file_options,
+								  config_file_options.node_id,
+								  "repmgrd_start",
+								  true,
+								  event_details.data);
+
+		startup_event_logged = true;
+
+		log_notice("%s", event_details.data);
+
+		termPQExpBuffer(&event_details);
+	}
+
+	monitoring_state = MS_NORMAL;
+	INSTR_TIME_SET_CURRENT(log_status_interval_start);
+
+	while (true)
+	{
+		if (is_server_available(upstream_node_info.conninfo) == false)
+		{
+
+			/* upstream node is down, we were expecting it to be up */
+			if (upstream_node_status == NODE_STATUS_UP)
+			{
+				instr_time	upstream_node_unreachable_start;
+
+				INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);
+
+				initPQExpBuffer(&event_details);
+
+				upstream_node_status = NODE_STATUS_UNKNOWN;
+
+				appendPQExpBuffer(&event_details,
+								  _("unable to connect to upstream node \"%s\" (node ID: %i)"),
+								  upstream_node_info.node_name, upstream_node_info.node_id);
+
+				if (upstream_node_info.type == STANDBY)
+				{
+					/* XXX possible pre-action event */
+					create_event_record(primary_conn,
+										&config_file_options,
+										config_file_options.node_id,
+										"repmgrd_upstream_disconnect",
+										true,
+										event_details.data);
+				}
+
+				log_warning("%s", event_details.data);
+				termPQExpBuffer(&event_details);
+
+				PQfinish(upstream_conn);
+				upstream_conn = try_reconnect(upstream_node_info.conninfo, &upstream_node_status);
+
+				if (upstream_node_status == NODE_STATUS_UP)
+				{
+					int		upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
+
+					initPQExpBuffer(&event_details);
+
+					appendPQExpBuffer(&event_details,
+									  _("reconnected to upstream node after %i seconds"),
+									  upstream_node_unreachable_elapsed);
+					log_notice("%s", event_details.data);
+
+					create_event_notification(local_conn,
+											  &config_file_options,
+											  config_file_options.node_id,
+											  "repmgrd_upstream_reconnect",
+											  true,
+											  event_details.data);
+					termPQExpBuffer(&event_details);
+
+					goto loop;
+				}
+
+				/* still down after reconnect attempt(s) */
+				if (upstream_node_status == NODE_STATUS_DOWN)
+				{
+					bool failover_done = false;
+
+					if (upstream_node_info.type == PRIMARY)
+					{
+						failover_done = do_primary_failover();
+					}
+					else if (upstream_node_info.type == STANDBY)
+					{
+						failover_done = do_upstream_standby_failover();
+					}
+
+					// it's possible it will make sense to return in
+					// all cases to restart monitoring
+					if (failover_done == true)
+						return;
+				}
+			}
+		}
+
+		if (monitoring_state == MS_DEGRADED)
+		{
+			int		degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
+
+			log_debug("monitoring node %i in degraded state for %i seconds",
+					  upstream_node_info.node_id,
+					  degraded_monitoring_elapsed);
+
+			if (is_server_available(upstream_node_info.conninfo) == true)
+			{
+				upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
+
+				if (PQstatus(upstream_conn) == CONNECTION_OK)
+				{
+					// XXX check here if upstream is still primary
+					// -> will be a problem if another node was promoted in the meantime
+					// and upstream is now former primary
+					// XXX scan other nodes to see if any has become primary
+
+					upstream_node_status = NODE_STATUS_UP;
+					monitoring_state = MS_NORMAL;
+
+					if (upstream_node_info.type == PRIMARY)
+					{
+						primary_conn = upstream_conn;
+					}
+					else
+					{
+
+						if (primary_conn == NULL || PQstatus(primary_conn) != CONNECTION_OK)
+						{
+							primary_conn = establish_primary_db_connection(upstream_conn, false);
+						}
+					}
+
+					initPQExpBuffer(&event_details);
+
+					appendPQExpBuffer(&event_details,
+									  _("reconnected to upstream node %i after %i seconds, resuming monitoring"),
+									  upstream_node_info.node_id,
+									  degraded_monitoring_elapsed);
+
+					create_event_notification(primary_conn,
+											  &config_file_options,
+											  config_file_options.node_id,
+											  "repmgrd_upstream_reconnect",
+											  true,
+											  event_details.data);
+
+					log_notice("%s", event_details.data);
+					termPQExpBuffer(&event_details);
+
+					goto loop;
+				}
+			}
+			else
+			{
+				// unable to connect to former primary - check if another node has
+				// been promoted
+			}
+
+		}
+
+	loop:
+
+		/* emit "still alive" log message at regular intervals, if requested */
+		if (config_file_options.log_status_interval > 0)
+		{
+			int		log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
+
+			if (log_status_interval_elapsed >= config_file_options.log_status_interval)
+			{
+				log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i) in %s state"),
+						 local_node_info.node_name,
+						 local_node_info.node_id,
+						 upstream_node_info.node_name,
+						 upstream_node_info.node_id,
+						 _print_monitoring_state(monitoring_state));
+
+				if (monitoring_state == MS_DEGRADED)
+				{
+					log_detail(_("waiting for upstream or another primary to reappear"));
+				}
+
+				INSTR_TIME_SET_CURRENT(log_status_interval_start);
+			}
+		}
+
+		/*
+		 * handle local node failure
+		 *
+		 * currently we'll just check the connection, and try to reconnect
+		 *
+		 * TODO: add timeout, after which we run in degraded state
+		 */
+		if (is_server_available(local_node_info.conninfo) == false)
+		{
+			log_warning(_("connection to local node %i lost"), local_node_info.node_id);
+
+			if (local_conn != NULL)
+			{
+				PQfinish(local_conn);
+				local_conn = NULL;
+			}
+		}
+
+		if (PQstatus(local_conn) != CONNECTION_OK)
+		{
+			log_info(_("attempting to reconnect"));
+			local_conn = establish_db_connection(config_file_options.conninfo, false);
+
+			if (PQstatus(local_conn) != CONNECTION_OK)
+			{
+				log_warning(_("reconnection failed"));
+			}
+			else
+			{
+				log_info(_("reconnected"));
+			}
+		}
+		sleep(1);
+	}
+}
+
+static bool
+do_primary_failover(void)
+{
+	/* attempt to initiate voting process */
+	ElectionResult election_result = do_election();
+
+	/* XXX add pre-event notification here */
+	failover_state = FAILOVER_STATE_UNKNOWN;
+
+	log_debug("election result: %s", _print_election_result(election_result));
+
+	if (election_result == ELECTION_CANCELLED)
+	{
+		log_notice(_("election cancelled"));
+		return false;
+	}
+	else if (election_result == ELECTION_WON)
+	{
+		log_notice("I am the winner, will now promote self and inform other nodes");
+
+		failover_state = promote_self();
+	}
+	else if (election_result == ELECTION_LOST)
+	{
+		t_node_info *best_candidate;
+
+		log_info(_("I am the candidate but did not get all votes; will now determine the best candidate"));
+
+
+		/* reset node list */
+		get_active_sibling_node_records(local_conn,
+										local_node_info.node_id,
+										upstream_node_info.node_id,
+										&standby_nodes);
+
+		best_candidate = poll_best_candidate(&standby_nodes);
+
+		/*
+		 * this can occur in a tie-break situation, where this node establishes
+		 * it is the best candidate
+		 */
+		if (best_candidate->node_id == local_node_info.node_id)
+		{
+			log_notice("I am the best candidate, will now promote self and inform other nodes");
+
+			failover_state = promote_self();
+		}
+		else
+		{
+			PGconn *candidate_conn = NULL;
+
+			log_info("node %i is the best candidate, waiting for it to confirm so I can follow it",
+					 best_candidate->node_id);
+
+			/* notify the best candidate so it */
+
+			candidate_conn = establish_db_connection(best_candidate->conninfo, false);
+
+			if (PQstatus(candidate_conn) == CONNECTION_OK)
+			{
+				notify_follow_primary(candidate_conn, best_candidate->node_id);
+
+				/*  we'll wait for the candidate to get back to us */
+				failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
+			}
+			else
+			{
+				log_error(_("unable to connect to candidate node (ID: %i)"), best_candidate->node_id);
+				failover_state = FAILOVER_STATE_NODE_NOTIFICATION_ERROR;
+			}
+			PQfinish(candidate_conn);
+		}
+	}
+	else
+	{
+		log_info(_("follower node awaiting notification from the candidate node"));
+		failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
+	}
+
+
+	/*
+	 * node has decided it is a follower, so will await notification
+	 * from the candidate that it has promoted itself and can be followed
+	 */
+	if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
+	{
+		int new_primary_id;
+
+		//   --> need timeout in case new primary doesn't come up, then rerun election
+
+		/* either follow or time out; either way resume monitoring */
+		if (wait_primary_notification(&new_primary_id) == true)
+		{
+			/* if primary has reappeared, no action needed */
+			if (new_primary_id == upstream_node_info.node_id)
+			{
+				failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY;
+			}
+			/* if new_primary_id is self, promote */
+			else if (new_primary_id == local_node_info.node_id)
+			{
+				log_notice(_("this node is promotion candidate, promoting"));
+
+				failover_state = promote_self();
+
+				get_active_sibling_node_records(local_conn,
+												local_node_info.node_id,
+												upstream_node_info.node_id,
+												&standby_nodes);
+
+			}
+			else
+			{
+				failover_state = follow_new_primary(new_primary_id);
+			}
+		}
+		else
+		{
+			failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
+		}
+	}
+
+	switch(failover_state)
+	{
+		case FAILOVER_STATE_PROMOTED:
+			log_debug("failover state is PROMOTED");
+
+			/* notify former siblings that they should now follow this node */
+			notify_followers(&standby_nodes, local_node_info.node_id);
+
+			/* we no longer care about our former siblings */
+			clear_node_info_list(&standby_nodes);
+
+			/* pass control back down to start_monitoring() */
+			log_info(_("switching to primary monitoring mode"));
+
+			failover_state = FAILOVER_STATE_NONE;
+			return true;
+
+		case FAILOVER_STATE_PRIMARY_REAPPEARED:
+			log_debug("failover state is PRIMARY_REAPPEARED");
+
+			/* notify siblings that they should resume following the original primary */
+			notify_followers(&standby_nodes, upstream_node_info.node_id);
+
+			/* we no longer care about our former siblings */
+			clear_node_info_list(&standby_nodes);
+
+			/* pass control back down to start_monitoring() */
+			log_info(_("resuming standby monitoring mode"));
+			log_detail(_("original primary \"%s\" (node ID: %i) reappeared"),
+					   upstream_node_info.node_name, upstream_node_info.node_id);
+
+			failover_state = FAILOVER_STATE_NONE;
+			return true;
+
+
+		case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
+			log_info(_("resuming standby monitoring mode"));
+			log_detail(_("following new primary \"%s\" (node id: %i)"),
+					   upstream_node_info.node_name, upstream_node_info.node_id);
+			failover_state = FAILOVER_STATE_NONE;
+
+			return true;
+
+		case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
+			log_info(_("resuming standby monitoring mode"));
+			log_detail(_("following original primary \"%s\" (node id: %i)"),
+					   upstream_node_info.node_name, upstream_node_info.node_id);
+			failover_state = FAILOVER_STATE_NONE;
+
+			return true;
+
+		case FAILOVER_STATE_PROMOTION_FAILED:
+			log_debug("failover state is PROMOTION FAILED");
+			return false;
+
+		case FAILOVER_STATE_FOLLOW_FAIL:
+			/*
+			 * for whatever reason we were unable to follow the new primary -
+			 * continue monitoring in degraded state
+			 */
+			monitoring_state = MS_DEGRADED;
+			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+
+			return false;
+
+		case FAILOVER_STATE_NO_NEW_PRIMARY:
+		case FAILOVER_STATE_WAITING_NEW_PRIMARY:
+			/* pass control back down to start_monitoring() */
+			// -> should kick off new election
+			return false;
+
+		case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
+		case FAILOVER_STATE_LOCAL_NODE_FAILURE:
+		case FAILOVER_STATE_UNKNOWN:
+		case FAILOVER_STATE_NONE:
+			log_debug("failover state is %i", failover_state);
+			return false;
+	}
+
+	/* should never reach here */
+	return false;
+}
+
+/*
+ * do_upstream_standby_failover()
+ *
+ * Attach cascaded standby to primary
+ *
+ * Currently we will try to attach to the cluster primary, as "repmgr
+ * standby follow" doesn't support attaching to another node.
+ *
+ * If this becomes supported, it might be worth providing a selection
+ * of reconnection strategies as different behaviour might be desirable
+ * in different situations;
+ * or maybe the option not to reconnect might be required?
+ *
+ * XXX check this handles replication slots gracefully
+ */
+static bool
+do_upstream_standby_failover(void)
+{
+	PQExpBufferData event_details;
+	t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
+	RecordStatus record_status;
+	int r;
+
+	PQfinish(upstream_conn);
+	upstream_conn = NULL;
+
+	record_status = get_primary_node_record(local_conn, &primary_node_info);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve primary node record"));
+		return false;
+	}
+	/*
+	 * Verify that we can still talk to the cluster primary, even though
+	 * the node's upstream is not available
+	 */
+
+	// consolidate below code
+	if (is_server_available(primary_node_info.conninfo) == false)
+	{
+		log_warning(_("connection to primary %i lost"), primary_node_info.node_id);
+
+		if (primary_conn != NULL)
+		{
+			PQfinish(primary_conn);
+			primary_conn = NULL;
+		}
+	}
+
+	if (PQstatus(primary_conn) != CONNECTION_OK)
+	{
+		log_info(_("attempting to reconnect"));
+		primary_conn = establish_db_connection(primary_node_info.conninfo, false);
+
+		if (PQstatus(primary_conn) != CONNECTION_OK)
+		{
+			log_warning(_("reconnection failed"));
+		}
+		else
+		{
+			log_info(_("reconnected"));
+		}
+	}
+
+	/* grandparent upstream is inactive  */
+	if (primary_node_info.active == false)
+	{
+		// XXX
+	}
+
+	/* Close the connection to this server */
+	PQfinish(local_conn);
+	local_conn = NULL;
+
+	initPQExpBuffer(&event_details);
+
+	log_debug(_("standby follow command is:\n  \"%s\""),
+			  config_file_options.follow_command);
+
+	r = system(config_file_options.follow_command);
+
+	if (r != 0)
+	{
+		appendPQExpBuffer(&event_details,
+						  _("unable to execute follow command:\n %s"),
+						  config_file_options.follow_command);
+
+		log_error("%s", event_details.data);
+
+		/* It may not possible to write to the event notification
+		 * table but we should be able to generate an external notification
+		 * if required.
+		 */
+		create_event_notification(primary_conn,
+							&config_file_options,
+							local_node_info.node_id,
+							"repmgrd_failover_follow",
+							false,
+							event_details.data);
+
+		termPQExpBuffer(&event_details);
+	}
+
+	/* reconnect to local node */
+	local_conn = establish_db_connection(config_file_options.conninfo, false);
+
+	if (update_node_record_set_upstream(primary_conn,
+										local_node_info.node_id,
+										primary_node_info.node_id) == false)
+	{
+		appendPQExpBuffer(&event_details,
+						  _("unable to set node %i's new upstream ID to %i"),
+						  local_node_info.node_id,
+						  primary_node_info.node_id);
+
+		log_error("%s", event_details.data);
+
+		create_event_notification(NULL,
+							&config_file_options,
+							local_node_info.node_id,
+							"repmgrd_failover_follow",
+							false,
+							event_details.data);
+
+		termPQExpBuffer(&event_details);
+
+		terminate(ERR_BAD_CONFIG);
+	}
+	/* update own internal node record */
+    record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);
+
+
+	appendPQExpBuffer(&event_details,
+					  _("node %i is now following primary node %i"),
+					  local_node_info.node_id,
+					  primary_node_info.node_id);
+
+	log_notice("%s", event_details.data);
+
+	create_event_notification(primary_conn,
+							  &config_file_options,
+							  local_node_info.node_id,
+							  "repmgrd_failover_follow",
+							  true,
+							  event_details.data);
+
+	termPQExpBuffer(&event_details);
+
+
+	PQfinish(primary_conn);
+	primary_conn = NULL;
+
+
+	return true;
+}
+
+
+static FailoverState
+promote_self(void)
+{
+	PQExpBufferData event_details;
+	char *promote_command;
+	int r;
+
+	/* Store details of the failed node here */
+	t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
+	RecordStatus record_status;
+
+	/*
+	 * optionally add a delay before promoting the standby; this is mainly
+	 * useful for testing (e.g. for reappearance of the original primary)
+	 * and is not documented.
+	 */
+	if (config_file_options.promote_delay > 0)
+	{
+		log_debug("sleeping %i seconds before promoting standby",
+				  config_file_options.promote_delay);
+		sleep(config_file_options.promote_delay);
+	}
+
+	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
+				  local_node_info.upstream_node_id);
+		return FAILOVER_STATE_PROMOTION_FAILED;
+	}
+
+	/* the presence of either of these commands has been established already */
+	if (config_file_options.service_promote_command[0] != '\0')
+		promote_command = config_file_options.service_promote_command;
+	else
+		promote_command = config_file_options.promote_command;
+
+	log_debug("promote command is:\n  \"%s\"",
+			  promote_command);
+
+	if (log_type == REPMGR_STDERR && *config_file_options.log_file)
+	{
+		fflush(stderr);
+	}
+
+	r = system(promote_command);
+
+	/* connection should stay up, but check just in case */
+	if(PQstatus(local_conn) != CONNECTION_OK)
+	{
+		local_conn = establish_db_connection(local_node_info.conninfo, true);
+
+		/* assume node failed */
+		if(PQstatus(local_conn) != CONNECTION_OK)
+		{
+			log_error(_("unable to reconnect to local node"));
+			// XXX handle this
+			return FAILOVER_STATE_LOCAL_NODE_FAILURE;
+		}
+	}
+
+	if (r != 0)
+	{
+		int primary_node_id;
+
+		upstream_conn = get_primary_connection(local_conn,
+											  &primary_node_id, NULL);
+
+		if (PQstatus(upstream_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
+		{
+			log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"),
+					   failed_primary.node_id);
+
+			initPQExpBuffer(&event_details);
+			appendPQExpBuffer(&event_details,
+							  _("original primary \"%s\" (node ID: %i) reappeared"),
+							  failed_primary.node_name,
+							  failed_primary.node_id);
+
+			create_event_notification(upstream_conn,
+								&config_file_options,
+								local_node_info.node_id,
+								"repmgrd_failover_abort",
+								true,
+								event_details.data);
+
+			termPQExpBuffer(&event_details);
+
+			//primary_conn = NULL;
+
+			// XXX handle this!
+			// -> we'll need to let the other nodes know too....
+			/* no failover occurred but we'll want to restart connections */
+			//failover_done = true;
+			return FAILOVER_STATE_PRIMARY_REAPPEARED;
+		}
+
+		// handle this
+		//  -> check if somehow primary; otherwise go for new election?
+		log_error(_("promote command failed"));
+		return FAILOVER_STATE_PROMOTION_FAILED;
+	}
+
+
+	initPQExpBuffer(&event_details);
+
+	/* update own internal node record */
+	record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
+
+	/*
+	 * XXX here we're assuming the promote command updated metadata
+	 */
+	appendPQExpBuffer(&event_details,
+					  _("node %i promoted to primary; old primary %i marked as failed"),
+					  local_node_info.node_id,
+					  failed_primary.node_id);
+
+	/* local_conn is now the primary connection */
+	create_event_notification(local_conn,
+						&config_file_options,
+						local_node_info.node_id,
+						"repmgrd_failover_promote",
+						true,
+						event_details.data);
+
+	termPQExpBuffer(&event_details);
+
+	return FAILOVER_STATE_PROMOTED;
+}
+
+
+
+
+/*
+ * Notify follower nodes about which node to follow. Normally this
+ * will be the current node, however if the original primary reappeared
+ * before this node could be promoted, we'll inform the followers they
+ * should resume monitoring the original primary.
+ */
+static void
+notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
+{
+	NodeInfoListCell *cell;
+
+	log_debug("notify_followers()");
+	for (cell = standby_nodes->head; cell; cell = cell->next)
+	{
+		log_debug("intending to notify node %i... ", cell->node_info->node_id);
+		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
+		{
+			log_debug("reconnecting to node %i... ", cell->node_info->node_id);
+
+			cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
+		}
+
+		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
+		{
+			log_debug("unable to reconnect to  %i ... ", cell->node_info->node_id);
+
+			continue;
+		}
+
+		log_debug("notifying node %i to follow node %i",
+				  cell->node_info->node_id, follow_node_id);
+		notify_follow_primary(cell->node_info->conn, follow_node_id);
+	}
+}
+
+
+static t_node_info *
+poll_best_candidate(NodeInfoList *standby_nodes)
+{
+	NodeInfoListCell *cell;
+	t_node_info *best_candidate = &local_node_info;
+
+	// XXX ensure standby_nodes is set correctly
+
+	/*
+	 * we need to definitively decide the best candidate, as in some corner
+	 * cases we could end up with two candidate nodes, so they should each
+	 * come to the same conclusion
+	 */
+	for (cell = standby_nodes->head; cell; cell = cell->next)
+	{
+		if (cell->node_info->last_wal_receive_lsn > best_candidate->last_wal_receive_lsn)
+		{
+			log_debug("node %i has higher LSN, now best candidate", cell->node_info->node_id);
+			best_candidate = cell->node_info;
+		}
+		else if (cell->node_info->last_wal_receive_lsn == best_candidate->last_wal_receive_lsn)
+		{
+			if (cell->node_info->priority > best_candidate->priority)
+			{
+				log_debug("node %i has higher priority, now best candidate", cell->node_info->node_id);
+				best_candidate = cell->node_info;
+			}
+		}
+		/* if all else fails, we decide by node_id */
+		else if (cell->node_info->node_id < best_candidate->node_id)
+		{
+			log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id);
+			best_candidate = cell->node_info;
+		}
+	}
+
+	log_info(_("best candidate is %i"), best_candidate->node_id);
+
+	return best_candidate;
+}
+
+
+static bool
+wait_primary_notification(int *new_primary_id)
+{
+	// XXX make this configurable
+	int wait_primary_timeout = 60;
+	int i;
+
+	for (i = 0; i < wait_primary_timeout; i++)
+	{
+		if (get_new_primary(local_conn, new_primary_id) == true)
+		{
+			log_debug("new primary is %i; elapsed: %i",
+					  *new_primary_id, i);
+			return true;
+		}
+		sleep(1);
+	}
+
+
+	log_warning(_("no notifcation received from new primary after %i seconds"),
+				wait_primary_timeout);
+
+	return false;
+}
+
+
+static FailoverState
+follow_new_primary(int new_primary_id)
+{
+	PQExpBufferData event_details;
+	int r;
+
+	/* Store details of the failed node here */
+	t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
+	t_node_info new_primary = T_NODE_INFO_INITIALIZER;
+	RecordStatus record_status;
+	bool new_primary_ok = false;
+
+	record_status = get_node_record(local_conn, new_primary_id, &new_primary);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve metadata record for upstream node (ID: %i)"),
+				  new_primary_id);
+		return FAILOVER_STATE_FOLLOW_FAIL;
+	}
+
+	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve metadata record for failed primary (ID: %i)"),
+					local_node_info.upstream_node_id);
+		return FAILOVER_STATE_FOLLOW_FAIL;
+	}
+
+	// XXX check if new_primary_id == failed_primary.node_id?
+
+	if (log_type == REPMGR_STDERR && *config_file_options.log_file)
+	{
+		fflush(stderr);
+	}
+
+	log_debug(_("standby follow command is:\n  \"%s\""),
+			  config_file_options.follow_command);
+
+	upstream_conn = establish_db_connection(new_primary.conninfo, false);
+
+	if (PQstatus(upstream_conn) == CONNECTION_OK)
+	{
+		RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
+		if (primary_recovery_type == RECTYPE_PRIMARY)
+		{
+			new_primary_ok = true;
+		}
+		else
+		{
+			log_warning(_("new primary is not in recovery"));
+			PQfinish(upstream_conn);
+		}
+	}
+
+	if (new_primary_ok == false)
+	{
+		return FAILOVER_STATE_FOLLOW_FAIL;
+	}
+
+	/*
+	 * disconnect from local node, as follow operation will result in
+	 * a server restart
+	 */
+
+	PQfinish(local_conn);
+	local_conn = NULL;
+
+	/* execute the follow command */
+	r = system(config_file_options.follow_command);
+
+	if (r != 0)
+	{
+		PGconn *old_primary_conn;
+		/*
+		 * The follow action could still fail due to the original primary reappearing
+		 * before the candidate could promote itself ("repmgr standby follow" will
+		 * refuse to promote another node if the primary is available). However
+		 * the new primary will only instruct use to follow it after it's successfully
+		 * promoted itself, so that very likely won't be the reason for the failure.
+		 *
+		 *
+		 * TODO: check the new primary too - we could have a split-brain
+		 * situation where the old primary reappeared just after the new
+		 * one promoted itself.
+		 */
+		old_primary_conn = establish_db_connection(failed_primary.conninfo, false);
+
+		if (PQstatus(old_primary_conn) == CONNECTION_OK)
+		{
+			// XXX add event notifications
+			RecoveryType upstream_recovery_type = get_recovery_type(old_primary_conn);
+			PQfinish(old_primary_conn);
+
+			if (upstream_recovery_type == RECTYPE_PRIMARY)
+			{
+				log_notice(_("original primary reappeared - no action taken"));
+				return FAILOVER_STATE_PRIMARY_REAPPEARED;
+			}
+		}
+
+		return FAILOVER_STATE_FOLLOW_FAIL;
+	}
+
+
+	/*
+	 * refresh local copy of local and primary node records - we get these
+	 * directly from the primary to ensure they're the current version
+	 */
+
+	record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve metadata record found for node %i"),
+				  new_primary_id);
+		return FAILOVER_STATE_FOLLOW_FAIL;
+	}
+
+	record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve metadata record found for node %i"),
+				  local_node_info.node_id);
+		return FAILOVER_STATE_FOLLOW_FAIL;
+	}
+
+
+	local_conn = establish_db_connection(local_node_info.conninfo, false);
+	initPQExpBuffer(&event_details);
+	appendPQExpBuffer(&event_details,
+					  _("node %i now following new upstream node %i"),
+					  local_node_info.node_id,
+					  upstream_node_info.node_id);
+
+	log_notice("%s\n", event_details.data);
+
+	create_event_notification(upstream_conn,
+						&config_file_options,
+						local_node_info.node_id,
+						"repmgrd_failover_follow",
+						true,
+						event_details.data);
+
+	termPQExpBuffer(&event_details);
+
+	return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
+}
+
+
+static const char *
+_print_voting_status(NodeVotingStatus voting_status)
+{
+	switch(voting_status)
+	{
+		case VS_NO_VOTE:
+			return "NO VOTE";
+
+		case VS_VOTE_REQUEST_RECEIVED:
+			return "VOTE REQUEST RECEIVED";
+
+		case VS_VOTE_INITIATED:
+			return "VOTE REQUEST INITIATED";
+
+		case VS_UNKNOWN:
+			return "VOTE REQUEST UNKNOWN";
+	}
+
+	return "UNKNOWN VOTE REQUEST STATE";
+}
+
+static const char *
+_print_election_result(ElectionResult result)
+{
+	switch(result)
+	{
+		case ELECTION_NOT_CANDIDATE:
+			return "NOT CANDIDATE";
+
+		case ELECTION_WON:
+			return "WON";
+
+		case ELECTION_LOST:
+			return "LOST";
+
+		case ELECTION_CANCELLED:
+			return "CANCELLED";
+	}
+
+	/* should never reach here */
+	return "UNKNOWN";
+}
+
+static const char *
+_print_monitoring_state(MonitoringState monitoring_state)
+{
+	switch(monitoring_state)
+	{
+		case MS_NORMAL:
+			return "normal";
+
+		case MS_DEGRADED:
+			return "degraded";
+	}
+
+	/* should never reach here */
+	return "UNKNOWN";
+}
+
+
+
+static ElectionResult
+do_election(void)
+{
+	int electoral_term = -1;
+
+	int votes_for_me = 0;
+
+	/* we're visible */
+	int visible_nodes = 1;
+
+	/*
+	 * get voting status from shared memory - should be one of "VS_NO_VOTE"
+	 * or "VS_VOTE_REQUEST_RECEIVED". If VS_NO_VOTE, we declare ourselves as
+	 * candidate and initiate the voting process.
+	 */
+	NodeVotingStatus voting_status;
+
+	NodeInfoListCell *cell;
+
+	bool other_node_is_candidate = false;
+	bool other_node_is_ahead = false;
+
+	/*
+	 * Check if at least one server in the primary's location is visible;
+	 * if not we'll assume a network split between this node and the primary
+	 * location, and not promote any standby.
+	 *
+	 * NOTE: this function is only ever called by standbys attached to the current
+	 * (unreachable) primary, so "upstream_node_info" will always contain the
+	 * primary node record.
+	 */
+	bool primary_location_seen = false;
+
+	/*
+	 * sleep for a random period of 100 ~ 350 ms
+	 */
+
+	long unsigned rand_wait = (long) ((rand() % 35) + 10) * 10000;
+
+	log_debug("do_election(): sleeping %lu", rand_wait);
+	log_debug("do_election(): primary location is %s", upstream_node_info.location);
+
+	pg_usleep(rand_wait);
+
+	local_node_info.last_wal_receive_lsn = InvalidXLogRecPtr;
+
+	log_debug("do_election(): executing get_voting_status()");
+	voting_status = get_voting_status(local_conn);
+	log_debug("do_election(): node voting status is %s", _print_voting_status(voting_status));
+
+	if (voting_status == VS_VOTE_REQUEST_RECEIVED)
+	{
+		/* we've already been requested to vote, so can't become a candidate */
+		log_debug("vote request already received, not candidate");
+		return ELECTION_NOT_CANDIDATE;
+	}
+
+	/*
+	 * Here we mark ourselves as candidate, so any further vote requests
+	 * are rejected. However it's possible another node has done the
+	 * same thing, so when announcing ourselves as candidate to the other
+	 * nodes, we'll check for that and withdraw our candidature.
+	 */
+	electoral_term = set_voting_status_initiated(local_conn);
+
+	/* get all active nodes attached to primary, excluding self */
+	get_active_sibling_node_records(local_conn,
+									local_node_info.node_id,
+									upstream_node_info.node_id,
+									&standby_nodes);
+
+	/* no other standbys - win by default */
+
+	if (standby_nodes.node_count == 0)
+	{
+		if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
+		{
+			log_debug("no other nodes - we win by default");
+			return ELECTION_WON;
+		}
+		else
+		{
+			log_debug("no other nodes, but primary and standby locations differ");
+
+			monitoring_state = MS_DEGRADED;
+			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+
+			return ELECTION_NOT_CANDIDATE;
+		}
+	}
+
+	for (cell = standby_nodes.head; cell; cell = cell->next)
+	{
+		/* assume the worst case */
+		cell->node_info->is_visible = false;
+
+		cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
+
+		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
+		{
+			continue;
+		}
+
+		/*
+		 * tell the other node we're candidate - if the node has already declared
+		 * itself, we withdraw
+		 *
+		 * XXX check for situations where more than one node could end up as candidate?
+		 *
+		 * XXX note it's possible some nodes accepted our candidature before we
+		 * found out about the other candidate, check what happens in that situation
+		 *  -> other node will have info from all the nodes, even if not the vote,
+		 *     so it should be able to determine the best node anyway
+		 */
+
+		if (announce_candidature(cell->node_info->conn, &local_node_info, cell->node_info, electoral_term) == false)
+		{
+			log_debug("node %i is candidate",  cell->node_info->node_id);
+			other_node_is_candidate = true;
+
+			/* don't notify any further standbys */
+			break;
+		}
+
+		/*
+		 * see if the node is in the primary's location (but skip the check
+		 * if we've seen
+		 */
+		if (primary_location_seen == false)
+		{
+			if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0)
+			{
+				primary_location_seen = true;
+			}
+		}
+
+		cell->node_info->is_visible = true;
+		visible_nodes ++;
+	}
+
+	if (other_node_is_candidate == true)
+	{
+		clear_node_info_list(&standby_nodes);
+
+		reset_node_voting_status();
+		log_debug("other node is candidate, returning NOT CANDIDATE");
+		return ELECTION_NOT_CANDIDATE;
+	}
+
+	if (primary_location_seen == false)
+	{
+		log_notice(_("no nodes from the primary location \"%s\" visible - assuming network split"),
+				   upstream_node_info.location);
+		log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
+
+		monitoring_state = MS_DEGRADED;
+		INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+
+		reset_node_voting_status();
+
+		return ELECTION_CANCELLED;
+	}
+
+
+	/* get our lsn */
+	local_node_info.last_wal_receive_lsn = get_last_wal_receive_location(local_conn);
+
+	log_debug("last receive lsn = %X/%X",
+			  (uint32) (local_node_info.last_wal_receive_lsn >> 32),
+			  (uint32)  local_node_info.last_wal_receive_lsn);
+
+	/* request vote from each node */
+
+	for (cell = standby_nodes.head; cell; cell = cell->next)
+	{
+		log_debug("checking node %i...", cell->node_info->node_id);
+		/* ignore unreachable nodes */
+		if (cell->node_info->is_visible == false)
+			continue;
+		votes_for_me += request_vote(cell->node_info->conn,
+									 &local_node_info,
+									 cell->node_info,
+									 electoral_term);
+
+		if (cell->node_info->last_wal_receive_lsn > local_node_info.last_wal_receive_lsn)
+		{
+			/* register if another node is ahead of us */
+			other_node_is_ahead = true;
+		}
+		PQfinish(cell->node_info->conn);
+		cell->node_info->conn = NULL;
+	}
+
+	/* vote for myself, but only if I believe no-one else is ahead */
+	if (other_node_is_ahead == false)
+	{
+		votes_for_me += 1;
+	}
+
+	log_debug(_("%i of of %i votes"), votes_for_me, visible_nodes);
+
+	if (votes_for_me == visible_nodes)
+		return ELECTION_WON;
+
+	return ELECTION_LOST;
+}
+
+
+static void
+reset_node_voting_status(void)
+{
+	failover_state = FAILOVER_STATE_NONE;
+
+	if (PQstatus(local_conn) != CONNECTION_OK)
+	{
+		log_error(_("reset_node_voting_status(): local_conn not set"));
+		return;
+	}
+	reset_voting_status(local_conn);
+}
+
+
+void
+close_connections_physical()
+{
+	if (PQstatus(primary_conn) == CONNECTION_OK)
+	{
+		/* cancel any pending queries to the primary */
+		if (PQisBusy(primary_conn) == 1)
+			cancel_query(primary_conn, config_file_options.primary_response_timeout);
+		PQfinish(primary_conn);
+		primary_conn = NULL;
+	}
+
+	if (upstream_conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
+	{
+		PQfinish(upstream_conn);
+		upstream_conn = NULL;
+	}
+
+}
diff --git a/repmgrd-physical.h b/repmgrd-physical.h
new file mode 100644
index 00000000..998ca659
--- /dev/null
+++ b/repmgrd-physical.h
@@ -0,0 +1,15 @@
+/*
+ * repmgrd-physical.h
+ * Copyright (c) 2ndQuadrant, 2010-2017
+ */
+
+#ifndef _REPMGRD_PHYSICAL_H_
+#define _REPMGRD_PHYSICAL_H_
+
+void do_physical_node_check(void);
+
+void monitor_streaming_primary(void);
+void monitor_streaming_standby(void);
+void close_connections_physical(void);
+
+#endif /* _REPMGRD_PHYSICAL_H_ */
diff --git a/repmgrd.c b/repmgrd.c
index 0543328e..0306377f 100644
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -8,50 +8,19 @@
 #include <signal.h>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <time.h>
 
-#include "portability/instr_time.h"
+
+
 
 #include "repmgr.h"
+#include "repmgrd.h"
+#include "repmgrd-physical.h"
+#include "repmgrd-bdr.h"
 #include "config.h"
 #include "voting.h"
 
 #define OPT_HELP	1
 
-typedef enum {
-	NODE_STATUS_UNKNOWN = -1,
-	NODE_STATUS_UP,
-	NODE_STATUS_DOWN
-} NodeStatus;
-
-
-typedef enum {
-	FAILOVER_STATE_UNKNOWN = -1,
-	FAILOVER_STATE_NONE,
-	FAILOVER_STATE_PROMOTED,
-	FAILOVER_STATE_PROMOTION_FAILED,
-	FAILOVER_STATE_PRIMARY_REAPPEARED,
-	FAILOVER_STATE_LOCAL_NODE_FAILURE,
-	FAILOVER_STATE_WAITING_NEW_PRIMARY,
-	FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
-    FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
-	FAILOVER_STATE_NO_NEW_PRIMARY,
-	FAILOVER_STATE_FOLLOW_FAIL,
-	FAILOVER_STATE_NODE_NOTIFICATION_ERROR
-} FailoverState;
-
-
-typedef enum {
-	ELECTION_NOT_CANDIDATE = -1,
-	ELECTION_WON,
-	ELECTION_LOST,
-	ELECTION_CANCELLED
-} ElectionResult;
-
-typedef enum {
-	MS_NORMAL = 0,
-	MS_DEGRADED = 1
-} MonitoringState;
 
 static char	   *config_file = NULL;
 static bool		verbose = false;
@@ -60,24 +29,21 @@ static bool		daemonize = false;
 
 t_configuration_options config_file_options = T_CONFIGURATION_OPTIONS_INITIALIZER;
 
-static t_node_info local_node_info = T_NODE_INFO_INITIALIZER;
-static PGconn	   *local_conn = NULL;
+t_node_info local_node_info = T_NODE_INFO_INITIALIZER;
+PGconn	   *local_conn = NULL;
 
-static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
-static PGconn *upstream_conn = NULL;
-static PGconn *primary_conn = NULL;
 
-FailoverState failover_state = FAILOVER_STATE_UNKNOWN;
-
-static NodeInfoList standby_nodes = T_NODE_INFO_LIST_INITIALIZER;
 
 /* Collate command line errors here for friendlier reporting */
 static ItemList	cli_errors = { NULL, NULL };
 
-static bool        startup_event_logged = false;
+bool        startup_event_logged = false;
 
-static MonitoringState monitoring_state = MS_NORMAL;
-static instr_time	degraded_monitoring_start;
+MonitoringState monitoring_state = MS_NORMAL;
+instr_time	degraded_monitoring_start;
+
+static void close_connections(void);
+void (*_close_connections)(void) = NULL;
 
 /*
  * Record receipt of SIGHUP; will cause configuration file to be reread
@@ -90,34 +56,9 @@ static void show_usage(void);
 static void daemonize_process(void);
 static void check_and_create_pid_file(const char *pid_file);
 
+
 static void start_monitoring(void);
-static void monitor_streaming_primary(void);
-static void monitor_streaming_standby(void);
-static void monitor_bdr(void);
 
-static PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
-
-static bool do_primary_failover(void);
-static bool do_upstream_standby_failover(void);
-static t_node_info *do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node);
-
-static ElectionResult do_election(void);
-static const char *_print_voting_status(NodeVotingStatus voting_status);
-static const char *_print_election_result(ElectionResult result);
-static const char *_print_monitoring_state(MonitoringState monitoring_state);
-
-static FailoverState promote_self(void);
-static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);
-
-static t_node_info *poll_best_candidate(NodeInfoList *standby_nodes);
-
-static bool wait_primary_notification(int *new_primary_id);
-static FailoverState follow_new_primary(int new_primary_id);
-
-static void reset_node_voting_status(void);
-
-static int calculate_elapsed(instr_time start_time);
-static void update_registration(PGconn *conn);
 
 #ifndef WIN32
 static void setup_event_handlers(void);
@@ -125,8 +66,12 @@ static void handle_sighup(SIGNAL_ARGS);
 static void handle_sigint(SIGNAL_ARGS);
 #endif
 
-static void close_connections();
-static void terminate(int retval);
+
+PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
+
+int calculate_elapsed(instr_time start_time);
+void update_registration(PGconn *conn);
+void terminate(int retval);
 
 int
 main(int argc, char **argv)
@@ -364,70 +309,20 @@ main(int argc, char **argv)
 		terminate(ERR_BAD_CONFIG);
 	}
 
-	log_debug("node id is %i, upstream is %i",
-			  local_node_info.node_id,
-			  local_node_info.upstream_node_id);
-
-    /*
-     * Check if node record is active - if not, and `failover_mode=automatic`, the node
-     * won't be considered as a promotion candidate; this often happens when
-     * a failed primary is recloned and the node was not re-registered, giving
-     * the impression failover capability is there when it's not. In this case
-     * abort with an error and a hint about registering.
-     *
-     * If `failover_mode=manual`, repmgrd can continue to passively monitor the node, but
-     * we should nevertheless issue a warning and the same hint.
-     */
-
-    if (local_node_info.active == false)
-    {
-        char *hint = "Check that 'repmgr (primary|standby) register' was executed for this node";
-
-        switch (config_file_options.failover_mode)
-        {
-			/* "failover_mode" is an enum, all values should be covered here */
-
-            case FAILOVER_AUTOMATIC:
-                log_error(_("this node is marked as inactive and cannot be used as a failover target"));
-                log_hint(_("%s"), hint);
-				PQfinish(local_conn);
-                terminate(ERR_BAD_CONFIG);
-
-            case FAILOVER_MANUAL:
-                log_warning(_("this node is marked as inactive and will be passively monitored only"));
-                log_hint(_("%s"), hint);
-                break;
-        }
-    }
-
-	if (config_file_options.failover_mode == FAILOVER_AUTOMATIC)
+	if (config_file_options.replication_type == REPLICATION_TYPE_BDR)
 	{
-		/*
-		 * check that promote/follow commands are defined, otherwise repmgrd
-		 * won't be able to perform any useful action
-		 */
-
-		bool required_param_missing = false;
-
-		if (config_file_options.promote_command[0] == '\0'
-			&& config_file_options.service_promote_command[0] == '\0')
-		{
-			log_error(_("either \"promote_command\" or \"service_promote_command\" must be defined in the configuration file"));
-			required_param_missing = true;
-		}
-		if (config_file_options.follow_command[0] == '\0')
-		{
-			log_error(_("\"follow_command\" must be defined in the configuration file"));
-			required_param_missing = true;
-		}
-
-		if (required_param_missing == true)
-		{
-			log_hint(_("add the missing configuration parameter(s) and start repmgrd again"));
-			PQfinish(local_conn);
-			exit(ERR_BAD_CONFIG);
-		}
+		log_debug("node id is %i", local_node_info.node_id);
+		do_bdr_node_check();
 	}
+	else
+	{
+		_close_connections = close_connections_physical;
+		log_debug("node id is %i, upstream node id is %i",
+				  local_node_info.node_id,
+				  local_node_info.upstream_node_id);
+		do_physical_node_check();
+	}
+
 
 
 	if (daemonize == true)
@@ -452,6 +347,7 @@ main(int argc, char **argv)
 }
 
 
+
 static void
 start_monitoring(void)
 {
@@ -461,8 +357,6 @@ start_monitoring(void)
 
 	while(true)
 	{
-		reset_node_voting_status();
-
 		switch (local_node_info.type)
 		{
 			case PRIMARY:
@@ -482,1926 +376,11 @@ start_monitoring(void)
 }
 
 
-static void
-monitor_streaming_primary(void)
-{
-	NodeStatus	node_status = NODE_STATUS_UP;
-	instr_time	log_status_interval_start;
-	PQExpBufferData event_details;
 
-	/* Log startup event */
-	if (startup_event_logged == false)
-	{
-		initPQExpBuffer(&event_details);
 
-		appendPQExpBuffer(&event_details,
-						  _("monitoring cluster primary \"%s\" (node ID: %i)"),
-						  local_node_info.node_name,
-						  local_node_info.node_id);
 
-		create_event_notification(local_conn,
-								  &config_file_options,
-								  config_file_options.node_id,
-								  "repmgrd_start",
-								  true,
-								  event_details.data);
 
-		startup_event_logged = true;
-
-		log_notice("%s", event_details.data);
-
-		termPQExpBuffer(&event_details);
-	}
-
-	INSTR_TIME_SET_CURRENT(log_status_interval_start);
-
-	while (true)
-	{
-
-		// cache node list here, refresh at `node_list_refresh_interval`
-		// also return reason for inavailability so we can log it
-		if (is_server_available(local_node_info.conninfo) == false)
-		{
-
-			/* node is down, we were expecting it to be up */
-			if (node_status == NODE_STATUS_UP)
-			{
-				PQExpBufferData event_details;
-				instr_time	local_node_unreachable_start;
-
-				INSTR_TIME_SET_CURRENT(local_node_unreachable_start);
-
-				initPQExpBuffer(&event_details);
-
-				appendPQExpBuffer(&event_details,
-								  _("unable to connect to local node"));
-
-				log_warning("%s", event_details.data);
-
-				node_status = NODE_STATUS_UNKNOWN;
-
-				PQfinish(local_conn);
-
-				/*
-				 * as we're monitoring the primary, no point in trying to write
-				 * the event to the database
-				 *
-				 * XXX possible pre-action event
-				 */
-				create_event_notification(NULL,
-										  &config_file_options,
-										  config_file_options.node_id,
-										  "repmgrd_local_disconnect",
-										  true,
-										  event_details.data);
-
-				termPQExpBuffer(&event_details);
-
-				local_conn = try_reconnect(local_node_info.conninfo, &node_status);
-
-				if (node_status == NODE_STATUS_UP)
-				{
-					int		local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);
-
-					initPQExpBuffer(&event_details);
-
-					appendPQExpBuffer(&event_details,
-									  _("reconnected to local node after %i seconds"),
-									  local_node_unreachable_elapsed);
-					log_notice("%s", event_details.data);
-
-					create_event_notification(local_conn,
-										&config_file_options,
-										config_file_options.node_id,
-										"repmgrd_local_reconnect",
-										true,
-										event_details.data);
-					termPQExpBuffer(&event_details);
-
-					goto loop;
-				}
-
-				monitoring_state = MS_DEGRADED;
-				INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
-			}
-
-		}
-
-
-		if (monitoring_state == MS_DEGRADED)
-		{
-			int		degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
-
-			if (config_file_options.degraded_monitoring_timeout > 0
-				&& degraded_monitoring_elapsed > config_file_options.degraded_monitoring_timeout)
-			{
-				initPQExpBuffer(&event_details);
-
-				appendPQExpBuffer(&event_details,
-								  _("degraded monitoring timeout (%i seconds) exceeded, terminating"),
-								  degraded_monitoring_elapsed);
-
-				log_notice("%s", event_details.data);
-
-				create_event_notification(NULL,
-										  &config_file_options,
-										  config_file_options.node_id,
-										  "repmgrd_terminate",
-										  true,
-										  event_details.data);
-
-				termPQExpBuffer(&event_details);
-				terminate(ERR_MONITORING_TIMEOUT);
-			}
-
-			log_debug("monitoring node in degraded state for %i seconds", degraded_monitoring_elapsed);
-
-			if (is_server_available(local_node_info.conninfo) == true)
-			{
-				local_conn = establish_db_connection(local_node_info.conninfo, false);
-
-				if (PQstatus(local_conn) == CONNECTION_OK)
-				{
-					node_status = NODE_STATUS_UP;
-					monitoring_state = MS_NORMAL;
-
-					initPQExpBuffer(&event_details);
-
-					appendPQExpBuffer(&event_details,
-									  _("reconnected to primary node after %i seconds, resuming monitoring"),
-									  degraded_monitoring_elapsed);
-
-					create_event_notification(local_conn,
-											  &config_file_options,
-											  config_file_options.node_id,
-											  "repmgrd_local_reconnect",
-											  true,
-											  event_details.data);
-
-					log_notice("%s", event_details.data);
-					termPQExpBuffer(&event_details);
-
-					goto loop;
-				}
-			}
-
-
-			// possibly attempt to find another node from cached list
-			// check if there's a new primary - if so add hook for fencing?
-			// loop, if starts up check status, switch monitoring mode
-		}
-	loop:
-		/* emit "still alive" log message at regular intervals, if requested */
-		if (config_file_options.log_status_interval > 0)
-		{
-			int		log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
-
-			if (log_status_interval_elapsed >= config_file_options.log_status_interval)
-			{
-				log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"),
-						 local_node_info.node_name,
-						 local_node_info.node_id,
-						 _print_monitoring_state(monitoring_state));
-
-				if (monitoring_state == MS_DEGRADED)
-				{
-					log_detail(_("waiting primary to reappear"));
-				}
-
-				INSTR_TIME_SET_CURRENT(log_status_interval_start);
-			}
-		}
-		sleep(1);
-	}
-}
-
-
-static void
-monitor_streaming_standby(void)
-{
-	RecordStatus record_status;
-	NodeStatus	upstream_node_status = NODE_STATUS_UP;
-	instr_time	log_status_interval_start;
-	PQExpBufferData event_details;
-
-	log_debug("monitor_streaming_standby()");
-
-	/*
-	 * If no upstream node id is specified in the metadata, we'll try
-	 * and determine the current cluster primary in the assumption we
-	 * should connect to that by default.
-	 */
-	if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
-	{
-		local_node_info.upstream_node_id = get_primary_node_id(local_conn);
-
-		/*
-		 * Terminate if there doesn't appear to be an active cluster primary.
-		 * There could be one or more nodes marked as inactive primaries, and one
-		 * of them could actually be a primary, but we can't sensibly monitor
-		 * in that state.
-		 */
-		if (local_node_info.upstream_node_id == NODE_NOT_FOUND)
-		{
-			// XXX check if there's an inactive record(s) and log detail/hint
-			log_error(_("unable to determine an active primary for this cluster, terminating"));
-			PQfinish(local_conn);
-			exit(ERR_BAD_CONFIG);
-		}
-	}
-
-	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &upstream_node_info);
-
-	/*
-	 * Terminate if we can't find the record for the node we're supposed
-	 * to monitor. This is a "fix-the-config" situation, not a lot else we
-	 * can do.
-	 */
-	if (record_status != RECORD_FOUND)
-	{
-		log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
-					local_node_info.upstream_node_id);
-		PQfinish(local_conn);
-		exit(ERR_DB_CONN);
-	}
-
-	log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);
-
-	upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
-
-	/*
-	 * Upstream node must be running.
-	 *
-	 * We could possibly have repmgrd skip to degraded monitoring mode until it
-	 * comes up, but there doesn't seem to be much point in doint that.
-	 */
-	if (PQstatus(upstream_conn) != CONNECTION_OK)
-	{
-		log_error(_("unable connect to upstream node (ID: %i), terminating"),
-				  local_node_info.upstream_node_id);
-		log_hint(_("upstream node must be running before repmgrd can start"));
-
-		PQfinish(local_conn);
-		exit(ERR_DB_CONN);
-	}
-
-	/* refresh upstream node record from upstream node, so it's as up-to-date as possible */
-	record_status = get_node_record(upstream_conn, upstream_node_info.node_id, &upstream_node_info);
-
-	if (upstream_node_info.type == STANDBY)
-	{
-		/*
-		 * Currently cascaded standbys need to be able to connect to the primary.
-		 * We could possibly add a limited connection mode for cases where this isn't
-		 * possible.
-		 */
-		primary_conn = establish_primary_db_connection(upstream_conn, false);
-
-		if (PQstatus(primary_conn) != CONNECTION_OK)
-		{
-			log_error(_("unable to connect to primary node"));
-			log_hint(_("ensure the primary node is reachable from this node"));
-			exit(ERR_DB_CONN);
-		}
-
-		log_verbose(LOG_DEBUG, "connected to primary");
-	}
-	else
-	{
-		primary_conn = upstream_conn;
-	}
-
-	/* Log startup event */
-	if (startup_event_logged == false)
-	{
-		PQExpBufferData event_details;
-		initPQExpBuffer(&event_details);
-
-		appendPQExpBuffer(&event_details,
-						  _("monitoring upstream node \"%s\" (node ID: %i)"),
-						  upstream_node_info.node_name,
-						  upstream_node_info.node_id);
-
-		create_event_notification(primary_conn,
-								  &config_file_options,
-								  config_file_options.node_id,
-								  "repmgrd_start",
-								  true,
-								  event_details.data);
-
-		startup_event_logged = true;
-
-		log_notice("%s", event_details.data);
-
-		termPQExpBuffer(&event_details);
-	}
-
-	monitoring_state = MS_NORMAL;
-	INSTR_TIME_SET_CURRENT(log_status_interval_start);
-
-	while (true)
-	{
-		if (is_server_available(upstream_node_info.conninfo) == false)
-		{
-
-			/* upstream node is down, we were expecting it to be up */
-			if (upstream_node_status == NODE_STATUS_UP)
-			{
-				instr_time	upstream_node_unreachable_start;
-
-				INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);
-
-				initPQExpBuffer(&event_details);
-
-				upstream_node_status = NODE_STATUS_UNKNOWN;
-
-				appendPQExpBuffer(&event_details,
-								  _("unable to connect to upstream node \"%s\" (node ID: %i)"),
-								  upstream_node_info.node_name, upstream_node_info.node_id);
-
-				if (upstream_node_info.type == STANDBY)
-				{
-					/* XXX possible pre-action event */
-					create_event_record(primary_conn,
-										&config_file_options,
-										config_file_options.node_id,
-										"repmgrd_upstream_disconnect",
-										true,
-										event_details.data);
-				}
-
-				log_warning("%s", event_details.data);
-				termPQExpBuffer(&event_details);
-
-				PQfinish(upstream_conn);
-				upstream_conn = try_reconnect(upstream_node_info.conninfo, &upstream_node_status);
-
-				if (upstream_node_status == NODE_STATUS_UP)
-				{
-					int		upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
-
-					initPQExpBuffer(&event_details);
-
-					appendPQExpBuffer(&event_details,
-									  _("reconnected to upstream node after %i seconds"),
-									  upstream_node_unreachable_elapsed);
-					log_notice("%s", event_details.data);
-
-					create_event_notification(local_conn,
-											  &config_file_options,
-											  config_file_options.node_id,
-											  "repmgrd_upstream_reconnect",
-											  true,
-											  event_details.data);
-					termPQExpBuffer(&event_details);
-
-					goto loop;
-				}
-
-				/* still down after reconnect attempt(s) */
-				if (upstream_node_status == NODE_STATUS_DOWN)
-				{
-					bool failover_done = false;
-
-					if (upstream_node_info.type == PRIMARY)
-					{
-						failover_done = do_primary_failover();
-					}
-					else if (upstream_node_info.type == STANDBY)
-					{
-						failover_done = do_upstream_standby_failover();
-					}
-
-					// it's possible it will make sense to return in
-					// all cases to restart monitoring
-					if (failover_done == true)
-						return;
-				}
-			}
-		}
-
-		if (monitoring_state == MS_DEGRADED)
-		{
-			int		degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
-
-			log_debug("monitoring node %i in degraded state for %i seconds",
-					  upstream_node_info.node_id,
-					  degraded_monitoring_elapsed);
-
-			if (is_server_available(upstream_node_info.conninfo) == true)
-			{
-				upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
-
-				if (PQstatus(upstream_conn) == CONNECTION_OK)
-				{
-					// XXX check here if upstream is still primary
-					// -> will be a problem if another node was promoted in the meantime
-					// and upstream is now former primary
-					// XXX scan other nodes to see if any has become primary
-
-					upstream_node_status = NODE_STATUS_UP;
-					monitoring_state = MS_NORMAL;
-
-					if (upstream_node_info.type == PRIMARY)
-					{
-						primary_conn = upstream_conn;
-					}
-					else
-					{
-
-						if (primary_conn == NULL || PQstatus(primary_conn) != CONNECTION_OK)
-						{
-							primary_conn = establish_primary_db_connection(upstream_conn, false);
-						}
-					}
-
-					initPQExpBuffer(&event_details);
-
-					appendPQExpBuffer(&event_details,
-									  _("reconnected to upstream node %i after %i seconds, resuming monitoring"),
-									  upstream_node_info.node_id,
-									  degraded_monitoring_elapsed);
-
-					create_event_notification(primary_conn,
-											  &config_file_options,
-											  config_file_options.node_id,
-											  "repmgrd_upstream_reconnect",
-											  true,
-											  event_details.data);
-
-					log_notice("%s", event_details.data);
-					termPQExpBuffer(&event_details);
-
-					goto loop;
-				}
-			}
-			else
-			{
-				// unable to connect to former primary - check if another node has
-				// been promoted
-			}
-
-		}
-
-	loop:
-
-		/* emit "still alive" log message at regular intervals, if requested */
-		if (config_file_options.log_status_interval > 0)
-		{
-			int		log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
-
-			if (log_status_interval_elapsed >= config_file_options.log_status_interval)
-			{
-				log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i) in %s state"),
-						 local_node_info.node_name,
-						 local_node_info.node_id,
-						 upstream_node_info.node_name,
-						 upstream_node_info.node_id,
-						 _print_monitoring_state(monitoring_state));
-
-				if (monitoring_state == MS_DEGRADED)
-				{
-					log_detail(_("waiting for upstream or another primary to reappear"));
-				}
-
-				INSTR_TIME_SET_CURRENT(log_status_interval_start);
-			}
-		}
-
-		/*
-		 * handle local node failure
-		 *
-		 * currently we'll just check the connection, and try to reconnect
-		 *
-		 * TODO: add timeout, after which we run in degraded state
-		 */
-		if (is_server_available(local_node_info.conninfo) == false)
-		{
-			log_warning(_("connection to local node %i lost"), local_node_info.node_id);
-
-			if (local_conn != NULL)
-			{
-				PQfinish(local_conn);
-				local_conn = NULL;
-			}
-		}
-
-		if (PQstatus(local_conn) != CONNECTION_OK)
-		{
-			log_info(_("attempting to reconnect"));
-			local_conn = establish_db_connection(config_file_options.conninfo, false);
-
-			if (PQstatus(local_conn) != CONNECTION_OK)
-			{
-				log_warning(_("reconnection failed"));
-			}
-			else
-			{
-				log_info(_("reconnected"));
-			}
-		}
-		sleep(1);
-	}
-}
-
-
-static bool
-do_primary_failover(void)
-{
-	/* attempt to initiate voting process */
-	ElectionResult election_result = do_election();
-
-	/* XXX add pre-event notification here */
-	failover_state = FAILOVER_STATE_UNKNOWN;
-
-	log_debug("election result: %s", _print_election_result(election_result));
-
-	if (election_result == ELECTION_CANCELLED)
-	{
-		log_notice(_("election cancelled"));
-		return false;
-	}
-	else if (election_result == ELECTION_WON)
-	{
-		log_notice("I am the winner, will now promote self and inform other nodes");
-
-		failover_state = promote_self();
-	}
-	else if (election_result == ELECTION_LOST)
-	{
-		t_node_info *best_candidate;
-
-		log_info(_("I am the candidate but did not get all votes; will now determine the best candidate"));
-
-
-		/* reset node list */
-		get_active_sibling_node_records(local_conn,
-										local_node_info.node_id,
-										upstream_node_info.node_id,
-										&standby_nodes);
-
-		best_candidate = poll_best_candidate(&standby_nodes);
-
-		/*
-		 * this can occur in a tie-break situation, where this node establishes
-		 * it is the best candidate
-		 */
-		if (best_candidate->node_id == local_node_info.node_id)
-		{
-			log_notice("I am the best candidate, will now promote self and inform other nodes");
-
-			failover_state = promote_self();
-		}
-		else
-		{
-			PGconn *candidate_conn = NULL;
-
-			log_info("node %i is the best candidate, waiting for it to confirm so I can follow it",
-					 best_candidate->node_id);
-
-			/* notify the best candidate so it */
-
-			candidate_conn = establish_db_connection(best_candidate->conninfo, false);
-
-			if (PQstatus(candidate_conn) == CONNECTION_OK)
-			{
-				notify_follow_primary(candidate_conn, best_candidate->node_id);
-
-				/*  we'll wait for the candidate to get back to us */
-				failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
-			}
-			else
-			{
-				log_error(_("unable to connect to candidate node (ID: %i)"), best_candidate->node_id);
-				failover_state = FAILOVER_STATE_NODE_NOTIFICATION_ERROR;
-			}
-			PQfinish(candidate_conn);
-		}
-	}
-	else
-	{
-		log_info(_("follower node awaiting notification from the candidate node"));
-		failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
-	}
-
-
-	/*
-	 * node has decided it is a follower, so will await notification
-	 * from the candidate that it has promoted itself and can be followed
-	 */
-	if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
-	{
-		int new_primary_id;
-
-		//   --> need timeout in case new primary doesn't come up, then rerun election
-
-		/* either follow or time out; either way resume monitoring */
-		if (wait_primary_notification(&new_primary_id) == true)
-		{
-			/* if primary has reappeared, no action needed */
-			if (new_primary_id == upstream_node_info.node_id)
-			{
-				failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY;
-			}
-			/* if new_primary_id is self, promote */
-			else if (new_primary_id == local_node_info.node_id)
-			{
-				log_notice(_("this node is promotion candidate, promoting"));
-
-				failover_state = promote_self();
-
-				get_active_sibling_node_records(local_conn,
-												local_node_info.node_id,
-												upstream_node_info.node_id,
-												&standby_nodes);
-
-			}
-			else
-			{
-				failover_state = follow_new_primary(new_primary_id);
-			}
-		}
-		else
-		{
-			failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
-		}
-	}
-
-	switch(failover_state)
-	{
-		case FAILOVER_STATE_PROMOTED:
-			log_debug("failover state is PROMOTED");
-
-			/* notify former siblings that they should now follow this node */
-			notify_followers(&standby_nodes, local_node_info.node_id);
-
-			/* we no longer care about our former siblings */
-			clear_node_info_list(&standby_nodes);
-
-			/* pass control back down to start_monitoring() */
-			log_info(_("switching to primary monitoring mode"));
-
-			failover_state = FAILOVER_STATE_NONE;
-			return true;
-
-		case FAILOVER_STATE_PRIMARY_REAPPEARED:
-			log_debug("failover state is PRIMARY_REAPPEARED");
-
-			/* notify siblings that they should resume following the original primary */
-			notify_followers(&standby_nodes, upstream_node_info.node_id);
-
-			/* we no longer care about our former siblings */
-			clear_node_info_list(&standby_nodes);
-
-			/* pass control back down to start_monitoring() */
-			log_info(_("resuming standby monitoring mode"));
-			log_detail(_("original primary \"%s\" (node ID: %i) reappeared"),
-					   upstream_node_info.node_name, upstream_node_info.node_id);
-
-			failover_state = FAILOVER_STATE_NONE;
-			return true;
-
-
-		case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
-			log_info(_("resuming standby monitoring mode"));
-			log_detail(_("following new primary \"%s\" (node id: %i)"),
-					   upstream_node_info.node_name, upstream_node_info.node_id);
-			failover_state = FAILOVER_STATE_NONE;
-
-			return true;
-
-		case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
-			log_info(_("resuming standby monitoring mode"));
-			log_detail(_("following original primary \"%s\" (node id: %i)"),
-					   upstream_node_info.node_name, upstream_node_info.node_id);
-			failover_state = FAILOVER_STATE_NONE;
-
-			return true;
-
-		case FAILOVER_STATE_PROMOTION_FAILED:
-			log_debug("failover state is PROMOTION FAILED");
-			return false;
-
-		case FAILOVER_STATE_FOLLOW_FAIL:
-			/*
-			 * for whatever reason we were unable to follow the new primary -
-			 * continue monitoring in degraded state
-			 */
-			monitoring_state = MS_DEGRADED;
-			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
-
-			return false;
-
-		case FAILOVER_STATE_NO_NEW_PRIMARY:
-		case FAILOVER_STATE_WAITING_NEW_PRIMARY:
-			/* pass control back down to start_monitoring() */
-			// -> should kick off new election
-			return false;
-
-		case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
-		case FAILOVER_STATE_LOCAL_NODE_FAILURE:
-		case FAILOVER_STATE_UNKNOWN:
-		case FAILOVER_STATE_NONE:
-			log_debug("failover state is %i", failover_state);
-			return false;
-	}
-
-	/* should never reach here */
-	return false;
-}
-
-/*
- * do_upstream_standby_failover()
- *
- * Attach cascaded standby to primary
- *
- * Currently we will try to attach to the cluster primary, as "repmgr
- * standby follow" doesn't support attaching to another node.
- *
- * If this becomes supported, it might be worth providing a selection
- * of reconnection strategies as different behaviour might be desirable
- * in different situations;
- * or maybe the option not to reconnect might be required?
- *
- * XXX check this handles replication slots gracefully
- */
-static bool
-do_upstream_standby_failover(void)
-{
-	PQExpBufferData event_details;
-	t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
-	RecordStatus record_status;
-	int r;
-
-	PQfinish(upstream_conn);
-	upstream_conn = NULL;
-
-	record_status = get_primary_node_record(local_conn, &primary_node_info);
-
-	if (record_status != RECORD_FOUND)
-	{
-		log_error(_("unable to retrieve primary node record"));
-		return false;
-	}
-	/*
-	 * Verify that we can still talk to the cluster primary, even though
-	 * the node's upstream is not available
-	 */
-
-	// consolidate below code
-	if (is_server_available(primary_node_info.conninfo) == false)
-	{
-		log_warning(_("connection to primary %i lost"), primary_node_info.node_id);
-
-		if (primary_conn != NULL)
-		{
-			PQfinish(primary_conn);
-			primary_conn = NULL;
-		}
-	}
-
-	if (PQstatus(primary_conn) != CONNECTION_OK)
-	{
-		log_info(_("attempting to reconnect"));
-		primary_conn = establish_db_connection(primary_node_info.conninfo, false);
-
-		if (PQstatus(primary_conn) != CONNECTION_OK)
-		{
-			log_warning(_("reconnection failed"));
-		}
-		else
-		{
-			log_info(_("reconnected"));
-		}
-	}
-
-	/* grandparent upstream is inactive  */
-	if (primary_node_info.active == false)
-	{
-		// XXX
-	}
-
-	/* Close the connection to this server */
-	PQfinish(local_conn);
-	local_conn = NULL;
-
-	initPQExpBuffer(&event_details);
-
-	log_debug(_("standby follow command is:\n  \"%s\""),
-			  config_file_options.follow_command);
-
-	r = system(config_file_options.follow_command);
-
-	if (r != 0)
-	{
-		appendPQExpBuffer(&event_details,
-						  _("unable to execute follow command:\n %s"),
-						  config_file_options.follow_command);
-
-		log_error("%s", event_details.data);
-
-		/* It may not possible to write to the event notification
-		 * table but we should be able to generate an external notification
-		 * if required.
-		 */
-		create_event_notification(primary_conn,
-							&config_file_options,
-							local_node_info.node_id,
-							"repmgrd_failover_follow",
-							false,
-							event_details.data);
-
-		termPQExpBuffer(&event_details);
-	}
-
-	/* reconnect to local node */
-	local_conn = establish_db_connection(config_file_options.conninfo, false);
-
-	if (update_node_record_set_upstream(primary_conn,
-										local_node_info.node_id,
-										primary_node_info.node_id) == false)
-	{
-		appendPQExpBuffer(&event_details,
-						  _("unable to set node %i's new upstream ID to %i"),
-						  local_node_info.node_id,
-						  primary_node_info.node_id);
-
-		log_error("%s", event_details.data);
-
-		create_event_notification(NULL,
-							&config_file_options,
-							local_node_info.node_id,
-							"repmgrd_failover_follow",
-							false,
-							event_details.data);
-
-		termPQExpBuffer(&event_details);
-
-		terminate(ERR_BAD_CONFIG);
-	}
-	/* update own internal node record */
-    record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);
-
-
-	appendPQExpBuffer(&event_details,
-					  _("node %i is now following primary node %i"),
-					  local_node_info.node_id,
-					  primary_node_info.node_id);
-
-	log_notice("%s", event_details.data);
-
-	create_event_notification(primary_conn,
-							  &config_file_options,
-							  local_node_info.node_id,
-							  "repmgrd_failover_follow",
-							  true,
-							  event_details.data);
-
-	termPQExpBuffer(&event_details);
-
-
-	PQfinish(primary_conn);
-	primary_conn = NULL;
-
-
-	return true;
-}
-
-
-static FailoverState
-promote_self(void)
-{
-	PQExpBufferData event_details;
-	char *promote_command;
-	int r;
-
-	/* Store details of the failed node here */
-	t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
-	RecordStatus record_status;
-
-	/*
-	 * optionally add a delay before promoting the standby; this is mainly
-	 * useful for testing (e.g. for reappearance of the original primary)
-	 * and is not documented.
-	 */
-	if (config_file_options.promote_delay > 0)
-	{
-		log_debug("sleeping %i seconds before promoting standby",
-				  config_file_options.promote_delay);
-		sleep(config_file_options.promote_delay);
-	}
-
-	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
-
-	if (record_status != RECORD_FOUND)
-	{
-		log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
-				  local_node_info.upstream_node_id);
-		return FAILOVER_STATE_PROMOTION_FAILED;
-	}
-
-	/* the presence of either of these commands has been established already */
-	if (config_file_options.service_promote_command[0] != '\0')
-		promote_command = config_file_options.service_promote_command;
-	else
-		promote_command = config_file_options.promote_command;
-
-	log_debug("promote command is:\n  \"%s\"",
-			  promote_command);
-
-	if (log_type == REPMGR_STDERR && *config_file_options.log_file)
-	{
-		fflush(stderr);
-	}
-
-	r = system(promote_command);
-
-	/* connection should stay up, but check just in case */
-	if(PQstatus(local_conn) != CONNECTION_OK)
-	{
-		local_conn = establish_db_connection(local_node_info.conninfo, true);
-
-		/* assume node failed */
-		if(PQstatus(local_conn) != CONNECTION_OK)
-		{
-			log_error(_("unable to reconnect to local node"));
-			// XXX handle this
-			return FAILOVER_STATE_LOCAL_NODE_FAILURE;
-		}
-	}
-
-	if (r != 0)
-	{
-		int primary_node_id;
-
-		upstream_conn = get_primary_connection(local_conn,
-											  &primary_node_id, NULL);
-
-		if (PQstatus(upstream_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
-		{
-			log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"),
-					   failed_primary.node_id);
-
-			initPQExpBuffer(&event_details);
-			appendPQExpBuffer(&event_details,
-							  _("original primary \"%s\" (node ID: %i) reappeared"),
-							  failed_primary.node_name,
-							  failed_primary.node_id);
-
-			create_event_notification(upstream_conn,
-								&config_file_options,
-								local_node_info.node_id,
-								"repmgrd_failover_abort",
-								true,
-								event_details.data);
-
-			termPQExpBuffer(&event_details);
-
-			//primary_conn = NULL;
-
-			// XXX handle this!
-			// -> we'll need to let the other nodes know too....
-			/* no failover occurred but we'll want to restart connections */
-			//failover_done = true;
-			return FAILOVER_STATE_PRIMARY_REAPPEARED;
-		}
-
-		// handle this
-		//  -> check if somehow primary; otherwise go for new election?
-		log_error(_("promote command failed"));
-		return FAILOVER_STATE_PROMOTION_FAILED;
-	}
-
-
-	initPQExpBuffer(&event_details);
-
-	/* update own internal node record */
-	record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
-
-	/*
-	 * XXX here we're assuming the promote command updated metadata
-	 */
-	appendPQExpBuffer(&event_details,
-					  _("node %i promoted to primary; old primary %i marked as failed"),
-					  local_node_info.node_id,
-					  failed_primary.node_id);
-
-	/* local_conn is now the primary connection */
-	create_event_notification(local_conn,
-						&config_file_options,
-						local_node_info.node_id,
-						"repmgrd_failover_promote",
-						true,
-						event_details.data);
-
-	termPQExpBuffer(&event_details);
-
-	return FAILOVER_STATE_PROMOTED;
-}
-
-
-/*
- * Notify follower nodes about which node to follow. Normally this
- * will be the current node, however if the original primary reappeared
- * before this node could be promoted, we'll inform the followers they
- * should resume monitoring the original primary.
- */
-static void
-notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
-{
-	NodeInfoListCell *cell;
-
-	log_debug("notify_followers()");
-	for (cell = standby_nodes->head; cell; cell = cell->next)
-	{
-		log_debug("intending to notify node %i... ", cell->node_info->node_id);
-		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
-		{
-			log_debug("reconnecting to node %i... ", cell->node_info->node_id);
-
-			cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
-		}
-
-		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
-		{
-			log_debug("unable to reconnect to  %i ... ", cell->node_info->node_id);
-
-			continue;
-		}
-
-		log_debug("notifying node %i to follow node %i",
-				  cell->node_info->node_id, follow_node_id);
-		notify_follow_primary(cell->node_info->conn, follow_node_id);
-	}
-}
-
-
-static t_node_info *
-poll_best_candidate(NodeInfoList *standby_nodes)
-{
-	NodeInfoListCell *cell;
-	t_node_info *best_candidate = &local_node_info;
-
-	// XXX ensure standby_nodes is set correctly
-
-	/*
-	 * we need to definitively decide the best candidate, as in some corner
-	 * cases we could end up with two candidate nodes, so they should each
-	 * come to the same conclusion
-	 */
-	for (cell = standby_nodes->head; cell; cell = cell->next)
-	{
-		if (cell->node_info->last_wal_receive_lsn > best_candidate->last_wal_receive_lsn)
-		{
-			log_debug("node %i has higher LSN, now best candidate", cell->node_info->node_id);
-			best_candidate = cell->node_info;
-		}
-		else if (cell->node_info->last_wal_receive_lsn == best_candidate->last_wal_receive_lsn)
-		{
-			if (cell->node_info->priority > best_candidate->priority)
-			{
-				log_debug("node %i has higher priority, now best candidate", cell->node_info->node_id);
-				best_candidate = cell->node_info;
-			}
-		}
-		/* if all else fails, we decide by node_id */
-		else if (cell->node_info->node_id < best_candidate->node_id)
-		{
-			log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id);
-			best_candidate = cell->node_info;
-		}
-	}
-
-	log_info(_("best candidate is %i"), best_candidate->node_id);
-
-	return best_candidate;
-}
-
-
-static bool
-wait_primary_notification(int *new_primary_id)
-{
-	// XXX make this configurable
-	int wait_primary_timeout = 60;
-	int i;
-
-	for (i = 0; i < wait_primary_timeout; i++)
-	{
-		if (get_new_primary(local_conn, new_primary_id) == true)
-		{
-			log_debug("new primary is %i; elapsed: %i",
-					  *new_primary_id, i);
-			return true;
-		}
-		sleep(1);
-	}
-
-
-	log_warning(_("no notifcation received from new primary after %i seconds"),
-				wait_primary_timeout);
-
-	return false;
-}
-
-
-static FailoverState
-follow_new_primary(int new_primary_id)
-{
-	PQExpBufferData event_details;
-	int r;
-
-	/* Store details of the failed node here */
-	t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
-	t_node_info new_primary = T_NODE_INFO_INITIALIZER;
-	RecordStatus record_status;
-	bool new_primary_ok = false;
-
-	record_status = get_node_record(local_conn, new_primary_id, &new_primary);
-
-	if (record_status != RECORD_FOUND)
-	{
-		log_error(_("unable to retrieve metadata record for upstream node (ID: %i)"),
-				  new_primary_id);
-		return FAILOVER_STATE_FOLLOW_FAIL;
-	}
-
-	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
-
-	if (record_status != RECORD_FOUND)
-	{
-		log_error(_("unable to retrieve metadata record for failed primary (ID: %i)"),
-					local_node_info.upstream_node_id);
-		return FAILOVER_STATE_FOLLOW_FAIL;
-	}
-
-	// XXX check if new_primary_id == failed_primary.node_id?
-
-	if (log_type == REPMGR_STDERR && *config_file_options.log_file)
-	{
-		fflush(stderr);
-	}
-
-	log_debug(_("standby follow command is:\n  \"%s\""),
-			  config_file_options.follow_command);
-
-	upstream_conn = establish_db_connection(new_primary.conninfo, false);
-
-	if (PQstatus(upstream_conn) == CONNECTION_OK)
-	{
-		RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
-		if (primary_recovery_type == RECTYPE_PRIMARY)
-		{
-			new_primary_ok = true;
-		}
-		else
-		{
-			log_warning(_("new primary is not in recovery"));
-			PQfinish(upstream_conn);
-		}
-	}
-
-	if (new_primary_ok == false)
-	{
-		return FAILOVER_STATE_FOLLOW_FAIL;
-	}
-
-	/*
-	 * disconnect from local node, as follow operation will result in
-	 * a server restart
-	 */
-
-	PQfinish(local_conn);
-	local_conn = NULL;
-
-	/* execute the follow command */
-	r = system(config_file_options.follow_command);
-
-	if (r != 0)
-	{
-		PGconn *old_primary_conn;
-		/*
-		 * The follow action could still fail due to the original primary reappearing
-		 * before the candidate could promote itself ("repmgr standby follow" will
-		 * refuse to promote another node if the primary is available). However
-		 * the new primary will only instruct use to follow it after it's successfully
-		 * promoted itself, so that very likely won't be the reason for the failure.
-		 *
-		 *
-		 * TODO: check the new primary too - we could have a split-brain
-		 * situation where the old primary reappeared just after the new
-		 * one promoted itself.
-		 */
-		old_primary_conn = establish_db_connection(failed_primary.conninfo, false);
-
-		if (PQstatus(old_primary_conn) == CONNECTION_OK)
-		{
-			// XXX add event notifications
-			RecoveryType upstream_recovery_type = get_recovery_type(old_primary_conn);
-			PQfinish(old_primary_conn);
-
-			if (upstream_recovery_type == RECTYPE_PRIMARY)
-			{
-				log_notice(_("original primary reappeared - no action taken"));
-				return FAILOVER_STATE_PRIMARY_REAPPEARED;
-			}
-		}
-
-		return FAILOVER_STATE_FOLLOW_FAIL;
-	}
-
-
-	/*
-	 * refresh local copy of local and primary node records - we get these
-	 * directly from the primary to ensure they're the current version
-	 */
-
-	record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
-
-	if (record_status != RECORD_FOUND)
-	{
-		log_error(_("unable to retrieve metadata record found for node %i"),
-				  new_primary_id);
-		return FAILOVER_STATE_FOLLOW_FAIL;
-	}
-
-	record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
-	if (record_status != RECORD_FOUND)
-	{
-		log_error(_("unable to retrieve metadata record found for node %i"),
-				  local_node_info.node_id);
-		return FAILOVER_STATE_FOLLOW_FAIL;
-	}
-
-
-	local_conn = establish_db_connection(local_node_info.conninfo, false);
-	initPQExpBuffer(&event_details);
-	appendPQExpBuffer(&event_details,
-					  _("node %i now following new upstream node %i"),
-					  local_node_info.node_id,
-					  upstream_node_info.node_id);
-
-	log_notice("%s\n", event_details.data);
-
-	create_event_notification(upstream_conn,
-						&config_file_options,
-						local_node_info.node_id,
-						"repmgrd_failover_follow",
-						true,
-						event_details.data);
-
-	termPQExpBuffer(&event_details);
-
-	return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
-}
-
-
-static const char *
-_print_voting_status(NodeVotingStatus voting_status)
-{
-	switch(voting_status)
-	{
-		case VS_NO_VOTE:
-			return "NO VOTE";
-
-		case VS_VOTE_REQUEST_RECEIVED:
-			return "VOTE REQUEST RECEIVED";
-
-		case VS_VOTE_INITIATED:
-			return "VOTE REQUEST INITIATED";
-
-		case VS_UNKNOWN:
-			return "VOTE REQUEST UNKNOWN";
-	}
-
-	return "UNKNOWN VOTE REQUEST STATE";
-}
-
-static const char *
-_print_election_result(ElectionResult result)
-{
-	switch(result)
-	{
-		case ELECTION_NOT_CANDIDATE:
-			return "NOT CANDIDATE";
-
-		case ELECTION_WON:
-			return "WON";
-
-		case ELECTION_LOST:
-			return "LOST";
-
-		case ELECTION_CANCELLED:
-			return "CANCELLED";
-	}
-
-	/* should never reach here */
-	return "UNKNOWN";
-}
-
-static const char *
-_print_monitoring_state(MonitoringState monitoring_state)
-{
-	switch(monitoring_state)
-	{
-		case MS_NORMAL:
-			return "normal";
-
-		case MS_DEGRADED:
-			return "degraded";
-	}
-
-	/* should never reach here */
-	return "UNKNOWN";
-}
-
-
-
-static ElectionResult
-do_election(void)
-{
-	int electoral_term = -1;
-
-	int votes_for_me = 0;
-
-	/* we're visible */
-	int visible_nodes = 1;
-
-	/*
-	 * get voting status from shared memory - should be one of "VS_NO_VOTE"
-	 * or "VS_VOTE_REQUEST_RECEIVED". If VS_NO_VOTE, we declare ourselves as
-	 * candidate and initiate the voting process.
-	 */
-	NodeVotingStatus voting_status;
-
-	NodeInfoListCell *cell;
-
-	bool other_node_is_candidate = false;
-	bool other_node_is_ahead = false;
-
-	/*
-	 * Check if at least one server in the primary's location is visible;
-	 * if not we'll assume a network split between this node and the primary
-	 * location, and not promote any standby.
-	 *
-	 * NOTE: this function is only ever called by standbys attached to the current
-	 * (unreachable) primary, so "upstream_node_info" will always contain the
-	 * primary node record.
-	 */
-	bool primary_location_seen = false;
-
-	/*
-	 * sleep for a random period of 100 ~ 350 ms
-	 */
-
-	long unsigned rand_wait = (long) ((rand() % 35) + 10) * 10000;
-
-	log_debug("do_election(): sleeping %lu", rand_wait);
-	log_debug("do_election(): primary location is %s", upstream_node_info.location);
-
-	pg_usleep(rand_wait);
-
-	local_node_info.last_wal_receive_lsn = InvalidXLogRecPtr;
-
-	log_debug("do_election(): executing get_voting_status()");
-	voting_status = get_voting_status(local_conn);
-	log_debug("do_election(): node voting status is %s", _print_voting_status(voting_status));
-
-	if (voting_status == VS_VOTE_REQUEST_RECEIVED)
-	{
-		/* we've already been requested to vote, so can't become a candidate */
-		log_debug("vote request already received, not candidate");
-		return ELECTION_NOT_CANDIDATE;
-	}
-
-	/*
-	 * Here we mark ourselves as candidate, so any further vote requests
-	 * are rejected. However it's possible another node has done the
-	 * same thing, so when announcing ourselves as candidate to the other
-	 * nodes, we'll check for that and withdraw our candidature.
-	 */
-	electoral_term = set_voting_status_initiated(local_conn);
-
-	/* get all active nodes attached to primary, excluding self */
-	get_active_sibling_node_records(local_conn,
-									local_node_info.node_id,
-									upstream_node_info.node_id,
-									&standby_nodes);
-
-	/* no other standbys - win by default */
-
-	if (standby_nodes.node_count == 0)
-	{
-		if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
-		{
-			log_debug("no other nodes - we win by default");
-			return ELECTION_WON;
-		}
-		else
-		{
-			log_debug("no other nodes, but primary and standby locations differ");
-
-			monitoring_state = MS_DEGRADED;
-			INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
-
-			return ELECTION_NOT_CANDIDATE;
-		}
-	}
-
-	for (cell = standby_nodes.head; cell; cell = cell->next)
-	{
-		/* assume the worst case */
-		cell->node_info->is_visible = false;
-
-		cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
-
-		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
-		{
-			continue;
-		}
-
-		/*
-		 * tell the other node we're candidate - if the node has already declared
-		 * itself, we withdraw
-		 *
-		 * XXX check for situations where more than one node could end up as candidate?
-		 *
-		 * XXX note it's possible some nodes accepted our candidature before we
-		 * found out about the other candidate, check what happens in that situation
-		 *  -> other node will have info from all the nodes, even if not the vote,
-		 *     so it should be able to determine the best node anyway
-		 */
-
-		if (announce_candidature(cell->node_info->conn, &local_node_info, cell->node_info, electoral_term) == false)
-		{
-			log_debug("node %i is candidate",  cell->node_info->node_id);
-			other_node_is_candidate = true;
-
-			/* don't notify any further standbys */
-			break;
-		}
-
-		/*
-		 * see if the node is in the primary's location (but skip the check
-		 * if we've seen
-		 */
-		if (primary_location_seen == false)
-		{
-			if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0)
-			{
-				primary_location_seen = true;
-			}
-		}
-
-		cell->node_info->is_visible = true;
-		visible_nodes ++;
-	}
-
-	if (other_node_is_candidate == true)
-	{
-		clear_node_info_list(&standby_nodes);
-
-		reset_node_voting_status();
-		log_debug("other node is candidate, returning NOT CANDIDATE");
-		return ELECTION_NOT_CANDIDATE;
-	}
-
-	if (primary_location_seen == false)
-	{
-		log_notice(_("no nodes from the primary location \"%s\" visible - assuming network split"),
-				   upstream_node_info.location);
-		log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
-
-		monitoring_state = MS_DEGRADED;
-		INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
-
-		reset_node_voting_status();
-
-		return ELECTION_CANCELLED;
-	}
-
-
-	/* get our lsn */
-	local_node_info.last_wal_receive_lsn = get_last_wal_receive_location(local_conn);
-
-	log_debug("last receive lsn = %X/%X",
-			  (uint32) (local_node_info.last_wal_receive_lsn >> 32),
-			  (uint32)  local_node_info.last_wal_receive_lsn);
-
-	/* request vote from each node */
-
-	for (cell = standby_nodes.head; cell; cell = cell->next)
-	{
-		log_debug("checking node %i...", cell->node_info->node_id);
-		/* ignore unreachable nodes */
-		if (cell->node_info->is_visible == false)
-			continue;
-		votes_for_me += request_vote(cell->node_info->conn,
-									 &local_node_info,
-									 cell->node_info,
-									 electoral_term);
-
-		if (cell->node_info->last_wal_receive_lsn > local_node_info.last_wal_receive_lsn)
-		{
-			/* register if another node is ahead of us */
-			other_node_is_ahead = true;
-		}
-		PQfinish(cell->node_info->conn);
-		cell->node_info->conn = NULL;
-	}
-
-	/* vote for myself, but only if I believe no-one else is ahead */
-	if (other_node_is_ahead == false)
-	{
-		votes_for_me += 1;
-	}
-
-	log_debug(_("%i of of %i votes"), votes_for_me, visible_nodes);
-
-	if (votes_for_me == visible_nodes)
-		return ELECTION_WON;
-
-	return ELECTION_LOST;
-}
-
-
-static void
-reset_node_voting_status(void)
-{
-	failover_state = FAILOVER_STATE_NONE;
-
-	if (PQstatus(local_conn) != CONNECTION_OK)
-	{
-		log_error(_("reset_node_voting_status(): local_conn not set"));
-		return;
-	}
-	reset_voting_status(local_conn);
-}
-
-
-static void
-monitor_bdr(void)
-{
-	NodeInfoList  nodes = T_NODE_INFO_LIST_INITIALIZER;
-	PGconn		 *monitoring_conn = NULL;
-	t_node_info	 *monitored_node = NULL;
-	RecordStatus  record_status;
-
-	bool failover_done = false;
-
-	/* sanity check local database */
-	log_info(_("connecting to local database '%s'"),
-			 config_file_options.conninfo);
-
-	local_conn = establish_db_connection(config_file_options.conninfo, true);
-
-	/*
-	 * Local node must be running
-	 */
-	if (PQstatus(local_conn) != CONNECTION_OK)
-	{
-		log_error(_("unable connect to local node (ID: %i), terminating"),
-				  local_node_info.node_id);
-		log_hint(_("local node must be running before repmgrd can start"));
-		PQfinish(local_conn);
-		exit(ERR_DB_CONN);
-	}
-
-	/*
-	 * Verify that database is a BDR one
-	 * TODO: check if supported BDR version?
-	 */
-	log_info(_("connected to database, checking for BDR"));
-
-	if (!is_bdr_db(local_conn))
-	{
-		log_error(_("database is not BDR-enabled"));
-		exit(ERR_BAD_CONFIG);
-	}
-
-
-	if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr"))
-	{
-		log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"),
-				  "nodes");
-
-		/* TODO: add `repmgr bdr sync` or similar for this situation, and hint here */
-
-		exit(ERR_BAD_CONFIG);
-	}
-
-	/* Retrieve record for this node from the local database */
-	record_status = get_node_record(local_conn, config_file_options.node_id, &local_node_info);
-
-	/*
-	 * Terminate if we can't find the local node record. This is a "fix-the-config"
-	 * situation, not a lot else we can do.
-	 */
-	if (record_status != RECORD_FOUND)
-	{
-		log_error(_("unable to retrieve record for local node (ID: %i), terminating"),
-					local_node_info.node_id);
-		log_hint(_("check that 'repmgr bdr register' was executed for this node\n"));
-		PQfinish(local_conn);
-		exit(ERR_BAD_CONFIG);
-	}
-
-
-	// check if inactive node
-	// -> what to do?
-
-	/* Log startup event */
-
-	create_event_record(local_conn,
-						&config_file_options,
-						config_file_options.node_id,
-						"repmgrd_start",
-						true,
-						NULL);
-
-	/*
-	 * retrieve list of nodes - we'll need these if the DB connection goes away,
-	 * or if we're monitoring a non-local node
-	 */
-	get_node_records_by_priority(local_conn, &nodes);
-
-		/* decided which node to monitor */
-
-	if (config_file_options.bdr_monitoring_mode == BDR_MONITORING_LOCAL)
-	{
-		// if local, reuse local_conn and node info
-		//record_status = get_node_record(local_conn, config_file_options.node_id, &monitored_node);
-		monitored_node = &local_node_info;
-
-		monitoring_conn = establish_db_connection(monitored_node->conninfo, false);
-		log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id);
-	}
-	else
-	{
-		NodeInfoListCell *cell;
-
-		for (cell = nodes.head; cell; cell = cell->next)
-		{
-			log_debug("main_loop_bdr() checking node %s %i", cell->node_info->node_name, cell->node_info->priority);
-
-			monitoring_conn = establish_db_connection(cell->node_info->conninfo, false);
-			if (PQstatus(monitoring_conn) == CONNECTION_OK)
-			{
-				log_debug("main_loop_bdr() monitoring node '%s' (ID %i, priority %i)",
-						  cell->node_info->node_name, cell->node_info->node_id, cell->node_info->priority);
-				/* fetch the record again, as the node list is transient */
-				monitored_node = get_node_record_pointer(monitoring_conn, cell->node_info->node_id);
-
-				break;
-			}
-		}
-	}
-
-	// check monitored_node not null!
-
-	while (true)
-	{
-		/* normal state - connection active */
-		if (PQstatus(monitoring_conn) == CONNECTION_OK)
-		{
-			// XXX detail
-			log_info(_("starting continuous bdr node monitoring"));
-
-			/* monitoring loop */
-			do
-			{
-				log_verbose(LOG_DEBUG, "bdr check loop...");
-
-				{
-					NodeInfoListCell *cell;
-
-					for (cell = nodes.head; cell; cell = cell->next)
-					{
-						log_debug("bdr_monitor() %s", cell->node_info->node_name);
-					}
-				}
-
-				if (is_server_available(monitored_node->conninfo) == false)
-				{
-					t_node_info  *new_monitored_node;
-
-					// XXX improve
-					log_warning("connection problem!");
-					new_monitored_node = do_bdr_failover(&nodes, monitored_node);
-
-					if (new_monitored_node != NULL)
-					{
-						pfree(monitored_node);
-						monitored_node = new_monitored_node;
-					}
-					log_notice(_("monitored_node->node_name is now '%s' \n"), monitored_node->node_name);
-				}
-				else
-				{
-					sleep(config_file_options.monitor_interval_secs);
-				}
-
-				if (got_SIGHUP)
-				{
-					/*
-					 * if we can reload, then could need to change
-					 * local_conn
-					 */
-					if (reload_config(&config_file_options))
-					{
-						PQfinish(local_conn);
-						local_conn = establish_db_connection(config_file_options.conninfo, true);
-						update_registration(local_conn);
-					}
-
-					/* reload node list */
-					get_node_records_by_priority(local_conn, &nodes);
-
-					got_SIGHUP = false;
-				}
-
-			} while (!failover_done);
-		}
-		/* local connection inactive - periodically try and connect */
-		/* TODO: make this an option */
-		else
-		{
-
-			monitoring_conn = establish_db_connection(monitored_node->conninfo, false);
-
-			if (PQstatus(monitoring_conn) == CONNECTION_OK)
-			{
-				// XXX event bdr_node_recovered -> if monitored == local node
-
-				if (monitored_node->node_id == config_file_options.node_id)
-				{
-					log_notice(_("local connection has returned, resuming monitoring"));
-				}
-				else
-				{
-					log_notice(_("connection to '%s' has returned, resuming monitoring"), monitored_node->node_name);
-				}
-			}
-			else
-			{
-				sleep(config_file_options.monitor_interval_secs);
-			}
-
-
-			if (got_SIGHUP)
-			{
-				/*
-				 * if we can reload, then could need to change
-				 * local_conn
-				 */
-				if (reload_config(&config_file_options))
-				{
-					if (PQstatus(local_conn) == CONNECTION_OK)
-					{
-						PQfinish(local_conn);
-						local_conn = establish_db_connection(config_file_options.conninfo, true);
-						update_registration(local_conn);
-					}
-				}
-
-				/* reload node list */
-				if (PQstatus(local_conn) == CONNECTION_OK)
-					get_node_records_by_priority(local_conn, &nodes);
-
-				got_SIGHUP = false;
-			}
-		}
-
-		failover_done = false;
-	}
-
-	return;
-}
-
-/*
- * do_bdr_failover()
- *
- * Here we attempt to perform a BDR "failover".
- *
- * As there's no equivalent of a physical replication failover,
- * we'll do the following:
- *
- *  - attempt to find another node, to set our node record as inactive
- *  - generate an event log record on that node
- *  - optionally execute `bdr_failover_command`, passing the conninfo string
- *    of that node to the command; this can be used for e.g. reconfiguring
- *    pgbouncer.
- *  - if mode is 'BDR_MONITORING_PRIORITY', redirect monitoring to that node.
- *
- */
-static t_node_info *
-do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
-{
-	PGconn	   *next_node_conn = NULL;
-	NodeInfoListCell *cell;
-	bool	    failover_success = false;
-	PQExpBufferData event_details;
-	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
-	t_node_info *new_monitored_node = NULL;
-
-	initPQExpBuffer(&event_details);
-
-	/* get next active priority node */
-
-	for (cell = nodes->head; cell; cell = cell->next)
-	{
-		log_debug("do_bdr_failover() %s", cell->node_info->node_name);
-
-		/* don't attempt to connect to the current monitored node, as that's the one which has failed  */
-		if (cell->node_info->node_id == monitored_node->node_id)
-			continue;
-
-		/* XXX skip inactive node? */
-
-		next_node_conn = establish_db_connection(cell->node_info->conninfo, false);
-
-		if (PQstatus(next_node_conn) == CONNECTION_OK)
-		{
-			// XXX check if record returned
-			new_monitored_node = get_node_record_pointer(next_node_conn, cell->node_info->node_id);
-
-			break;
-		}
-
-		next_node_conn = NULL;
-	}
-
-	if (next_node_conn == NULL)
-	{
-		appendPQExpBuffer(&event_details,
-						  _("no other available node found"));
-
-		log_error("%s", event_details.data);
-
-
-		// no other nodes found
-		// continue degraded monitoring until node is restored?
-	}
-	else
-	{
-		log_info(_("connecting to target node %s"), cell->node_info->node_name);
-
-		failover_success = true;
-
-		event_info.conninfo_str = cell->node_info->conninfo;
-		event_info.node_name = cell->node_info->node_name;
-
-		/* update our own record on the other node */
-		if (monitored_node->node_id == config_file_options.node_id)
-		{
-			update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
-		}
-
-		if (config_file_options.bdr_monitoring_mode == BDR_MONITORING_PRIORITY)
-		{
-			log_notice(_("monitoring next available node by prioriy: %s (ID %i)"),
-					   new_monitored_node->node_name,
-					   new_monitored_node->node_id);
-		}
-
-		appendPQExpBuffer(&event_details,
-						  _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
-						  monitored_node->node_name,
-						  monitored_node->node_id,
-						  cell->node_info->node_name,
-						  cell->node_info->node_id);
-	}
-
-	/*
-	 * Create an event record
-	 *
-	 * If we were able to connect to another node, we'll update the
-	 * event log there.
-	 *
-	 * In any case the event notification command will be triggered
-	 * with the event "bdr_failover"
-	 */
-
-	create_event_notification_extended(
-		next_node_conn,
-		&config_file_options,
-		config_file_options.node_id,
-		"bdr_failover",
-		failover_success,
-		event_details.data,
-		&event_info);
-
-	termPQExpBuffer(&event_details);
-
-	//failover_done = true;
-
-	if (config_file_options.bdr_monitoring_mode == BDR_MONITORING_PRIORITY)
-		return new_monitored_node;
-
-	/* local monitoring mode - there's no new node to monitor */
-	return NULL;
-}
-
-static void
+void
 update_registration(PGconn *conn)
 {
 	bool success = update_node_record_conn_priority(local_conn,
@@ -2622,7 +601,8 @@ show_help(void)
 	printf(_("%s monitors a cluster of servers and optionally performs failover.\n"), progname());
 }
 
-static PGconn *
+
+PGconn *
 try_reconnect(const char *conninfo, NodeStatus *node_status)
 {
 	PGconn *conn;
@@ -2666,33 +646,8 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
 }
 
 
-static void
-close_connections()
-{
-	if (PQstatus(primary_conn) == CONNECTION_OK)
-	{
-		/* cancel any pending queries to the primary */
-		if (PQisBusy(primary_conn) == 1)
-			cancel_query(primary_conn, config_file_options.primary_response_timeout);
-		PQfinish(primary_conn);
-		primary_conn = NULL;
-	}
 
-	if (upstream_conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
-	{
-		PQfinish(upstream_conn);
-		upstream_conn = NULL;
-	}
-
-	if (PQstatus(local_conn) == CONNECTION_OK)
-	{
-		PQfinish(local_conn);
-		local_conn = NULL;
-	}
-}
-
-
-static int
+int
 calculate_elapsed(instr_time start_time)
 {
 	instr_time	current_time;
@@ -2706,6 +661,20 @@ calculate_elapsed(instr_time start_time)
 
 
 static void
+close_connections()
+{
+	if (_close_connections != NULL)
+		_close_connections();
+
+	if (local_conn != NULL && PQstatus(local_conn) == CONNECTION_OK)
+	{
+		PQfinish(local_conn);
+		local_conn = NULL;
+	}
+
+}
+
+void
 terminate(int retval)
 {
 	close_connections();
diff --git a/repmgrd.h b/repmgrd.h
new file mode 100644
index 00000000..4c9443f7
--- /dev/null
+++ b/repmgrd.h
@@ -0,0 +1,36 @@
+/*
+ * repmgrd.h
+ * Copyright (c) 2ndQuadrant, 2010-2017
+ */
+
+#ifndef _REPMGRD_H_
+#define _REPMGRD_H_
+
+#include <time.h>
+#include "portability/instr_time.h"
+
+typedef enum {
+	NODE_STATUS_UNKNOWN = -1,
+	NODE_STATUS_UP,
+	NODE_STATUS_DOWN
+} NodeStatus;
+
+typedef enum {
+	MS_NORMAL = 0,
+	MS_DEGRADED = 1
+} MonitoringState;
+
+extern MonitoringState monitoring_state;
+extern instr_time	degraded_monitoring_start;
+
+extern t_configuration_options config_file_options;
+extern t_node_info local_node_info;
+extern PGconn	   *local_conn;
+extern bool startup_event_logged;
+
+PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
+
+int calculate_elapsed(instr_time start_time);
+void update_registration(PGconn *conn);
+void terminate(int retval);
+#endif /* _REPMGRD_H_ */