doc: update release notes

standby follow: don't attempt to delete slot if current upstream is primary
An attempt will be made to delete an existing replication slot on the old upstream node (this is important during e.g. a switchover operation or when attaching a cascaded standby to a new upstream). However if the standby is currently attached to the follow target node anyway, the replication slot should never be deleted.
2026-03-23 07:06:30 +00:00 · 2019-12-10 16:35:59 +09:00 · 2019-12-10 15:48:49 +09:00 · 2019-05-24 10:08:27 +09:00 · 2019-05-24 10:05:10 +09:00 · 2019-05-10 10:30:29 +09:00
66 changed files with 4474 additions and 1920 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,9 @@ lib*.pc
 # other
 /.lineno
 *.dSYM
+*.orig
+*.rej
+
 # generated binaries
 repmgr
 repmgrd
--- a/41
+++ b/41
@@ -1,9 +1,13 @@
-4.3     2019-??
+4.3.1   2019-12-??
+        repmgr: ensure an existing replication slot is not deleted if the
+          follow target is the node's current upstream (Ian)
+
+4.3     2019-04-02
        repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian)
        repmgr: add --version-number command line option (Ian)
        repmgr: add --compact option to "cluster show"; GitHub #521 (Ian)
-		repmgr: cluster show - differentiate between unreachable nodes
-		  and nodes which are running but rejecting connections (Ian)
+        repmgr: cluster show - differentiate between unreachable nodes
+          and nodes which are running but rejecting connections (Ian)
        repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
        repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
        repmgr: prevent potential race condition in "standby switchover"
@@ -11,29 +15,22 @@
        repmgr: ensure "standby switchover" verifies repmgr can read the
          data directory on the demotion candidate; GitHub #523 (Ian)
        repmgr: ensure "standby switchover" verifies replication connection
-		  exists; GitHub #519 (Ian)
-		repmgr: ensure "primary unregister" behaves correctly when executed
-		  on a witness server; GitHub #548 (Ian)
-        repmgr: when executing "standby follow" and "node rejoin", check that
-          it will actually be possible to stream from the target node (Ian)
-        repmgr: "standby switchover": improve handling of connection URIs when
-          executing "node rejoin" on the demotion candidate; GitHub #525 (Ian)
-        repmgr: fix long node ID display in "cluster show" (Ian)
-        repmgr: check for primary server before executing "witness register";
-           GitHub #538 (Ian)
-        repmgr: show "upstream last seen" interval in "daemon status" output (Ian)
-        repmgr: "node check" will only consider physical replication slots (Ian)
-        repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
-        repmgrd: on a cascaded standby, don't fail over if "failover=manual";
-          GitHub #531 (Ian)
-		repmgrd: don't consider nodes where repmgrd is not running as promotion
-		  candidates (Ian)
-
-4.2.1   2018-??-??
+          exists; GitHub #519 (Ian)
        repmgr: add sanity check for correct extension version (Ian)
        repmgr: ensure "witness register --dry-run" does not attempt to read node
          tables if repmgr extension not installed; GitHub #513 (Ian)
+        repmgr: ensure "standby register" fails when --upstream-node-id is the
+          same as the local node ID (Ian)
+        repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
+        repmgrd: on a cascaded standby, don't fail over if "failover=manual";
+          GitHub #531 (Ian)
+        repmgrd: don't consider nodes where repmgrd is not running as promotion
+          candidates (Ian)
+        repmgrd: add option "connection_check_type" (Ian)
        repmgrd: improve witness monitoring when primary node not available (Ian)
+		repmgrd: handle situation where a primary has unexpectedly appeared
+		  during failover; GitHub #420 (Ian)
+		general: fix Makefile (John)

 4.2     2018-10-24
        repmgr: add parameter "shutdown_check_timeout" for use by "standby switchover";
--- a/Makefile.in
+++ b/Makefile.in
@@ -50,8 +50,8 @@ $(info Building against PostgreSQL $(MAJORVERSION))
 REPMGR_CLIENT_OBJS = repmgr-client.o \
 	repmgr-action-primary.o repmgr-action-standby.o repmgr-action-witness.o \
 	repmgr-action-bdr.o repmgr-action-cluster.o repmgr-action-node.o repmgr-action-daemon.o \
-	configfile.o log.o strutil.o controldata.o dirutil.o compat.o dbutils.o
-REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o configfile.o log.o dbutils.o strutil.o controldata.o compat.o
+	configfile.o log.o strutil.o controldata.o dirutil.o compat.o dbutils.o sysutils.o
+REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o configfile.o log.o dbutils.o strutil.o controldata.o compat.o sysutils.o
 DATE=$(shell date "+%Y-%m-%d")

 repmgr_version.h: repmgr_version.h.in
@@ -86,29 +86,15 @@ clean: additional-clean
 maintainer-clean: additional-maintainer-clean

 additional-clean:
-	rm -f repmgr-client.o
-	rm -f repmgr-action-primary.o
-	rm -f repmgr-action-standby.o
-	rm -f repmgr-action-witness.o
-	rm -f repmgr-action-bdr.o
-	rm -f repmgr-action-node.o
-	rm -f repmgr-action-cluster.o
-	rm -f repmgr-action-daemon.o
-	rm -f repmgrd.o
-	rm -f repmgrd-physical.o
-	rm -f repmgrd-bdr.o
-	rm -f compat.o
-	rm -f configfile.o
-	rm -f controldata.o
-	rm -f dbutils.o
-	rm -f dirutil.o
-	rm -f log.o
-	rm -f strutil.o
+	rm -f *.o

-maintainer-additional-clean: clean
-	rm -f configure
+additional-maintainer-clean: clean
+	$(MAKE) -C doc maintainer-clean
 	rm -f config.status config.log
+	rm -f config.h
+	rm -f repmgr_version.h
 	rm -f Makefile
+	rm -f Makefile.global
 	@rm -rf autom4te.cache/

 ifeq ($(MAJORVERSION),$(filter $(MAJORVERSION),9.3 9.4))
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Documentation

 The main `repmgr` documentation is available here:

-> [repmgr 4 documentation](https://repmgr.org/docs/4.2/index.html)
+> [repmgr documentation](https://repmgr.org/docs/current/index.html)

 The `README` file for `repmgr` 3.x is available here:

@@ -72,7 +72,7 @@ Please report bugs and other issues to:

 * https://github.com/2ndQuadrant/repmgr

-Further information is available at https://www.repmgr.org/
+Further information is available at https://repmgr.org/

 We'd love to hear from you about how you use repmgr. Case studies and
 news are always welcome. Send us an email at info@2ndQuadrant.com, or
@@ -97,6 +97,7 @@ Thanks from the repmgr core team.
 Further reading
 ---------------

+* [repmgr documentation](https://repmgr.org/docs/current/index.html)
 * https://blog.2ndquadrant.com/repmgr-3-2-is-here-barman-support-brand-new-high-availability-features/
 * https://blog.2ndquadrant.com/improvements-in-repmgr-3-1-4/
 * https://blog.2ndquadrant.com/managing-useful-clusters-repmgr/
--- a/configfile.c
+++ b/configfile.c
@@ -358,6 +358,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 	options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
 	options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */
 	memset(options->repmgrd_pid_file, 0, sizeof(options->repmgrd_pid_file));
+	options->standby_disconnect_on_failover = false;
+	options->sibling_nodes_disconnect_timeout = DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT;
+	options->connection_check_type = CHECK_PING;
+	options->primary_visibility_consensus = false;
+	memset(options->failover_validation_command, 0, sizeof(options->failover_validation_command));
+	options->election_rerun_interval = DEFAULT_ELECTION_RERUN_INTERVAL;

 	/*-------------
 	 * witness settings
@@ -478,7 +484,14 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 			node_id_found = true;
 		}
 		else if (strcmp(name, "node_name") == 0)
-			strncpy(options->node_name, value, MAXLEN);
+		{
+			if (strlen(value) < sizeof(options->node_name))
+				strncpy(options->node_name, value, sizeof(options->node_name));
+			else
+				item_list_append_format(error_list,
+										_("value for \"node_name\" must contain fewer than %lu characters"),
+										sizeof(options->node_name));
+		}
 		else if (strcmp(name, "conninfo") == 0)
 			strncpy(options->conninfo, value, MAXLEN);
 		else if (strcmp(name, "data_directory") == 0)
@@ -488,11 +501,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *

 		else if (strcmp(name, "replication_user") == 0)
 		{
-			if (strlen(value) < NAMEDATALEN)
-				strncpy(options->replication_user, value, NAMEDATALEN);
+			if (strlen(value) < sizeof(options->replication_user))
+				strncpy(options->replication_user, value, sizeof(options->replication_user));
 			else
-				item_list_append(error_list,
-								 _("value for \"replication_user\" must contain fewer than " STR(NAMEDATALEN) " characters"));
+				item_list_append_format(error_list,
+										_("value for \"replication_user\" must contain fewer than %lu characters"),
+										sizeof(options->replication_user));
 		}
 		else if (strcmp(name, "pg_bindir") == 0)
 			strncpy(options->pg_bindir, value, MAXPGPATH);
@@ -618,6 +632,36 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 			options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0);
 		else if (strcmp(name, "repmgrd_pid_file") == 0)
 			strncpy(options->repmgrd_pid_file, value, MAXPGPATH);
+		else if (strcmp(name, "standby_disconnect_on_failover") == 0)
+			options->standby_disconnect_on_failover = parse_bool(value, name, error_list);
+		else if (strcmp(name, "sibling_nodes_disconnect_timeout") == 0)
+			options->sibling_nodes_disconnect_timeout = repmgr_atoi(value, name, error_list, 0);
+		else if (strcmp(name, "connection_check_type") == 0)
+		{
+			if (strcasecmp(value, "ping") == 0)
+			{
+				options->connection_check_type = CHECK_PING;
+			}
+			else if (strcasecmp(value, "connection") == 0)
+			{
+				options->connection_check_type = CHECK_CONNECTION;
+			}
+			else if (strcasecmp(value, "query") == 0)
+			{
+				options->connection_check_type = CHECK_QUERY;
+			}
+			else
+			{
+				item_list_append(error_list,
+								 _("value for \"connection_check_type\" must be \"ping\", \"connection\" or \"query\"\n"));
+			}
+		}
+		else if (strcmp(name, "primary_visibility_consensus") == 0)
+			options->primary_visibility_consensus = parse_bool(value, name, error_list);
+		else if (strcmp(name, "failover_validation_command") == 0)
+			strncpy(options->failover_validation_command, value, sizeof(options->failover_validation_command));
+		else if (strcmp(name, "election_rerun_interval") == 0)
+			options->election_rerun_interval = repmgr_atoi(value, name, error_list, 0);

 		/* witness settings */
 		else if (strcmp(name, "witness_sync_interval") == 0)
@@ -792,15 +836,16 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 		conninfo_options = PQconninfoParse(options->conninfo, &conninfo_errmsg);
 		if (conninfo_options == NULL)
 		{
-			char		error_message_buf[MAXLEN] = "";
+			PQExpBufferData error_message_buf;
+			initPQExpBuffer(&error_message_buf);

-			snprintf(error_message_buf,
-					 MAXLEN,
-					 _("\"conninfo\": %s	(provided: \"%s\")"),
-					 conninfo_errmsg,
-					 options->conninfo);
+			appendPQExpBuffer(&error_message_buf,
+							  _("\"conninfo\": %s	(provided: \"%s\")"),
+							  conninfo_errmsg,
+							  options->conninfo);

-			item_list_append(error_list, error_message_buf);
+			item_list_append(error_list, error_message_buf.data);
+			termPQExpBuffer(&error_message_buf);
 		}

 		PQconninfoFree(conninfo_options);
@@ -1049,15 +1094,19 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
 * loop is started up; it therefore only needs to reload options required
 * by repmgrd, which are as follows:
 *
- * changeable options:
+ * changeable options (keep the list in "doc/repmgrd-configuration.sgml" in sync
+ * with these):
+ *
 * - async_query_timeout
 * - bdr_local_monitoring_only
 * - bdr_recovery_timeout
+ * - connection_check_type
 * - conninfo
 * - degraded_monitoring_timeout
 * - event_notification_command
 * - event_notifications
 * - failover
+ * - failover_validation_command
 * - follow_command
 * - log_facility
 * - log_file
@@ -1065,12 +1114,19 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
 * - log_status_interval
 * - monitor_interval_secs
 * - monitoring_history
+ * - primary_notification_timeout
+ * - primary_visibility_consensus
 * - promote_command
- * - promote_delay
 * - reconnect_attempts
 * - reconnect_interval
 * - repmgrd_standby_startup_timeout
 * - retry_promote_interval_secs
+ * - sibling_nodes_disconnect_timeout
+ * - standby_disconnect_on_failover
+ *
+ *
+ * Not publicly documented:
+ * - promote_delay
 *
 * non-changeable options (repmgrd references these from the "repmgr.nodes"
 * table, not the configuration file)
@@ -1149,13 +1205,12 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		return false;
 	}

-	if (strncmp(new_options.node_name, orig_options->node_name, MAXLEN) != 0)
+	if (strncmp(new_options.node_name, orig_options->node_name, sizeof(orig_options->node_name)) != 0)
 	{
 		log_warning(_("\"node_name\" cannot be changed, keeping current configuration"));
 		return false;
 	}

-
 	/*
 	 * No configuration problems detected - copy any changed values
 	 *
@@ -1205,8 +1260,8 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		{
 			strncpy(orig_options->conninfo, new_options.conninfo, MAXLEN);
 			log_info(_("\"conninfo\" is now \"%s\""), new_options.conninfo);
-
 		}
+
 		PQfinish(conn);
 	}

@@ -1284,7 +1339,6 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		config_changed = true;
 	}

-
 	/* promote_command */
 	if (strncmp(orig_options->promote_command, new_options.promote_command, MAXLEN) != 0)
 	{
@@ -1330,6 +1384,51 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		config_changed = true;
 	}

+	/* standby_disconnect_on_failover */
+	if (orig_options->standby_disconnect_on_failover != new_options.standby_disconnect_on_failover)
+	{
+		orig_options->standby_disconnect_on_failover = new_options.standby_disconnect_on_failover;
+		log_info(_("\"standby_disconnect_on_failover\" is now \"%s\""),
+				 new_options.standby_disconnect_on_failover == true ? "TRUE" : "FALSE");
+		config_changed = true;
+	}
+
+	/* sibling_nodes_disconnect_timeout */
+	if (orig_options->sibling_nodes_disconnect_timeout != new_options.sibling_nodes_disconnect_timeout)
+	{
+		orig_options->sibling_nodes_disconnect_timeout = new_options.sibling_nodes_disconnect_timeout;
+		log_info(_("\"sibling_nodes_disconnect_timeout\" is now \"%i\""),
+				 new_options.sibling_nodes_disconnect_timeout);
+		config_changed = true;
+	}
+
+	/* connection_check_type */
+	if (orig_options->connection_check_type != new_options.connection_check_type)
+	{
+		orig_options->connection_check_type = new_options.connection_check_type;
+		log_info(_("\"connection_check_type\" is now \"%s\""),
+				 print_connection_check_type(new_options.connection_check_type));
+		config_changed = true;
+	}
+
+	/* primary_visibility_consensus */
+	if (orig_options->primary_visibility_consensus != new_options.primary_visibility_consensus)
+	{
+		orig_options->primary_visibility_consensus = new_options.primary_visibility_consensus;
+		log_info(_("\"primary_visibility_consensus\" is now \"%s\""),
+				 new_options.primary_visibility_consensus == true ? "TRUE" : "FALSE");
+		config_changed = true;
+	}
+
+	/* failover_validation_command */
+	if (strncmp(orig_options->failover_validation_command, new_options.failover_validation_command, MAXPGPATH) != 0)
+	{
+		strncpy(orig_options->failover_validation_command, new_options.failover_validation_command, MAXPGPATH);
+		log_info(_("\"failover_validation_command\" is now \"%s\""), new_options.failover_validation_command);
+
+		config_changed = true;
+	}
+
 	/*
 	 * Handle changes to logging configuration
 	 */
@@ -1927,3 +2026,21 @@ parse_pg_basebackup_options(const char *pg_basebackup_options, t_basebackup_opti

 	return backup_options_ok;
 }
+
+
+const char *
+print_connection_check_type(ConnectionCheckType type)
+{
+	switch (type)
+	{
+		case CHECK_PING:
+			return "ping";
+		case CHECK_QUERY:
+			return "query";
+		case CHECK_CONNECTION:
+			return "connection";
+	}
+
+	/* should never reach here */
+	return "UNKNOWN";
+}
--- a/configfile.h
+++ b/configfile.h
@@ -37,6 +37,13 @@ typedef enum
 	FAILOVER_AUTOMATIC
 } failover_mode_opt;

+typedef enum
+{
+	CHECK_PING,
+	CHECK_QUERY,
+	CHECK_CONNECTION
+} ConnectionCheckType;
+
 typedef struct EventNotificationListCell
 {
 	struct EventNotificationListCell *next;
@@ -69,7 +76,7 @@ typedef struct
 {
 	/* node information */
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	char		conninfo[MAXLEN];
 	char		replication_user[NAMEDATALEN];
 	char		data_directory[MAXPGPATH];
@@ -135,6 +142,12 @@ typedef struct
 	int			primary_notification_timeout;
 	int			repmgrd_standby_startup_timeout;
 	char		repmgrd_pid_file[MAXPGPATH];
+	bool		standby_disconnect_on_failover;
+	int			sibling_nodes_disconnect_timeout;
+	ConnectionCheckType connection_check_type;
+	bool		primary_visibility_consensus;
+	char		failover_validation_command[MAXPGPATH];
+	int			election_rerun_interval;

 	/* BDR settings */
 	bool		bdr_local_monitoring_only;
@@ -179,7 +192,7 @@ typedef struct
 		/* node information */ \
 		UNKNOWN_NODE_ID, "", "", "", "", "", "", "", REPLICATION_TYPE_PHYSICAL,	\
 		/* log settings */ \
-		"", "", "", DEFAULT_LOG_STATUS_INTERVAL,	\
+		"", "", "", DEFAULT_LOG_STATUS_INTERVAL, \
 		/* standby clone settings */ \
 		false, "", "", { NULL, NULL }, "", false, "", false, "", \
 		/* standby promote settings */ \
@@ -205,8 +218,9 @@ typedef struct
        DEFAULT_RECONNECTION_INTERVAL, \
        false, -1, \
 		DEFAULT_ASYNC_QUERY_TIMEOUT, \
-		DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT,	\
-		-1, "", \
+		DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
+		-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, \
+		CHECK_PING, true, "", DEFAULT_ELECTION_RERUN_INTERVAL, \
 		/* BDR settings */ \
 		false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
 		/* service settings */ \
@@ -315,5 +329,6 @@ void free_parsed_argv(char ***argv_array);
 /* called by repmgr-client and repmgrd */
 void		exit_with_cli_errors(ItemList *error_list, const char *repmgr_command);
 void		print_item_list(ItemList *item_list);
+const char *print_connection_check_type(ConnectionCheckType type);

 #endif							/* _REPMGR_CONFIGFILE_H_ */
--- a/controldata.c
+++ b/controldata.c
@@ -301,6 +301,8 @@ get_controlfile(const char *DataDir)
 					ControlFilePath);
 		log_detail("%s", strerror(errno));

+		close(fd);
+
 		return control_file_info;
 	}

--- a/dbutils.c
+++ b/dbutils.c
@@ -43,6 +43,8 @@ int			bdr_version_num = UNKNOWN_BDR_VERSION_NUM;
 static void log_db_error(PGconn *conn, const char *query_text, const char *fmt,...)
 __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));

+static bool _is_server_available(const char *conninfo, bool quiet);
+
 static PGconn *_establish_db_connection(const char *conninfo,
 						 const bool exit_on_error,
 						 const bool log_notice,
@@ -67,16 +69,19 @@ void
 log_db_error(PGconn *conn, const char *query_text, const char *fmt,...)
 {
 	va_list		ap;
+	char		buf[MAXLEN];
+	int			retval;

 	va_start(ap, fmt);
-
-	log_error(fmt, ap);
-
+	retval = vsnprintf(buf, MAXLEN, fmt, ap);
 	va_end(ap);

-	if (conn != NULL && PQstatus(conn) == CONNECTION_OK)
+	if (retval < MAXLEN)
+		log_error("%s", buf);
+
+	if (conn != NULL)
 	{
-		log_detail("%s", PQerrorMessage(conn));
+		log_detail("\n%s", PQerrorMessage(conn));
 	}

 	if (query_text != NULL)
@@ -190,13 +195,13 @@ _establish_db_connection(const char *conninfo, const bool exit_on_error, const b
 		{
 			if (log_notice)
 			{
-				log_notice(_("connection to database failed:\n  %s"),
-						   PQerrorMessage(conn));
+				log_notice(_("connection to database failed"));
+				log_detail("\n%s", PQerrorMessage(conn));
 			}
 			else
 			{
-				log_error(_("connection to database failed:\n  %s"),
-						  PQerrorMessage(conn));
+				log_error(_("connection to database failed"));
+				log_detail("\n%s", PQerrorMessage(conn));
 			}
 			log_detail(_("attempted to connect using:\n  %s"),
 					   connection_string);
@@ -287,8 +292,9 @@ establish_db_connection_by_params(t_conninfo_param_list *param_list,
 	/* Check to see that the backend connection was successfully made */
 	if ((PQstatus(conn) != CONNECTION_OK))
 	{
-		log_error(_("connection to database failed:\n	%s"),
-				  PQerrorMessage(conn));
+		log_error(_("connection to database failed"));
+		log_detail("\n%s", PQerrorMessage(conn));
+
 		if (exit_on_error)
 		{
 			PQfinish(conn);
@@ -338,7 +344,9 @@ is_superuser_connection(PGconn *conn, t_connection_user *userinfo)

 	if (userinfo != NULL)
 	{
-		strncpy(userinfo->username, current_user, MAXLEN);
+		snprintf(userinfo->username,
+				 sizeof(userinfo->username),
+				 "%s", current_user);
 		userinfo->is_superuser = is_superuser;
 	}

@@ -821,8 +829,8 @@ begin_transaction(PGconn *conn)

 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
-		log_error(_("unable to begin transaction:\n	 %s"),
-				  PQerrorMessage(conn));
+		log_error(_("unable to begin transaction"));
+		log_detail("%s", PQerrorMessage(conn));

 		PQclear(res);
 		return false;
@@ -845,8 +853,8 @@ commit_transaction(PGconn *conn)

 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
-		log_error(_("unable to commit transaction:\n  %s"),
-				  PQerrorMessage(conn));
+		log_error(_("unable to commit transaction"));
+		log_detail("%s", PQerrorMessage(conn));
 		PQclear(res);

 		return false;
@@ -869,8 +877,8 @@ rollback_transaction(PGconn *conn)

 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
-		log_error(_("unable to rollback transaction:\n	%s"),
-				  PQerrorMessage(conn));
+		log_error(_("unable to rollback transaction"));
+		log_detail("%s", PQerrorMessage(conn));
 		PQclear(res);

 		return false;
@@ -1073,13 +1081,13 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
 	{
 		if (strcmp(PQgetvalue(res, i, 0), setting) == 0)
 		{
-			strncpy(output, PQgetvalue(res, i, 1), MAXLEN);
+			snprintf(output, MAXLEN, "%s", PQgetvalue(res, i, 1));
 			success = true;
 			break;
 		}
 		else
 		{
-			/* XXX highly unlikely this would ever happen */
+			/* highly unlikely this would ever happen */
 			log_error(_("get_pg_setting(): unknown parameter \"%s\""), PQgetvalue(res, i, 0));
 		}
 	}
@@ -1096,6 +1104,55 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
 }


+bool
+alter_system_int(PGconn *conn, const char *name, int value)
+{
+	PQExpBufferData query;
+	PGresult   *res = NULL;
+	bool		success = true;
+
+	initPQExpBuffer(&query);
+	appendPQExpBuffer(&query,
+					  "ALTER SYSTEM SET %s = %i",
+					  name, value);
+
+	res = PQexec(conn, query.data);
+
+	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+	{
+		log_db_error(conn, query.data, _("alter_system_int() - unable to execute query"));
+
+		success = false;
+	}
+
+	termPQExpBuffer(&query);
+	PQclear(res);
+
+	return success;
+}
+
+
+bool
+pg_reload_conf(PGconn *conn)
+{
+	PGresult   *res = NULL;
+	bool		success = false;
+
+	res = PQexec(conn, "SELECT pg_catalog.pg_reload_conf()");
+
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_db_error(conn, NULL, _("pg_reload_conf() - unable to execute query"));
+
+		success = false;
+	}
+
+	PQclear(res);
+
+	return success;
+}
+
+
 /* ============================ */
 /* Server information functions */
 /* ============================ */
@@ -1124,7 +1181,7 @@ get_cluster_size(PGconn *conn, char *size)
 	}
 	else
 	{
-		strncpy(size, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(size, MAXLEN, "%s", PQgetvalue(res, 0, 0));
 	}

 	termPQExpBuffer(&query);
@@ -1172,7 +1229,7 @@ get_server_version(PGconn *conn, char *server_version_buf)
 		 * first space.
 		 */

-		strncpy(_server_version_buf, PQgetvalue(res, 0, 1), MAXVERSIONSTR);
+		snprintf(_server_version_buf, MAXVERSIONSTR, "%s", PQgetvalue(res, 0, 1));

 		for (i = 0; i < MAXVERSIONSTR; i++)
 		{
@@ -1299,7 +1356,8 @@ _get_primary_connection(PGconn *conn,

 		/* initialize with the values of the current node being processed */
 		node_id = atoi(PQgetvalue(res, i, 0));
-		strncpy(remote_conninfo, PQgetvalue(res, i, 1), MAXCONNINFO);
+		snprintf(remote_conninfo, MAXCONNINFO, "%s", PQgetvalue(res, i, 1));
+
 		log_verbose(LOG_INFO,
 					_("checking if node %i is primary"),
 					node_id);
@@ -1463,10 +1521,10 @@ get_ready_archive_files(PGconn *conn, const char *data_directory)
 	while ((arcdir_ent = readdir(arcdir)) != NULL)
 	{
 		struct stat statbuf;
-		char		file_path[MAXPGPATH] = "";
+		char		file_path[MAXPGPATH + sizeof(arcdir_ent->d_name)];
 		int			basenamelen = 0;

-		snprintf(file_path, MAXPGPATH,
+		snprintf(file_path, sizeof(file_path),
 				 "%s/%s",
 				 archive_status_dir,
 				 arcdir_ent->d_name);
@@ -1503,6 +1561,8 @@ identify_system(PGconn *repl_conn, t_system_identification *identification)

 	if (PQresultStatus(res) != PGRES_TUPLES_OK || !PQntuples(res))
 	{
+		log_db_error(repl_conn, NULL, _("unable to execute IDENTIFY_SYSTEM"));
+
 		PQclear(res);
 		return false;
 	}
@@ -1621,6 +1681,7 @@ repmgrd_set_local_node_id(PGconn *conn, int local_node_id)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
+	bool		success = true;

 	initPQExpBuffer(&query);

@@ -1629,16 +1690,18 @@ repmgrd_set_local_node_id(PGconn *conn, int local_node_id)
 					  local_node_id);

 	res = PQexec(conn, query.data);
-	termPQExpBuffer(&query);

 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		PQclear(res);
-		return false;
+		log_db_error(conn, query.data, _("repmgrd_set_local_node_id(): unable to execute query"));
+
+		success = false;
 	}

+	termPQExpBuffer(&query);
 	PQclear(res);
-	return true;
+
+	return success;
 }


@@ -1854,6 +1917,29 @@ repmgrd_pause(PGconn *conn, bool pause)
 	return success;
 }

+pid_t
+get_wal_receiver_pid(PGconn *conn)
+{
+	PGresult   *res = NULL;
+	pid_t		wal_receiver_pid = UNKNOWN_PID;
+
+	res = PQexec(conn, "SELECT repmgr.get_wal_receiver_pid()");
+
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_error(_("unable to execute \"SELECT repmgr.get_wal_receiver_pid()\""));
+		log_detail("%s", PQerrorMessage(conn));
+	}
+	else if (!PQgetisnull(res, 0, 0))
+	{
+		wal_receiver_pid = atoi(PQgetvalue(res, 0, 0));
+	}
+
+	PQclear(res);
+
+	return wal_receiver_pid;
+}
+
 /* ================ */
 /* result functions */
 /* ================ */
@@ -1916,9 +2002,13 @@ get_repmgr_extension_status(PGconn *conn, t_extension_versions *extversions)
 		/* caller wants to know which versions are installed/available */
 		if (extversions != NULL)
 		{
-			strncpy(extversions->default_version, PQgetvalue(res, 0, 2), 7);
+			snprintf(extversions->default_version,
+					 sizeof(extversions->default_version),
+					 "%s", PQgetvalue(res, 0, 2));
 			extversions->default_version_num = available_version;
-			strncpy(extversions->installed_version, PQgetvalue(res, 0, 4), 7);
+			snprintf(extversions->installed_version,
+					 sizeof(extversions->installed_version),
+					 "%s", PQgetvalue(res, 0, 4));
 			extversions->installed_version_num = installed_version;
 		}

@@ -2082,6 +2172,8 @@ _get_node_record(PGconn *conn, char *sqlquery, t_node_info *node_info, bool init

 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
+		log_db_error(conn, sqlquery, _("_get_node_record(): unable to execute query"));
+
 		PQclear(res);
 		return RECORD_ERROR;
 	}
@@ -2117,17 +2209,17 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row, bool init_
 		node_info->upstream_node_id = atoi(PQgetvalue(res, row, 2));
 	}

-	strncpy(node_info->node_name, PQgetvalue(res, row, 3), MAXLEN);
-	strncpy(node_info->conninfo, PQgetvalue(res, row, 4), MAXLEN);
-	strncpy(node_info->repluser, PQgetvalue(res, row, 5), NAMEDATALEN);
-	strncpy(node_info->slot_name, PQgetvalue(res, row, 6), MAXLEN);
-	strncpy(node_info->location, PQgetvalue(res, row, 7), MAXLEN);
+	snprintf(node_info->node_name, sizeof(node_info->node_name), "%s", PQgetvalue(res, row, 3));
+	snprintf(node_info->conninfo, sizeof(node_info->conninfo), "%s", PQgetvalue(res, row, 4));
+	snprintf(node_info->repluser, sizeof(node_info->repluser), "%s", PQgetvalue(res, row, 5));
+	snprintf(node_info->slot_name, sizeof(node_info->slot_name), "%s", PQgetvalue(res, row, 6));
+	snprintf(node_info->location, sizeof(node_info->location), "%s", PQgetvalue(res, row, 7));
 	node_info->priority = atoi(PQgetvalue(res, row, 8));
 	node_info->active = atobool(PQgetvalue(res, row, 9));
-	strncpy(node_info->config_file, PQgetvalue(res, row, 10), MAXPGPATH);
+	snprintf(node_info->config_file, sizeof(node_info->config_file), "%s", PQgetvalue(res, row, 10));

 	/* This won't normally be set */
-	strncpy(node_info->upstream_node_name, PQgetvalue(res, row, 11), MAXLEN);
+	snprintf(node_info->upstream_node_name, sizeof(node_info->upstream_node_name), "%s", PQgetvalue(res, row, 11));

 	/* Set remaining struct fields with default values */

@@ -2991,13 +3083,15 @@ update_node_record_conn_priority(PGconn *conn, t_configuration_options *options)
 					  options->node_id);

 	res = PQexec(conn, query.data);
-	termPQExpBuffer(&query);

 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
+		log_db_error(conn, query.data, _("update_node_record_conn_priority(): unable to execute query"));
 		success = false;
 	}

+	termPQExpBuffer(&query);
+
 	PQclear(res);

 	return success;
@@ -3379,11 +3473,15 @@ config_file_list_add(t_configfile_list *list, const char *file, const char *file
 	}


-	strncpy(list->files[list->entries]->filepath, file, MAXPGPATH);
+	snprintf(list->files[list->entries]->filepath,
+			 sizeof(list->files[list->entries]->filepath),
+			 "%s", file);
 	canonicalize_path(list->files[list->entries]->filepath);

+	snprintf(list->files[list->entries]->filename,
+			 sizeof(list->files[list->entries]->filename),
+			 "%s", filename);

-	strncpy(list->files[list->entries]->filename, filename, MAXPGPATH);
 	list->files[list->entries]->in_data_directory = in_data_dir;

 	list->entries++;
@@ -3463,13 +3561,10 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
 	log_verbose(LOG_DEBUG, "_create_event(): event is \"%s\" for node %i", event, node_id);

 	/*
-	 * Only attempt to write a record if a connection handle was provided.
-	 * Also check that the repmgr schema has been properly initialised - if
-	 * not it means no configuration file was provided, which can happen with
-	 * e.g. `repmgr standby clone`, and we won't know which schema to write
-	 * to.
+	 * Only attempt to write a record if a connection handle was provided,
+	 * and the connection handle points to a node which is not in recovery.
 	 */
-	if (conn != NULL && PQstatus(conn) == CONNECTION_OK)
+	if (conn != NULL && PQstatus(conn) == CONNECTION_OK && get_recovery_type(conn) == RECTYPE_PRIMARY)
 	{
 		int			n_node_id = htonl(node_id);
 		char	   *t_successful = successful ? "TRUE" : "FALSE";
@@ -3523,7 +3618,7 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
 		else
 		{
 			/* Store timestamp to send to the notification command */
-			strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
+			snprintf(event_timestamp, MAXLEN, "%s", PQgetvalue(res, 0, 0));
 		}

 		termPQExpBuffer(&query);
@@ -3958,8 +4053,12 @@ get_slot_record(PGconn *conn, char *slot_name, t_replication_slot *record)
 	}
 	else
 	{
-		strncpy(record->slot_name, PQgetvalue(res, 0, 0), MAXLEN);
-		strncpy(record->slot_type, PQgetvalue(res, 0, 1), MAXLEN);
+		snprintf(record->slot_name,
+				 sizeof(record->slot_name),
+				 "%s", PQgetvalue(res, 0, 0));
+		snprintf(record->slot_type,
+				 sizeof(record->slot_type),
+				 "%s", PQgetvalue(res, 0, 1));
 		record->active = atobool(PQgetvalue(res, 0, 2));
 	}

@@ -4090,7 +4189,8 @@ get_tablespace_name_by_location(PGconn *conn, const char *location, char *name)
 	}
 	else
 	{
-		strncpy(name, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(name, MAXLEN,
+				 "%s", PQgetvalue(res, 0, 0));
 	}

 	termPQExpBuffer(&query);
@@ -4123,7 +4223,8 @@ cancel_query(PGconn *conn, int timeout)
 	 */
 	if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
 	{
-		log_warning(_("unable to stop current query:\n  %s"), errbuf);
+		log_warning(_("unable to cancel current query"));
+		log_detail("\n%s", errbuf);
 		PQfreeCancel(pgcancel);
 		return false;
 	}
@@ -4141,7 +4242,7 @@ cancel_query(PGconn *conn, int timeout)
 * Returns 1 for success; 0 if any error ocurred; -1 if timeout reached.
 */
 int
-wait_connection_availability(PGconn *conn, long long timeout)
+wait_connection_availability(PGconn *conn, int timeout)
 {
 	PGresult   *res = NULL;
 	fd_set		read_set;
@@ -4150,16 +4251,17 @@ wait_connection_availability(PGconn *conn, long long timeout)
 				before,
 				after;
 	struct timezone tz;
+	long long	timeout_ms;

-	/* recalc to microseconds */
-	timeout *= 1000000;
+	/* calculate timeout in microseconds */
+	timeout_ms = (long long) timeout * 1000000;

-	while (timeout > 0)
+	while (timeout_ms > 0)
 	{
 		if (PQconsumeInput(conn) == 0)
 		{
-			log_warning(_("wait_connection_availability(): could not receive data from connection:\n  %s"),
-						PQerrorMessage(conn));
+			log_warning(_("wait_connection_availability(): unable to receive data from connection"));
+			log_detail("%s", PQerrorMessage(conn));
 			return 0;
 		}

@@ -4190,17 +4292,17 @@ wait_connection_availability(PGconn *conn, long long timeout)

 		gettimeofday(&after, &tz);

-		timeout -= (after.tv_sec * 1000000 + after.tv_usec) -
+		timeout_ms -= (after.tv_sec * 1000000 + after.tv_usec) -
 			(before.tv_sec * 1000000 + before.tv_usec);
 	}


-	if (timeout >= 0)
+	if (timeout_ms >= 0)
 	{
 		return 1;
 	}

-	log_warning(_("wait_connection_availability(): timeout reached"));
+	log_warning(_("wait_connection_availability(): timeout (%i secs) reached"), timeout);
 	return -1;
 }

@@ -4211,13 +4313,33 @@ wait_connection_availability(PGconn *conn, long long timeout)

 bool
 is_server_available(const char *conninfo)
+{
+	return _is_server_available(conninfo, false);
+}
+
+
+bool
+is_server_available_quiet(const char *conninfo)
+{
+	return _is_server_available(conninfo, true);
+}
+
+
+static bool
+_is_server_available(const char *conninfo, bool quiet)
 {
 	PGPing		status = PQping(conninfo);

-	log_verbose(LOG_DEBUG, "is_server_available(): ping status for %s is %i", conninfo, (int)status);
+	log_verbose(LOG_DEBUG, "is_server_available(): ping status for \"%s\" is %s", conninfo, print_pqping_status(status));
 	if (status == PQPING_OK)
 		return true;

+	if (quiet == false)
+	{
+		log_warning(_("unable to ping \"%s\""), conninfo);
+		log_detail(_("PQping() returned \"%s\""), print_pqping_status(status));
+	}
+
 	return false;
 }

@@ -4230,10 +4352,17 @@ is_server_available_params(t_conninfo_param_list *param_list)
 									  false);

 	/* deparsing the param_list adds overhead, so only do it if needed  */
-	if (log_level == LOG_DEBUG)
+	if (log_level == LOG_DEBUG || status != PQPING_OK)
 	{
 		char *conninfo_str = param_list_to_string(param_list);
-		log_verbose(LOG_DEBUG, "is_server_available_params(): ping status for %s is %i", conninfo_str, (int)status);
+		log_verbose(LOG_DEBUG, "is_server_available_params(): ping status for \"%s\" is %s", conninfo_str, print_pqping_status(status));
+
+		if (status != PQPING_OK)
+		{
+			log_warning(_("unable to ping \"%s\""), conninfo_str);
+			log_detail(_("PQping() returned \"%s\""), print_pqping_status(status));
+		}
+
 		pfree(conninfo_str);
 	}

@@ -4263,6 +4392,25 @@ connection_ping(PGconn *conn)
 }


+ExecStatusType
+connection_ping_reconnect(PGconn *conn)
+{
+	ExecStatusType ping_result = connection_ping(conn);
+
+	if (PQstatus(conn) != CONNECTION_OK)
+	{
+		log_warning(_("connection error, attempting to reset"));
+		log_detail("\n%s", PQerrorMessage(conn));
+		PQreset(conn);
+		ping_result = connection_ping(conn);
+	}
+
+	log_verbose(LOG_DEBUG, "connection_ping_reconnect(): result is %s", PQresStatus(ping_result));
+
+	return ping_result;
+}
+
+

 /* ==================== */
 /* monitoring functions */
@@ -4647,6 +4795,11 @@ get_primary_current_lsn(PGconn *conn)
 	{
 		ptr = parse_lsn(PQgetvalue(res, 0, 0));
 	}
+	else
+	{
+		log_db_error(conn, NULL, _("unable to execute get_primary_current_lsn()"));
+	}
+

 	PQclear(res);

@@ -4673,6 +4826,10 @@ get_last_wal_receive_location(PGconn *conn)
 	{
 		ptr = parse_lsn(PQgetvalue(res, 0, 0));
 	}
+	else
+	{
+		log_db_error(conn, NULL, _("unable to execute get_last_wal_receive_location()"));
+	}

 	PQclear(res);

@@ -4775,17 +4932,19 @@ void
 init_replication_info(ReplInfo *replication_info)
 {
 	memset(replication_info->current_timestamp, 0, sizeof(replication_info->current_timestamp));
+	replication_info->in_recovery = false;
 	replication_info->last_wal_receive_lsn = InvalidXLogRecPtr;
 	replication_info->last_wal_replay_lsn = InvalidXLogRecPtr;
 	memset(replication_info->last_xact_replay_timestamp, 0, sizeof(replication_info->last_xact_replay_timestamp));
 	replication_info->replication_lag_time = 0;
 	replication_info->receiving_streamed_wal = true;
 	replication_info->wal_replay_paused = false;
+	replication_info->upstream_last_seen = -1;
 }


 bool
-get_replication_info(PGconn *conn, ReplInfo *replication_info)
+get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replication_info)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
@@ -4794,6 +4953,7 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 	initPQExpBuffer(&query);
 	appendPQExpBufferStr(&query,
 						 " SELECT ts, "
+						 "        in_recovery, "
 						 "        last_wal_receive_lsn, "
 						 "        last_wal_replay_lsn, "
 						 "        last_xact_replay_timestamp, "
@@ -4807,9 +4967,11 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 						 "          END "
 						 "        END AS replication_lag_time, "
 						 "        last_wal_receive_lsn >= last_wal_replay_lsn AS receiving_streamed_wal, "
-						 "        wal_replay_paused "
+						 "        wal_replay_paused, "
+						 "        upstream_last_seen "
 						 "   FROM ( "
 						 " SELECT CURRENT_TIMESTAMP AS ts, "
+						 "        pg_catalog.pg_is_in_recovery() AS in_recovery, "
 						 "        pg_catalog.pg_last_xact_replay_timestamp() AS last_xact_replay_timestamp, ");


@@ -4821,7 +4983,7 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 							 "        CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
 							 "          THEN FALSE "
 							 "          ELSE pg_catalog.pg_is_wal_replay_paused() "
-							 "        END AS wal_replay_paused ");
+							 "        END AS wal_replay_paused, ");
 	}
 	else
 	{
@@ -4843,7 +5005,21 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 							 "        CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
 							 "          THEN FALSE "
 							 "          ELSE pg_catalog.pg_is_xlog_replay_paused() "
-							 "        END AS wal_replay_paused ");
+							 "        END AS wal_replay_paused, ");
+	}
+
+	if (node_type == WITNESS)
+	{
+		appendPQExpBufferStr(&query,
+							 "        repmgr.get_upstream_last_seen() AS upstream_last_seen");
+	}
+	else
+	{
+		appendPQExpBufferStr(&query,
+							 "        CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
+							 "          THEN -1 "
+							 "          ELSE repmgr.get_upstream_last_seen() "
+							 "        END AS upstream_last_seen ");
 	}

 	appendPQExpBufferStr(&query,
@@ -4861,13 +5037,19 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 	}
 	else
 	{
-		strncpy(replication_info->current_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
-		replication_info->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 1));
-		replication_info->last_wal_replay_lsn = parse_lsn(PQgetvalue(res, 0, 2));
-		strncpy(replication_info->last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
-		replication_info->replication_lag_time = atoi(PQgetvalue(res, 0, 4));
-		replication_info->receiving_streamed_wal = atobool(PQgetvalue(res, 0, 5));
-		replication_info->wal_replay_paused = atobool(PQgetvalue(res, 0, 6));
+		snprintf(replication_info->current_timestamp,
+				 sizeof(replication_info->current_timestamp),
+				 "%s", PQgetvalue(res, 0, 0));
+		replication_info->in_recovery = atobool(PQgetvalue(res, 0, 1));
+		replication_info->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 2));
+		replication_info->last_wal_replay_lsn = parse_lsn(PQgetvalue(res, 0, 3));
+		snprintf(replication_info->last_xact_replay_timestamp,
+				 sizeof(replication_info->last_xact_replay_timestamp),
+				 "%s", PQgetvalue(res, 0, 4));
+		replication_info->replication_lag_time = atoi(PQgetvalue(res, 0, 5));
+		replication_info->receiving_streamed_wal = atobool(PQgetvalue(res, 0, 6));
+		replication_info->wal_replay_paused = atobool(PQgetvalue(res, 0, 7));
+		replication_info->upstream_last_seen = atoi(PQgetvalue(res, 0, 8));
 	}

 	termPQExpBuffer(&query);
@@ -4913,13 +5095,12 @@ get_replication_lag_seconds(PGconn *conn)
 		log_warning("%s", PQerrorMessage(conn));
 		PQclear(res);

-		/* XXX magic number */
-		return -1;
+		return UNKNOWN_REPLICATION_LAG;
 	}

 	if (!PQntuples(res))
 	{
-		return -1;
+		return UNKNOWN_REPLICATION_LAG;
 	}

 	lag_seconds = atoi(PQgetvalue(res, 0, 0));
@@ -5053,7 +5234,7 @@ is_downstream_node_attached(PGconn *conn, char *node_name)


 void
-set_primary_last_seen(PGconn *conn)
+set_upstream_last_seen(PGconn *conn)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
@@ -5061,51 +5242,58 @@ set_primary_last_seen(PGconn *conn)
 	initPQExpBuffer(&query);

 	appendPQExpBufferStr(&query,
-						 "SELECT repmgr.set_primary_last_seen()");
+						 "SELECT repmgr.set_upstream_last_seen()");

 	res = PQexec(conn, query.data);

 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		log_db_error(conn, query.data, _("unable to execute repmgr.set_primary_last_seen()"));
+		log_db_error(conn, query.data, _("unable to execute repmgr.set_upstream_last_seen()"));
 	}

 	termPQExpBuffer(&query);
 	PQclear(res);
-
 }


 int
-get_primary_last_seen(PGconn *conn)
+get_upstream_last_seen(PGconn *conn, t_server_type node_type)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
-	int primary_last_seen = -1;
+	int upstream_last_seen = -1;

 	initPQExpBuffer(&query);

-	appendPQExpBufferStr(&query,
-						 "SELECT CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
-						 "   THEN -1 "
-						 "   ELSE repmgr.get_primary_last_seen() "
-						 " END AS primary_last_seen ");
+	if (node_type == WITNESS)
+	{
+		appendPQExpBufferStr(&query,
+							 "SELECT repmgr.get_upstream_last_seen()");
+	}
+	else
+	{
+		appendPQExpBufferStr(&query,
+							 "SELECT CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
+							 "   THEN -1 "
+							 "   ELSE repmgr.get_upstream_last_seen() "
+							 " END AS upstream_last_seen ");
+	}

 	res = PQexec(conn, query.data);

 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		log_db_error(conn, query.data, _("unable to execute repmgr.get_primary_last_seen()"));
+		log_db_error(conn, query.data, _("unable to execute repmgr.get_upstream_last_seen()"));
 	}
 	else
 	{
-		primary_last_seen = atoi(PQgetvalue(res, 0, 0));
+		upstream_last_seen = atoi(PQgetvalue(res, 0, 0));
 	}

 	termPQExpBuffer(&query);
 	PQclear(res);

-	return primary_last_seen;
+	return upstream_last_seen;
 }


@@ -5370,7 +5558,9 @@ get_default_bdr_replication_set(PGconn *conn)
 		/* For BDR2, we use a custom replication set */
 		namelen = strlen(BDR2_REPLICATION_SET_NAME);
 		default_replication_set = pg_malloc0(namelen + 1);
-		strncpy(default_replication_set, BDR2_REPLICATION_SET_NAME, namelen);
+		snprintf(default_replication_set,
+				 namelen + 1,
+				 "%s", BDR2_REPLICATION_SET_NAME);

 		return default_replication_set;
 	}
@@ -5400,7 +5590,9 @@ get_default_bdr_replication_set(PGconn *conn)
 	namelen = strlen(PQgetvalue(res, 0, 0));
 	default_replication_set = pg_malloc0(namelen + 1);

-	strncpy(default_replication_set, PQgetvalue(res, 0, 0), namelen);
+	snprintf(default_replication_set,
+			 namelen,
+			 "%s", PQgetvalue(res, 0, 0));

 	PQclear(res);

@@ -5621,7 +5813,9 @@ get_bdr_other_node_name(PGconn *conn, int node_id, char *node_name)

 	if (PQresultStatus(res) == PGRES_TUPLES_OK)
 	{
-		strncpy(node_name, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(node_name,
+				 NAMEDATALEN,
+				 "%s", PQgetvalue(res, 0, 0));
 	}
 	else
 	{
@@ -5804,12 +5998,12 @@ _populate_bdr_node_records(PGresult *res, BdrNodeInfoList *node_list)
 static void
 _populate_bdr_node_record(PGresult *res, t_bdr_node_info *node_info, int row)
 {
-	strncpy(node_info->node_sysid, PQgetvalue(res, row, 0), MAXLEN);
+	snprintf(node_info->node_sysid, sizeof(node_info->node_sysid), "%s", PQgetvalue(res, row, 0));
 	node_info->node_timeline = atoi(PQgetvalue(res, row, 1));
 	node_info->node_dboid = atoi(PQgetvalue(res, row, 2));
-	strncpy(node_info->node_name, PQgetvalue(res, row, 3), MAXLEN);
-	strncpy(node_info->node_local_dsn, PQgetvalue(res, row, 4), MAXLEN);
-	strncpy(node_info->peer_state_name, PQgetvalue(res, row, 5), MAXLEN);
+	snprintf(node_info->node_name, sizeof(node_info->node_name), "%s", PQgetvalue(res, row, 3));
+	snprintf(node_info->node_local_dsn, sizeof(node_info->node_local_dsn), "%s", PQgetvalue(res, row, 4));
+	snprintf(node_info->peer_state_name, sizeof(node_info->peer_state_name), "%s", PQgetvalue(res, row, 5));
 }


--- a/dbutils.h
+++ b/dbutils.h
@@ -134,8 +134,8 @@ typedef struct s_node_info
 	int			node_id;
 	int			upstream_node_id;
 	t_server_type type;
-	char		node_name[MAXLEN];
-	char		upstream_node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
+	char		upstream_node_name[NAMEDATALEN];
 	char		conninfo[MAXLEN];
 	char		repluser[NAMEDATALEN];
 	char		location[MAXLEN];
@@ -302,12 +302,14 @@ typedef struct BdrNodeInfoList
 typedef struct
 {
 	char		current_timestamp[MAXLEN];
+	bool		in_recovery;
 	XLogRecPtr	last_wal_receive_lsn;
 	XLogRecPtr	last_wal_replay_lsn;
 	char		last_xact_replay_timestamp[MAXLEN];
 	int			replication_lag_time;
 	bool		receiving_streamed_wal;
 	bool		wal_replay_paused;
+	int			upstream_last_seen;
 } ReplInfo;

 typedef struct
@@ -414,6 +416,8 @@ bool		set_config_bool(PGconn *conn, const char *config_param, bool state);
 int		    guc_set(PGconn *conn, const char *parameter, const char *op, const char *value);
 int			guc_set_typed(PGconn *conn, const char *parameter, const char *op, const char *value, const char *datatype);
 bool		get_pg_setting(PGconn *conn, const char *setting, char *output);
+bool		alter_system_int(PGconn *conn, const char *name, int value);
+bool		pg_reload_conf(PGconn *conn);

 /* server information functions */
 bool		get_cluster_size(PGconn *conn, char *size);
@@ -435,6 +439,7 @@ pid_t		repmgrd_get_pid(PGconn *conn);
 bool		repmgrd_is_running(PGconn *conn);
 bool		repmgrd_is_paused(PGconn *conn);
 bool		repmgrd_pause(PGconn *conn, bool pause);
+pid_t		get_wal_receiver_pid(PGconn *conn);

 /* extension functions */
 ExtensionStatus get_repmgr_extension_status(PGconn *conn, t_extension_versions *extversions);
@@ -509,12 +514,14 @@ bool		get_tablespace_name_by_location(PGconn *conn, const char *location, char *

 /* asynchronous query functions */
 bool		cancel_query(PGconn *conn, int timeout);
-int			wait_connection_availability(PGconn *conn, long long timeout);
+int			wait_connection_availability(PGconn *conn, int timeout);

 /* node availability functions */
 bool		is_server_available(const char *conninfo);
+bool		is_server_available_quiet(const char *conninfo);
 bool		is_server_available_params(t_conninfo_param_list *param_list);
 ExecStatusType	connection_ping(PGconn *conn);
+ExecStatusType	connection_ping_reconnect(PGconn *conn);

 /* monitoring functions  */
 void
@@ -549,12 +556,12 @@ XLogRecPtr	get_primary_current_lsn(PGconn *conn);
 XLogRecPtr	get_node_current_lsn(PGconn *conn);
 XLogRecPtr	get_last_wal_receive_location(PGconn *conn);
 void		init_replication_info(ReplInfo *replication_info);
-bool		get_replication_info(PGconn *conn, ReplInfo *replication_info);
+bool		get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replication_info);
 int			get_replication_lag_seconds(PGconn *conn);
 void		get_node_replication_stats(PGconn *conn, t_node_info *node_info);
 bool		is_downstream_node_attached(PGconn *conn, char *node_name);
-void		set_primary_last_seen(PGconn *conn);
-int			get_primary_last_seen(PGconn *conn);
+void		set_upstream_last_seen(PGconn *conn);
+int			get_upstream_last_seen(PGconn *conn, t_server_type node_type);
 bool		is_wal_replay_paused(PGconn *conn, bool check_pending_wal);

 /* BDR functions */
--- a/dirutil.c
+++ b/dirutil.c
@@ -276,6 +276,8 @@ is_pg_running(const char *path)
 			log_warning(_("invalid data in PostgreSQL PID file \"%s\""), path);
 		}

+		fclose(pidf);
+
 		return PG_DIR_NOT_RUNNING;
 	}

@@ -334,6 +336,15 @@ create_pg_dir(const char *path, bool force)
 				{
 					log_notice(_("-F/--force provided - deleting existing data directory \"%s\""), path);
 					nftw(path, unlink_dir_callback, 64, FTW_DEPTH | FTW_PHYS);
+
+					/* recreate the directory ourselves to ensure permissions are correct */
+					if (!create_dir(path))
+					{
+						log_error(_("unable to create directory \"%s\"..."),
+								  path);
+						return false;
+					}
+
 					return true;
 				}

@@ -345,6 +356,15 @@ create_pg_dir(const char *path, bool force)
 				{
 					log_notice(_("deleting existing directory \"%s\""), path);
 					nftw(path, unlink_dir_callback, 64, FTW_DEPTH | FTW_PHYS);
+
+					/* recreate the directory ourselves to ensure permissions are correct */
+					if (!create_dir(path))
+					{
+						log_error(_("unable to create directory \"%s\"..."),
+								  path);
+						return false;
+					}
+
 					return true;
 				}
 				return false;
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -61,7 +61,7 @@ clean:

 maintainer-clean:
 	rm -rf html
-	rm -rf Makefile
+	rm -f Makefile

 zip: html
 	cp -r html repmgr-docs-$(REPMGR_VERSION)
--- a/doc/appendix-faq.sgml
+++ b/doc/appendix-faq.sgml
@@ -100,8 +100,7 @@
     and recloning standbys from this.
   </para>
   <para>
-     To minimize downtime during major upgrades, for more recent PostgreSQL
-     versions (PostgreSQL 9.4 and later),
+     To minimize downtime during major upgrades from PostgreSQL 9.4 and later,
     <ulink url="https://www.2ndquadrant.com/en/resources/pglogical/">pglogical</ulink>
     can be used to set up a parallel cluster using the newer PostgreSQL version,
     which can be kept in sync with the existing production cluster until the
--- a/doc/appendix-packages.sgml
+++ b/doc/appendix-packages.sgml
@@ -481,28 +481,6 @@ repmgr96-4.1.1-0.0git320.g5113ab0.1.el7.x86_64.rpm</programlisting>

    <sect2 id="packages-old-versions-rhel-centos" xreflabel="old RHEL/CentOS package versions">
      <title>RHEL/CentOS</title>
-      <para>
-        Old RPM packages (<literal>3.2</literal> and later) can be retrieved from the
-        (deprecated) 2ndQuadrant repository at
-        <ulink url="http://packages.2ndquadrant.com/">http://packages.2ndquadrant.com/</ulink>
-        by installing the appropriate repository RPM:
-      </para>
-
-      <itemizedlist spacing="compact" mark="bullet">
-
-        <listitem>
-          <simpara>
-            <ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm</ulink>
-          </simpara>
-        </listitem>
-
-        <listitem>
-          <simpara>
-            <ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</ulink>
-          </simpara>
-        </listitem>
-
-      </itemizedlist>

      <para>
        Old versions can be located with e.g.:
@@ -520,6 +498,32 @@ repmgr96-4.1.1-0.0git320.g5113ab0.1.el7.x86_64.rpm</programlisting>
          yum install repmgr96-4.0.6-1.rhel6</programlisting>
      </para>

+      <sect3 id="packages-old-versions-rhel-centos-repmgr3">
+        <title>repmgr 3 packages</title>
+        <para>
+          Old &repmgr; 3 RPM packages (<literal>3.2</literal> and later) can be retrieved from the
+          (deprecated) 2ndQuadrant repository at
+          <ulink url="http://packages.2ndquadrant.com/repmgr/yum/">http://packages.2ndquadrant.com/repmgr/yum/</ulink>
+          by installing the appropriate repository RPM:
+        </para>
+
+        <itemizedlist spacing="compact" mark="bullet">
+
+          <listitem>
+            <simpara>
+              <ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm</ulink>
+            </simpara>
+          </listitem>
+
+          <listitem>
+            <simpara>
+              <ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</ulink>
+            </simpara>
+          </listitem>
+
+        </itemizedlist>
+      </sect3>
+
    </sect2>
  </sect1>

--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -14,30 +14,64 @@
  <para>
    See also: <xref linkend="upgrading-repmgr">
  </para>
+  <sect1 id="release-4.3.1">
+    <title>Release 4.3.1</title>
+    <para><emphasis>??? December ??, 2019</emphasis></para>
+    <para>
+      &repmgr; 4.3.1 is a minor release.
+    </para>
+
+    <sect2>
+      <title>Bug fixes</title>
+      <para>
+        <itemizedlist>
+         <listitem>
+            <para>
+              <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>:
+	          ensure an existing replication slot is not deleted if the
+              follow target is the node's current upstream.
+            </para>
+	      </listitem>
+
+        </itemizedlist>
+      </para>
+    </sect2>
+
+  </sect1>

  <sect1 id="release-4.3">
    <title>Release 4.3</title>
-    <para><emphasis>Mar ???, 2019</emphasis></para>
+    <para><emphasis>Tue April 2, 2019</emphasis></para>
    <para>
      &repmgr; 4.3 is a major release.
    </para>
+    <para>
+      For details on how to upgrade an existing &repmgr; instrallation, see
+      documentation section <link linkend="upgrading-major-version">Upgrading a major version release</link>.
+    </para>
+    <para>
+      If <application>repmgrd</application> is in use, a PostgreSQL restart <emphasis>is</emphasis> required;
+      in that case we suggest combining this &repmgr; upgrade with the next PostgreSQL
+      minor release, which will require a PostgreSQL restart in any case.
+    </para>
+

 	<important>
 	  <para>
-		On Debian-based systems, including Ubuntu, if using <application>repmgrd</application>
-		please ensure that in the file <filename>/etc/init.d/repmgrd</filename>, the parameter
-		<varname>REPMGRD_OPTS</varname> contains &quot;<literal>--daemonize=false</literal>&quot;, e.g.:
-		<programlisting>
+	    On Debian-based systems, including Ubuntu, if using <application>repmgrd</application>
+	    please ensure that in the file <filename>/etc/init.d/repmgrd</filename>, the parameter
+	    <varname>REPMGRD_OPTS</varname> contains &quot;<literal>--daemonize=false</literal>&quot;, e.g.:
+	    <programlisting>
 # additional options
 REPMGRD_OPTS="--daemonize=false"</programlisting>
 	  </para>
 	  <para>
-		For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd daemon configuration on Debian/Ubuntu</link>.
+	    For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd configuration on Debian/Ubuntu</link>.
 	  </para>
 	</important>

    <sect2>
-      <title>repmgr enhancements</title>
+      <title>repmgr client enhancements</title>
      <para>
        <itemizedlist>

@@ -72,9 +106,9 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
            </para>
            <note>
              <para>
-                For these commands to work reliably, the configuration file settings
+                These commands require the configuration file settings
                <varname>repmgrd_service_start_command</varname> and <varname>repmgrd_service_stop_command</varname>
-                should be set in <filename>repmgr.conf</filename>.
+                in <filename>repmgr.conf</filename> to be set.
              </para>
            </note>
          </listitem>
@@ -82,8 +116,8 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
          <listitem>
            <para>
              <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>
-              displays the interval (in seconds) since the <application>repmgrd</application> instance
-              last verified its upstream node was available.
+              additionally displays the node priority and the interval (in seconds) since the
+              <application>repmgrd</application> instance last verified its upstream node was available.
            </para>
          </listitem>

@@ -99,11 +133,11 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
          <listitem>
            <para>
              <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
-			  differentiate between unreachable nodes and nodes which are running but rejecting connections.
+	      differentiate between unreachable nodes and nodes which are running but rejecting connections.
            </para>
            <para>
-			  This makes it possible to see whether a node is unreachable at network level,
-			  or if it is running but rejecting connections for some reason.
+	      This makes it possible to see whether a node is unreachable at network level,
+	      or if it is running but rejecting connections for some reason.
            </para>
          </listitem>

@@ -132,7 +166,7 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>

          <listitem>
            <para>
-              Add check <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>
+              Add check to <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>
              to ensure the data directory on the demotion candidate is configured correctly in <filename>repmgr.conf</filename>.
              This is to ensure that &repmgr;, when remotely executed on the demotion candidate, can correctly verify
              that PostgreSQL on the demotion candidate was shut down cleanly. GitHub #523.
@@ -154,10 +188,45 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
              <application>repmgrd</application> will no longer consider nodes where <application>repmgrd</application>
 			  is not running as promotion candidates.
            </para>
-			<para>
-			  Previously, if <application>repmgrd</application> was not running on a node, but
-			  that node qualified as the promotion candidate, it would never be promoted due to
-			  the absence of a running <application>repmgrd</application>.
+            <para>
+              Previously, if <application>repmgrd</application> was not running on a node, but
+              that node qualified as the promotion candidate, it would never be promoted due to
+              the absence of a running <application>repmgrd</application>.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Add option <option>connection_check_type</option> to enable selection of the method
+              <application>repmgrd</application> uses to determine whether the upstream node is available.
+            </para>
+            <para>
+              Possible values are <literal>ping</literal> (default; uses <command>PQping()</command> to
+              determine server availability), <literal>connection</literal> (attempst to make a new connection to
+              the upstream node), and <literal>query</literal> (determines server availability
+              by executing an SQL statement on the node via the existing connection).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              New configuration option <link linkend="repmgrd-failover-validation"><option>failover_validation_command</option></link>
+              to allow an external mechanism to validate the failover decision made by <application>repmgrd</application>.
+            </para>
+          </listitem>
+
+         <listitem>
+            <para>
+              New configuration option <link linkend="repmgrd-standby-disconnection-on-failover"><option>standby_disconnect_on_failover</option></link>
+              to force standbys to disconnect their WAL receivers before making a failover decision.
+            </para>
+          </listitem>
+
+         <listitem>
+            <para>
+			  In a failover situation, <application>repmgrd</application> will not attempt to promote a
+			  node if another primary has already appeared (e.g. by being promoted manually).
+			  GitHub #420.
 			</para>
          </listitem>

@@ -170,6 +239,35 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
      <para>
        <itemizedlist>

+          <listitem>
+            <para>
+              <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
+	          fix display of node IDs with multiple digits.
+            </para>
+	      </listitem>
+
+          <listitem>
+            <para>
+              ensure <command><link linkend="repmgr-primary-unregister">repmgr primary unregister</link></command>
+	          behaves correctly when executed on a witness server. GitHub #548.
+            </para>
+	      </listitem>
+
+          <listitem>
+            <para>
+              ensure <command><link linkend="repmgr-standby-register">repmgr standby register</link></command>
+	          fails when <option>--upstream-node-id</option> is the same as the local node ID.
+            </para>
+	      </listitem>
+
+          <listitem>
+            <para>
+              &repmgr;: when executing <link linkend="repmgr-standby-clone"><command>repmgr standby clone</command></link>,
+              recheck primary/upstream connection(s) after the data copy operation is complete, as these may
+              have gone away.
+            </para>
+          </listitem>
+
          <listitem>
            <para>
              &repmgr;: when executing <command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>,
@@ -180,8 +278,8 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>

          <listitem>
            <para>
-              &repmgr;: when executing <command><link linkend="repmgr-witness-register">repmgr witness register</link></command>,
-              chech the node to connected is actually the primary (i.e. not the witness server).  GitHub #528.
+              &repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
+	          verify the standby (promotion candidate) is currently attached to the primary (demotion candidate). GitHub #519.
            </para>
          </listitem>

@@ -189,18 +287,26 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
            <para>
              &repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
              avoid a potential race condition when comparing received WAL on the standby to the primary's shutdown location,
-			  as the standby's walreceiver may not have yet flushed all received WAL to disk. GitHub #518.
+	          as the standby's walreceiver may not have yet flushed all received WAL to disk. GitHub #518.
            </para>
          </listitem>

-
-         <listitem>
+          <listitem>
            <para>
-              &repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
-			  verify the standby (promotion candidate) is currently attached to the primary (demotion candidate). GitHub #519.
+              &repmgr;: when executing <command><link linkend="repmgr-witness-register">repmgr witness register</link></command>,
+              check the node to connected is actually the primary (i.e. not the witness server).  GitHub #528.
            </para>
          </listitem>

+          <listitem>
+            <para>
+              <command><link linkend="repmgr-node-check">repmgr node check</link></command>
+	          will only consider physical replication slots, as the purpose
+	          of slot checks is to warn about potential issues with
+	          streaming replication standbys which are no longer attached.
+	        </para>
+	      </listitem>
+
          <listitem>
            <para>
              <application>repmgrd</application>: on a cascaded standby, don't fail over if
@@ -208,29 +314,6 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
            </para>
          </listitem>

-          <listitem>
-            <para>
-              <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
-			  fix display of node IDs with multiple digits.
-            </para>
-		  </listitem>
-
-          <listitem>
-            <para>
-              ensure <command><link linkend="repmgr-primary-unregister">repmgr primary unregister</link></command>
-			  behaves correctly when executed on a witness server. GitHub #548.
-            </para>
-		  </listitem>
-
-          <listitem>
-            <para>
-              <command><link linkend="repmgr-node-check">repmgr node check</link></command>
-			  will only consider physical replication slots, as the purpose
-			  of slot checks is to warn about potential issues with
-			  streaming replication standbys which are no longer attached.
-			</para>
-		  </listitem>
-
        </itemizedlist>
      </para>
    </sect2>
@@ -255,15 +338,15 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>

 	<important>
 	  <para>
-		On Debian-based systems, including Ubuntu, if using <application>repmgrd</application>
-		please ensure that the in the file <filename>/etc/init.d/repmgrd</filename>, the parameter
-		<varname>REPMGRD_OPTS</varname> contains &quot;<literal>--daemonize=false</literal>&quot;, e.g.:
-		<programlisting>
+	    On Debian-based systems, including Ubuntu, if using <application>repmgrd</application>
+	    please ensure that the in the file <filename>/etc/init.d/repmgrd</filename>, the parameter
+	    <varname>REPMGRD_OPTS</varname> contains &quot;<literal>--daemonize=false</literal>&quot;, e.g.:
+	    <programlisting>
 # additional options
 REPMGRD_OPTS="--daemonize=false"</programlisting>
 	  </para>
 	  <para>
-		For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd daemon configuration on Debian/Ubuntu</link>.
+	    For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd daemon configuration on Debian/Ubuntu</link>.
 	  </para>
 	</important>

--- a/doc/configuration-file-required-settings.sgml
+++ b/doc/configuration-file-required-settings.sgml
@@ -39,6 +39,10 @@
       called <varname>standby1</varname> (for example), things will be confusing
       to say the least.
     </para>
+     <para>
+       The string's maximum length is 63 characters and it should
+       contain only printable ASCII characters.
+     </para>
    </listitem>
   </varlistentry>

--- a/doc/configuration-file.sgml
+++ b/doc/configuration-file.sgml
@@ -1,15 +1,15 @@
-<sect1 id="configuration-file" xreflabel="configuration file location">
+<sect1 id="configuration-file" xreflabel="configuration file">
  <indexterm>
    <primary>repmgr.conf</primary>
-    <secondary>location</secondary>
  </indexterm>

  <indexterm>
    <primary>configuration</primary>
-    <secondary>repmgr.conf location</secondary>
+    <secondary>repmgr.conf</secondary>
  </indexterm>

-  <title>Configuration file location</title>
+  <title>Configuration file</title>
+
  <para>
    <application>repmgr</application> and <application>repmgrd</application>
    use a common configuration file, by default called
@@ -21,6 +21,55 @@
    for more details.
  </para>

+  <sect2 id="configuration-file-format" xreflabel="configuration file format">
+
+    <indexterm>
+      <primary>repmgr.conf</primary>
+      <secondary>format</secondary>
+    </indexterm>
+
+    <title>Configuration file format</title>
+
+    <para>
+      <filename>repmgr.conf</filename> is a plain text file with one parameter/value
+      combination per line.
+    </para>
+    <para>
+      Whitespace is insignificant (except within a quoted parameter value) and blank lines are ignored.
+      Hash marks (<literal>#</literal>) designate the remainder of the line as a comment.
+      Parameter values that are not simple identifiers or numbers should be single-quoted.
+      Note that single quote cannot be embedded in a parameter value.
+    </para>
+    <important>
+      <para>
+        &repmgr; will interpret double-quotes as being part of a string value; only use single quotes
+        to quote parameter values.
+      </para>
+    </important>
+
+    <para>
+      Example of a valid <filename>repmgr.conf</filename> file:
+      <programlisting>
+# repmgr.conf
+
+node_id=1
+node_name= node1
+conninfo ='host=node1 dbname=repmgr user=repmgr connect_timeout=2'
+data_directory = /var/lib/pgsql/11/data</programlisting>
+
+    </para>
+  </sect2>
+
+
+
+  <sect2 id="configuration-file-location" xreflabel="configuration file location">
+  <indexterm>
+    <primary>repmgr.conf</primary>
+    <secondary>location</secondary>
+  </indexterm>
+
+  <title>Configuration file location</title>
+
  <para>
   The configuration file will be searched for in the following locations:
   <itemizedlist spacing="compact" mark="bullet">
@@ -50,7 +99,7 @@
   Note that if a file is explicitly specified with <literal>-f/--config-file</literal>,
   an error will be raised if it is not found or not readable, and no attempt will be made to
   check default locations; this is to prevent <application>repmgr</application> unexpectedly
-   reading the wrong configuraton file.
+   reading the wrong configuration file.
  </para>

  <note>
@@ -65,5 +114,7 @@
      to <filename>/path/to/./repmgr.conf</filename>, whereas you'd normally write
      <filename>/path/to/repmgr.conf</filename>).
    </para>
-  </note>
-</sect1>
+   </note>
+
+   </sect2>
+ </sect1>
--- a/doc/configuring-witness-server.sgml
+++ b/doc/configuring-witness-server.sgml
@@ -1,93 +0,0 @@
-<chapter id="using-witness-server">
- <indexterm>
-  <primary>witness server</primary>
-  <seealso>Using a witness server with repmgrd</seealso>
- </indexterm>
-
-
- <title>Using a witness server</title>
- <para>
-   A <xref linkend="witness-server"> is a normal PostgreSQL instance which
-   is not part of the streaming replication cluster; its purpose is, if a
-   failover situation occurs, to provide proof that the primary server
-   itself is unavailable.
- </para>
-
- <para>
-   A typical use case for a witness server is a two-node streaming replication
-   setup, where the primary and standby are in different locations (data centres).
-   By creating a witness server in the same location (data centre) as the primary,
-   if the primary becomes unavailable it's possible for the standby to decide whether
-   it can promote itself without risking a "split brain" scenario: if it can't see either the
-   witness or the primary server, it's likely there's a network-level interruption
-   and it should not promote itself. If it can seen the witness but not the primary,
-   this proves there is no network interruption and the primary itself is unavailable,
-   and it can therefore promote itself (and ideally take action to fence the
-   former primary).
- </para>
- <note>
-   <para>
-     <emphasis>Never</emphasis> install a witness server on the same physical host
-     as another node in the replication cluster managed by &repmgr; - it's essential
-     the witness is not affected in any way by failure of another node.
-   </para>
- </note>
- <para>
-   For more complex replication scenarios,e.g. with multiple datacentres, it may
-   be preferable to use location-based failover, which ensures that only nodes
-   in the same location as the primary will ever be promotion candidates;
-   see <xref linkend="repmgrd-network-split"> for more details.
- </para>
-
- <note>
-   <simpara>
-     A witness server will only be useful if <application>repmgrd</application>
-     is in use.
-   </simpara>
- </note>
-
- <sect1 id="creating-witness-server">
-   <title>Creating a witness server</title>
- <para>
-   To create a witness server, set up a normal PostgreSQL instance on a server
-   in the same physical location as the cluster's primary server.
- </para>
- <para>
-   This instance should *not* be on the same physical host as the primary server,
-   as otherwise if the primary server fails due to hardware issues, the witness
-   server will be lost too.
- </para>
- <note>
-   <simpara>
-     &repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
-     command, which would automatically create a PostgreSQL instance. However
-     this often resulted in an unsatisfactory, hard-to-customise instance.
-   </simpara>
- </note>
- <para>
-   The witness server should be configured in the same way as a normal
-   &repmgr; node; see section <xref linkend="configuration">.
- </para>
- <para>
-   Register the witness server with <xref linkend="repmgr-witness-register">.
-   This will create the &repmgr; extension on the witness server, and make
-   a copy of the &repmgr; metadata.
- </para>
- <note>
-   <simpara>
-    As the witness server is not part of the replication cluster, further
-    changes to the &repmgr; metadata will be synchronised by
-    <application>repmgrd</application>.
-   </simpara>
- </note>
- <para>
-   Once the witness server has been configured, <application>repmgrd</application>
-   should be started; for more details see <xref linkend="repmgrd-witness-server">.
- </para>
-
- <para>
-  To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
- </para>
-
- </sect1>
-</chapter>
--- a/doc/event-notifications.sgml
+++ b/doc/event-notifications.sgml
@@ -88,7 +88,7 @@

 <para>
  The values provided for <literal>%t</literal> and <literal>%d</literal>
-  will probably contain spaces, so should be quoted in the provided command
+  may contain spaces, so should be quoted in the provided command
  configuration, e.g.:
  <programlisting>
    event_notification_command='/path/to/some/script %n %e %s "%t" "%d"'
--- a/doc/filelist.sgml
+++ b/doc/filelist.sgml
@@ -45,21 +45,14 @@
 <!ENTITY promoting-standby  SYSTEM "promoting-standby.sgml">
 <!ENTITY follow-new-primary  SYSTEM "follow-new-primary.sgml">
 <!ENTITY switchover  SYSTEM "switchover.sgml">
-<!ENTITY configuring-witness-server SYSTEM "configuring-witness-server.sgml">

 <!ENTITY event-notifications  SYSTEM "event-notifications.sgml">
 <!ENTITY upgrading-repmgr  SYSTEM "upgrading-repmgr.sgml">

+<!ENTITY repmgrd-overview SYSTEM "repmgrd-overview.sgml">
 <!ENTITY repmgrd-automatic-failover SYSTEM "repmgrd-automatic-failover.sgml">
 <!ENTITY repmgrd-configuration SYSTEM "repmgrd-configuration.sgml">
-<!ENTITY repmgrd-demonstration SYSTEM "repmgrd-demonstration.sgml">
-<!ENTITY repmgrd-monitoring SYSTEM "repmgrd-monitoring.sgml">
-<!ENTITY repmgrd-degraded-monitoring SYSTEM "repmgrd-degraded-monitoring.sgml">
-<!ENTITY repmgrd-cascading-replication SYSTEM "repmgrd-cascading-replication.sgml">
-<!ENTITY repmgrd-network-split SYSTEM "repmgrd-network-split.sgml">
-<!ENTITY repmgrd-witness-server SYSTEM "repmgrd-witness-server.sgml">
-<!ENTITY repmgrd-pausing SYSTEM "repmgrd-pausing.sgml">
-<!ENTITY repmgrd-notes SYSTEM "repmgrd-notes.sgml">
+<!ENTITY repmgrd-operation SYSTEM "repmgrd-operation.sgml">
 <!ENTITY repmgrd-bdr SYSTEM "repmgrd-bdr.sgml">

 <!ENTITY repmgr-primary-register SYSTEM "repmgr-primary-register.sgml">
--- a/doc/install-source.sgml
+++ b/doc/install-source.sgml
@@ -61,28 +61,28 @@ deb-src http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main</programlisti
         <itemizedlist spacing="compact" mark="bullet">

           <listitem>
-             <simpara><literal>llibedit-dev</literal></simpara>
+             <simpara><literal>libedit-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibkrb5-dev</literal></simpara>
+             <simpara><literal>libkrb5-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibpam0g-dev</literal></simpara>
+             <simpara><literal>libpam0g-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibreadline-dev</literal></simpara>
+             <simpara><literal>libreadline-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibselinux1-dev</literal></simpara>
+             <simpara><literal>libselinux1-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibssl-dev</literal></simpara>
+             <simpara><literal>libssl-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibxml2-dev</literal></simpara>
+             <simpara><literal>libxml2-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibxslt1-dev</literal></simpara>
+             <simpara><literal>libxslt1-dev</literal></simpara>
           </listitem>
         </itemizedlist>
       </para>
@@ -136,6 +136,16 @@ deb-src http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main</programlisti
         </itemizedlist>
       </para>
     </note>
+
+     <tip>
+       <para>
+         If building against PostgreSQL 11 or later configured with the <option>--with-llvm</option> option
+         (this is the case with the PGDG-provided packages) you'll also need to install the
+         <literal>llvm-toolset-7-clang</literal> package. This is available via the
+         <ulink url="https://wiki.centos.org/AdditionalResources/Repositories/SCL">Software Collections (SCL) Repository</ulink>.
+       </para>
+     </tip>
+
    </listitem>
   </itemizedlist>
  </para>
--- a/doc/quickstart.sgml
+++ b/doc/quickstart.sgml
@@ -76,19 +76,25 @@
   </para>
   <programlisting>

-    # Enable replication connections; set this figure to at least one more
+    # Enable replication connections; set this value to at least one more
    # than the number of standbys which will connect to this server
-    # (note that repmgr will execute `pg_basebackup` in WAL streaming mode,
-    # which requires two free WAL senders)
+    # (note that repmgr will execute "pg_basebackup" in WAL streaming mode,
+    # which requires two free WAL senders).
+    #
+    # See: https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-MAX-WAL-SENDERS

    max_wal_senders = 10

-    # Enable replication slots; set this figure to at least one more
+    # If using replication slots, set this value to at least one more
    # than the number of standbys which will connect to this server.
    # Note that repmgr will only make use of replication slots if
-    # "use_replication_slots" is set to "true" in repmgr.conf
+    # "use_replication_slots" is set to "true" in "repmgr.conf".
+    # (If you are not intending to use replication slots, this value
+    # can be set to "0").
+    #
+    # See: https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-MAX-REPLICATION-SLOTS

-    max_replication_slots = 0
+    max_replication_slots = 10

    # Ensure WAL files contain enough information to enable read-only queries
    # on the standby.
@@ -103,24 +109,31 @@

    # Enable read-only queries on a standby
    # (Note: this will be ignored on a primary but we recommend including
-    # it anyway)
+    # it anyway, in case the primary later becomes a standby)
+    #
+    # See: https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY

    hot_standby = on

    # Enable WAL file archiving
+    #
+    # See: https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-ARCHIVE-MODE
+
    archive_mode = on

-    # Set archive command to a script or application that will safely store
-    # you WALs in a secure place. /bin/true is an example of a command that
-    # ignores archiving. Use something more sensible.
+    # Set archive command to a dummy command; this can later be changed without
+    # needing to restart the PostgreSQL instance.
+    #
+    # See: https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-ARCHIVE-COMMAND
+
    archive_command = '/bin/true'
   </programlisting>
   <tip>
    <simpara>
      Rather than editing these settings in the default <filename>postgresql.conf</filename>
-     file, create a separate file such as <filename>postgresql.replication.conf</filename> and
+      file, create a separate file such as <filename>postgresql.replication.conf</filename> and
      include it from the end of the main configuration file with:
-     <command>include 'postgresql.replication.conf</command>.
+     <command>include 'postgresql.replication.conf'</command>.
    </simpara>
   </tip>
   <para>
@@ -129,7 +142,8 @@
     <varname>wal_log_hints</varname>; for more details see <xref linkend="repmgr-node-rejoin-pg-rewind">.
   </para>
    <para>
-      See also the <link linkend="configuration-postgresql">PostgreSQL configuration</link> section in the <link linkend="configuration">repmgr configuaration guide</link>.
+      See also the <link linkend="configuration-postgresql">PostgreSQL configuration</link> section in the
+      <link linkend="configuration">repmgr configuration guide</link>.
    </para>
 </sect1>

--- a/doc/repmgr-cluster-show.sgml
+++ b/doc/repmgr-cluster-show.sgml
@@ -196,11 +196,31 @@
        </listitem>
      </varlistentry>

+      <varlistentry>
+        <term><option>ERR_BAD_CONFIG (1)</option></term>
+        <listitem>
+          <para>
+            An issue was encountered while attempting to retrieve
+            &repmgr; metadata.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_DB_CONN (6)</option></term>
+        <listitem>
+          <para>
+            &repmgr; was unable to connect to the local PostgreSQL instance.
+          </para>
+        </listitem>
+      </varlistentry>
+
      <varlistentry>
        <term><option>ERR_NODE_STATUS (25)</option></term>
        <listitem>
          <para>
-            One or more issues were detected.
+            One or more issues were detected with the replication configuration,
+            e.g. a node was not in its expected state.
          </para>
        </listitem>
      </varlistentry>
--- a/doc/repmgr-daemon-status.sgml
+++ b/doc/repmgr-daemon-status.sgml
@@ -33,7 +33,10 @@
      <command>repmgr daemon status</command> can be executed on any active node in the
      replication cluster. A valid <filename>repmgr.conf</filename> file is required.
    </para>
-
+    <para>
+      If PostgreSQL is not running on a node, &repmgr; will not be able to determine the
+      status of that node's <application>repmgrd</application> instance.
+    </para>
    <note>
      <para>
        After restarting PostgreSQL on any node, the <application>repmgrd</application> instance
@@ -87,60 +90,60 @@
      <varlistentry>
        <term><option>--csv</option></term>
        <listitem>
-		  <para>
-			<command>repmgr daemon status</command> accepts an optional parameter <literal>--csv</literal>, which
-			outputs the replication cluster's status in a simple CSV format, suitable for
-			parsing by scripts, e.g.:
-			<programlisting>
+          <para>
+            <command>repmgr daemon status</command> accepts an optional parameter <literal>--csv</literal>, which
+            outputs the replication cluster's status in a simple CSV format, suitable for
+            parsing by scripts, e.g.:
+            <programlisting>
    $ repmgr -f /etc/repmgr.conf daemon status --csv
    1,node1,primary,1,1,5722,1,100,-1
    2,node2,standby,1,0,-1,1,100,1
    3,node3,standby,1,1,5779,1,100,1</programlisting>
-		  </para>
-		  <para>
-			The columns have following meanings:
-			<itemizedlist spacing="compact" mark="bullet">
-			  <listitem>
-				<simpara>
-				  node ID
-				</simpara>
-			  </listitem>
+          </para>
+          <para>
+            The columns have following meanings:
+            <itemizedlist spacing="compact" mark="bullet">
+              <listitem>
+                <simpara>
+                  node ID
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
+              <listitem>
+                <simpara>
                  node name
-				</simpara>
-			  </listitem>
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
+              <listitem>
+                <simpara>
                  node type (primary or standby)
-				</simpara>
-			  </listitem>
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
+              <listitem>
+                <simpara>
                  PostgreSQL server running (1 = running, 0 = not running)
-				</simpara>
-			  </listitem>
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
-                  <application>repmgrd</application> running (1 = running, 0 = not running)
-				</simpara>
-			  </listitem>
+              <listitem>
+                <simpara>
+                  <application>repmgrd</application> running (1 = running, 0 = not running, -1 = unknown)
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
-                  <application>repmgrd</application> PID (-1 if not running)
-				</simpara>
-			  </listitem>
+              <listitem>
+                <simpara>
+                  <application>repmgrd</application> PID (-1 if not running or status unknown)
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
-                  <application>repmgrd</application> paused (1 = paused, 0 = not paused)
-				</simpara>
-			  </listitem>
+              <listitem>
+                <simpara>
+                  <application>repmgrd</application> paused (1 = paused, 0 = not paused, -1 = unknown)
+                </simpara>
+              </listitem>

              <listitem>
                <simpara>
@@ -150,25 +153,25 @@

              <listitem>
                <simpara>
-                  interval in seconds since the node's upstream was last seen
+                  interval in seconds since the node's upstream was last seen (this will be -1 if the value could not be retrieved, or the node is primary)
                </simpara>
              </listitem>

-			</itemizedlist>
-		  </para>
-		</listitem>
-	  </varlistentry>
+            </itemizedlist>
+          </para>
+        </listitem>
+      </varlistentry>

      <varlistentry>
        <term><option>--verbose</option></term>
        <listitem>
          <para>
-			Display the full text of any database connection error messages
+            Display the full text of any database connection error messages
          </para>
        </listitem>
      </varlistentry>

-	</variablelist>
+    </variablelist>

  </refsect1>

--- a/doc/repmgr-standby-promote.sgml
+++ b/doc/repmgr-standby-promote.sgml
@@ -99,7 +99,7 @@
        </indexterm>
         <simpara>
           <literal>promote_check_interval</literal>:
-           interval (in seconds, default: 1 seconds) to wait between each check
+           interval (in seconds, default: 1 second) to wait between each check
           to determine whether the standby has been promoted.
 		 </simpara>
 	   </listitem>
--- a/doc/repmgr-standby-switchover.sgml
+++ b/doc/repmgr-standby-switchover.sgml
@@ -22,10 +22,10 @@
      passwordless SSH connection to the current primary.
    </para>
    <para>
-      If other standbys are connected to the demotion candidate, &repmgr; can instruct
+      If other nodes are connected to the demotion candidate, &repmgr; can instruct
      these to follow the new primary if the option <literal>--siblings-follow</literal>
      is specified. This requires a passwordless SSH connection between the promotion
-      candidate (new primary) and the standbys attached to the demotion candidate
+      candidate (new primary) and the nodes attached to the demotion candidate
      (existing primary).
    </para>
    <note>
@@ -150,8 +150,18 @@
        <term><option>--siblings-follow</option></term>
        <listitem>
          <para>
-            Have standbys attached to the old primary follow the new primary.
+            Have nodes attached to the old primary follow the new primary.
          </para>
+          <para>
+            This will also ensure that a witness node, if in use, is updated
+            with the new primary's data.
+          </para>
+          <note>
+            <para>
+              In a future &repmgr; release, <option>--siblings-follow</option> will be applied
+              by default.
+            </para>
+          </note>
        </listitem>
      </varlistentry>
    </variablelist>
--- a/doc/repmgr.sgml
+++ b/doc/repmgr.sgml
@@ -9,6 +9,7 @@
          %filelist;

          <!ENTITY repmgr "<productname>repmgr</productname>">
+          <!ENTITY repmgrd "<productname>repmgrd</productname>">
          <!ENTITY postgres "<productname>PostgreSQL</productname>">
 ]>

@@ -25,25 +26,31 @@
   <para>
   This is the official documentation of &repmgr; &repmgrversion; for
   use with PostgreSQL 9.3 - PostgreSQL 11.
-   It describes the functionality supported by the current version of &repmgr;.
+   </para>
+   <para>
+     &repmgr; is being continually developed and we strongly recommend using the
+     latest version. Please check the
+     <ulink url="https://repmgr.org/">repmgr website</ulink> for details
+     about the current &repmgr; version as well as the
+     <ulink url="https://repmgr.org/docs/current/index.html">current repmgr documentation</ulink>.
   </para>

   <para>
-    &repmgr; was developed by
+    &repmgr; is developed by
    <ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
    along with contributions from other individuals and companies.
    Contributions from the community are appreciated and welcome - get
-    in touch via <ulink url="https://github.com/2ndQuadrant/repmgr">github</>
-    or <ulink url="https://groups.google.com/group/repmgr">the mailing list/forum</>.
+    in touch via <ulink url="https://github.com/2ndQuadrant/repmgr">github</ulink>
+    or <ulink url="https://groups.google.com/group/repmgr">the mailing list/forum</ulink>.
    Multiple 2ndQuadrant customers contribute funding
    to make repmgr development possible.
   </para>

   <para>
-    2ndQuadrant, a Platinum sponsor of the PostgreSQL project,
-    continues to develop repmgr to meet internal needs and those of customers.
-     Other companies as well as individual developers
-    are welcome to participate in the efforts.
+     &repmgr; is fully supported by 2ndQuadrant's
+     <ulink url="https://www.2ndquadrant.com/en/support/support-postgresql/">24/7 Production Support</ulink>.
+     2ndQuadrant, a Major Sponsor of the PostgreSQL project, continues to develop and maintain &repmgr;.
+     Other companies as well as individual developers are welcome to participate in the efforts.
   </para>
  </abstract>

@@ -73,23 +80,16 @@
  &promoting-standby;
  &follow-new-primary;
  &switchover;
-  &configuring-witness-server;
  &event-notifications;
  &upgrading-repmgr;
 </part>

 <part id="using-repmgrd">
  <title>Using repmgrd</title>
+  &repmgrd-overview;
  &repmgrd-automatic-failover;
  &repmgrd-configuration;
-  &repmgrd-demonstration;
-  &repmgrd-cascading-replication;
-  &repmgrd-network-split;
-  &repmgrd-witness-server;
-  &repmgrd-pausing;
-  &repmgrd-degraded-monitoring;
-  &repmgrd-monitoring;
-  &repmgrd-notes;
+  &repmgrd-operation;
  &repmgrd-bdr;
 </part>

--- a/doc/repmgrd-automatic-failover.sgml
+++ b/doc/repmgrd-automatic-failover.sgml
@@ -13,5 +13,285 @@
  providing monitoring information about the state of each standby.
 </para>

+<sect1 id="repmgrd-witness-server" xreflabel="Using a witness server with repmgrd">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>witness server</secondary>
+ </indexterm>
+
+ <indexterm>
+   <primary>witness server</primary>
+   <secondary>repmgrd</secondary>
+ </indexterm>
+ <title>Using a witness server</title>
+ <para>
+   A <xref linkend="witness-server"> is a normal PostgreSQL instance which
+   is not part of the streaming replication cluster; its purpose is, if a
+   failover situation occurs, to provide proof that it is the primary server
+   itself which is unavailable, rather than e.g. a network split between
+   different physical locations.
+ </para>
+
+ <para>
+   A typical use case for a witness server is a two-node streaming replication
+   setup, where the primary and standby are in different locations (data centres).
+   By creating a witness server in the same location (data centre) as the primary,
+   if the primary becomes unavailable it's possible for the standby to decide whether
+   it can promote itself without risking a "split brain" scenario: if it can't see either the
+   witness or the primary server, it's likely there's a network-level interruption
+   and it should not promote itself. If it can see the witness but not the primary,
+   this proves there is no network interruption and the primary itself is unavailable,
+   and it can therefore promote itself (and ideally take action to fence the
+   former primary).
+ </para>
+ <note>
+   <para>
+     <emphasis>Never</emphasis> install a witness server on the same physical host
+     as another node in the replication cluster managed by &repmgr; - it's essential
+     the witness is not affected in any way by failure of another node.
+   </para>
+ </note>
+ <para>
+   For more complex replication scenarios,e.g. with multiple datacentres, it may
+   be preferable to use location-based failover, which ensures that only nodes
+   in the same location as the primary will ever be promotion candidates;
+   see <xref linkend="repmgrd-network-split"> for more details.
+ </para>
+
+ <note>
+   <simpara>
+     A witness server will only be useful if <application>repmgrd</application>
+     is in use.
+   </simpara>
+ </note>
+
+ <sect2 id="creating-witness-server">
+   <title>Creating a witness server</title>
+ <para>
+   To create a witness server, set up a normal PostgreSQL instance on a server
+   in the same physical location as the cluster's primary server.
+ </para>
+ <para>
+   This instance should <emphasis>not</emphasis> be on the same physical host as the primary server,
+   as otherwise if the primary server fails due to hardware issues, the witness
+   server will be lost too.
+ </para>
+ <note>
+   <simpara>
+     &repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
+     command, which would automatically create a PostgreSQL instance. However
+     this often resulted in an unsatisfactory, hard-to-customise instance.
+   </simpara>
+ </note>
+ <para>
+   The witness server should be configured in the same way as a normal
+   &repmgr; node; see section <xref linkend="configuration">.
+ </para>
+ <para>
+   Register the witness server with <xref linkend="repmgr-witness-register">.
+   This will create the &repmgr; extension on the witness server, and make
+   a copy of the &repmgr; metadata.
+ </para>
+ <note>
+   <simpara>
+    As the witness server is not part of the replication cluster, further
+    changes to the &repmgr; metadata will be synchronised by
+    <application>repmgrd</application>.
+   </simpara>
+ </note>
+ <para>
+   Once the witness server has been configured, <application>repmgrd</application>
+   should be started.
+ </para>
+
+ <para>
+  To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
+ </para>
+
+ </sect2>
+
+</sect1>
+
+
+<sect1 id="repmgrd-network-split" xreflabel="Handling network splits with repmgrd">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>network splits</secondary>
+ </indexterm>
+
+ <indexterm>
+   <primary>network splits</primary>
+ </indexterm>
+
+ <title>Handling network splits with repmgrd</title>
+ <para>
+  A common pattern for replication cluster setups is to spread servers over
+  more than one datacentre. This can provide benefits such as geographically-
+  distributed read replicas and DR (disaster recovery capability). However
+  this also means there is a risk of disconnection at network level between
+  datacentre locations, which would result in a split-brain scenario if
+  servers in a secondary data centre were no longer able to see the primary
+  in the main data centre and promoted a standby among themselves.
+ </para>
+ <para>
+  &repmgr; enables provision of &quot;<xref linkend="witness-server">&quot; to
+  artificially create a quorum of servers in a particular location, ensuring
+  that nodes in another location will not elect a new primary if they
+  are unable to see the majority of nodes. However this approach does not
+  scale well, particularly with more complex replication setups, e.g.
+  where the majority of nodes are located outside of the primary datacentre.
+  It also means the <literal>witness</literal> node needs to be managed as an
+  extra PostgreSQL instance outside of the main replication cluster, which
+  adds administrative and programming complexity.
+ </para>
+ <para>
+  <literal>repmgr4</literal> introduces the concept of <literal>location</literal>:
+  each node is associated with an arbitrary location string (default is
+  <literal>default</literal>); this is set in <filename>repmgr.conf</filename>, e.g.:
+  <programlisting>
+    node_id=1
+    node_name=node1
+    conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'
+    data_directory='/var/lib/postgresql/data'
+    location='dc1'</programlisting>
+ </para>
+ <para>
+  In a failover situation, <application>repmgrd</application> will check if any servers in the
+  same location as the current primary node are visible.  If not, <application>repmgrd</application>
+  will assume a network interruption and not promote any node in any
+  other location (it will however enter <link linkend="repmgrd-degraded-monitoring">degraded monitoring</link>
+  mode until a primary becomes visible).
+ </para>
+
+</sect1>
+
+<sect1 id="repmgrd-standby-disconnection-on-failover" xreflabel="Standby disconnection on failover">
+  <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>standby disconnection on failover</secondary>
+ </indexterm>
+
+  <indexterm>
+    <primary>standby disconnection on failover</primary>
+  </indexterm>
+
+  <title>Standby disconnection on failover</title>
+  <para>
+    If <option>standby_disconnect_on_failover</option> is set to <literal>true</literal> in
+    <filename>repmgr.conf</filename>, in a failover situation <application>repmgrd</application> will forcibly disconnect
+    the local node's WAL receiver before making a failover decision.
+  </para>
+  <note>
+    <para>
+      <option>standby_disconnect_on_failover</option> is available from PostgreSQL 9.5 and later.
+      Additionally this requires that the <literal>repmgr</literal> database user is a superuser.
+    </para>
+  </note>
+  <para>
+    By doing this, it's possible to ensure that, at the point the failover decision is made, no nodes
+    are receiving data from the primary and their LSN location will be static.
+  </para>
+  <important>
+    <para>
+      <option>standby_disconnect_on_failover</option> <emphasis>must</emphasis> be set to the same value on
+      all nodes.
+    </para>
+  </important>
+  <para>
+    Note that when using <option>standby_disconnect_on_failover</option> there will be a delay of 5 seconds
+    plus however many seconds it takes to confirm the WAL receiver is disconnected before
+    <application>repmgrd</application> proceeds with the failover decision.
+  </para>
+  <para>
+    Following the failover operation, no matter what the outcome, each node will reconnect its WAL receiver.
+  </para>
+
+</sect1>
+
+<sect1 id="repmgrd-failover-validation" xreflabel="Failover validation">
+  <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>failover validation</secondary>
+ </indexterm>
+
+  <indexterm>
+    <primary>failover validation</primary>
+  </indexterm>
+
+  <title>Failover validation</title>
+  <para>
+    From <link linkend="release-4.3">repmgr 4.3</link>, &repmgr; makes it possible to provide a script
+    to <application>repmgrd</application> which, in a failover situation,
+    will be executed by the promotion candidate (the node which has been selected
+    to be the new primary) to confirm whether the node should actually be promoted.
+  </para>
+  <para>
+    To use this, <option>failover_validation_command</option> in <filename>repmgr.conf</filename>
+    to a script executable by the <literal>postgres</literal> system user, e.g.:
+    <programlisting>
+      failover_validation_command=/path/to/script.sh %n %a</programlisting>
+  </para>
+  <para>
+    The <literal>%n</literal> parameter will be replaced with the node ID, and the
+    <literal>%a</literal> parameter will be replaced by the node name when the script is executed.
+  </para>
+  <para>
+    This script must return an exit code of <literal>0</literal> to indicate the node should promote itself.
+    Any other value will result in the promotion being aborted and the election rerun.
+    There is a pause of <option>election_rerun_interval</option> seconds before the election is rerun.
+  </para>
+  <para>
+    Sample <application>repmgrd</application> log file output during which the failover validation
+    script rejects the proposed promotion candidate:
+    <programlisting>
+[2019-03-13 21:01:30] [INFO] visible nodes: 2; total nodes: 2; no nodes have seen the primary within the last 4 seconds
+[2019-03-13 21:01:30] [NOTICE] promotion candidate is "node2" (ID: 2)
+[2019-03-13 21:01:30] [NOTICE] executing "failover_validation_command"
+[2019-03-13 21:01:30] [DETAIL] /usr/local/bin/failover-validation.sh 2
+[2019-03-13 21:01:30] [INFO] output returned by failover validation command:
+Node ID: 2
+
+[2019-03-13 21:01:30] [NOTICE] failover validation command returned a non-zero value: "1"
+[2019-03-13 21:01:30] [NOTICE] promotion candidate election will be rerun
+[2019-03-13 21:01:30] [INFO] 1 followers to notify
+[2019-03-13 21:01:30] [NOTICE] notifying node "node3" (node ID: 3) to rerun promotion candidate selection
+INFO:  node 3 received notification to rerun promotion candidate election
+[2019-03-13 21:01:30] [NOTICE] rerunning election after 15 seconds ("election_rerun_interval")</programlisting>
+  </para>
+
+
+</sect1>
+
+  <sect1 id="cascading-replication" xreflabel="Cascading replication">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>cascading replication</secondary>
+ </indexterm>
+
+ <indexterm>
+   <primary>cascading replication</primary>
+   <secondary>repmgrd</secondary>
+ </indexterm>
+
+ <title>repmgrd and cascading replication</title>
+ <para>
+  Cascading replication - where a standby can connect to an upstream node and not
+  the primary server itself - was introduced in PostgreSQL 9.2. &repmgr; and
+  <application>repmgrd</application> support cascading replication by keeping track of the relationship
+  between standby servers - each node record is stored with the node id of its
+  upstream ("parent") server (except of course the primary server).
+ </para>
+ <para>
+  In a failover situation where the primary node fails and a top-level standby
+  is promoted, a standby connected to another standby will not be affected
+  and continue working as normal (even if the upstream standby it's connected
+  to becomes the primary node). If however the node's direct upstream fails,
+  the &quot;cascaded standby&quot; will attempt to reconnect to that node's parent
+  (unless <varname>failover</varname> is set to <literal>manual</literal> in
+  <filename>repmgr.conf</filename>).
+ </para>
+
+  </sect1>
+

 </chapter>
--- a/doc/repmgrd-bdr.sgml
+++ b/doc/repmgrd-bdr.sgml
@@ -10,7 +10,7 @@

  <title>BDR failover with repmgrd</title>
  <para>
-    &repmgr; 4.x provides support for monitoring BDR nodes and taking action in
+    &repmgr; 4.x provides support for monitoring a pair of BDR 2.x nodes and taking action in
    case one of the nodes fails.
  </para>
  <note>
@@ -31,8 +31,21 @@
    reconfigure a proxy server/connection pooler such as <application>PgBouncer</application>.
  </para>

+  <note>
+    <simpara>
+      This &repmgr; functionality is for BDR 2.x only running on PostgreSQL 9.4/9.6.
+      It is <emphasis>not</emphasis> required for later BDR versions.
+    </simpara>
+  </note>
+
  <sect1 id="bdr-prerequisites" xreflabel="BDR prequisites">
    <title>Prerequisites</title>
+    <important>
+      <para>
+        This &repmgr; functionality is for BDR 2.x only running on PostgreSQL 9.4/9.6.
+        It is <emphasis>not</emphasis> required for later BDR versions.
+      </para>
+    </important>
    <para>
      &repmgr; 4 requires PostgreSQL 9.4 or 9.6 with the BDR 2 extension
      enabled and configured for a two-node BDR network. &repmgr; 4 packages
--- a/doc/repmgrd-cascading-replication.sgml
+++ b/doc/repmgrd-cascading-replication.sgml
@@ -1,24 +0,0 @@
-<chapter id="repmgrd-cascading-replication">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>cascading replication</secondary>
- </indexterm>
-
- <title>repmgrd and cascading replication</title>
- <para>
-  Cascading replication - where a standby can connect to an upstream node and not
-  the primary server itself - was introduced in PostgreSQL 9.2. &repmgr; and
-  <application>repmgrd</application> support cascading replication by keeping track of the relationship
-  between standby servers - each node record is stored with the node id of its
-  upstream ("parent") server (except of course the primary server).
- </para>
- <para>
-  In a failover situation where the primary node fails and a top-level standby
-  is promoted, a standby connected to another standby will not be affected
-  and continue working as normal (even if the upstream standby it's connected
-  to becomes the primary node). If however the node's direct upstream fails,
-  the &quot;cascaded standby&quot; will attempt to reconnect to that node's parent
-  (unless <varname>failover</varname> is set to <literal>manual</literal> in
-  <filename>repmgr.conf</filename>).
- </para>
-</chapter>
--- a/doc/repmgrd-configuration.sgml
+++ b/doc/repmgrd-configuration.sgml
@@ -5,7 +5,7 @@
    <secondary>configuration</secondary>
  </indexterm>

-  <title>repmgrd configuration</title>
+  <title>repmgrd setup and configuration</title>

  <para>
    <application>repmgrd</application> is a daemon which runs on each PostgreSQL node,
@@ -20,7 +20,7 @@
  </para>

  <sect1 id="repmgrd-basic-configuration">
-    <title>repmgrd basic configuration</title>
+    <title>repmgrd configuration</title>

    <para>
      To use <application>repmgrd</application>, its associated function library <emphasis>must</emphasis> be
@@ -34,73 +34,411 @@
      the <ulink url="https://www.postgresql.org/docs/current/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
    </para>

+    <para>
+      The following configuraton options apply to <application>repmgrd</application> in all circumstances:
+    </para>
+    <variablelist>
+
+        <varlistentry>
+
+         <indexterm>
+            <primary>monitor_interval_secs</primary>
+          </indexterm>
+          <term><option>monitor_interval_secs</option></term>
+          <listitem>
+            <para>
+              The interval (in seconds, default: <literal>2</literal>) to check the availability of the upstream node.
+            </para>
+          </listitem>
+
+        </varlistentry>
+
+        <varlistentry id="connection-check-type">
+
+          <indexterm>
+            <primary>connection_check_type</primary>
+          </indexterm>
+          <term><option>connection_check_type</option></term>
+          <listitem>
+            <para>
+              The option <option>connection_check_type</option> is used to select the method
+              <application>repmgrd</application> uses to determine whether the upstream node is available.
+            </para>
+            <para>
+              Possible values are:
+              <itemizedlist spacing="compact" mark="bullet">
+                <listitem>
+                  <simpara>
+                    <literal>ping</literal> (default) - uses <command>PQping()</command> to
+                    determine server availability
+                  </simpara>
+                </listitem>
+                <listitem>
+                  <simpara>
+                    <literal>connection</literal> - determines server availability
+                    by attempt ingto make a new connection to the upstream node
+                  </simpara>
+                </listitem>
+                <listitem>
+                  <simpara>
+                    <literal>query</literal> - determines server availability
+                    by executing an SQL statement on the node via the existing connection
+                  </simpara>
+                </listitem>
+
+              </itemizedlist>
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+         <indexterm>
+            <primary>reconnect_attempts</primary>
+          </indexterm>
+          <term><option>reconnect_attempts</option></term>
+          <listitem>
+            <para>
+              The number of attempts (default: <literal>6</literal>) will be made to reconnect to an unreachable
+	      upstream node before initiating a failover.
+            </para>
+            <para>
+              There will be an interval of <option>reconnect_interval</option> seconds between each reconnection
+              attempt.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+         <indexterm>
+            <primary>reconnect_interval</primary>
+          </indexterm>
+          <term><option>reconnect_interval</option></term>
+          <listitem>
+            <para>
+              Interval (in seconds, default: <literal>10</literal>) between attempts to reconnect to an unreachable
+              upstream node.
+            </para>
+            <para>
+              The number of reconnection attempts is defined by the parameter <option>reconnect_attempts</option>.
+            </para>
+          </listitem>
+        </varlistentry>
+
+
+
+        <varlistentry>
+          <indexterm>
+            <primary>degraded_monitoring_timeout</primary>
+          </indexterm>
+          <term><option>degraded_monitoring_timeout</option></term>
+          <listitem>
+	    <para>
+              Interval (in seconds) after which <application>repmgrd</application> will terminate if
+              either of the servers (local node and or upstream node) being monitored is no longer available
+              (<link linkend="repmgrd-degraded-monitoring">degraded monitoring mode</link>).
+            </para>
+            <para>
+              <literal>-1</literal> (default) disables this timeout completely.
+            </para>
+	  </listitem>
+	</varlistentry>
+
+    </variablelist>
+
+      <para>
+        See also <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename> for an annotated sample configuration file.
+      </para>

    <sect2 id="repmgrd-automatic-failover-configuration">
-      <title>Automatic failover configuration</title>
+      <title>Required configuration for automatic failover</title>
+
      <para>
-        If using automatic failover, the following <application>repmgrd</application> options *must* be set in
-        <filename>repmgr.conf</filename> :
+        The following <application>repmgrd</application> options <emphasis>must</emphasis> be set in
+        <filename>repmgr.conf</filename>:
+
+        <itemizedlist spacing="compact" mark="bullet">
+          <listitem>
+            <simpara><option>failover</option></simpara>
+          </listitem>
+          <listitem>
+            <simpara><option>promote_command</option></simpara>
+          </listitem>
+          <listitem>
+            <simpara><option>follow_command</option></simpara>
+          </listitem>
+        </itemizedlist>
+      </para>
+
+
+      <para>
+        Example:
        <programlisting>
          failover=automatic
          promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'
          follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
      </para>
      <para>
-        Adjust file paths as appropriate; alway specify the full path to the &repmgr; binary.
+        Details of each option are as follows:
      </para>
+      <variablelist>
+        <varlistentry>

-      <note>
-        <para>
-          &repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
-          or <option>follow_command</option>; these can be user-defined scripts so must always be
-          specified with the full path.
-        </para>
-      </note>
+          <indexterm>
+            <primary>failover</primary>
+          </indexterm>
+          <term><option>failover</option></term>
+          <listitem>
+            <para>
+              <option>failover</option> can be one of <literal>automatic</literal> or <literal>manual</literal>.
+            </para>
+            <note>
+              <para>
+                If <option>failover</option> is set to <literal>manual</literal>, <application>repmgrd</application>
+                will not take any action if a failover situation is detected, and the node may need to
+                be modified manually (e.g. by executing <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>).
+              </para>
+            </note>
+
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <indexterm>
+            <primary>promote_command</primary>
+          </indexterm>
+          <term><option>promote_command</option></term>
+          <listitem>
+            <para>
+              The program or script defined in <option>promote_command</option> will be executed
+              in a failover situation when <application>repmgrd</application> determines that
+              the current node is to become the new primary node.
+            </para>
+            <para>
+              Normally <option>promote_command</option> is set as &repmgr;'s
+              <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> command.
+            </para>
+            <para>
+              It is also possible to provide a shell script to e.g. perform user-defined tasks
+              before promoting the current node. In this case the script <emphasis>must</emphasis>
+              at some point execute <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
+              to promote the node; if this is not done, &repmgr; metadata will not be updated and
+              &repmgr; will no longer function reliably.
+            </para>
+            <para>
+              Example:
+              <programlisting>
+                promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'</programlisting>
+            </para>
+
+            <para>
+              Note that the <literal>--log-to-file</literal> option will cause
+              output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
+              to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
+            </para>
+            <note>
+              <para>
+                &repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
+                or <option>follow_command</option>; these can be user-defined scripts so must always be
+                specified with the full path.
+              </para>
+            </note>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <indexterm>
+            <primary>follow_command</primary>
+          </indexterm>
+          <term><option>follow_command</option></term>
+          <listitem>
+            <para>
+              The program or script defined in <option>follow_command</option> will be executed
+              in a failover situation when <application>repmgrd</application> determines that
+              the current node is to follow the new primary node.
+            </para>
+            <para>
+              Normally <option>follow_command</option> is set as &repmgr;'s
+              <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command> command.
+            </para>
+            <para>
+              The <option>follow_command</option> parameter
+              should provide the <literal>--upstream-node-id=%n</literal>
+              option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
+              <application>repmgrd</application> with the ID of the new primary node. If this is not provided,
+              <command>repmgr standby follow</command> will attempt to determine the new primary by itself, but if the
+              original primary comes back online after the new primary is promoted, there is a risk that
+              <command>repmgr standby follow</command> will result in the node continuing to follow
+              the original primary.
+            </para>
+            <para>
+              It is also possible to provide a shell script to e.g. perform user-defined tasks
+              before promoting the current node. In this case the script <emphasis>must</emphasis>
+              at some point execute <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>
+              to promote the node; if this is not done, &repmgr; metadata will not be updated and
+              &repmgr; will no longer function reliably.
+            </para>
+            <para>
+              Example:
+              <programlisting>
+          follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
+            </para>
+
+            <para>
+              Note that the <literal>--log-to-file</literal> option will cause
+              output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
+              to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
+            </para>
+            <note>
+              <para>
+                &repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
+                or <option>follow_command</option>; these can be user-defined scripts so must always be
+                specified with the full path.
+              </para>
+            </note>
+          </listitem>
+
+        </varlistentry>
+
+      </variablelist>
+
+
+    </sect2>
+
+    <sect2 id="repmgrd-automatic-failover-configuration-optional">
+      <title>Optional configuration for automatic failover</title>

      <para>
-        Note that the <literal>--log-to-file</literal> option will cause
-        output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
-        to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
-        See <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename>
-        for further <application>repmgrd</application>-specific settings.
+        The following configuraton options can be use to fine-tune automatic failover:
      </para>
-      <para>
-        When <varname>failover</varname> is set to <literal>automatic</literal>, upon detecting failure
-        of the current  primary, <application>repmgrd</application> will execute one of:
-      </para>
-      <itemizedlist spacing="compact" mark="bullet">
-        <listitem>
-          <simpara>
-            <varname>promote_command</varname> (if the current server is to become the new primary)
-          </simpara>
-        </listitem>
-        <listitem>
-          <simpara>
-            <varname>follow_command</varname> (if the current server needs to follow another server which has
-            become the new primary)
-          </simpara>
-        </listitem>
-      </itemizedlist>
-      <note>
-        <para>
-          These commands can be any valid shell script which results in one of these
-          two actions happening, but if &repmgr;'s <command>standby follow</command> or
-          <command>standby promote</command>
-          commands are not executed (either directly as shown here, or from a script which
-          performs other actions), the &repmgr; metadata will not be updated and
-          &repmgr; will no longer function reliably.
-        </para>
-      </note>
+      <variablelist>
+
+        <varlistentry>
+          <indexterm>
+            <primary>priority</primary>
+          </indexterm>
+          <term><option>priority</option></term>
+          <listitem>
+            <para>
+              Indicates a preferred priority (default: <literal>100</literal>) for promoting nodes;
+			  a value of zero prevents the node being promoted to primary.
+            </para>
+            <para>
+              Note that the priority setting is only applied if two or more nodes are
+              determined as promotion candidates; in that case the node with the
+              higher priority is selected.
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <indexterm>
+            <primary>failover_validation_command</primary>
+          </indexterm>
+          <term><option>failover_validation_command</option></term>
+          <listitem>
+            <para>
+              User-defined script to execute for an external mechanism to validate the failover
+	      decision made by <application>repmgrd</application>.
+            </para>
+            <note>
+              <para>
+                This option <emphasis>must</emphasis> be identically configured
+                on all nodes.
+              </para>
+            </note>
+            <para>
+              One or both of the following parameter placeholders
+			  should be provided, which will be replaced by repmgrd with the appropriate
+	          value:
+              <itemizedlist spacing="compact" mark="bullet">
+                <listitem>
+                  <simpara><literal>%n</literal>: node ID</simpara>
+                </listitem>
+                <listitem>
+                  <simpara><literal>%a</literal>: node name</simpara>
+                </listitem>
+              </itemizedlist>
+            </para>
+            <para>
+              See also: <link linkend="repmgrd-failover-validation">Failover validation</link>.
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+
+         <indexterm>
+            <primary>standby_disconnect_on_failover</primary>
+          </indexterm>
+          <term><option>standby_disconnect_on_failover</option></term>
+          <listitem>
+            <para>
+              In a failover situation, disconnect the local node's WAL receiver.
+            </para>
+            <para>
+              This option is available from PostgreSQL 9.5 and later.
+            </para>
+            <note>
+              <para>
+                This option <emphasis>must</emphasis> be identically configured
+                on all nodes.
+              </para>
+              <para>
+                Additionally the &repmgr; user <emphasis>must</emphasis> be a superuser
+                for this option.
+              </para>
+              <para>
+                <application>repmgrd</application> will refuse to start if this option is set
+                but either of these prerequisites is not met.
+              </para>
+            </note>
+
+            <para>
+              See also: <link linkend="repmgrd-standby-disconnection-on-failover">Standby disconnection on failover</link>.
+            </para>
+          </listitem>
+        </varlistentry>
+
+      </variablelist>

      <para>
-        The <varname>follow_command</varname> should provide the <literal>--upstream-node-id=%n</literal>
-        option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
-        <application>repmgrd</application> with the ID of the new primary node. If this is not provided, &repmgr;
-        will attempt to determine the new primary by itself, but if the
-        original primary comes back online after the new primary is promoted, there is a risk that
-        <command>repmgr standby follow</command> will result in the node continuing to follow
-        the original primary.
+        The following options can be used to further fine-tune failover behaviour.
+        In practice it's unlikely these will need to be changed from their default
+        values, but are available as configuration options should the need arise.
      </para>
+      <variablelist>
+
+        <varlistentry>
+          <indexterm>
+            <primary>election_rerun_interval</primary>
+          </indexterm>
+          <term><option>election_rerun_interval</option></term>
+          <listitem>
+			<para>
+			  If <option>failover_validation_command</option> is set, and the command returns
+			  an error, pause the specified amount of seconds (default: 15) before rerunning the election.
+			</para>
+		  </listitem>
+		</varlistentry>
+
+
+        <varlistentry>
+          <indexterm>
+            <primary>sibling_nodes_disconnect_timeout</primary>
+          </indexterm>
+          <term><option>sibling_nodes_disconnect_timeout</option></term>
+          <listitem>
+			<para>
+              If <option>standby_disconnect_on_failover</option> is <literal>true</literal>, the
+              maximum length of time (in seconds, default: <literal>30</literal>)
+			  to wait for other standbys to confirm they have disconnected their
+		      WAL receivers.
+			</para>
+		  </listitem>
+		</varlistentry>
+      </variablelist>
+
+
+
    </sect2>

    <sect2 id="postgresql-service-configuration">
@@ -175,10 +513,8 @@ repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
        in <filename>repmgr.conf</filename>.
      </para>
      <para>
-        The default monitoring interval is 2 seconds; this value can be explicitly set using:
-        <programlisting>
-          monitor_interval_secs=&lt;seconds&gt;</programlisting>
-        in <filename>repmgr.conf</filename>.
+        Monitoring data is written at the interval defined by
+        the option <option>monitor_interval_secs</option> (see above).
      </para>
      <para>
        For more details on monitoring, see <xref linkend="repmgrd-monitoring">.
@@ -228,6 +564,13 @@ repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
          </simpara>
        </listitem>

+
+        <listitem>
+          <simpara>
+            <varname>connection_check_type</varname>
+          </simpara>
+        </listitem>
+
        <listitem>
          <simpara>
            <varname>conninfo</varname>
@@ -252,6 +595,12 @@ repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
          </simpara>
        </listitem>

+        <listitem>
+          <simpara>
+            <varname>failover_validation_command</varname>
+          </simpara>
+        </listitem>
+
        <listitem>
          <simpara>
            <varname>failover</varname>
@@ -324,12 +673,30 @@ repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
          </simpara>
        </listitem>

+        <listitem>
+          <simpara>
+            <varname>retry_promote_interval_secs</varname>
+          </simpara>
+        </listitem>
+
        <listitem>
          <simpara>
            <varname>repmgrd_standby_startup_timeout</varname>
          </simpara>
        </listitem>

+        <listitem>
+          <simpara>
+            <varname>sibling_nodes_disconnect_timeout</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>standby_disconnect_on_failover</varname>
+          </simpara>
+        </listitem>
+
      </itemizedlist>

      <para>
--- a/doc/repmgrd-degraded-monitoring.sgml
+++ b/doc/repmgrd-degraded-monitoring.sgml
@@ -1,83 +0,0 @@
-<chapter id="repmgrd-degraded-monitoring" xreflabel="repmgrd degraded monitoring">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>degraded monitoring</secondary>
- </indexterm>
-
- <title>"degraded monitoring" mode</title>
- <para>
-  In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
-  of monitoring the node's upstream server. In these cases it enters &quot;degraded monitoring&quot;
-  mode, where <application>repmgrd</application> remains active but is waiting for the situation
-  to be resolved.
- </para>
- <para>
-  Situations where this happens are:
-  <itemizedlist spacing="compact" mark="bullet">
-
-   <listitem>
-    <simpara>a failover situation has occurred, no nodes in the primary node's location are visible</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but no promotion candidate is available</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but the promotion candidate could not be promoted</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but the node was unable to follow the new primary</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but no primary has become available</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but automatic failover is not enabled for the node</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
-   </listitem>
-  </itemizedlist>
- </para>
-
- <para>
-  Example output in a situation where there is only one standby with <literal>failover=manual</literal>,
-  and the primary node is unavailable (but is later restarted):
-  <programlisting>
-    [2017-08-29 10:59:19] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)
-    [2017-08-29 10:59:33] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
-    [2017-08-29 10:59:33] [INFO] checking state of node 1, 1 of 5 attempts
-    [2017-08-29 10:59:33] [INFO] sleeping 1 seconds until next reconnection attempt
-    (...)
-    [2017-08-29 10:59:37] [INFO] checking state of node 1, 5 of 5 attempts
-    [2017-08-29 10:59:37] [WARNING] unable to reconnect to node 1 after 5 attempts
-    [2017-08-29 10:59:37] [NOTICE] this node is not configured for automatic failover so will not be considered as promotion candidate
-    [2017-08-29 10:59:37] [NOTICE] no other nodes are available as promotion candidate
-    [2017-08-29 10:59:37] [HINT] use "repmgr standby promote" to manually promote this node
-    [2017-08-29 10:59:37] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
-    [2017-08-29 10:59:53] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
-    [2017-08-29 11:00:45] [NOTICE] reconnected to upstream node 1 after 68 seconds, resuming monitoring
-    [2017-08-29 11:00:57] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)</programlisting>
-
- </para>
- <para>
-  By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
-  However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
-  after which <application>repmgrd</application> will terminate.
- </para>
-
- <note>
-   <para>
-     If <application>repmgrd</application> is monitoring a primary mode which has been stopped
-     and manually restarted as a standby attached to a new primary, it will automatically detect
-     the status change and update the node record to reflect the node's new status
-     as an active standby. It will then resume monitoring the node as a standby.
-   </para>
- </note>
-
-</chapter>
--- a/doc/repmgrd-demonstration.sgml
+++ b/doc/repmgrd-demonstration.sgml
@@ -1,96 +0,0 @@
-<chapter id="repmgrd-demonstration">
- <title>repmgrd demonstration</title>
- <para>
-  To demonstrate automatic failover, set up a 3-node replication cluster (one primary
-  and two standbys streaming directly from the primary) so that the cluster looks
-  something like this:
-  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster show
-     ID | Name  | Role    | Status    | Upstream | Location | Connection string
-    ----+-------+---------+-----------+----------+----------+--------------------------------------
-     1  | node1 | primary | * running |          | default  | host=node1 dbname=repmgr user=repmgr
-     2  | node2 | standby |   running | node1    | default  | host=node2 dbname=repmgr user=repmgr
-     3  | node3 | standby |   running | node1    | default  | host=node3 dbname=repmgr user=repmgr</programlisting>
- </para>
- <para>
-  Start <application>repmgrd</application> on each standby and verify that it's running by examining the
-  log output, which at log level <literal>INFO</literal> will look like this:
-  <programlisting>
-    [2017-08-24 17:31:00] [NOTICE] using configuration file "/etc/repmgr.conf"
-    [2017-08-24 17:31:00] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr"
-    [2017-08-24 17:31:00] [NOTICE] starting monitoring of node <literal>node2</literal> (ID: 2)
-    [2017-08-24 17:31:00] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
- </para>
- <para>
-  Each <application>repmgrd</application> should also have recorded its successful startup as an event:
-  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster event --event=repmgrd_start
-     Node ID | Name  | Event         | OK | Timestamp           | Details
-    ---------+-------+---------------+----+---------------------+-------------------------------------------------------------
-     3       | node3 | repmgrd_start | t  | 2017-08-24 17:35:54 | monitoring connection to upstream node "node1" (node ID: 1)
-     2       | node2 | repmgrd_start | t  | 2017-08-24 17:35:50 | monitoring connection to upstream node "node1" (node ID: 1)
-     1       | node1 | repmgrd_start | t  | 2017-08-24 17:35:46 | monitoring cluster primary "node1" (node ID: 1)  </programlisting>
- </para>
- <para>
-  Now stop the current primary server with e.g.:
-  <programlisting>
-    pg_ctl -D /var/lib/postgresql/data -m immediate stop</programlisting>
- </para>
- <para>
-  This will force the primary to shut down straight away, aborting all processes
-  and transactions.  This will cause a flurry of activity in the <application>repmgrd</application> log
-  files as each <application>repmgrd</application> detects the failure of the primary and a failover
-  decision is made. This is an extract from the log of a standby server (<literal>node2</literal>)
-  which has promoted to new primary after failure of the original primary (<literal>node1</literal>).
-  <programlisting>
-    [2017-08-24 23:32:01] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state
-    [2017-08-24 23:32:08] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
-    [2017-08-24 23:32:08] [INFO] checking state of node 1, 1 of 5 attempts
-    [2017-08-24 23:32:08] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:09] [INFO] checking state of node 1, 2 of 5 attempts
-    [2017-08-24 23:32:09] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:10] [INFO] checking state of node 1, 3 of 5 attempts
-    [2017-08-24 23:32:10] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:11] [INFO] checking state of node 1, 4 of 5 attempts
-    [2017-08-24 23:32:11] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:12] [INFO] checking state of node 1, 5 of 5 attempts
-    [2017-08-24 23:32:12] [WARNING] unable to reconnect to node 1 after 5 attempts
-    INFO:  setting voting term to 1
-    INFO:  node 2 is candidate
-    INFO:  node 3 has received request from node 2 for electoral term 1 (our term: 0)
-    [2017-08-24 23:32:12] [NOTICE] this node is the winner, will now promote self and inform other nodes
-    INFO: connecting to standby database
-    NOTICE: promoting standby
-    DETAIL: promoting server using 'pg_ctl -l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' promote'
-    INFO: reconnecting to promoted server
-    NOTICE: STANDBY PROMOTE successful
-    DETAIL: node 2 was successfully promoted to primary
-    INFO:  node 3 received notification to follow node 2
-    [2017-08-24 23:32:13] [INFO] switching to primary monitoring mode</programlisting>
- </para>
- <para>
-  The cluster status will now look like this, with the original primary (<literal>node1</literal>)
-  marked as inactive, and standby <literal>node3</literal> now following the new primary
-  (<literal>node2</literal>):
-  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster show
-     ID | Name  | Role    | Status    | Upstream | Location | Connection string
-    ----+-------+---------+-----------+----------+----------+----------------------------------------------------
-     1  | node1 | primary | - failed  |          | default  | host=node1 dbname=repmgr user=repmgr
-     2  | node2 | primary | * running |          | default  | host=node2 dbname=repmgr user=repmgr
-     3  | node3 | standby |   running | node2    | default  | host=node3 dbname=repmgr user=repmgr</programlisting>
-
- </para>
- <para>
-  <command>repmgr cluster event</command> will display a summary of what happened to each server
-  during the failover:
-  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster event
-     Node ID | Name  | Event                    | OK | Timestamp           | Details
-    ---------+-------+--------------------------+----+---------------------+-----------------------------------------------------------------------------------
-     3       | node3 | repmgrd_failover_follow  | t  | 2017-08-24 23:32:16 | node 3 now following new upstream node 2
-     3       | node3 | standby_follow           | t  | 2017-08-24 23:32:16 | node 3 is now attached to node 2
-     2       | node2 | repmgrd_failover_promote | t  | 2017-08-24 23:32:13 | node 2 promoted to primary; old primary 1 marked as failed
-     2       | node2 | standby_promote          | t  | 2017-08-24 23:32:13 | node 2 was successfully promoted to primary</programlisting>
- </para>
-</chapter>
--- a/doc/repmgrd-monitoring.sgml
+++ b/doc/repmgrd-monitoring.sgml
@@ -1,80 +0,0 @@
-<chapter id="repmgrd-monitoring" xreflabel="Monitoring with repmgrd">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>monitoring</secondary>
- </indexterm>
- <indexterm>
-   <primary>monitoring</primary>
-   <secondary>with repmgrd</secondary>
- </indexterm>
-
- <title>Monitoring with repmgrd</title>
- <para>
-   When <application>repmgrd</application> is running with the option <literal>monitoring_history=true</literal>,
-  it will constantly write standby node status information to the
-  <varname>monitoring_history</varname> table, providing a near-real time
-  overview of replication status on all nodes
-  in the cluster.
- </para>
- <para>
-   The view <literal>replication_status</literal> shows the most recent state
-   for each node, e.g.:
-  <programlisting>
-    repmgr=# select * from repmgr.replication_status;
-    -[ RECORD 1 ]-------------+------------------------------
-    primary_node_id           | 1
-    standby_node_id           | 2
-    standby_name              | node2
-    node_type                 | standby
-    active                    | t
-    last_monitor_time         | 2017-08-24 16:28:41.260478+09
-    last_wal_primary_location | 0/6D57A00
-    last_wal_standby_location | 0/5000000
-    replication_lag           | 29 MB
-    replication_time_lag      | 00:00:11.736163
-    apply_lag                 | 15 MB
-    communication_time_lag    | 00:00:01.365643</programlisting>
- </para>
- <para>
-  The interval in which monitoring history is written is controlled by the
-  configuration parameter <varname>monitor_interval_secs</varname>;
-  default is 2.
- </para>
- <para>
-  As this can generate a large amount of monitoring data in the table
-  <literal>repmgr.monitoring_history</literal>. it's advisable to regularly
-  purge historical data using the <xref linkend="repmgr-cluster-cleanup">
-  command; use the <literal>-k/--keep-history</literal> option to
-  specify how many day's worth of data should be retained.
- </para>
- <para>
-  It's possible to use <application>repmgrd</application> to run in monitoring
-  mode only (without automatic failover capability) for some or all
-  nodes by setting <literal>failover=manual</literal> in the node's
-  <filename>repmgr.conf</filename> file. In the event of the node's upstream failing,
-  no failover action will be taken and the node will require manual intervention to
-  be reattached to replication. If this occurs, an
-  <link linkend="event-notifications">event notification</link>
-  <varname>standby_disconnect_manual</varname> will be created.
- </para>
- <para>
-  Note that when a standby node is not streaming directly from its upstream
-  node, e.g. recovering WAL from an archive, <varname>apply_lag</varname> will always appear as
-  <literal>0 bytes</literal>.
- </para>
- <tip>
-  <para>
-   If monitoring history is enabled, the contents of the <literal>repmgr.monitoring_history</literal>
-   table will be replicated to attached standbys. This means there will be a small but
-   constant stream of replication activity which may not be desirable. To prevent
-   this, convert the table to an <literal>UNLOGGED</literal> one with:
-   <programlisting>
-     ALTER TABLE repmgr.monitoring_history SET UNLOGGED;</programlisting>
-  </para>
-  <para>
-   This will however mean that monitoring history will not be available on
-   another node following a failover, and the view <literal>repmgr.replication_status</literal>
-   will not work on standbys.
-  </para>
- </tip>
-</chapter>
--- a/doc/repmgrd-network-split.sgml
+++ b/doc/repmgrd-network-split.sgml
@@ -1,48 +0,0 @@
-<chapter id="repmgrd-network-split" xreflabel="Handling network splits with repmgrd">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>network splits</secondary>
- </indexterm>
-
- <title>Handling network splits with repmgrd</title>
- <para>
-  A common pattern for replication cluster setups is to spread servers over
-  more than one datacentre. This can provide benefits such as geographically-
-  distributed read replicas and DR (disaster recovery capability). However
-  this also means there is a risk of disconnection at network level between
-  datacentre locations, which would result in a split-brain scenario if
-  servers in a secondary data centre were no longer able to see the primary
-  in the main data centre and promoted a standby among themselves.
- </para>
- <para>
-  &repmgr; enables provision of &quot;<xref linkend="witness-server">&quot; to
-  artificially create a quorum of servers in a particular location, ensuring
-  that nodes in another location will not elect a new primary if they
-  are unable to see the majority of nodes. However this approach does not
-  scale well, particularly with more complex replication setups, e.g.
-  where the majority of nodes are located outside of the primary datacentre.
-  It also means the <literal>witness</literal> node needs to be managed as an
-  extra PostgreSQL instance outside of the main replication cluster, which
-  adds administrative and programming complexity.
- </para>
- <para>
-  <literal>repmgr4</literal> introduces the concept of <literal>location</literal>:
-  each node is associated with an arbitrary location string (default is
-  <literal>default</literal>); this is set in <filename>repmgr.conf</filename>, e.g.:
-  <programlisting>
-    node_id=1
-    node_name=node1
-    conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'
-    data_directory='/var/lib/postgresql/data'
-    location='dc1'</programlisting>
- </para>
- <para>
-  In a failover situation, <application>repmgrd</application> will check if any servers in the
-  same location as the current primary node are visible.  If not, <application>repmgrd</application>
-  will assume a network interruption and not promote any node in any
-  other location (it will however enter <link linkend="repmgrd-degraded-monitoring">degraded monitoring</link>
-  mode until a primary becomes visible).
- </para>
-
-</chapter>
-
--- a/doc/repmgrd-notes.sgml
+++ b/doc/repmgrd-notes.sgml
@@ -1,38 +0,0 @@
-<chapter id="repmgrd-notes" xreflabel="repmgrd notes">
-
-  <indexterm>
-    <primary>repmgrd</primary>
-    <secondary>notes</secondary>
-  </indexterm>
-  <title>repmgrd notes</title>
-
-  <sect1 id="repmgrd-wal-replay-pause">
-    <indexterm>
-      <primary>repmgrd</primary>
-      <secondary>paused WAL replay</secondary>
-    </indexterm>
-
-    <title>repmgrd and paused WAL replay</title>
-    <para>
-      If WAL replay has been paused (using <command>pg_wal_replay_pause()</command>,
-      on PostgreSQL 9.6 and earlier <command>pg_xlog_replay_pause()</command>),
-      in a failover situation <application>repmgrd</application> will
-      automatically resume WAL replay.
-    </para>
-    <para>
-      This is because if WAL replay is paused, but WAL is pending replay,
-      PostgreSQL cannot be promoted until WAL replay is resumed.
-    </para>
-    <note>
-      <para>
-        <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
-        will refuse to promote a node in this state, as the PostgreSQL
-        <command>promote</command> command will not be acted on until
-        WAL replay is resumed, leaving the cluster in a potentially
-        unstable state. In this case it is up to the user to
-        decide whether to resume WAL replay.
-      </para>
-    </note>
-  </sect1>
-
-</chapter>
--- a/doc/repmgrd-operation.sgml
+++ b/doc/repmgrd-operation.sgml
@@ -0,0 +1,386 @@
+<chapter id="repmgrd-operation" xreflabel="repmgrd operation">
+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>operation</secondary>
+  </indexterm>
+
+  <title>repmgrd operation</title>
+
+
+  <sect1 id="repmgrd-pausing">
+
+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>pausing</secondary>
+  </indexterm>
+
+  <indexterm>
+    <primary>pausing repmgrd</primary>
+  </indexterm>
+
+  <title>Pausing repmgrd</title>
+
+  <para>
+    In normal operation, <application>repmgrd</application> monitors the state of the
+    PostgreSQL node it is running on, and will take appropriate action if problems
+    are detected, e.g. (if so configured) promote the node to primary, if the existing
+    primary has been determined as failed.
+  </para>
+
+  <para>
+    However, <application>repmgrd</application> is unable to distinguish between
+    planned outages (such as performing a <link linkend="performing-switchover">switchover</link>
+    or installing PostgreSQL maintenance released), and an actual server outage. In versions prior to
+    &repmgr; 4.2 it was necessary to stop <application>repmgrd</application> on all nodes (or at least
+    on all nodes where <application>repmgrd</application> is
+    <link linkend="repmgrd-automatic-failover">configured for automatic failover</link>)
+    to prevent <application>repmgrd</application> from making unintentional changes to the
+    replication cluster.
+  </para>
+
+  <para>
+    From <link linkend="release-4.2">&repmgr; 4.2</link>, <application>repmgrd</application>
+    can now be &quot;paused&quot;, i.e. instructed not to take any action such as performing a failover.
+    This can be done from any node in the cluster, removing the need to stop/restart
+    each <application>repmgrd</application> individually.
+  </para>
+
+  <note>
+    <para>
+      For major PostgreSQL upgrades, e.g. from PostgreSQL 10 to PostgreSQL 11,
+      <application>repmgrd</application> should be shut down completely and only started up
+      once the &repmgr; packages for the new PostgreSQL major version have been installed.
+    </para>
+  </note>
+
+  <sect2 id="repmgrd-pausing-prerequisites">
+    <title>Prerequisites for pausing <application>repmgrd</application></title>
+    <para>
+      In order to be able to pause/unpause <application>repmgrd</application>, following
+      prerequisites must be met:
+      <itemizedlist spacing="compact" mark="bullet">
+
+        <listitem>
+          <simpara><link linkend="release-4.2">&repmgr; 4.2</link> or later must be installed on all nodes.</simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>The same major &repmgr; version (e.g. 4.2) must be installed on all nodes (and preferably the same minor version).</simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            PostgreSQL on all nodes must be accessible from the node where the
+            <literal>pause</literal>/<literal>unpause</literal> operation is executed, using the
+            <varname>conninfo</varname> string shown by <link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>.
+          </simpara>
+        </listitem>
+      </itemizedlist>
+    </para>
+    <note>
+      <para>
+        These conditions are required for normal &repmgr; operation in any case.
+      </para>
+    </note>
+
+  </sect2>
+
+  <sect2 id="repmgrd-pausing-execution">
+    <title>Pausing/unpausing <application>repmgrd</application></title>
+    <para>
+      To pause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link>, e.g.:
+   <programlisting>
+$ repmgr -f /etc/repmgr.conf daemon pause
+NOTICE: node 1 (node1) paused
+NOTICE: node 2 (node2) paused
+NOTICE: node 3 (node3) paused</programlisting>
+    </para>
+    <para>
+      The state of <application>repmgrd</application> on each node can be checked with
+      <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>, e.g.:
+    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
+ ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
+----+-------+---------+---------+---------+------+---------
+ 1  | node1 | primary | running | running | 7851 | yes
+ 2  | node2 | standby | running | running | 7889 | yes
+ 3  | node3 | standby | running | running | 7918 | yes</programlisting>
+    </para>
+
+    <note>
+      <para>
+        If executing a switchover with  <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
+		&repmgr; will automatically pause/unpause <application>repmgrd</application> as part of the switchover process.
+      </para>
+    </note>
+
+    <para>
+      If the primary (in this example, <literal>node1</literal>) is stopped, <application>repmgrd</application>
+      running on one of the standbys (here: <literal>node2</literal>) will react like this:
+      <programlisting>
+[2018-09-20 12:22:21] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
+[2018-09-20 12:22:21] [INFO] checking state of node 1, 1 of 5 attempts
+[2018-09-20 12:22:21] [INFO] sleeping 1 seconds until next reconnection attempt
+...
+[2018-09-20 12:22:24] [INFO] sleeping 1 seconds until next reconnection attempt
+[2018-09-20 12:22:25] [INFO] checking state of node 1, 5 of 5 attempts
+[2018-09-20 12:22:25] [WARNING] unable to reconnect to node 1 after 5 attempts
+[2018-09-20 12:22:25] [NOTICE] node is paused
+[2018-09-20 12:22:33] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state
+[2018-09-20 12:22:33] [DETAIL] repmgrd paused by administrator
+[2018-09-20 12:22:33] [HINT] execute "repmgr daemon unpause" to resume normal failover mode</programlisting>
+    </para>
+    <para>
+      If the primary becomes available again (e.g. following a software upgrade), <application>repmgrd</application>
+      will automatically reconnect, e.g.:
+      <programlisting>
+[2018-09-20 13:12:41] [NOTICE] reconnected to upstream node 1 after 8 seconds, resuming monitoring</programlisting>
+    </para>
+
+    <para>
+      To unpause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>, e.g.:
+   <programlisting>
+$ repmgr -f /etc/repmgr.conf daemon unpause
+NOTICE: node 1 (node1) unpaused
+NOTICE: node 2 (node2) unpaused
+NOTICE: node 3 (node3) unpaused</programlisting>
+    </para>
+
+    <note>
+      <para>
+        If the previous primary is no longer accessible when <application>repmgrd</application>
+        is unpaused, no failover action will be taken. Instead, a new primary must be manually promoted using
+        <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>,
+		and any standbys attached to the new primary with
+		<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>.
+      </para>
+      <para>
+        This is to prevent <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
+        resulting in the automatic promotion of a new primary, which may be a problem particularly
+        in larger clusters, where <application>repmgrd</application> could select a different promotion
+        candidate to the one intended by the administrator.
+      </para>
+    </note>
+  </sect2>
+  <sect2 id="repmgrd-pausing-details">
+    <title>Details on the <application>repmgrd</application> pausing mechanism</title>
+
+    <para>
+      The pause state of each node will be stored over a PostgreSQL restart.
+    </para>
+
+	<para>
+	  <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
+	  <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link> can be
+	  executed even if <application>repmgrd</application> is not running; in this case,
+	  <application>repmgrd</application> will start up in whichever pause state has been set.
+	</para>
+    <note>
+      <para>
+		<link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
+		<link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
+		<emphasis>do not</emphasis> stop/start <application>repmgrd</application>.
+      </para>
+    </note>
+  </sect2>
+  </sect1>
+
+  <sect1 id="repmgrd-wal-replay-pause">
+    <indexterm>
+      <primary>repmgrd</primary>
+      <secondary>paused WAL replay</secondary>
+    </indexterm>
+
+    <title>repmgrd and paused WAL replay</title>
+    <para>
+      If WAL replay has been paused (using <command>pg_wal_replay_pause()</command>,
+      on PostgreSQL 9.6 and earlier <command>pg_xlog_replay_pause()</command>),
+      in a failover situation <application>repmgrd</application> will
+      automatically resume WAL replay.
+    </para>
+    <para>
+      This is because if WAL replay is paused, but WAL is pending replay,
+      PostgreSQL cannot be promoted until WAL replay is resumed.
+    </para>
+    <note>
+      <para>
+        <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
+        will refuse to promote a node in this state, as the PostgreSQL
+        <command>promote</command> command will not be acted on until
+        WAL replay is resumed, leaving the cluster in a potentially
+        unstable state. In this case it is up to the user to
+        decide whether to resume WAL replay.
+      </para>
+    </note>
+  </sect1>
+
+<sect1 id="repmgrd-degraded-monitoring" xreflabel="repmgrd degraded monitoring">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>degraded monitoring</secondary>
+ </indexterm>
+
+ <indexterm>
+   <primary>degraded monitoring</primary>
+ </indexterm>
+
+ <title>"degraded monitoring" mode</title>
+ <para>
+  In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
+  of monitoring the node's upstream server. In these cases it enters &quot;degraded monitoring&quot;
+  mode, where <application>repmgrd</application> remains active but is waiting for the situation
+  to be resolved.
+ </para>
+ <para>
+  Situations where this happens are:
+  <itemizedlist spacing="compact" mark="bullet">
+
+   <listitem>
+    <simpara>a failover situation has occurred, no nodes in the primary node's location are visible</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but no promotion candidate is available</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but the promotion candidate could not be promoted</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but the node was unable to follow the new primary</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but no primary has become available</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but automatic failover is not enabled for the node</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
+   </listitem>
+  </itemizedlist>
+ </para>
+
+ <para>
+  Example output in a situation where there is only one standby with <literal>failover=manual</literal>,
+  and the primary node is unavailable (but is later restarted):
+  <programlisting>
+    [2017-08-29 10:59:19] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)
+    [2017-08-29 10:59:33] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
+    [2017-08-29 10:59:33] [INFO] checking state of node 1, 1 of 5 attempts
+    [2017-08-29 10:59:33] [INFO] sleeping 1 seconds until next reconnection attempt
+    (...)
+    [2017-08-29 10:59:37] [INFO] checking state of node 1, 5 of 5 attempts
+    [2017-08-29 10:59:37] [WARNING] unable to reconnect to node 1 after 5 attempts
+    [2017-08-29 10:59:37] [NOTICE] this node is not configured for automatic failover so will not be considered as promotion candidate
+    [2017-08-29 10:59:37] [NOTICE] no other nodes are available as promotion candidate
+    [2017-08-29 10:59:37] [HINT] use "repmgr standby promote" to manually promote this node
+    [2017-08-29 10:59:37] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
+    [2017-08-29 10:59:53] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
+    [2017-08-29 11:00:45] [NOTICE] reconnected to upstream node 1 after 68 seconds, resuming monitoring
+    [2017-08-29 11:00:57] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)</programlisting>
+
+ </para>
+ <para>
+  By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
+  However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
+  after which <application>repmgrd</application> will terminate.
+ </para>
+
+ <note>
+   <para>
+     If <application>repmgrd</application> is monitoring a primary mode which has been stopped
+     and manually restarted as a standby attached to a new primary, it will automatically detect
+     the status change and update the node record to reflect the node's new status
+     as an active standby. It will then resume monitoring the node as a standby.
+   </para>
+ </note>
+</sect1>
+
+
+<sect1 id="repmgrd-monitoring" xreflabel="Storing monitoring data">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>monitoring</secondary>
+ </indexterm>
+ <indexterm>
+   <primary>monitoring</primary>
+   <secondary>with repmgrd</secondary>
+ </indexterm>
+
+ <title>Storing monitoring data</title>
+ <para>
+   When <application>repmgrd</application> is running with the option <literal>monitoring_history=true</literal>,
+  it will constantly write standby node status information to the
+  <varname>monitoring_history</varname> table, providing a near-real time
+  overview of replication status on all nodes
+  in the cluster.
+ </para>
+ <para>
+   The view <literal>replication_status</literal> shows the most recent state
+   for each node, e.g.:
+  <programlisting>
+    repmgr=# select * from repmgr.replication_status;
+    -[ RECORD 1 ]-------------+------------------------------
+    primary_node_id           | 1
+    standby_node_id           | 2
+    standby_name              | node2
+    node_type                 | standby
+    active                    | t
+    last_monitor_time         | 2017-08-24 16:28:41.260478+09
+    last_wal_primary_location | 0/6D57A00
+    last_wal_standby_location | 0/5000000
+    replication_lag           | 29 MB
+    replication_time_lag      | 00:00:11.736163
+    apply_lag                 | 15 MB
+    communication_time_lag    | 00:00:01.365643</programlisting>
+ </para>
+ <para>
+  The interval in which monitoring history is written is controlled by the
+  configuration parameter <varname>monitor_interval_secs</varname>;
+  default is 2.
+ </para>
+ <para>
+  As this can generate a large amount of monitoring data in the table
+  <literal>repmgr.monitoring_history</literal>. it's advisable to regularly
+  purge historical data using the <xref linkend="repmgr-cluster-cleanup">
+  command; use the <literal>-k/--keep-history</literal> option to
+  specify how many day's worth of data should be retained.
+ </para>
+ <para>
+  It's possible to use <application>repmgrd</application> to run in monitoring
+  mode only (without automatic failover capability) for some or all
+  nodes by setting <literal>failover=manual</literal> in the node's
+  <filename>repmgr.conf</filename> file. In the event of the node's upstream failing,
+  no failover action will be taken and the node will require manual intervention to
+  be reattached to replication. If this occurs, an
+  <link linkend="event-notifications">event notification</link>
+  <varname>standby_disconnect_manual</varname> will be created.
+ </para>
+ <para>
+  Note that when a standby node is not streaming directly from its upstream
+  node, e.g. recovering WAL from an archive, <varname>apply_lag</varname> will always appear as
+  <literal>0 bytes</literal>.
+ </para>
+ <tip>
+  <para>
+   If monitoring history is enabled, the contents of the <literal>repmgr.monitoring_history</literal>
+   table will be replicated to attached standbys. This means there will be a small but
+   constant stream of replication activity which may not be desirable. To prevent
+   this, convert the table to an <literal>UNLOGGED</literal> one with:
+   <programlisting>
+     ALTER TABLE repmgr.monitoring_history SET UNLOGGED;</programlisting>
+  </para>
+  <para>
+   This will however mean that monitoring history will not be available on
+   another node following a failover, and the view <literal>repmgr.replication_status</literal>
+   will not work on standbys.
+  </para>
+ </tip>
+</sect1>
+
+
+</chapter>
--- a/doc/repmgrd-overview.sgml
+++ b/doc/repmgrd-overview.sgml
@@ -0,0 +1,187 @@
+<chapter id="repmgrd-overview" xreflabel="repmgrd overview">
+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>overview</secondary>
+  </indexterm>
+
+  <title>repmgrd overview</title>
+
+  <para>
+    <application>repmgrd</application> (&quot;<literal>replication manager daemon</literal>&quot;)
+    is a management and monitoring daemon which runs
+    on each node in a replication cluster. It can automate actions such as
+    failover and updating standbys to follow the new primary, as well as
+    providing monitoring information about the state of each standby.
+  </para>
+  <para>
+    <application>repmgrd</application> is designed to be straightforward to set up
+    and does not require additional external infrastructure.
+  </para>
+  <para>
+    Functionality provided by <application>repmgrd</application> includes:
+    <itemizedlist spacing="compact" mark="bullet">
+
+       <listitem>
+         <simpara>
+           wide range of <link linkend="repmgrd-basic-configuration">configuration options</link>
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           option to execute custom scripts (&quot;<link linkend="event-notifications">event notifications</link>
+           at different points in the failover sequence
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           ability to <link linkend="repmgrd-pausing">pause repmgrd</link>
+           operation on all nodes with a
+           <link linkend="repmgr-daemon-pause"><command>single command</command></link>
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           optional <link linkend="repmgrd-witness-server">witness server</link>
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           &quot;location&quot; configuration option to restrict
+           potential promotion candidates to a single location
+           (e.g. when nodes are spread over multiple data centres)
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           <link linkend="connection-check-type">choice of method</link> to determine node availability
+           (PostgreSQL ping, query execution or new connection)
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           retention of monitoring statistics (optional)
+         </simpara>
+       </listitem>
+
+
+    </itemizedlist>
+
+  </para>
+
+  <sect1 id="repmgrd-demonstration">
+
+    <title>repmgrd demonstration</title>
+    <para>
+  To demonstrate automatic failover, set up a 3-node replication cluster (one primary
+  and two standbys streaming directly from the primary) so that the cluster looks
+  something like this:
+  <programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster show --compact
+     ID | Name  | Role    | Status    | Upstream | Location | Prio.
+    ----+-------+---------+-----------+----------+----------+-------
+     1  | node1 | primary | * running |          | default  | 100
+     2  | node2 | standby |   running | node1    | default  | 100
+     3  | node3 | standby |   running | node1    | default  | 100</programlisting>
+ </para>
+
+ <tip>
+   <para>
+     See section <link linkend="repmgrd-automatic-failover-configuration">Required configuration for automatic failover</link>
+     for an example of minimal <filename>repmgr.conf</filename> file settings suitable for use with <application>repmgrd</application>.
+   </para>
+ </tip>
+ <para>
+  Start <application>repmgrd</application> on each standby and verify that it's running by examining the
+  log output, which at log level <literal>INFO</literal> will look like this:
+  <programlisting>
+    [2019-03-15 06:32:05] [NOTICE] repmgrd (repmgrd 4.3) starting up
+    [2019-03-15 06:32:05] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr connect_timeout=2"
+    INFO:  set_repmgrd_pid(): provided pidfile is /var/run/repmgr/repmgrd-11.pid
+    [2019-03-15 06:32:05] [NOTICE] starting monitoring of node "node2" (ID: 2)
+    [2019-03-15 06:32:05] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
+ </para>
+ <para>
+  Each <application>repmgrd</application> should also have recorded its successful startup as an event:
+  <programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster event --event=repmgrd_start
+     Node ID | Name  | Event         | OK | Timestamp           | Details
+    ---------+-------+---------------+----+---------------------+-------------------------------------------------------------
+     3       | node3 | repmgrd_start | t  | 2019-03-14 04:17:30 | monitoring connection to upstream node "node1" (node ID: 1)
+     2       | node2 | repmgrd_start | t  | 2019-03-14 04:11:47 | monitoring connection to upstream node "node1" (node ID: 1)
+     1       | node1 | repmgrd_start | t  | 2019-03-14 04:04:31 | monitoring cluster primary "node1" (node ID: 1)</programlisting>
+ </para>
+ <para>
+  Now stop the current primary server with e.g.:
+  <programlisting>
+    pg_ctl -D /var/lib/postgresql/data -m immediate stop</programlisting>
+ </para>
+ <para>
+  This will force the primary to shut down straight away, aborting all processes
+  and transactions.  This will cause a flurry of activity in the <application>repmgrd</application> log
+  files as each <application>repmgrd</application> detects the failure of the primary and a failover
+  decision is made. This is an extract from the log of a standby server (<literal>node2</literal>)
+  which has promoted to new primary after failure of the original primary (<literal>node1</literal>).
+  <programlisting>
+    [2019-03-15 06:37:50] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
+    [2019-03-15 06:37:50] [INFO] checking state of node 1, 1 of 3 attempts
+    [2019-03-15 06:37:50] [INFO] sleeping 5 seconds until next reconnection attempt
+    [2019-03-15 06:37:55] [INFO] checking state of node 1, 2 of 3 attempts
+    [2019-03-15 06:37:55] [INFO] sleeping 5 seconds until next reconnection attempt
+    [2019-03-15 06:38:00] [INFO] checking state of node 1, 3 of 3 attempts
+    [2019-03-15 06:38:00] [WARNING] unable to reconnect to node 1 after 3 attempts
+    [2019-03-15 06:38:00] [INFO] primary and this node have the same location ("default")
+    [2019-03-15 06:38:00] [INFO] local node's last receive lsn: 0/900CBF8
+    [2019-03-15 06:38:00] [INFO] node 3 last saw primary node 12 second(s) ago
+    [2019-03-15 06:38:00] [INFO] last receive LSN for sibling node "node3" (ID: 3) is: 0/900CBF8
+    [2019-03-15 06:38:00] [INFO] node "node3" (ID: 3) has same LSN as current candidate "node2" (ID: 2)
+    [2019-03-15 06:38:00] [INFO] visible nodes: 2; total nodes: 2; no nodes have seen the primary within the last 4 seconds
+    [2019-03-15 06:38:00] [NOTICE] promotion candidate is "node2" (ID: 2)
+    [2019-03-15 06:38:00] [NOTICE] this node is the winner, will now promote itself and inform other nodes
+    [2019-03-15 06:38:00] [INFO] promote_command is:
+      "/usr/pgsql-11/bin/repmgr -f /etc/repmgr/11/repmgr.conf standby promote"
+    NOTICE: promoting standby to primary
+    DETAIL: promoting server "node2" (ID: 2) using "/usr/pgsql-11/bin/pg_ctl  -w -D '/var/lib/pgsql/11/data' promote"
+    NOTICE: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
+    NOTICE: STANDBY PROMOTE successful
+    DETAIL: server "node2" (ID: 2) was successfully promoted to primary
+    [2019-03-15 06:38:01] [INFO] 3 followers to notify
+    [2019-03-15 06:38:01] [NOTICE] notifying node "node3" (node ID: 3) to follow node 2
+    INFO:  node 3 received notification to follow node 2
+    [2019-03-15 06:38:01] [INFO] switching to primary monitoring mode
+    [2019-03-15 06:38:01] [NOTICE] monitoring cluster primary "node2" (node ID: 2)</programlisting>
+ </para>
+ <para>
+  The cluster status will now look like this, with the original primary (<literal>node1</literal>)
+  marked as inactive, and standby <literal>node3</literal> now following the new primary
+  (<literal>node2</literal>):
+  <programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster show --compact
+     ID | Name  | Role    | Status    | Upstream | Location | Prio.
+    ----+-------+---------+-----------+----------+----------+-------
+     1  | node1 | primary | - failed  |          | default  | 100
+     2  | node2 | primary | * running |          | default  | 100
+     3  | node3 | standby |   running | node2    | default  | 100</programlisting>
+
+ </para>
+ <para>
+   <link linkend="repmgr-cluster-event"><command>repmgr cluster event</command></link> will display a summary of
+   what happened to each server during the failover:
+  <programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster event
+     Node ID | Name  | Event                      | OK | Timestamp           | Details
+    ---------+-------+----------------------------+----+---------------------+-------------------------------------------------------------
+     3       | node3 | repmgrd_failover_follow    | t  | 2019-03-15 06:38:03 | node 3 now following new upstream node 2
+     3       | node3 | standby_follow             | t  | 2019-03-15 06:38:02 | standby attached to upstream node "node2" (node ID: 2)
+     2       | node2 | repmgrd_reload             | t  | 2019-03-15 06:38:01 | monitoring cluster primary "node2" (node ID: 2)
+     2       | node2 | repmgrd_failover_promote   | t  | 2019-03-15 06:38:01 | node 2 promoted to primary; old primary 1 marked as failed
+     2       | node2 | standby_promote            | t  | 2019-03-15 06:38:01 | server "node2" (ID: 2) was successfully promoted to primary</programlisting>
+ </para>
+
+  </sect1>
+</chapter>
--- a/doc/repmgrd-pausing.sgml
+++ b/doc/repmgrd-pausing.sgml
@@ -1,178 +0,0 @@
-<chapter id="repmgrd-pausing" xreflabel="Pausing repmgrd">
-
-  <indexterm>
-    <primary>repmgrd</primary>
-    <secondary>pausing</secondary>
-  </indexterm>
-
-  <indexterm>
-    <primary>pausing repmgrd</primary>
-  </indexterm>
-
-  <title>Pausing repmgrd</title>
-
-  <para>
-    In normal operation, <application>repmgrd</application> monitors the state of the
-    PostgreSQL node it is running on, and will take appropriate action if problems
-    are detected, e.g. (if so configured) promote the node to primary, if the existing
-    primary has been determined as failed.
-  </para>
-
-  <para>
-    However, <application>repmgrd</application> is unable to distinguish between
-    planned outages (such as performing a <link linkend="performing-switchover">switchover</link>
-    or installing PostgreSQL maintenance released), and an actual server outage. In versions prior to
-    &repmgr; 4.2 it was necessary to stop <application>repmgrd</application> on all nodes (or at least
-    on all nodes where <application>repmgrd</application> is
-    <link linkend="repmgrd-automatic-failover">configured for automatic failover</link>)
-    to prevent <application>repmgrd</application> from making unintentional changes to the
-    replication cluster.
-  </para>
-
-  <para>
-    From <link linkend="release-4.2">&repmgr; 4.2</link>, <application>repmgrd</application>
-    can now be &quot;paused&quot;, i.e. instructed not to take any action such as performing a failover.
-    This can be done from any node in the cluster, removing the need to stop/restart
-    each <application>repmgrd</application> individually.
-  </para>
-
-  <note>
-    <para>
-      For major PostgreSQL upgrades, e.g. from PostgreSQL 10 to PostgreSQL 11,
-      <application>repmgrd</application> should be shut down completely and only started up
-      once the &repmgr; packages for the new PostgreSQL major version have been installed.
-    </para>
-  </note>
-
-  <sect1 id="repmgrd-pausing-prerequisites">
-    <title>Prerequisites for pausing <application>repmgrd</application></title>
-    <para>
-      In order to be able to pause/unpause <application>repmgrd</application>, following
-      prerequisites must be met:
-      <itemizedlist spacing="compact" mark="bullet">
-
-        <listitem>
-          <simpara><link linkend="release-4.2">&repmgr; 4.2</link> or later must be installed on all nodes.</simpara>
-        </listitem>
-
-        <listitem>
-          <simpara>The same major &repmgr; version (e.g. 4.2) must be installed on all nodes (and preferably the same minor version).</simpara>
-        </listitem>
-
-        <listitem>
-          <simpara>
-            PostgreSQL on all nodes must be accessible from the node where the
-            <literal>pause</literal>/<literal>unpause</literal> operation is executed, using the
-            <varname>conninfo</varname> string shown by <link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>.
-          </simpara>
-        </listitem>
-      </itemizedlist>
-    </para>
-    <note>
-      <para>
-        These conditions are required for normal &repmgr; operation in any case.
-      </para>
-    </note>
-
-  </sect1>
-
-  <sect1 id="repmgrd-pausing-execution">
-    <title>Pausing/unpausing <application>repmgrd</application></title>
-    <para>
-      To pause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link>, e.g.:
-   <programlisting>
-$ repmgr -f /etc/repmgr.conf daemon pause
-NOTICE: node 1 (node1) paused
-NOTICE: node 2 (node2) paused
-NOTICE: node 3 (node3) paused</programlisting>
-    </para>
-    <para>
-      The state of <application>repmgrd</application> on each node can be checked with
-      <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>, e.g.:
-    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
- ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
----+-------+---------+---------+---------+------+---------
- 1  | node1 | primary | running | running | 7851 | yes
- 2  | node2 | standby | running | running | 7889 | yes
- 3  | node3 | standby | running | running | 7918 | yes</programlisting>
-    </para>
-
-    <note>
-      <para>
-        If executing a switchover with  <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
-		&repmgr; will automatically pause/unpause <application>repmgrd</application> as part of the switchover process.
-      </para>
-    </note>
-
-    <para>
-      If the primary (in this example, <literal>node1</literal>) is stopped, <application>repmgrd</application>
-      running on one of the standbys (here: <literal>node2</literal>) will react like this:
-      <programlisting>
-[2018-09-20 12:22:21] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
-[2018-09-20 12:22:21] [INFO] checking state of node 1, 1 of 5 attempts
-[2018-09-20 12:22:21] [INFO] sleeping 1 seconds until next reconnection attempt
-...
-[2018-09-20 12:22:24] [INFO] sleeping 1 seconds until next reconnection attempt
-[2018-09-20 12:22:25] [INFO] checking state of node 1, 5 of 5 attempts
-[2018-09-20 12:22:25] [WARNING] unable to reconnect to node 1 after 5 attempts
-[2018-09-20 12:22:25] [NOTICE] node is paused
-[2018-09-20 12:22:33] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state
-[2018-09-20 12:22:33] [DETAIL] repmgrd paused by administrator
-[2018-09-20 12:22:33] [HINT] execute "repmgr daemon unpause" to resume normal failover mode</programlisting>
-    </para>
-    <para>
-      If the primary becomes available again (e.g. following a software upgrade), <application>repmgrd</application>
-      will automatically reconnect, e.g.:
-      <programlisting>
-[2018-09-20 13:12:41] [NOTICE] reconnected to upstream node 1 after 8 seconds, resuming monitoring</programlisting>
-    </para>
-
-    <para>
-      To unpause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>, e.g.:
-   <programlisting>
-$ repmgr -f /etc/repmgr.conf daemon unpause
-NOTICE: node 1 (node1) unpaused
-NOTICE: node 2 (node2) unpaused
-NOTICE: node 3 (node3) unpaused</programlisting>
-    </para>
-
-    <note>
-      <para>
-        If the previous primary is no longer accessible when <application>repmgrd</application>
-        is unpaused, no failover action will be taken. Instead, a new primary must be manually promoted using
-        <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>,
-		and any standbys attached to the new primary with
-		<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>.
-      </para>
-      <para>
-        This is to prevent <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
-        resulting in the automatic promotion of a new primary, which may be a problem particularly
-        in larger clusters, where <application>repmgrd</application> could select a different promotion
-        candidate to the one intended by the administrator.
-      </para>
-    </note>
-
-  <sect2 id="repmgrd-pausing-details">
-    <title>Details on the <application>repmgrd</application> pausing mechanism</title>
-
-    <para>
-      The pause state of each node will be stored over a PostgreSQL restart.
-    </para>
-
-	<para>
-	  <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
-	  <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link> can be
-	  executed even if <application>repmgrd</application> is not running; in this case,
-	  <application>repmgrd</application> will start up in whichever pause state has been set.
-	</para>
-    <note>
-      <para>
-		<link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
-		<link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
-		<emphasis>do not</emphasis> stop/start <application>repmgrd</application>.
-      </para>
-    </note>
-  </sect2>
-  </sect1>
-</chapter>
-
--- a/doc/repmgrd-witness-server.sgml
+++ b/doc/repmgrd-witness-server.sgml
@@ -1,31 +0,0 @@
-<chapter id="repmgrd-witness-server" xreflabel="Using a witness server with repmgrd">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>witness server</secondary>
- </indexterm>
-
- <title>Using a witness server with repmgrd</title>
- <para>
-   In a situation caused e.g. by a network interruption between two
-   data centres, it's important to avoid a "split-brain" situation where
-   both sides of the network assume they are the active segment and the
-   side without an active primary unilaterally promotes one of its standbys.
- </para>
- <para>
-   To prevent this situation happening, it's essential to ensure that one
-   network segment has a "voting majority", so other segments will know
-   they're in the minority and not attempt to promote a new primary. Where
-   an odd number of servers exists, this is not an issue. However, if each
-   network has an even number of nodes, it's necessary to provide some way
-   of ensuring a majority, which is where the witness server becomes useful.
- </para>
- <para>
-   This is not a fully-fledged standby node and is not integrated into
-   replication, but it effectively represents the "casting vote" when
-   deciding which network segment has a majority. A witness server can
-   be set up using <xref linkend="repmgr-witness-register">. Note that it only
-   makes sense to create a witness server in conjunction with running
-   <application>repmgrd</application>; the witness server will require its own
-   <application>repmgrd</application> instance.
- </para>
-</chapter>
--- a/doc/switchover.sgml
+++ b/doc/switchover.sgml
@@ -72,7 +72,8 @@
     Ensure that a passwordless SSH connection is possible from the promotion candidate
     (standby) to the demotion candidate (current primary). If <literal>--siblings-follow</literal>
     will be used, ensure that passwordless SSH connections are possible from the
-     promotion candidate to all standbys attached to the demotion candidate.
+     promotion candidate to all nodes attached to the demotion candidate
+     (including the witness server, if in use).
   </para>

   <note>
--- a/doc/version.sgml
+++ b/doc/version.sgml
@@ -1 +0,0 @@
-<!ENTITY repmgrversion "4.3dev">
--- a/log.c
+++ b/log.c
@@ -85,7 +85,7 @@ _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_li

 			time(&t);
 			tm = localtime(&t);
-			strftime(buf, 100, "[%Y-%m-%d %H:%M:%S]", tm);
+			strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", tm);
 			fprintf(stderr, "%s [%s] ", buf, level_name);
 		}
 		else
--- a/repmgr--4.2--4.3.sql
+++ b/repmgr--4.2--4.3.sql
@@ -1,12 +1,17 @@
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION repmgr" to load this file. \quit

-CREATE FUNCTION set_primary_last_seen()
+CREATE FUNCTION set_upstream_last_seen()
  RETURNS VOID
-  AS 'MODULE_PATHNAME', 'set_primary_last_seen'
+  AS 'MODULE_PATHNAME', 'set_upstream_last_seen'
  LANGUAGE C STRICT;

-CREATE FUNCTION get_primary_last_seen()
+CREATE FUNCTION get_upstream_last_seen()
  RETURNS INT
-  AS 'MODULE_PATHNAME', 'get_primary_last_seen'
+  AS 'MODULE_PATHNAME', 'get_upstream_last_seen'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_wal_receiver_pid()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_wal_receiver_pid'
  LANGUAGE C STRICT;
--- a/repmgr--4.3.sql
+++ b/repmgr--4.3.sql
@@ -118,16 +118,17 @@ CREATE FUNCTION standby_get_last_updated()
  AS 'MODULE_PATHNAME', 'standby_get_last_updated'
  LANGUAGE C STRICT;

-CREATE FUNCTION set_primary_last_seen()
+CREATE FUNCTION set_upstream_last_seen()
  RETURNS VOID
-  AS 'MODULE_PATHNAME', 'set_primary_last_seen'
+  AS 'MODULE_PATHNAME', 'set_upstream_last_seen'
  LANGUAGE C STRICT;

-CREATE FUNCTION get_primary_last_seen()
+CREATE FUNCTION get_upstream_last_seen()
  RETURNS INT
-  AS 'MODULE_PATHNAME', 'get_primary_last_seen'
+  AS 'MODULE_PATHNAME', 'get_upstream_last_seen'
  LANGUAGE C STRICT;

+
 /* failover functions */

 CREATE FUNCTION notify_follow_primary(INT)
@@ -185,6 +186,15 @@ CREATE FUNCTION repmgrd_is_paused()
  AS 'MODULE_PATHNAME', 'repmgrd_is_paused'
  LANGUAGE C STRICT;

+CREATE FUNCTION get_wal_receiver_pid()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_wal_receiver_pid'
+  LANGUAGE C STRICT;
+
+
+
+
+/* views */

 CREATE VIEW repmgr.replication_status AS
  SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
--- a/repmgr-action-bdr.c
+++ b/repmgr-action-bdr.c
@@ -93,6 +93,15 @@ do_bdr_register(void)
 		exit(ERR_BAD_CONFIG);
 	}

+	if (get_bdr_version_num() > 2)
+	{
+		log_error(_("\"repmgr bdr register\" is for BDR 2.x only"));
+		PQfinish(conn);
+		pfree(dbname);
+		exit(ERR_BAD_CONFIG);
+	}
+
+
 	/* check for a matching BDR node */
 	{
 		PQExpBufferData bdr_local_node_name;
@@ -216,7 +225,7 @@ do_bdr_register(void)
 				ExtensionStatus other_node_extension_status = REPMGR_UNKNOWN;

 				/* skip the local node */
-				if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, MAXLEN) == 0)
+				if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, sizeof(node_info.node_name)) == 0)
 				{
 					continue;
 				}
@@ -304,9 +313,9 @@ do_bdr_register(void)
 	node_info.active = true;
 	node_info.priority = config_file_options.priority;

-	strncpy(node_info.node_name, config_file_options.node_name, MAXLEN);
-	strncpy(node_info.location, config_file_options.location, MAXLEN);
-	strncpy(node_info.conninfo, config_file_options.conninfo, MAXLEN);
+	strncpy(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name));
+	strncpy(node_info.location, config_file_options.location, sizeof(node_info.location));
+	strncpy(node_info.conninfo, config_file_options.conninfo, sizeof(node_info.conninfo));

 	if (record_status == RECORD_FOUND)
 	{
@@ -330,7 +339,7 @@ do_bdr_register(void)
 		 * name set when the node was registered.
 		 */

-		if (strncmp(node_info.node_name, config_file_options.node_name, MAXLEN) != 0)
+		if (strncmp(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name)) != 0)
 		{
 			log_error(_("a record for node %i is already registered with node_name \"%s\""),
 					  config_file_options.node_id, node_info.node_name);
--- a/repmgr-action-cluster.c
+++ b/repmgr-action-cluster.c
@@ -156,7 +156,7 @@ do_cluster_show(void)
 		else
 		{
 			/* check if node is reachable, but just not letting us in */
-			if (is_server_available(cell->node_info->conninfo))
+			if (is_server_available_quiet(cell->node_info->conninfo))
 				cell->node_info->node_status = NODE_STATUS_REJECTED;
 			else
 				cell->node_info->node_status = NODE_STATUS_DOWN;
@@ -1063,7 +1063,9 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite
 		matrix_rec_list[i] = (t_node_matrix_rec *) pg_malloc0(sizeof(t_node_matrix_rec));

 		matrix_rec_list[i]->node_id = cell->node_info->node_id;
-		strncpy(matrix_rec_list[i]->node_name, cell->node_info->node_name, MAXLEN);
+		strncpy(matrix_rec_list[i]->node_name,
+				cell->node_info->node_name,
+				sizeof(matrix_rec_list[i]->node_name));

 		/*
 		 * Find the maximum length of a node name
@@ -1161,6 +1163,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite
 		(void) remote_command(host,
 							  runtime_options.remote_user,
 							  command.data,
+							  config_file_options.ssh_options,
 							  &command_output);

 		p = command_output.data;
@@ -1277,7 +1280,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item

 		cube[h] = (t_node_status_cube *) pg_malloc(sizeof(t_node_status_cube));
 		cube[h]->node_id = cell->node_info->node_id;
-		strncpy(cube[h]->node_name, cell->node_info->node_name, MAXLEN);
+		strncpy(cube[h]->node_name, cell->node_info->node_name, sizeof(cube[h]->node_name));

 		/*
 		 * Find the maximum length of a node name
@@ -1299,7 +1302,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item
 			/* we don't need the name here */
 			cube[h]->matrix_list_rec[i]->node_name[0] = '\0';

-			cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec) * nodes.node_count);
+			cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec *) * nodes.node_count);

 			j = 0;

@@ -1373,6 +1376,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item
 			(void) remote_command(host,
 								  runtime_options.remote_user,
 								  quoted_command.data,
+								  config_file_options.ssh_options,
 								  &command_output);

 			free_conninfo_params(&remote_conninfo);
--- a/repmgr-action-cluster.h
+++ b/repmgr-action-cluster.h
@@ -30,14 +30,14 @@ typedef struct
 typedef struct
 {
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	t_node_status_rec **node_status_list;
 } t_node_matrix_rec;

 typedef struct
 {
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	t_node_matrix_rec **matrix_list_rec;
 } t_node_status_cube;

--- a/repmgr-action-daemon.c
+++ b/repmgr-action-daemon.c
@@ -201,8 +201,7 @@ do_daemon_status(void)
 				}
 			}

-			repmgrd_info[i]->upstream_last_seen = get_primary_last_seen(cell->node_info->conn);
-
+			repmgrd_info[i]->upstream_last_seen = get_upstream_last_seen(cell->node_info->conn, cell->node_info->type);
 			if (repmgrd_info[i]->upstream_last_seen < 0)
 			{
 				maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, "%s", _("n/a"));
@@ -260,14 +259,24 @@ do_daemon_status(void)
 	{
 		if (runtime_options.output_mode == OM_CSV)
 		{
+			int running = repmgrd_info[i]->running ? 1 : 0;
+			int paused = repmgrd_info[i]->paused ? 1 : 0;
+
+			/* If PostgreSQL is not running, repmgrd status is unknown */
+			if (repmgrd_info[i]->pg_running == false)
+			{
+				running = -1;
+				paused = -1;
+			}
+
 			printf("%i,%s,%s,%i,%i,%i,%i,%i,%i\n",
 				   cell->node_info->node_id,
 				   cell->node_info->node_name,
 				   get_node_type_string(cell->node_info->type),
 				   repmgrd_info[i]->pg_running ? 1 : 0,
-				   repmgrd_info[i]->running ? 1 : 0,
+				   running,
 				   repmgrd_info[i]->pid,
-				   repmgrd_info[i]->paused ? 1 : 0,
+				   paused,
 				   cell->node_info->priority,
 				   repmgrd_info[i]->pid == UNKNOWN_PID
 				     ? -1
@@ -344,18 +353,9 @@ _do_repmgr_pause(bool pause)
 	PGconn	   *conn = NULL;
 	NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
 	NodeInfoListCell *cell = NULL;
-	RepmgrdInfo **repmgrd_info;
 	int i;
 	int error_nodes = 0;

-	repmgrd_info = (RepmgrdInfo **) pg_malloc0(sizeof(RepmgrdInfo *) * nodes.node_count);
-
-	if (repmgrd_info == NULL)
-	{
-		log_error(_("unable to allocate memory"));
-		exit(ERR_OUT_OF_MEMORY);
-	}
-
 	/* Connect to local database to obtain cluster connection data */
 	log_verbose(LOG_INFO, _("connecting to database"));

@@ -370,9 +370,6 @@ _do_repmgr_pause(bool pause)

 	for (cell = nodes.head; cell; cell = cell->next)
 	{
-		repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
-		repmgrd_info[i]->node_id = cell->node_info->node_id;
-
 		log_verbose(LOG_DEBUG, "pausing node %i (%s)",
 					cell->node_info->node_id,
 					cell->node_info->node_name);
--- a/repmgr-action-node.c
+++ b/repmgr-action-node.c
@@ -413,7 +413,7 @@ do_node_status(void)
 								  node_info.upstream_node_name,
 								  node_info.upstream_node_id);

-		get_replication_info(conn, &replication_info);
+		get_replication_info(conn, node_info.type, &replication_info);

 		key_value_list_set_format(&node_status,
 								  "Replication lag",
@@ -1408,7 +1408,7 @@ do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_i
 					break;
 			}
 		}
-		else if (lag_seconds < 0)
+		else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
 		{
 			status = CHECK_STATUS_UNKNOWN;

@@ -2476,6 +2476,8 @@ do_node_rejoin(void)

 						termPQExpBuffer(&slotdir_ent_path);
 					}
+
+					closedir(slotdir);
 				}
 				termPQExpBuffer(&slotdir_path);
 			}
@@ -2681,6 +2683,48 @@ do_node_rejoin(void)
 }


+/*
+ * Currently for testing purposes only, not documented;
+ * use at own risk!
+ */
+
+void
+do_node_control(void)
+{
+	PGconn	   *conn = NULL;
+	pid_t	    wal_receiver_pid = UNKNOWN_PID;
+	conn = establish_db_connection(config_file_options.conninfo, true);
+
+	if (runtime_options.disable_wal_receiver == true)
+	{
+		wal_receiver_pid = disable_wal_receiver(conn);
+
+		PQfinish(conn);
+
+		if (wal_receiver_pid == UNKNOWN_PID)
+			exit(ERR_BAD_CONFIG);
+
+		exit(SUCCESS);
+	}
+
+	if (runtime_options.enable_wal_receiver == true)
+	{
+		wal_receiver_pid = enable_wal_receiver(conn, true);
+
+		PQfinish(conn);
+
+		if (wal_receiver_pid == UNKNOWN_PID)
+			exit(ERR_BAD_CONFIG);
+
+		exit(SUCCESS);
+	}
+
+	log_error(_("no option provided"));
+
+	PQfinish(conn);
+}
+
+
 /*
 * For "internal" use by `node rejoin` on the local node when
 * called by "standby switchover" from the remote node.
@@ -2742,6 +2786,7 @@ _do_node_archive_config(void)

 	arcdir = opendir(archive_dir.data);

+	/* always attempt to open the directory */
 	if (arcdir == NULL)
 	{
 		log_error(_("unable to open archive directory \"%s\""),
@@ -2787,10 +2832,11 @@ _do_node_archive_config(void)

 			termPQExpBuffer(&arcdir_ent_path);
 		}
-
-		closedir(arcdir);
 	}

+	closedir(arcdir);
+
+
 	/*
 	 * extract list of config files from --config-files
 	 */
@@ -3062,11 +3108,12 @@ copy_file(const char *src_file, const char *dest_file)
 	int			a = 0;

 	ptr_old = fopen(src_file, "r");
-	ptr_new = fopen(dest_file, "w");

 	if (ptr_old == NULL)
 		return false;

+	ptr_new = fopen(dest_file, "w");
+
 	if (ptr_new == NULL)
 	{
 		fclose(ptr_old);
--- a/repmgr-action-node.h
+++ b/repmgr-action-node.h
@@ -24,6 +24,7 @@ extern void do_node_check(void);

 extern void do_node_rejoin(void);
 extern void do_node_service(void);
+extern void do_node_control(void);

 extern void do_node_help(void);

--- a/repmgr-action-primary.c
+++ b/repmgr-action-primary.c
@@ -96,28 +96,6 @@ do_primary_register(void)

 	initialize_voting_term(conn);

-	/* Ensure there isn't another registered node which is primary */
-	primary_conn = get_primary_connection(conn, &current_primary_id, NULL);
-
-	if (primary_conn != NULL)
-	{
-		if (current_primary_id != config_file_options.node_id)
-		{
-			/*
-			 * it's impossible to add a second primary to a streaming
-			 * replication cluster
-			 */
-			log_error(_("there is already an active registered primary (node ID: %i) in this cluster"), current_primary_id);
-			PQfinish(primary_conn);
-			PQfinish(conn);
-			exit(ERR_BAD_CONFIG);
-		}
-
-		/* we've probably connected to ourselves */
-		PQfinish(primary_conn);
-	}
-
-
 	begin_transaction(conn);

 	/*
@@ -128,12 +106,32 @@ do_primary_register(void)
 	current_primary_id = get_primary_node_id(conn);
 	if (current_primary_id != NODE_NOT_FOUND && current_primary_id != config_file_options.node_id)
 	{
-		log_error(_("another node with id %i is already registered as primary"), current_primary_id);
-		log_detail(_("a streaming replication cluster can have only one primary node"));
+		log_debug("XXX %i", current_primary_id);
+		primary_conn = establish_primary_db_connection(conn, false);

-		rollback_transaction(conn);
-		PQfinish(conn);
-		exit(ERR_BAD_CONFIG);
+		if (PQstatus(primary_conn) == CONNECTION_OK)
+		{
+			if (get_recovery_type(primary_conn) == RECTYPE_PRIMARY)
+			{
+				log_error(_("there is already an active registered primary (node ID: %i) in this cluster"),
+						  current_primary_id);
+				log_detail(_("a streaming replication cluster can have only one primary node"));
+
+				log_hint(_("ensure this node is shut down before registering a new primary"));
+				PQfinish(primary_conn);
+				rollback_transaction(conn);
+				PQfinish(conn);
+				exit(ERR_BAD_CONFIG);
+			}
+
+			log_warning(_("node %is is registered as primary but running as a standby"),
+						  current_primary_id);
+			PQfinish(primary_conn);
+		}
+
+		log_notice(_("setting node %i's node record to inactive"),
+						  current_primary_id);
+		update_node_record_set_active(conn, current_primary_id, false);
 	}

 	/*
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -213,7 +213,7 @@ do_standby_clone(void)
 		param_set(&recovery_conninfo, "application_name", config_file_options.node_name);

 		get_conninfo_value(config_file_options.conninfo, "application_name", application_name);
-		if (strlen(application_name) && strncmp(application_name, config_file_options.node_name, MAXLEN) != 0)
+		if (strlen(application_name) && strncmp(application_name, config_file_options.node_name, sizeof(config_file_options.node_name)) != 0)
 		{
 			log_notice(_("\"application_name\" is set in repmgr.conf but will be replaced by the node name"));
 		}
@@ -605,7 +605,6 @@ do_standby_clone(void)
 			log_error(_("unknown clone mode"));
 	}

-
 	/* If the backup failed then exit */
 	if (r != SUCCESS)
 	{
@@ -771,56 +770,73 @@ do_standby_clone(void)
 void
 check_barman_config(void)
 {
-	char		command[MAXLEN];
+	PQExpBufferData command;
 	bool		command_ok = false;

 	/*
 	 * Check that there is at least one valid backup
 	 */

-	log_info(_("connecting to Barman server to verify backup for %s"), config_file_options.barman_server);
+	log_info(_("connecting to Barman server to verify backup for \"%s\""), config_file_options.barman_server);

-	maxlen_snprintf(command, "%s show-backup %s latest > /dev/null",
-					make_barman_ssh_command(barman_command_buf),
-					config_file_options.barman_server);
+	initPQExpBuffer(&command);

-	command_ok = local_command(command, NULL);
+	appendPQExpBuffer(&command, "%s show-backup %s latest > /dev/null",
+					  make_barman_ssh_command(barman_command_buf),
+					  config_file_options.barman_server);
+
+	command_ok = local_command(command.data, NULL);

 	if (command_ok == false)
 	{
-		log_error(_("no valid backup for server %s was found in the Barman catalogue"),
+		log_error(_("no valid backup for server \"%s\" was found in the Barman catalogue"),
 				  config_file_options.barman_server);
+		log_detail(_("command executed was:\n  %s"), command.data),
 		log_hint(_("refer to the Barman documentation for more information"));

+		termPQExpBuffer(&command);
 		exit(ERR_BARMAN);
 	}
-
-
-	if (!create_pg_dir(local_data_directory, runtime_options.force))
+	else if (runtime_options.dry_run == true)
 	{
-		log_error(_("unable to use directory %s"),
-				  local_data_directory);
-		log_hint(_("use -F/--force option to force this directory to be overwritten"));
-		exit(ERR_BAD_CONFIG);
+		log_info(_("valid backup for server \"%s\" found in the Barman catalogue"),
+				 config_file_options.barman_server);
 	}

+	termPQExpBuffer(&command);

 	/*
-	 * Create the local repmgr subdirectory
+	 * Attempt to create data directory (unless --dry-run specified,
+	 * in which case do nothing; warnings will be emitted elsewhere about
+	 * any issues with the data directory)
 	 */
-
-	maxlen_snprintf(local_repmgr_tmp_directory,
-					"%s/repmgr", local_data_directory);
-
-	maxlen_snprintf(datadir_list_filename,
-					"%s/data.txt", local_repmgr_tmp_directory);
-
-	if (!create_pg_dir(local_repmgr_tmp_directory, runtime_options.force))
+	if (runtime_options.dry_run == false)
 	{
-		log_error(_("unable to create directory \"%s\""),
-				  local_repmgr_tmp_directory);
+		if (!create_pg_dir(local_data_directory, runtime_options.force))
+		{
+			log_error(_("unable to use directory %s"),
+					  local_data_directory);
+			log_hint(_("use -F/--force option to force this directory to be overwritten"));
+			exit(ERR_BAD_CONFIG);
+		}

-		exit(ERR_BAD_CONFIG);
+		/*
+		 * Create the local repmgr subdirectory
+		 */
+
+		maxlen_snprintf(local_repmgr_tmp_directory,
+						"%s/repmgr", local_data_directory);
+
+		maxlen_snprintf(datadir_list_filename,
+						"%s/data.txt", local_repmgr_tmp_directory);
+
+		if (!create_pg_dir(local_repmgr_tmp_directory, runtime_options.force))
+		{
+			log_error(_("unable to create directory \"%s\""),
+					  local_repmgr_tmp_directory);
+
+			exit(ERR_BAD_CONFIG);
+		}
 	}

 	/*
@@ -828,20 +844,37 @@ check_barman_config(void)
 	 */
 	log_info(_("connecting to Barman server to fetch server parameters"));

-	maxlen_snprintf(command, "%s show-server %s > %s/show-server.txt",
-					make_barman_ssh_command(barman_command_buf),
-					config_file_options.barman_server,
-					local_repmgr_tmp_directory);
+	initPQExpBuffer(&command);

-	command_ok = local_command(command, NULL);
+	if (runtime_options.dry_run == true)
+	{
+		appendPQExpBuffer(&command, "%s show-server %s > /dev/null",
+						  make_barman_ssh_command(barman_command_buf),
+						  config_file_options.barman_server);
+	}
+	else
+	{
+		appendPQExpBuffer(&command, "%s show-server %s > %s/show-server.txt",
+						  make_barman_ssh_command(barman_command_buf),
+						  config_file_options.barman_server,
+						  local_repmgr_tmp_directory);
+	}
+
+	command_ok = local_command(command.data, NULL);

 	if (command_ok == false)
 	{
 		log_error(_("unable to fetch server parameters from Barman server"));
-
+		log_detail(_("command executed was:\n  %s"), command.data),
+		termPQExpBuffer(&command);
 		exit(ERR_BARMAN);
 	}
+	else if (runtime_options.dry_run == true)
+	{
+		log_info(_("server parameters were successfully fetched from Barman server"));
+	}

+	termPQExpBuffer(&command);
 }


@@ -873,7 +906,7 @@ _do_create_recovery_conf(void)
 	t_node_info upstream_node_record = T_NODE_INFO_INITIALIZER;

 	RecordStatus record_status = RECORD_NOT_FOUND;
-	char		recovery_file_path[MAXPGPATH] = "";
+	char		recovery_file_path[MAXPGPATH + sizeof(RECOVERY_COMMAND_FILE)] = "";
 	struct stat st;
 	bool		node_is_running = false;
 	bool		slot_creation_required = false;
@@ -1118,7 +1151,10 @@ _do_create_recovery_conf(void)

 	/* check if recovery.conf exists */

-	snprintf(recovery_file_path, MAXPGPATH, "%s/%s", local_data_directory, RECOVERY_COMMAND_FILE);
+	snprintf(recovery_file_path, sizeof(recovery_file_path),
+			 "%s/%s",
+			 local_data_directory,
+			 RECOVERY_COMMAND_FILE);

 	if (stat(recovery_file_path, &st) == -1)
 	{
@@ -1306,8 +1342,7 @@ do_standby_register(void)
 			log_error(_("unable to connect to local node \"%s\" (ID: %i)"),
 					  config_file_options.node_name,
 					  config_file_options.node_id);
-			log_detail("%s",
-					   PQerrorMessage(conn));
+			log_detail("\n%s", PQerrorMessage(conn));
 			log_hint(_("to register a standby which is not running, provide primary connection parameters and use option -F/--force"));

 			exit(ERR_BAD_CONFIG);
@@ -1437,6 +1472,17 @@ do_standby_register(void)
 		RecordStatus upstream_record_status = RECORD_NOT_FOUND;
 		t_node_info upstream_node_record = T_NODE_INFO_INITIALIZER;

+		if (runtime_options.upstream_node_id == config_file_options.node_id)
+		{
+			log_error(_("provided node ID for --upstream-node-id (%i) is the same as the configured local node ID (%i)"),
+					  runtime_options.upstream_node_id,
+					  config_file_options.node_id);
+			PQfinish(primary_conn);
+			if (PQstatus(conn) == CONNECTION_OK)
+				PQfinish(conn);
+			exit(ERR_BAD_CONFIG);
+		}
+
 		upstream_record_status = get_node_record(primary_conn,
 												 runtime_options.upstream_node_id,
 												 &upstream_node_record);
@@ -1888,7 +1934,7 @@ do_standby_unregister(void)
 	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
 		log_error(_("unable to connect to primary server"));
-		log_detail("%s", PQerrorMessage(conn));
+		log_detail("\n%s", PQerrorMessage(conn));
 		exit(ERR_BAD_CONFIG);
 	}

@@ -2010,7 +2056,7 @@ do_standby_promote(void)

 		init_replication_info(&replication_info);

-		if (get_replication_info(conn, &replication_info) == false)
+		if (get_replication_info(conn, STANDBY, &replication_info) == false)
 		{
 			log_error(_("unable to retrieve replication information from local node"));
 			PQfinish(conn);
@@ -2270,6 +2316,7 @@ void
 do_standby_follow(void)
 {
 	PGconn	   *local_conn = NULL;
+	t_node_info local_node_record = T_NODE_INFO_INITIALIZER;

 	PGconn	   *primary_conn = NULL;
 	int			primary_node_id = UNKNOWN_NODE_ID;
@@ -2308,6 +2355,19 @@ do_standby_follow(void)
 	if (PQserverVersion(local_conn) < 90400)
 		check_93_config();

+	/* attempt to retrieve local node record */
+	record_status = get_node_record(local_conn,
+									config_file_options.node_id,
+									&local_node_record);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve record for local node %i"),
+				  config_file_options.node_id);
+		PQfinish(local_conn);
+		exit(ERR_BAD_CONFIG);
+	}
+
 	/*
 	 * --upstream-node-id provided - attempt to follow that node
 	 */
@@ -2552,6 +2612,9 @@ do_standby_follow(void)

 		conn_to_param_list(local_conn, &local_repl_conninfo);

+		/* Set the replication user from the node record */
+		param_set(&local_repl_conninfo, "user", local_node_record.repluser);
+
 		param_set(&local_repl_conninfo, "replication", "1");

 		local_repl_conn = establish_db_connection_by_params(&local_repl_conninfo, false);
@@ -2838,8 +2901,8 @@ do_standby_follow_internal(PGconn *primary_conn, PGconn *follow_target_conn, t_n
 		free_conninfo_params(&local_node_conninfo);

 		/*
-		 * store the original upstream node id so we can delete the
-		 * replication slot, if exists
+		 * Store the original upstream node id so we can delete the
+		 * replication slot, if it exists.
 		 */
 		if (local_node_record.upstream_node_id != UNKNOWN_NODE_ID)
 		{
@@ -2851,9 +2914,17 @@ do_standby_follow_internal(PGconn *primary_conn, PGconn *follow_target_conn, t_n
 		}


-		if (config_file_options.use_replication_slots && runtime_options.host_param_provided == false && original_upstream_node_id != UNKNOWN_NODE_ID)
+		if (config_file_options.use_replication_slots && runtime_options.host_param_provided == false)
 		{
-			remove_old_replication_slot = true;
+			/*
+			 * Only attempt to delete the old replication slot if the old upstream
+			 * node is known and is different to the follow target node.
+			 */
+			if (original_upstream_node_id != UNKNOWN_NODE_ID
+			 && original_upstream_node_id != follow_target_node_record->node_id)
+			{
+				remove_old_replication_slot = true;
+			}
 		}
 	}

@@ -3000,8 +3071,6 @@ do_standby_follow_internal(PGconn *primary_conn, PGconn *follow_target_conn, t_n
 	 * Note that if this function is called by do_standby_switchover(), the
 	 * "repmgr node rejoin" command executed on the demotion candidate may already
 	 * have removed the slot, so there may be nothing to do.
-	 *
-	 * XXX check if former upstream is current primary?
 	 */

 	if (remove_old_replication_slot == true)
@@ -3263,7 +3332,7 @@ do_standby_switchover(void)
 		ReplInfo 	replication_info;
 		init_replication_info(&replication_info);

-		if (get_replication_info(local_conn, &replication_info) == false)
+		if (get_replication_info(local_conn, STANDBY, &replication_info) == false)
 		{
 			log_error(_("unable to retrieve replication information from local node"));
 			PQfinish(local_conn);
@@ -3403,6 +3472,7 @@ do_standby_switchover(void)
 	command_success = remote_command(remote_host,
 									 runtime_options.remote_user,
 									 remote_command_str.data,
+									 config_file_options.ssh_options,
 									 &command_output);

 	termPQExpBuffer(&remote_command_str);
@@ -3466,6 +3536,7 @@ do_standby_switchover(void)
 	command_success = remote_command(remote_host,
 									 runtime_options.remote_user,
 									 remote_command_str.data,
+									 config_file_options.ssh_options,
 									 &command_output);

 	termPQExpBuffer(&remote_command_str);
@@ -3533,9 +3604,26 @@ do_standby_switchover(void)
 	{
 		if (sibling_nodes.node_count > 0)
 		{
+			PQExpBufferData nodes;
+			NodeInfoListCell *cell;
+
+			initPQExpBuffer(&nodes);
+
+			for (cell = sibling_nodes.head; cell; cell = cell->next)
+			{
+				appendPQExpBuffer(&nodes,
+								  "  %s (node ID: %i)",
+								  cell->node_info->node_name,
+								  cell->node_info->node_id);
+				if (cell->next)
+					appendPQExpBufferStr(&nodes, "\n");
+			}
+
 			log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"),
 						sibling_nodes.node_count);
-			log_detail(_("these nodes will remain attached to the current primary"));
+			log_detail(_("these nodes will remain attached to the current primary:\n%s"), nodes.data);
+
+			termPQExpBuffer(&nodes);
 		}
 	}
 	else
@@ -3693,6 +3781,7 @@ do_standby_switchover(void)
 		command_success = remote_command(remote_host,
 										 runtime_options.remote_user,
 										 remote_command_str.data,
+										 config_file_options.ssh_options,
 										 &command_output);

 		termPQExpBuffer(&remote_command_str);
@@ -3745,6 +3834,7 @@ do_standby_switchover(void)
 			command_success = remote_command(remote_host,
 											 runtime_options.remote_user,
 											 remote_command_str.data,
+											 config_file_options.ssh_options,
 											 &command_output);

 			termPQExpBuffer(&remote_command_str);
@@ -3881,7 +3971,7 @@ do_standby_switchover(void)
 			log_detail(_("lag is %i seconds (warning threshold: %i)"),
 					   lag_seconds, config_file_options.replication_lag_warning);
 		}
-		else if (lag_seconds < 0)
+		else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
 		{
 			if (runtime_options.force == false)
 			{
@@ -3983,13 +4073,14 @@ do_standby_switchover(void)

 		for (cell = all_nodes.head; cell; cell = cell->next)
 		{
-			cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
-
 			repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
 			repmgrd_info[i]->node_id = cell->node_info->node_id;
 			repmgrd_info[i]->pid = UNKNOWN_PID;
 			repmgrd_info[i]->paused = false;
 			repmgrd_info[i]->running = false;
+			repmgrd_info[i]->pg_running = true;
+
+			cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);

 			if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
 			{
@@ -3999,12 +4090,25 @@ do_standby_switchover(void)

 				repmgrd_info[i]->pg_running = false;

-				item_list_append_format(&repmgrd_connection_errors,
-										_("unable to connect to node \"%s\" (ID %i)"),
-										cell->node_info->node_name,
-										cell->node_info->node_id);
+				/*
+				 * Only worry about unreachable nodes if they're marked as active
+				 * in the repmgr metadata.
+				 */
+				if (cell->node_info->active == true)
+				{
+					unreachable_node_count++;

-				unreachable_node_count++;
+					item_list_append_format(&repmgrd_connection_errors,
+											_("unable to connect to node \"%s\" (ID %i):\n%s"),
+											cell->node_info->node_name,
+											cell->node_info->node_id,
+											PQerrorMessage(cell->node_info->conn));
+				}
+
+				PQfinish(cell->node_info->conn);
+				cell->node_info->conn = NULL;
+
+				i++;
 				continue;
 			}

@@ -4066,11 +4170,37 @@ do_standby_switchover(void)

 		}

+		/* pause repmgrd on all reachable nodes */
 		if (repmgrd_running_count > 0)
 		{
 			i = 0;
 			for (cell = all_nodes.head; cell; cell = cell->next)
 			{
+
+				/*
+				 * Skip if node was unreachable
+				 */
+				if (repmgrd_info[i]->pg_running == false)
+				{
+					log_warning(_("node %s (ID %i) unreachable, unable to pause repmgrd"),
+								cell->node_info->node_name,
+								cell->node_info->node_id);
+					i++;
+					continue;
+				}
+
+
+				/*
+				 * Skip if repmgrd not running on node
+				 */
+				if (repmgrd_info[i]->running == false)
+				{
+					log_warning(_("repmgrd not running on node %s (ID %i)"),
+								cell->node_info->node_name,
+								cell->node_info->node_id);
+					i++;
+					continue;
+				}
 				/*
 				 * Skip if node is already paused. Note we won't unpause these, to
 				 * leave the repmgrd instances in the cluster in the same state they
@@ -4110,8 +4240,11 @@ do_standby_switchover(void)
 			/* close all connections - we'll reestablish later */
 			for (cell = all_nodes.head; cell; cell = cell->next)
 			{
-				PQfinish(cell->node_info->conn);
-				cell->node_info->conn = NULL;
+				if (cell->node_info->conn != NULL)
+				{
+					PQfinish(cell->node_info->conn);
+					cell->node_info->conn = NULL;
+				}
 			}
 		}
 	}
@@ -4174,6 +4307,7 @@ do_standby_switchover(void)
 	(void) remote_command(remote_host,
 						  runtime_options.remote_user,
 						  remote_command_str.data,
+						  config_file_options.ssh_options,
 						  &command_output);

 	termPQExpBuffer(&remote_command_str);
@@ -4184,6 +4318,7 @@ do_standby_switchover(void)
 	 */
 	if (runtime_options.dry_run == true)
 	{
+		/* we use a buffer here as it will be modified by string_remove_trailing_newlines() */
 		char		shutdown_command[MAXLEN] = "";

 		strncpy(shutdown_command, command_output.data, MAXLEN);
@@ -4242,6 +4377,7 @@ do_standby_switchover(void)
 			command_success = remote_command(remote_host,
 											 runtime_options.remote_user,
 											 remote_command_str.data,
+											 config_file_options.ssh_options,
 											 &command_output);

 			termPQExpBuffer(&remote_command_str);
@@ -4299,6 +4435,9 @@ do_standby_switchover(void)
 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
 		log_warning(_("connection to local node lost, reconnecting..."));
+		log_detail("\n%s", PQerrorMessage(local_conn));
+		PQfinish(local_conn);
+
 		local_conn = establish_db_connection(config_file_options.conninfo, false);

 		if (PQstatus(local_conn) != CONNECTION_OK)
@@ -4321,7 +4460,7 @@ do_standby_switchover(void)

 		for (i = 0; i < config_file_options.wal_receive_check_timeout; i++)
 		{
-			get_replication_info(local_conn, &replication_info);
+			get_replication_info(local_conn, STANDBY, &replication_info);
 			if (replication_info.last_wal_receive_lsn >= remote_last_checkpoint_lsn)
 				break;

@@ -4462,6 +4601,7 @@ do_standby_switchover(void)
 	command_success = remote_command(remote_host,
 									 runtime_options.remote_user,
 									 remote_command_str.data,
+									 config_file_options.ssh_options,
 									 &command_output);

 	termPQExpBuffer(&remote_command_str);
@@ -4570,6 +4710,7 @@ do_standby_switchover(void)
 			success = remote_command(host,
 									 runtime_options.remote_user,
 									 remote_command_str.data,
+									 config_file_options.ssh_options,
 									 &command_output);

 			termPQExpBuffer(&remote_command_str);
@@ -4712,9 +4853,10 @@ do_standby_switchover(void)
 				else
 				{
 					item_list_append_format(&repmgrd_unpause_errors,
-											_("unable to connect to node \"%s\" (ID %i)"),
+											_("unable to connect to node \"%s\" (ID %i):\n%s"),
 											cell->node_info->node_name,
-											cell->node_info->node_id);
+											cell->node_info->node_id,
+											PQerrorMessage(cell->node_info->conn));
 					error_node_count++;
 				}

@@ -4726,6 +4868,8 @@ do_standby_switchover(void)
 				PQExpBufferData detail;
 				ItemListCell *cell;

+				initPQExpBuffer(&detail);
+
 				for (cell = repmgrd_unpause_errors.head; cell; cell = cell->next)
 				{
 					appendPQExpBuffer(&detail,
@@ -4906,19 +5050,41 @@ check_source_server()
 	}

 	/*
-	 * In the default pg_basebackup mode, we'll cowardly refuse to overwrite
-	 * an existing data directory
+	 * Check the local directory to see if it appears to be a PostgreSQL
+	 * data directory.
+	 *
+	 * Note: a previous call to check_dir() will have checked whether it contains
+	 * a running PostgreSQL instance.
 	 */
-	if (mode == pg_basebackup)
+	if (is_pg_dir(local_data_directory))
 	{
-		if (is_pg_dir(local_data_directory) && runtime_options.force != true)
+		const char *msg = _("target data directory appears to be a PostgreSQL data directory");
+		const char *hint = _("use -F/--force to overwrite the existing data directory");
+
+		if (runtime_options.force == false && runtime_options.dry_run == false)
 		{
-			log_error(_("target data directory appears to be a PostgreSQL data directory"));
+			log_error("%s", msg);
 			log_detail(_("target data directory is \"%s\""), local_data_directory);
-			log_hint(_("use -F/--force to overwrite the existing data directory"));
+			log_hint("%s", hint);
 			PQfinish(source_conn);
 			exit(ERR_BAD_CONFIG);
 		}
+
+		if (runtime_options.dry_run == true)
+		{
+			if (runtime_options.force == true)
+			{
+				log_warning("%s and will be overwritten", msg);
+				log_detail(_("target data directory is \"%s\""), local_data_directory);
+
+			}
+			else
+			{
+				log_warning("%s", msg);
+				log_detail(_("target data directory is \"%s\""), local_data_directory);
+				log_hint("%s", hint);
+			}
+		}
 	}

 	/*
@@ -5794,6 +5960,12 @@ run_basebackup(t_node_info *node_record)
 	if (r != 0)
 		return ERR_BAD_BASEBACKUP;

+	/* check connections are still available */
+	(void)connection_ping_reconnect(primary_conn);
+
+	if (source_conn != primary_conn)
+		(void)connection_ping_reconnect(source_conn);
+
 	/*
 	 * If replication slots in use, check the created slot is on the correct
 	 * node; the slot will initially get created on the source node, and will
@@ -6002,10 +6174,11 @@ run_file_backup(t_node_info *node_record)
 				 * Remove prefix
 				 */
 				p = string_skip_prefix(prefix, output);
+
 				if (p == NULL)
 				{
-					log_error("unexpected output from \"barman list-files\": %s",
-							  output);
+					log_error("unexpected output from \"barman list-files\"");
+					log_detail("%s", output);
 					exit(ERR_BARMAN);
 				}

@@ -6023,6 +6196,14 @@ run_file_backup(t_node_info *node_record)
 					strncat(prefix, backup_id, MAXLEN - 1);
 					strncat(prefix, "/", MAXLEN - 1);
 					p = string_skip_prefix(backup_id, p);
+
+					if (p == NULL)
+					{
+						log_error("unexpected output from \"barman list-files\"");
+						log_detail("%s", output);
+						exit(ERR_BARMAN);
+					}
+
 					p = string_skip_prefix("/", p);

 					/*
@@ -6034,8 +6215,8 @@ run_file_backup(t_node_info *node_record)
 									basebackups_directory,
 									backup_id,
 									local_repmgr_tmp_directory);
-					(void) local_command(
-										 command,
+
+					(void) local_command(command,
 										 NULL);

 					/*
@@ -6359,6 +6540,8 @@ run_file_backup(t_node_info *node_record)

 		if (fputs(tablespace_map.data, tablespace_map_file) == EOF)
 		{
+			fclose(tablespace_map_file);
+
 			log_error(_("unable to write to tablespace_map file \"%s\""), tablespace_map_filename.data);

 			r = ERR_BAD_BASEBACKUP;
@@ -6396,6 +6579,15 @@ stop_backup:
 				RecordStatus record_status = RECORD_NOT_FOUND;
 				PGconn	   *upstream_conn = NULL;

+
+				/* check connections are still available */
+				(void)connection_ping_reconnect(primary_conn);
+
+				if (source_conn != primary_conn)
+					(void)connection_ping_reconnect(source_conn);
+
+				(void)connection_ping_reconnect(source_conn);
+
 				record_status = get_node_record(source_conn, upstream_node_id, &upstream_node_record);

 				if (record_status != RECORD_FOUND)
--- a/repmgr-action-witness.c
+++ b/repmgr-action-witness.c
@@ -56,8 +56,7 @@ do_witness_register(void)
 		log_error(_("unable to connect to witness node \"%s\" (ID: %i)"),
 				  config_file_options.node_name,
 				  config_file_options.node_id);
-		log_detail("%s",
-				   PQerrorMessage(witness_conn));
+		log_detail("\n%s", PQerrorMessage(witness_conn));
 		log_hint(_("the witness node must be running before it can be registered"));
 		exit(ERR_BAD_CONFIG);
 	}
@@ -411,7 +410,7 @@ do_witness_unregister(void)
 			log_error(_("unable to connect to node \"%s\" (ID: %i)"),
 					  config_file_options.node_name,
 					  config_file_options.node_id);
-			log_detail("%s", PQerrorMessage(local_conn));
+			log_detail("\n%s", PQerrorMessage(local_conn));
 			exit(ERR_BAD_CONFIG);
 		}

@@ -437,7 +436,7 @@ do_witness_unregister(void)
 	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
 		log_error(_("unable to connect to primary"));
-		log_detail("%s", PQerrorMessage(primary_conn));
+		log_detail("\n%s", PQerrorMessage(primary_conn));

 		if (local_node_available == true)
 		{
--- a/repmgr-client-global.h
+++ b/repmgr-client-global.h
@@ -70,7 +70,7 @@ typedef struct

 	/* general node options */
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	char		data_dir[MAXPGPATH];
 	int			remote_node_id;

@@ -135,6 +135,8 @@ typedef struct
 	/* following options for internal use */
 	char		config_archive_dir[MAXPGPATH];
 	OutputMode	output_mode;
+	bool		disable_wal_receiver;
+	bool		enable_wal_receiver;
 } t_runtime_options;

 #define T_RUNTIME_OPTIONS_INITIALIZER { \
@@ -174,7 +176,7 @@ typedef struct
 		/* "cluster cleanup" options */ \
 		0, \
 		/* following options for internal use */ \
-		"/tmp", OM_TEXT	\
+		"/tmp", OM_TEXT, false, false \
 }


@@ -224,8 +226,6 @@ extern int	check_server_version(PGconn *conn, char *server_type, bool exit_on_er
 extern void check_93_config(void);
 extern bool create_repmgr_extension(PGconn *conn);
 extern int	test_ssh_connection(char *host, char *remote_user);
-extern bool local_command(const char *command, PQExpBufferData *outputbuf);
-extern bool local_command_simple(const char *command, PQExpBufferData *outputbuf);

 extern standy_clone_mode get_standby_clone_mode(void);

@@ -238,8 +238,6 @@ extern char *make_pg_path(const char *file);

 extern void get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privileged_conn);

-extern bool remote_command(const char *host, const char *user, const char *command, PQExpBufferData *outputbuf);
-
 extern void make_remote_repmgr_path(PQExpBufferData *outputbuf, t_node_info *remote_node_record);
 extern void make_repmgrd_path(PQExpBufferData *output_buf);

--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -31,6 +31,7 @@
 * NODE CHECK
 * NODE REJOIN
 * NODE SERVICE
+ * NODE CONTROL
 *
 * DAEMON STATUS
 * DAEMON PAUSE
@@ -97,8 +98,6 @@ t_node_info target_node_info = T_NODE_INFO_INITIALIZER;
 static ItemList cli_errors = {NULL, NULL};
 static ItemList cli_warnings = {NULL, NULL};

-static bool _local_command(const char *command, PQExpBufferData *outputbuf, bool simple);
-
 int
 main(int argc, char **argv)
 {
@@ -357,9 +356,15 @@ main(int argc, char **argv)

 				/* --node-name */
 			case OPT_NODE_NAME:
-				strncpy(runtime_options.node_name, optarg, MAXLEN);
+			{
+				if (strlen(optarg) < sizeof(runtime_options.node_name))
+					strncpy(runtime_options.node_name, optarg, sizeof(runtime_options.node_name));
+				else
+					item_list_append_format(&cli_errors,
+											_("value for \"--node-name\" must contain fewer than %lu characters"),
+											sizeof(runtime_options.node_name));
 				break;
-
+			}
 				/* --remote-node-id */
 			case OPT_REMOTE_NODE_ID:
 				runtime_options.remote_node_id = repmgr_atoi(optarg, "--remote-node-id", &cli_errors, MIN_NODE_ID);
@@ -626,7 +631,7 @@ main(int argc, char **argv)
 				break;


-				/*--------------
+				/*---------------
 				 * output options
 				 *---------------
 				 */
@@ -642,6 +647,19 @@ main(int argc, char **argv)
 				runtime_options.optformat = true;
 				break;

+				/*---------------------------------
+				 * undocumented options for testing
+				 *----------------------------------
+				 */
+
+			case OPT_DISABLE_WAL_RECEIVER:
+				runtime_options.disable_wal_receiver = true;
+				break;
+
+			case OPT_ENABLE_WAL_RECEIVER:
+				runtime_options.enable_wal_receiver = true;
+				break;
+
 				/*-----------------------------
 				 * options deprecated since 3.3
 				 *-----------------------------
@@ -914,6 +932,8 @@ main(int argc, char **argv)
 				action = NODE_REJOIN;
 			else if (strcasecmp(repmgr_action, "SERVICE") == 0)
 				action = NODE_SERVICE;
+			else if (strcasecmp(repmgr_action, "CONTROL") == 0)
+				action = NODE_CONTROL;
 		}

 		else if (strcasecmp(repmgr_command, "CLUSTER") == 0)
@@ -1337,6 +1357,9 @@ main(int argc, char **argv)
 		case NODE_SERVICE:
 			do_node_service();
 			break;
+		case NODE_CONTROL:
+			do_node_control();
+			break;

 			/* CLUSTER */
 		case CLUSTER_SHOW:
@@ -1657,6 +1680,8 @@ check_cli_parameters(const int action)
 				item_list_append_format(&cli_warnings,
 										_("--replication-user ignored when executing %s"),
 										action_name(action));
+				break;
+
 			default:
 				item_list_append_format(&cli_warnings,
 										_("--replication-user not required when executing %s"),
@@ -1905,6 +1930,28 @@ check_cli_parameters(const int action)
 										action_name(action));
 		}
 	}
+
+	/* --disable-wal-receiver / --enable-wal-receiver */
+	if (runtime_options.disable_wal_receiver == true || runtime_options.enable_wal_receiver == true)
+	{
+		switch (action)
+		{
+			case NODE_CONTROL:
+			{
+				if (runtime_options.disable_wal_receiver == true && runtime_options.enable_wal_receiver == true)
+				{
+						item_list_append(&cli_errors,
+										 _("provide either --disable-wal-receiver or --enable-wal-receiver"));
+				}
+			}
+				break;
+			default:
+					item_list_append_format(&cli_warnings,
+											_("--disable-wal-receiver / --enable-wal-receiver not effective when executing %s"),
+											action_name(action));
+		}
+	}
+
 }


@@ -2172,7 +2219,7 @@ create_repmgr_extension(PGconn *conn)
 			log_detail(_("version %s is installed but newer version %s is available"),
 					   extversions.installed_version,
 					   extversions.default_version);
-			log_hint(_("execute \"ALTER EXTENSION repmgr UPGRADE\""));
+			log_hint(_("update the installed extension version by executing \"ALTER EXTENSION repmgr UPDATE\""));
 			return false;

 		case REPMGR_INSTALLED:
@@ -2399,75 +2446,6 @@ test_ssh_connection(char *host, char *remote_user)



-/*
- * Execute a command locally. "outputbuf" should either be an
- * initialised PQexpbuffer, or NULL
- */
-bool
-local_command(const char *command, PQExpBufferData *outputbuf)
-{
-	return _local_command(command, outputbuf, false);
-}
-
-
-bool
-local_command_simple(const char *command, PQExpBufferData *outputbuf)
-{
-	return _local_command(command, outputbuf, true);
-}
-
-
-static bool
-_local_command(const char *command, PQExpBufferData *outputbuf, bool simple)
-{
-	FILE	   *fp = NULL;
-	char		output[MAXLEN];
-	int			retval = 0;
-	bool		success;
-
-	log_verbose(LOG_DEBUG, "executing:\n  %s", command);
-
-	if (outputbuf == NULL)
-	{
-		retval = system(command);
-		return (retval == 0) ? true : false;
-	}
-
-	fp = popen(command, "r");
-
-	if (fp == NULL)
-	{
-		log_error(_("unable to execute local command:\n%s"), command);
-		return false;
-	}
-
-
-	while (fgets(output, MAXLEN, fp) != NULL)
-	{
-		appendPQExpBuffer(outputbuf, "%s", output);
-
-		if (!feof(fp) && simple == false)
-		{
-			break;
-		}
-	}
-
-	retval = pclose(fp);
-
-	/*  */
-	success = (WEXITSTATUS(retval) == 0 || WEXITSTATUS(retval) == 141) ? true : false;
-
-	log_verbose(LOG_DEBUG, "result of command was %i (%i)", WEXITSTATUS(retval), retval);
-
-	if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
-		log_verbose(LOG_DEBUG, "local_command(): output returned was:\n%s", outputbuf->data);
-	else
-		log_verbose(LOG_DEBUG, "local_command(): no output returned");
-
-	return success;
-}
-
-
 /*
 * get_superuser_connection()
 *
@@ -2487,6 +2465,7 @@ get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privil
 	if (PQstatus(*conn) != CONNECTION_OK)
 	{
 		log_error(_("no database connection available"));
+		log_detail("\n%s", PQerrorMessage(*conn));
 		exit(ERR_INTERNAL);
 	}

@@ -2674,78 +2653,6 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
 }


-/*
- * Execute a command via ssh on the remote host.
- *
- * TODO: implement SSH calls using libssh2.
- */
-bool
-remote_command(const char *host, const char *user, const char *command, PQExpBufferData *outputbuf)
-{
-	FILE	   *fp;
-	char		ssh_command[MAXLEN] = "";
-	PQExpBufferData ssh_host;
-
-	char		output[MAXLEN] = "";
-
-	initPQExpBuffer(&ssh_host);
-
-	if (*user != '\0')
-	{
-		appendPQExpBuffer(&ssh_host, "%s@", user);
-	}
-
-	appendPQExpBuffer(&ssh_host, "%s", host);
-
-	maxlen_snprintf(ssh_command,
-					"ssh -o Batchmode=yes %s %s %s",
-					config_file_options.ssh_options,
-					ssh_host.data,
-					command);
-
-	termPQExpBuffer(&ssh_host);
-
-	log_debug("remote_command():\n  %s", ssh_command);
-
-	fp = popen(ssh_command, "r");
-
-	if (fp == NULL)
-	{
-		log_error(_("unable to execute remote command:\n  %s"), ssh_command);
-		return false;
-	}
-
-	if (outputbuf != NULL)
-	{
-		/* TODO: better error handling */
-		while (fgets(output, MAXLEN, fp) != NULL)
-		{
-			appendPQExpBuffer(outputbuf, "%s", output);
-		}
-	}
-	else
-	{
-		while (fgets(output, MAXLEN, fp) != NULL)
-		{
-			if (!feof(fp))
-			{
-				break;
-			}
-		}
-	}
-
-	pclose(fp);
-
-	if (outputbuf != NULL)
-	{
-		if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
-			log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n%s", outputbuf->data);
-		else
-			log_verbose(LOG_DEBUG, "remote_command(): no output returned");
-	}
-
-	return true;
-}


 void
@@ -3102,7 +3009,7 @@ init_node_record(t_node_info *node_record)
 		strncpy(node_record->location, "default", MAXLEN);


-	strncpy(node_record->node_name, config_file_options.node_name, MAXLEN);
+	strncpy(node_record->node_name, config_file_options.node_name, sizeof(node_record->node_name));
 	strncpy(node_record->conninfo, config_file_options.conninfo, MAXLEN);
 	strncpy(node_record->config_file, config_file_path, MAXPGPATH);

@@ -3156,9 +3063,6 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea
 	/* "full_page_writes" must be on in any case */
 	if (guc_set(conn, "full_page_writes", "=", "off"))
 	{
-		if (can_use == false)
-			appendPQExpBuffer(reason, "; ");
-
 		appendPQExpBuffer(reason,
 						  _("\"full_page_writes\" must be set to \"on\""));

@@ -3245,6 +3149,8 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
 /*
 * Here we'll perform some timeline sanity checks to ensure the follow target
 * can actually be followed.
+ *
+ * See also comment for check_node_can_follow() in repmgrd-physical.c .
 */
 bool
 check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin)
@@ -3335,6 +3241,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
 		return false;
 	}

+	/* timelines are the same - check relative positions */
 	if (follow_target_identification.timeline == local_tli)
 	{
 		XLogRecPtr follow_target_xlogpos = get_node_current_lsn(follow_target_conn);
@@ -3346,7 +3253,6 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
 			return false;
 		}

-		/* timeline is the same - check relative positions */
 		if (local_xlogpos <= follow_target_xlogpos)
 		{
 			log_info(_("timelines are same, this server is not ahead"));
--- a/repmgr-client.h
+++ b/repmgr-client.h
@@ -40,16 +40,17 @@
 #define NODE_CHECK			   14
 #define NODE_SERVICE		   15
 #define NODE_REJOIN            16
-#define CLUSTER_SHOW		   17
-#define CLUSTER_CLEANUP		   18
-#define CLUSTER_MATRIX		   19
-#define CLUSTER_CROSSCHECK	   20
-#define CLUSTER_EVENT		   21
-#define DAEMON_STATUS		   22
-#define DAEMON_PAUSE		   23
-#define DAEMON_UNPAUSE		   24
-#define DAEMON_START 		   25
-#define DAEMON_STOP 		   26
+#define NODE_CONTROL           17
+#define CLUSTER_SHOW		   18
+#define CLUSTER_CLEANUP		   19
+#define CLUSTER_MATRIX		   20
+#define CLUSTER_CROSSCHECK	   21
+#define CLUSTER_EVENT		   22
+#define DAEMON_STATUS		   23
+#define DAEMON_PAUSE		   24
+#define DAEMON_UNPAUSE		   25
+#define DAEMON_START 		   26
+#define DAEMON_STOP 		   27

 /* command line options without short versions */
 #define OPT_HELP						   1001
@@ -97,7 +98,8 @@
 #define OPT_VERSION_NUMBER				   1043
 #define OPT_DATA_DIRECTORY_CONFIG		   1044
 #define OPT_COMPACT		                   1045
-
+#define OPT_DISABLE_WAL_RECEIVER           1046
+#define OPT_ENABLE_WAL_RECEIVER            1047

 /* deprecated since 3.3 */
 #define OPT_DATA_DIR						999
@@ -202,6 +204,10 @@ static struct option long_options[] =
 /* "cluster cleanup" options */
 	{"keep-history", required_argument, NULL, 'k'},

+/* undocumented options for testing */
+	{"disable-wal-receiver", no_argument, NULL, OPT_DISABLE_WAL_RECEIVER},
+	{"enable-wal-receiver", no_argument, NULL, OPT_ENABLE_WAL_RECEIVER},
+
 /* deprecated */
 	{"check-upstream-config", no_argument, NULL, OPT_CHECK_UPSTREAM_CONFIG},
 	{"no-conninfo-password", no_argument, NULL, OPT_NO_CONNINFO_PASSWORD},
--- a/repmgr.c
+++ b/repmgr.c
@@ -53,6 +53,7 @@
 #include "voting.h"

 #define UNKNOWN_NODE_ID		-1
+#define ELECTION_RERUN_NOTIFICATION -2
 #define UNKNOWN_PID			-1

 #define TRANCHE_NAME "repmgrd"
@@ -77,7 +78,7 @@ typedef struct repmgrdSharedState
 	char		repmgrd_pidfile[MAXPGPATH];
 	bool		repmgrd_paused;
 	/* streaming failover */
-	TimestampTz primary_last_seen;
+	TimestampTz upstream_last_seen;
 	NodeVotingStatus voting_status;
 	int			current_electoral_term;
 	int			candidate_node_id;
@@ -108,11 +109,11 @@ PG_FUNCTION_INFO_V1(standby_set_last_updated);
 Datum		standby_get_last_updated(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(standby_get_last_updated);

-Datum		set_primary_last_seen(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(set_primary_last_seen);
+Datum		set_upstream_last_seen(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(set_upstream_last_seen);

-Datum		get_primary_last_seen(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(get_primary_last_seen);
+Datum		get_upstream_last_seen(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(get_upstream_last_seen);

 Datum		notify_follow_primary(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(notify_follow_primary);
@@ -147,6 +148,8 @@ PG_FUNCTION_INFO_V1(repmgrd_pause);
 Datum		repmgrd_is_paused(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(repmgrd_is_paused);

+Datum		get_wal_receiver_pid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(get_wal_receiver_pid);


 /*
@@ -226,7 +229,7 @@ repmgr_shmem_startup(void)
 		shared_state->repmgrd_paused = false;
 		shared_state->current_electoral_term = 0;
 		/* arbitrary "magic" date to indicate this field hasn't been updated */
-		shared_state->primary_last_seen = POSTGRES_EPOCH_JDATE;
+		shared_state->upstream_last_seen = POSTGRES_EPOCH_JDATE;
 		shared_state->voting_status = VS_NO_VOTE;
 		shared_state->candidate_node_id = UNKNOWN_NODE_ID;
 		shared_state->follow_new_primary = false;
@@ -363,17 +366,14 @@ standby_get_last_updated(PG_FUNCTION_ARGS)


 Datum
-set_primary_last_seen(PG_FUNCTION_ARGS)
+set_upstream_last_seen(PG_FUNCTION_ARGS)
 {
 	if (!shared_state)
 		PG_RETURN_VOID();

 	LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);

-	shared_state->primary_last_seen = GetCurrentTimestamp();
-	elog(INFO,
-		 "primary_last_seen: %s",
-		 timestamptz_to_str( 	shared_state->primary_last_seen));
+	shared_state->upstream_last_seen = GetCurrentTimestamp();

 	LWLockRelease(shared_state->lock);

@@ -382,7 +382,7 @@ set_primary_last_seen(PG_FUNCTION_ARGS)


 Datum
-get_primary_last_seen(PG_FUNCTION_ARGS)
+get_upstream_last_seen(PG_FUNCTION_ARGS)
 {
 	long		secs;
 	int			microsecs;
@@ -391,13 +391,9 @@ get_primary_last_seen(PG_FUNCTION_ARGS)
 	if (!shared_state)
 		PG_RETURN_INT32(-1);

-	/* A primary is always visible */
-	if (!RecoveryInProgress())
-		PG_RETURN_INT32(0);
-
 	LWLockAcquire(shared_state->lock, LW_SHARED);

-	last_seen = shared_state->primary_last_seen;
+	last_seen = shared_state->upstream_last_seen;

 	LWLockRelease(shared_state->lock);

@@ -442,9 +438,17 @@ notify_follow_primary(PG_FUNCTION_ARGS)
 	/* only do something if local_node_id is initialised */
 	if (shared_state->local_node_id != UNKNOWN_NODE_ID)
 	{
-		elog(INFO, "node %i received notification to follow node %i",
-			 shared_state->local_node_id,
-			 primary_node_id);
+		if (primary_node_id == ELECTION_RERUN_NOTIFICATION)
+		{
+			elog(INFO, "node %i received notification to rerun promotion candidate election",
+				 shared_state->local_node_id);
+		}
+		else
+		{
+			elog(INFO, "node %i received notification to follow node %i",
+				 shared_state->local_node_id,
+				 primary_node_id);
+		}

 		LWLockRelease(shared_state->lock);
 		LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
@@ -743,3 +747,17 @@ repmgrd_is_paused(PG_FUNCTION_ARGS)

 	PG_RETURN_BOOL(is_paused);
 }
+
+
+Datum
+get_wal_receiver_pid(PG_FUNCTION_ARGS)
+{
+	int wal_receiver_pid;
+
+	if (!shared_state)
+		PG_RETURN_NULL();
+
+	wal_receiver_pid = WalRcv->pid;
+
+	PG_RETURN_INT32(wal_receiver_pid);
+}
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -5,7 +5,14 @@
 # Some configuration items will be set with a default value; this
 # is noted for each item. Where no default value is shown, the
 # parameter will be treated as empty or false.
-
+#
+# IMPORTANT: string values can be provided as-is, or enclosed in single quotes
+# (but not double-quotes, which will be interpreted as part of the string),
+# e.g.:
+#
+#  node_name=foo
+#  node_name = 'foo'
+#
 # =============================================================================
 # Required configuration items
 # =============================================================================
@@ -18,9 +25,11 @@
 				 # using the server's hostname or another identifier
 				 # unambiguously associated with the server to avoid
 				 # confusion. Avoid choosing names which reflect the
-				 # node's current role, e.g. "primary" or "standby1",
+				 # node's current role, e.g. 'primary' or 'standby1',
 				 # as roles can change and it will be confusing if
-				 # the current primary is called "standby1".
+				 # the current primary is called 'standby1'.
+                                 # The string's maximum length is 63 characters and it should
+                                 # contain only printable ASCII characters.

 #conninfo=''			 # Database connection information as a conninfo string.
 				 # All servers in the cluster must be able to connect to
@@ -63,6 +72,7 @@
 				 # to the user defined in "conninfo".

 #replication_type=physical	 # Must be one of 'physical' or 'bdr'.
+				 # NOTE: "bdr" can only be used with BDR 2.x

 #location=default		 # arbitrary string defining the location of the node; this
 				 # is used during failover to check visibilty of the
@@ -281,10 +291,13 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					#    manual attention to reattach it to replication
 					# (does not apply to BDR mode)

-#priority=100				# indicate a preferred priority for promoting nodes;
+#priority=100				# indicates a preferred priority for promoting nodes;
 					# a value of zero prevents the node being promoted to primary
 					# (default: 100)

+#connection_check_type=ping		# How to check availability of the upstream node; valid options:
+                                        #  'ping': use PQping() to check if the node is accepting connections
+                                        #  'connection': execute a throwaway query on the current connection
 #reconnect_attempts=6			# Number of attempts which will be made to reconnect to an unreachable
 					# primary (or other upstream node)
 #reconnect_interval=10			# Interval between attempts to reconnect to an unreachable
@@ -308,7 +321,7 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 #monitoring_history=no                  # Whether to write monitoring data to the "montoring_history" table
 #monitor_interval_secs=2                # Interval (in seconds) at which to write monitoring data
 #degraded_monitoring_timeout=-1		# Interval (in seconds) after which repmgrd will terminate if the
-					# server being monitored is no longer available. -1 (default)
+					# server(s) being monitored are no longer available. -1 (default)
 					# disables the timeout completely.
 #async_query_timeout=60			# Interval (in seconds) which repmgrd will wait before
 					# cancelling an asynchronous query.
@@ -319,6 +332,18 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					# "--no-pid-file" will force PID file creation to be skipped.
 					# Note: there is normally no need to set this, particularly if
 					# repmgr was installed from packages.
+#standby_disconnect_on_failover=false	# If "true", in a failover situation wait for all standbys to
+					# disconnect their WAL receivers before electing a new primary
+					# (PostgreSQL 9.5 and later only; repmgr user must be a superuser for this)
+#sibling_nodes_disconnect_timeout=30	# If "standby_disconnect_on_failover" is true, the maximum length of time
+					#  (in seconds) to wait for other standbys to confirm they have disconnected their
+					# WAL receivers
+#failover_validation_command=		# Script to execute for an external mechanism to validate the failover
+					# decision made by repmgrd. One or both of the following parameter placeholders
+					# should be provided, which will be replaced by repmgrd with the appropriate
+					# value: %n (node_id), %a (node_name). *Must* be the same on all nodes.
+#election_rerun_interval=15		# if "failover_validation_command" is set, and the command returns
+					# an error, pause the specified amount of seconds before rerunning the election.

 #------------------------------------------------------------------------------
 # service control commands
--- a/repmgr.h
+++ b/repmgr.h
@@ -41,6 +41,7 @@
 #include "configfile.h"
 #include "dbutils.h"
 #include "log.h"
+#include "sysutils.h"

 #define MIN_SUPPORTED_VERSION		"9.3"
 #define MIN_SUPPORTED_VERSION_NUM	90300
@@ -54,13 +55,16 @@
 #define UNKNOWN_TIMELINE_ID -1
 #define UNKNOWN_SYSTEM_IDENTIFIER 0
 #define UNKNOWN_PID			-1
+#define UNKNOWN_REPLICATION_LAG	-1

 #define NODE_NOT_FOUND		-1
 #define NO_UPSTREAM_NODE	-1
 #define UNKNOWN_NODE_ID		-1
 #define MIN_NODE_ID          1
+#define ELECTION_RERUN_NOTIFICATION -2
 #define VOTING_TERM_NOT_SET -1
 #define ARCHIVE_STATUS_DIR_ERROR -1
+#define NO_DEGRADED_MONITORING_ELAPSED -1

 #define BDR2_REPLICATION_SET_NAME "repmgr"

@@ -90,6 +94,10 @@
 #define DEFAULT_STANDBY_RECONNECT_TIMEOUT    60  /* seconds */
 #define DEFAULT_NODE_REJOIN_TIMEOUT          60  /* seconds */
 #define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT    30  /* seconds */
+#define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
+#define DEFAULT_ELECTION_RERUN_INTERVAL      15  /* seconds */
+
+#define WALRECEIVER_DISABLE_TIMEOUT_VALUE    86400000 /* milliseconds */

 #ifndef RECOVERY_COMMAND_FILE
 #define RECOVERY_COMMAND_FILE "recovery.conf"
--- a/repmgr_version.h.in
+++ b/repmgr_version.h.in
@@ -1,3 +1,3 @@
 #define REPMGR_VERSION_DATE ""
-#define REPMGR_VERSION "4.3dev"
-#define REPMGR_VERSION_NUM 40300
+#define REPMGR_VERSION "4.3.1"
+#define REPMGR_VERSION_NUM 40301
--- a/repmgrd-bdr.c
+++ b/repmgrd-bdr.c
@@ -68,7 +68,6 @@ monitor_bdr(void)
 	t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER;
 	RecordStatus record_status;
 	NodeInfoListCell *cell;
-	PQExpBufferData event_details;
 	instr_time	log_status_interval_start;

 	/* sanity check local database */
@@ -97,9 +96,21 @@ monitor_bdr(void)
 	if (!is_bdr_db(local_conn, NULL))
 	{
 		log_error(_("database is not BDR-enabled"));
+		PQfinish(local_conn);
 		exit(ERR_BAD_CONFIG);
 	}

+	/*
+	 * Check this is a supported BDR version (basically BDR 2.x)
+	 */
+	if (get_bdr_version_num() > 2)
+	{
+		log_error(_("\"bdr\" mode is for BDR 2.x only"));
+		log_hint(_("for BDR 3 and later, use \"replication_type=physical\""));
+		log_error(_("database is not BDR-enabled"));
+		exit(ERR_DB_CONN);
+	}
+
 	if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr") == false)
 	{
 		log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"),
@@ -229,6 +240,7 @@ monitor_bdr(void)
 								if (cell->node_info->node_status == NODE_STATUS_UP)
 								{
 									int			node_unreachable_elapsed = calculate_elapsed(node_unreachable_start);
+									PQExpBufferData event_details;

 									initPQExpBuffer(&event_details);

@@ -366,7 +378,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *next_node_conn = NULL;
 	NodeInfoListCell *cell;
-	PQExpBufferData event_details;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
 	t_node_info target_node = T_NODE_INFO_INITIALIZER;
 	t_node_info failed_node = T_NODE_INFO_INITIALIZER;
@@ -460,45 +471,49 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)

 	log_debug("this node is the failover handler");

-	initPQExpBuffer(&event_details);
+	{
+		PQExpBufferData event_details;

-	event_info.conninfo_str = target_node.conninfo;
-	event_info.node_name = target_node.node_name;
+		initPQExpBuffer(&event_details);

-	/* update node record on the active node */
-	update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
+		event_info.conninfo_str = target_node.conninfo;
+		event_info.node_name = target_node.node_name;

-	log_notice(_("setting node record for node %i to inactive"), monitored_node->node_id);
+		/* update node record on the active node */
+		update_node_record_set_active(next_node_conn, monitored_node->node_id, false);

-	appendPQExpBuffer(&event_details,
-					  _("node \"%s\" (ID: %i) detected as failed; next available node is \"%s\" (ID: %i)"),
-					  monitored_node->node_name,
-					  monitored_node->node_id,
-					  target_node.node_name,
-					  target_node.node_id);
+		log_notice(_("setting node record for node %i to inactive"), monitored_node->node_id);

-	/*
-	 * Create an event record
-	 *
-	 * If we were able to connect to another node, we'll update the event log
-	 * there.
-	 *
-	 * In any case the event notification command will be triggered with the
-	 * event "bdr_failover"
-	 */
+		appendPQExpBuffer(&event_details,
+						  _("node \"%s\" (ID: %i) detected as failed; next available node is \"%s\" (ID: %i)"),
+						  monitored_node->node_name,
+						  monitored_node->node_id,
+						  target_node.node_name,
+						  target_node.node_id);
+
+		/*
+		 * Create an event record
+		 *
+		 * If we were able to connect to another node, we'll update the event log
+		 * there.
+		 *
+		 * In any case the event notification command will be triggered with the
+		 * event "bdr_failover"
+		 */


-	create_event_notification_extended(next_node_conn,
-									   &config_file_options,
-									   monitored_node->node_id,
-									   "bdr_failover",
-									   true,
-									   event_details.data,
-									   &event_info);
+		create_event_notification_extended(next_node_conn,
+										   &config_file_options,
+										   monitored_node->node_id,
+										   "bdr_failover",
+										   true,
+										   event_details.data,
+										   &event_info);

-	log_info("%s", event_details.data);
+		log_info("%s", event_details.data);

-	termPQExpBuffer(&event_details);
+		termPQExpBuffer(&event_details);
+	}

 	unset_bdr_failover_handler(next_node_conn);

@@ -513,7 +528,6 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *recovered_node_conn;

-	PQExpBufferData event_details;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
 	int			i;
 	bool		slot_reactivated = false;
@@ -543,6 +557,8 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 	 */
 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
+		PQExpBufferData event_details;
+
 		local_conn = NULL;
 		log_warning(_("unable to reconnect to local node"));

@@ -613,49 +629,50 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 	node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
 	monitored_node->monitoring_state = MS_NORMAL;

-
-	initPQExpBuffer(&event_details);
-
-	appendPQExpBuffer(&event_details,
-					  _("node \"%s\" (ID: %i) has recovered after %i seconds"),
-					  monitored_node->node_name,
-					  monitored_node->node_id,
-					  node_recovery_elapsed);
-
-	log_notice("%s", event_details.data);
-
-
-	/* other node will generate the event */
-	if (monitored_node->node_id == local_node_info.node_id)
 	{
+		PQExpBufferData event_details;
+
+		initPQExpBuffer(&event_details);
+
+		appendPQExpBuffer(&event_details,
+						  _("node \"%s\" (ID: %i) has recovered after %i seconds"),
+						  monitored_node->node_name,
+						  monitored_node->node_id,
+						  node_recovery_elapsed);
+
+		log_notice("%s", event_details.data);
+
+
+		/* other node will generate the event */
+		if (monitored_node->node_id == local_node_info.node_id)
+		{
+			termPQExpBuffer(&event_details);
+			PQfinish(recovered_node_conn);
+
+			return;
+		}
+
+
+		/* generate the event on the currently active node only */
+		if (monitored_node->node_id != local_node_info.node_id)
+		{
+			event_info.conninfo_str = monitored_node->conninfo;
+			event_info.node_name = monitored_node->node_name;
+
+			create_event_notification_extended(local_conn,
+											   &config_file_options,
+											   config_file_options.node_id,
+											   "bdr_recovery",
+											   true,
+											   event_details.data,
+											   &event_info);
+		}
+
 		termPQExpBuffer(&event_details);
-		PQfinish(recovered_node_conn);
-
-		return;
 	}

-
-	/* generate the event on the currently active node only */
-	if (monitored_node->node_id != local_node_info.node_id)
-	{
-		event_info.conninfo_str = monitored_node->conninfo;
-		event_info.node_name = monitored_node->node_name;
-
-		create_event_notification_extended(
-										   local_conn,
-										   &config_file_options,
-										   config_file_options.node_id,
-										   "bdr_recovery",
-										   true,
-										   event_details.data,
-										   &event_info);
-	}
-
-
 	update_node_record_set_active(local_conn, monitored_node->node_id, true);

-	termPQExpBuffer(&event_details);
-
 	PQfinish(recovered_node_conn);

 	return;
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -383,6 +383,15 @@ main(int argc, char **argv)
 	 * repmgr has not been properly configured.
 	 */

+
+	/* warn about any settings which might not be relevant for the current PostgreSQL version  */
+	if (config_file_options.standby_disconnect_on_failover == true && PQserverVersion(local_conn) < 90500)
+	{
+		log_warning(_("\"standby_disconnect_on_failover\" specified, but not available for this PostgreSQL version"));
+		/* TODO: format server version */
+		log_detail(_("available from PostgreSQL 9.5, this PostgreSQL version is %i"), PQserverVersion(local_conn));
+	}
+
 	/* Check "repmgr" the extension is installed */
 	extension_status = get_repmgr_extension_status(local_conn, &extversions);

@@ -400,8 +409,8 @@ main(int argc, char **argv)
 			log_detail(_("\"repmgr\" version %s is installed but extension is version %s"),
 					   REPMGR_VERSION,
 					   extversions.installed_version);
+			log_hint(_("update the repmgr binaries to match the installed extension version"));

-			log_hint(_("verify the repmgr installation on this server is updated properly before continuing"));
 			close_connection(&local_conn);
 			exit(ERR_BAD_CONFIG);
 		}
@@ -412,8 +421,8 @@ main(int argc, char **argv)
 			log_detail(_("\"repmgr\" version %s is installed but extension is version %s"),
 					   REPMGR_VERSION,
 					   extversions.installed_version);
+			log_hint(_("update the installed extension version by executing \"ALTER EXTENSION repmgr UPDATE\""));

-			log_hint(_("verify the repmgr extension is updated properly before continuing"));
 			close_connection(&local_conn);
 			exit(ERR_BAD_CONFIG);
 		}
@@ -424,7 +433,7 @@ main(int argc, char **argv)
 		if (extension_status == REPMGR_UNKNOWN)
 		{
 			log_error(_("unable to determine status of \"repmgr\" extension"));
-			log_detail("%s", PQerrorMessage(local_conn));
+			log_detail("\n%s", PQerrorMessage(local_conn));
 			close_connection(&local_conn);
 			exit(ERR_DB_QUERY);
 		}
@@ -552,6 +561,8 @@ start_monitoring(void)
 			   local_node_info.node_name,
 			   local_node_info.node_id);

+	log_info(_("\"connection_check_type\" set to \"%s\""), print_connection_check_type(config_file_options.connection_check_type));
+
 	while (true)
 	{
 		switch (local_node_info.type)
@@ -818,6 +829,82 @@ show_help(void)
 }


+bool
+check_upstream_connection(PGconn **conn, const char *conninfo)
+{
+	/* Check the connection status twice in case it changes after reset */
+	bool		twice = false;
+
+	if (config_file_options.connection_check_type == CHECK_PING)
+		return is_server_available(conninfo);
+
+	if (config_file_options.connection_check_type == CHECK_CONNECTION)
+	{
+		bool success = true;
+		PGconn *test_conn = PQconnectdb(conninfo);
+
+		log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo);
+
+		if (PQstatus(test_conn) != CONNECTION_OK)
+		{
+			log_warning(_("unable to connect to \"%s\""), conninfo);
+			log_detail("\n%s", PQerrorMessage(test_conn));
+			success = false;
+		}
+		PQfinish(test_conn);
+
+		return success;
+	}
+
+	for (;;)
+	{
+		if (PQstatus(*conn) != CONNECTION_OK)
+		{
+			log_debug("check_upstream_connection(): connection not OK");
+			if (twice)
+				return false;
+			/* reconnect */
+			PQfinish(*conn);
+			*conn = PQconnectdb(conninfo);
+			twice = true;
+		}
+		else
+		{
+			if (!cancel_query(*conn, config_file_options.async_query_timeout))
+				goto failed;
+
+			if (wait_connection_availability(*conn, config_file_options.async_query_timeout) != 1)
+				goto failed;
+
+			/* execute a simple query to verify connection availability */
+			if (PQsendQuery(*conn, "SELECT 1") == 0)
+			{
+				log_warning(_("unable to send query to upstream"));
+				log_detail("%s", PQerrorMessage(*conn));
+				goto failed;
+			}
+
+			if (wait_connection_availability(*conn, config_file_options.async_query_timeout) != 1)
+				goto failed;
+
+			break;
+
+	failed:
+			/* retry once */
+			if (twice)
+				return false;
+
+			/* reconnect */
+			PQfinish(*conn);
+			*conn = PQconnectdb(conninfo);
+			twice = true;
+		}
+	}
+
+	return true;
+}
+
+
 void
 try_reconnect(PGconn **conn, t_node_info *node_info)
 {
@@ -843,8 +930,7 @@ try_reconnect(PGconn **conn, t_node_info *node_info)
 				 node_info->node_id, i + 1, max_attempts);
 		if (is_server_available_params(&conninfo_params) == true)
 		{
-
-			log_notice(_("node has recovered, reconnecting"));
+			log_notice(_("node %i has recovered, reconnecting"), node_info->node_id);

 			/*
 			 * XXX we should also handle the case where node is pingable but
@@ -874,7 +960,7 @@ try_reconnect(PGconn **conn, t_node_info *node_info)

 					if (ping_result != PGRES_TUPLES_OK)
 					{
-						log_info("original connnection no longer available, using new connection");
+						log_info("original connection no longer available, using new connection");
 						close_connection(conn);
 						*conn = our_conn;
 					}
--- a/repmgrd.h
+++ b/repmgrd.h
@@ -23,6 +23,7 @@ extern PGconn *local_conn;
 extern bool startup_event_logged;
 extern char pid_file[MAXPGPATH];

+bool		check_upstream_connection(PGconn **conn, const char *conninfo);
 void		try_reconnect(PGconn **conn, t_node_info *node_info);

 int			calculate_elapsed(instr_time start_time);
@@ -31,5 +32,4 @@ const char *print_monitoring_state(MonitoringState monitoring_state);
 void		update_registration(PGconn *conn);
 void		terminate(int retval);

-
 #endif							/* _REPMGRD_H_ */
--- a/sysutils.c
+++ b/sysutils.c
@@ -0,0 +1,366 @@
+/*
+ * sysutils.c
+ *
+ * Copyright (c) 2ndQuadrant, 2010-2019
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <signal.h>
+
+#include "repmgr.h"
+
+static bool _local_command(const char *command, PQExpBufferData *outputbuf, bool simple, int *return_value);
+
+
+/*
+ * Execute a command locally. "outputbuf" should either be an
+ * initialised PQExpPuffer, or NULL
+ */
+bool
+local_command(const char *command, PQExpBufferData *outputbuf)
+{
+	return _local_command(command, outputbuf, false, NULL);
+}
+
+bool
+local_command_return_value(const char *command, PQExpBufferData *outputbuf, int *return_value)
+{
+	return _local_command(command, outputbuf, false, return_value);
+}
+
+
+bool
+local_command_simple(const char *command, PQExpBufferData *outputbuf)
+{
+	return _local_command(command, outputbuf, true, NULL);
+}
+
+
+static bool
+_local_command(const char *command, PQExpBufferData *outputbuf, bool simple, int *return_value)
+{
+	FILE	   *fp = NULL;
+	char		output[MAXLEN];
+	int			retval = 0;
+	bool		success;
+
+	log_verbose(LOG_DEBUG, "executing:\n  %s", command);
+
+	if (outputbuf == NULL)
+	{
+		retval = system(command);
+
+		if (return_value != NULL)
+			*return_value = WEXITSTATUS(retval);
+
+		return (retval == 0) ? true : false;
+	}
+
+	fp = popen(command, "r");
+
+	if (fp == NULL)
+	{
+		log_error(_("unable to execute local command:\n%s"), command);
+		return false;
+	}
+
+
+	while (fgets(output, MAXLEN, fp) != NULL)
+	{
+		appendPQExpBufferStr(outputbuf, output);
+
+		if (!feof(fp) && simple == false)
+		{
+			break;
+		}
+	}
+
+	retval = pclose(fp);
+
+	/*  */
+	success = (WEXITSTATUS(retval) == 0 || WEXITSTATUS(retval) == 141) ? true : false;
+
+	log_verbose(LOG_DEBUG, "result of command was %i (%i)", WEXITSTATUS(retval), retval);
+
+	if (return_value != NULL)
+		*return_value = WEXITSTATUS(retval);
+
+	if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
+		log_verbose(LOG_DEBUG, "local_command(): output returned was:\n%s", outputbuf->data);
+	else
+		log_verbose(LOG_DEBUG, "local_command(): no output returned");
+
+	return success;
+}
+
+
+/*
+ * Execute a command via ssh on the remote host.
+ *
+ * TODO: implement SSH calls using libssh2.
+ */
+bool
+remote_command(const char *host, const char *user, const char *command, const char *ssh_options, PQExpBufferData *outputbuf)
+{
+	FILE	   *fp;
+	char		ssh_command[MAXLEN] = "";
+	PQExpBufferData ssh_host;
+
+	char		output[MAXLEN] = "";
+
+	initPQExpBuffer(&ssh_host);
+
+	if (*user != '\0')
+	{
+		appendPQExpBuffer(&ssh_host, "%s@", user);
+	}
+
+	appendPQExpBufferStr(&ssh_host, host);
+
+	maxlen_snprintf(ssh_command,
+					"ssh -o Batchmode=yes %s %s %s",
+					ssh_options,
+					ssh_host.data,
+					command);
+
+	termPQExpBuffer(&ssh_host);
+
+	log_debug("remote_command():\n  %s", ssh_command);
+
+	fp = popen(ssh_command, "r");
+
+	if (fp == NULL)
+	{
+		log_error(_("unable to execute remote command:\n  %s"), ssh_command);
+		return false;
+	}
+
+	if (outputbuf != NULL)
+	{
+		/* TODO: better error handling */
+		while (fgets(output, MAXLEN, fp) != NULL)
+		{
+			appendPQExpBufferStr(outputbuf, output);
+		}
+	}
+	else
+	{
+		while (fgets(output, MAXLEN, fp) != NULL)
+		{
+			if (!feof(fp))
+			{
+				break;
+			}
+		}
+	}
+
+	pclose(fp);
+
+	if (outputbuf != NULL)
+	{
+		if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
+			log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n%s", outputbuf->data);
+		else
+			log_verbose(LOG_DEBUG, "remote_command(): no output returned");
+	}
+
+	return true;
+}
+
+
+pid_t
+disable_wal_receiver(PGconn *conn)
+{
+	char buf[MAXLEN];
+	int wal_retrieve_retry_interval, new_wal_retrieve_retry_interval;
+	pid_t wal_receiver_pid = UNKNOWN_PID;
+	int kill_ret;
+	int i, j;
+	int max_retries = 2;
+
+	if (is_superuser_connection(conn, NULL) == false)
+	{
+		log_error(_("superuser connection required"));
+		return UNKNOWN_PID;
+	}
+
+	if (get_recovery_type(conn) == RECTYPE_PRIMARY)
+	{
+		log_error(_("node is not in recovery"));
+		log_detail(_("wal receiver can only run on standby nodes"));
+		return UNKNOWN_PID;
+	}
+
+	wal_receiver_pid = (pid_t)get_wal_receiver_pid(conn);
+
+	if (wal_receiver_pid == UNKNOWN_PID)
+	{
+		log_warning(_("unable to retrieve wal receiver PID"));
+		return UNKNOWN_PID;
+	}
+
+	get_pg_setting(conn, "wal_retrieve_retry_interval", buf);
+
+	/* TODO: potentially handle atoi error, though unlikely at this point */
+	wal_retrieve_retry_interval = atoi(buf);
+
+	new_wal_retrieve_retry_interval = wal_retrieve_retry_interval + WALRECEIVER_DISABLE_TIMEOUT_VALUE;
+
+	if (wal_retrieve_retry_interval < WALRECEIVER_DISABLE_TIMEOUT_VALUE)
+	{
+		log_notice(_("setting \"wal_retrieve_retry_interval\" to %i milliseconds"),
+				   new_wal_retrieve_retry_interval);
+		alter_system_int(conn, "wal_retrieve_retry_interval", new_wal_retrieve_retry_interval);
+		pg_reload_conf(conn);
+	}
+
+	/*
+	 * If, at this point, the WAL receiver is not running, we don't need to (and indeed can't)
+	 * kill it.
+	 */
+	if (wal_receiver_pid == 0)
+	{
+		log_warning(_("wal receiver not running"));
+		return UNKNOWN_PID;
+	}
+
+
+	/* why 5? */
+	log_info(_("sleeping 5 seconds"));
+	sleep(5);
+
+	/* see comment below as to why we need a loop here */
+	for (i = 0; i < max_retries; i++)
+	{
+		log_notice(_("killing WAL receiver with PID %i"), (int)wal_receiver_pid);
+
+		kill((int)wal_receiver_pid, SIGTERM);
+
+		for (j = 0; j < 30; j++)
+		{
+			kill_ret = kill(wal_receiver_pid, 0);
+
+			if (kill_ret != 0)
+			{
+				log_info(_("WAL receiver with pid %i killed"), (int)wal_receiver_pid);
+				break;
+			}
+			sleep(1);
+		}
+
+		/*
+		 * Wait briefly to check that the WAL receiver has indeed gone away -
+		 * for reasons as yet unclear, after a server start/restart, immediately
+		 * after the first time a WAL receiver is killed, a new one is started
+		 * straight away, so we'll need to kill that too.
+		 */
+		sleep(1);
+		wal_receiver_pid = (pid_t)get_wal_receiver_pid(conn);
+		if (wal_receiver_pid == UNKNOWN_PID || wal_receiver_pid == 0)
+			break;
+	}
+
+	return wal_receiver_pid;
+}
+
+pid_t
+enable_wal_receiver(PGconn *conn, bool wait_startup)
+{
+	char buf[MAXLEN];
+	int wal_retrieve_retry_interval;
+	pid_t wal_receiver_pid = UNKNOWN_PID;
+
+	/* make timeout configurable */
+	int i, timeout = 30;
+
+	if (is_superuser_connection(conn, NULL) == false)
+	{
+		log_error(_("superuser connection required"));
+		return UNKNOWN_PID;
+	}
+
+	if (get_recovery_type(conn) == RECTYPE_PRIMARY)
+	{
+		log_error(_("node is not in recovery"));
+		log_detail(_("wal receiver can only run on standby nodes"));
+		return UNKNOWN_PID;
+	}
+
+	if (get_pg_setting(conn, "wal_retrieve_retry_interval", buf) == false)
+	{
+		log_error(_("unable to retrieve \"wal_retrieve_retry_interval\""));
+		return UNKNOWN_PID;
+	}
+
+	/* TODO: potentially handle atoi error, though unlikely at this point */
+	wal_retrieve_retry_interval = atoi(buf);
+
+	if (wal_retrieve_retry_interval > WALRECEIVER_DISABLE_TIMEOUT_VALUE)
+	{
+		int new_wal_retrieve_retry_interval = wal_retrieve_retry_interval - WALRECEIVER_DISABLE_TIMEOUT_VALUE;
+		bool success;
+
+		log_notice(_("setting \"wal_retrieve_retry_interval\" to %i ms"),
+				   new_wal_retrieve_retry_interval);
+
+		success = alter_system_int(conn,
+								   "wal_retrieve_retry_interval",
+								   new_wal_retrieve_retry_interval);
+
+		if (success == false)
+		{
+			log_warning(_("unable to change \"wal_retrieve_retry_interval\""));
+			return UNKNOWN_PID;
+		}
+
+		pg_reload_conf(conn);
+	}
+	else
+	{
+		/* TODO: add threshold sanity check */
+		log_info(_("\"wal_retrieve_retry_interval\" is %i, not changing"),
+				 wal_retrieve_retry_interval);
+	}
+
+	if (wait_startup == false)
+		return UNKNOWN_PID;
+
+	for (i = 0; i < timeout; i++)
+	{
+		wal_receiver_pid = (pid_t)get_wal_receiver_pid(conn);
+
+		if (wal_receiver_pid > 0)
+			break;
+
+		log_info(_("sleeping %i of maximum %i seconds waiting for WAL receiver to start up"),
+				 i + 1, timeout)
+		sleep(1);
+	}
+
+	if (wal_receiver_pid == UNKNOWN_PID)
+	{
+		log_warning(_("unable to retrieve WAL receiver PID"));
+		return UNKNOWN_PID;
+	}
+	else if (wal_receiver_pid == 0)
+	{
+		log_error(_("WAL receiver did not start up after %i seconds"), timeout);
+		return UNKNOWN_PID;
+	}
+
+	log_info(_("WAL receiver started up with PID %i"), (int)wal_receiver_pid);
+
+	return wal_receiver_pid;
+}
--- a/sysutils.h
+++ b/sysutils.h
@@ -0,0 +1,32 @@
+/*
+ * sysutils.h
+ * Copyright (c) 2ndQuadrant, 2010-2019
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SYSUTILS_H_
+#define _SYSUTILS_H_
+
+extern bool local_command(const char *command, PQExpBufferData *outputbuf);
+extern bool local_command_return_value(const char *command, PQExpBufferData *outputbuf, int *return_value);
+extern bool local_command_simple(const char *command, PQExpBufferData *outputbuf);
+
+extern bool remote_command(const char *host, const char *user, const char *command, const char *ssh_options, PQExpBufferData *outputbuf);
+
+extern pid_t disable_wal_receiver(PGconn *conn);
+extern pid_t enable_wal_receiver(PGconn *conn, bool wait_startup);
+
+
+#endif							/* _SYSUTILS_H_ */