Bump version number

4.3
Fix default return value in alter_system_int()
2026-03-23 07:06:30 +00:00 · 2019-04-01 15:25:48 +09:00 · 2019-04-01 14:52:37 +09:00 · 2019-04-01 12:24:57 +09:00 · 2019-04-01 11:29:16 +09:00 · 2019-04-01 11:03:47 +09:00
106 changed files with 9320 additions and 2746 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,9 @@ lib*.pc
 # other
 /.lineno
 *.dSYM
+*.orig
+*.rej
+
 # generated binaries
 repmgr
 repmgrd
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -2,7 +2,7 @@ License and Contributions
 =========================

 `repmgr` is licensed under the GPL v3.  All of its code and documentation is
-Copyright 2010-2018, 2ndQuadrant Limited.  See the files COPYRIGHT and LICENSE for
+Copyright 2010-2019, 2ndQuadrant Limited.  See the files COPYRIGHT and LICENSE for
 details.

 The development of repmgr has primarily been sponsored by 2ndQuadrant customers.
@@ -24,7 +24,7 @@ Code style
 Code in repmgr should be formatted to the same standards as the main PostgreSQL
 project. For more details see:

-    https://www.postgresql.org/docs/current/static/source-format.html
+    https://www.postgresql.org/docs/current/source-format.html

 Contributors should reformat their code similarly before submitting code to
 the project, in order to minimize merge conflicts with other work.
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-Copyright (c) 2010-2018, 2ndQuadrant Limited
+Copyright (c) 2010-2019, 2ndQuadrant Limited
 All rights reserved.

 This program is free software: you can redistribute it and/or modify
--- a/29
+++ b/29
@@ -1,3 +1,32 @@
+4.3     2019-??
+        repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian)
+        repmgr: add --version-number command line option (Ian)
+        repmgr: add --compact option to "cluster show"; GitHub #521 (Ian)
+        repmgr: cluster show - differentiate between unreachable nodes
+          and nodes which are running but rejecting connections (Ian)
+        repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
+        repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
+        repmgr: prevent potential race condition in "standby switchover"
+          when checking received WAL location; GitHub #518 (Ian)
+        repmgr: ensure "standby switchover" verifies repmgr can read the
+          data directory on the demotion candidate; GitHub #523 (Ian)
+        repmgr: ensure "standby switchover" verifies replication connection
+          exists; GitHub #519 (Ian)
+        repmgr: add sanity check for correct extension version (Ian)
+        repmgr: ensure "witness register --dry-run" does not attempt to read node
+          tables if repmgr extension not installed; GitHub #513 (Ian)
+        repmgr: ensure "standby register" fails when --upstream-node-id is the
+          same as the local node ID (Ian)
+        repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
+        repmgrd: on a cascaded standby, don't fail over if "failover=manual";
+          GitHub #531 (Ian)
+        repmgrd: don't consider nodes where repmgrd is not running as promotion
+          candidates (Ian)
+        repmgrd: add option "connection_check_type" (Ian)
+        repmgrd: improve witness monitoring when primary node not available (Ian)
+		repmgrd: handle situation where a primary has unexpectedly appeared
+		  during failover; GitHub #420 (Ian)
+
 4.2     2018-10-24
        repmgr: add parameter "shutdown_check_timeout" for use by "standby switchover";
          GitHub #504 (Ian)
--- a/Makefile.in
+++ b/Makefile.in
@@ -15,7 +15,9 @@ DATA = \
  repmgr--4.0--4.1.sql \
  repmgr--4.1.sql \
  repmgr--4.1--4.2.sql \
-  repmgr--4.2.sql
+  repmgr--4.2.sql \
+  repmgr--4.2--4.3.sql \
+  repmgr--4.3.sql

 REGRESS = repmgr_extension

@@ -48,8 +50,8 @@ $(info Building against PostgreSQL $(MAJORVERSION))
 REPMGR_CLIENT_OBJS = repmgr-client.o \
 	repmgr-action-primary.o repmgr-action-standby.o repmgr-action-witness.o \
 	repmgr-action-bdr.o repmgr-action-cluster.o repmgr-action-node.o repmgr-action-daemon.o \
-	configfile.o log.o strutil.o controldata.o dirutil.o compat.o dbutils.o
-REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o configfile.o log.o dbutils.o strutil.o controldata.o compat.o
+	configfile.o log.o strutil.o controldata.o dirutil.o compat.o dbutils.o sysutils.o
+REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o configfile.o log.o dbutils.o strutil.o controldata.o compat.o sysutils.o
 DATE=$(shell date "+%Y-%m-%d")

 repmgr_version.h: repmgr_version.h.in
@@ -84,29 +86,15 @@ clean: additional-clean
 maintainer-clean: additional-maintainer-clean

 additional-clean:
-	rm -f repmgr-client.o
-	rm -f repmgr-action-primary.o
-	rm -f repmgr-action-standby.o
-	rm -f repmgr-action-witness.o
-	rm -f repmgr-action-bdr.o
-	rm -f repmgr-action-node.o
-	rm -f repmgr-action-cluster.o
-	rm -f repmgr-action-daemon.o
-	rm -f repmgrd.o
-	rm -f repmgrd-physical.o
-	rm -f repmgrd-bdr.o
-	rm -f compat.o
-	rm -f configfile.o
-	rm -f controldata.o
-	rm -f dbutils.o
-	rm -f dirutil.o
-	rm -f log.o
-	rm -f strutil.o
+	rm -f *.o

-maintainer-additional-clean: clean
-	rm -f configure
+additional-maintainer-clean: clean
+	$(MAKE) -C doc maintainer-clean
 	rm -f config.status config.log
+	rm -f config.h
+	rm -f repmgr_version.h
 	rm -f Makefile
+	rm -f Makefile.global
 	@rm -rf autom4te.cache/

 ifeq ($(MAJORVERSION),$(filter $(MAJORVERSION),9.3 9.4))
--- a/compat.c
+++ b/compat.c
@@ -6,7 +6,7 @@
 *    supported PostgreSQL versions. They're unlikely to change but
 *    it would be worth keeping an eye on them for any fixes/improvements.
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
@@ -98,9 +98,42 @@ appendShellString(PQExpBuffer buf, const char *str)

 		if (*p == '\'')
 			appendPQExpBufferStr(buf, "'\"'\"'");
+		else if (*p == '&')
+			appendPQExpBufferStr(buf, "\\&");
 		else
 			appendPQExpBufferChar(buf, *p);
 	}

 	appendPQExpBufferChar(buf, '\'');
 }
+
+/*
+ * Adapted from: src/fe_utils/string_utils.c
+ */
+void
+appendRemoteShellString(PQExpBuffer buf, const char *str)
+{
+	const char *p;
+
+	appendPQExpBufferStr(buf, "\\'");
+
+	for (p = str; *p; p++)
+	{
+		if (*p == '\n' || *p == '\r')
+		{
+			fprintf(stderr,
+					_("shell command argument contains a newline or carriage return: \"%s\"\n"),
+					str);
+			exit(ERR_BAD_CONFIG);
+		}
+
+		if (*p == '\'')
+			appendPQExpBufferStr(buf, "'\"'\"'");
+		else if (*p == '&')
+			appendPQExpBufferStr(buf, "\\&");
+		else
+			appendPQExpBufferChar(buf, *p);
+	}
+
+	appendPQExpBufferStr(buf, "\\'");
+}
--- a/compat.h
+++ b/compat.h
@@ -1,6 +1,6 @@
 /*
 * compat.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
@@ -27,4 +27,6 @@ extern void appendConnStrVal(PQExpBuffer buf, const char *str);

 extern void appendShellString(PQExpBuffer buf, const char *str);

+extern void appendRemoteShellString(PQExpBuffer buf, const char *str);
+
 #endif
--- a/configfile.c
+++ b/configfile.c
@@ -1,7 +1,7 @@
 /*
 * config.c - parse repmgr.conf and other configuration-related functionality
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -123,9 +123,9 @@ load_config(const char *config_file, bool verbose, bool terse, t_configuration_o

 		if (stat(config_file_path, &stat_config) != 0)
 		{
-			log_error(_("provided configuration file \"%s\" not found: %s"),
-					  config_file,
-					  strerror(errno));
+			log_error(_("provided configuration file \"%s\" not found"),
+					  config_file);
+			log_detail("%s", strerror(errno));
 			exit(ERR_BAD_CONFIG);
 		}

@@ -335,6 +335,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 	 */
 	options->shutdown_check_timeout = DEFAULT_SHUTDOWN_CHECK_TIMEOUT;
 	options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
+	options->wal_receive_check_timeout = DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT;

 	/*-----------------
 	 * repmgrd settings
@@ -357,6 +358,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 	options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
 	options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */
 	memset(options->repmgrd_pid_file, 0, sizeof(options->repmgrd_pid_file));
+	options->standby_disconnect_on_failover = false;
+	options->sibling_nodes_disconnect_timeout = DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT;
+	options->connection_check_type = CHECK_PING;
+	options->primary_visibility_consensus = false;
+	memset(options->failover_validation_command, 0, sizeof(options->failover_validation_command));
+	options->election_rerun_interval = DEFAULT_ELECTION_RERUN_INTERVAL;

 	/*-------------
 	 * witness settings
@@ -371,17 +378,24 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 	options->bdr_local_monitoring_only = false;
 	options->bdr_recovery_timeout = DEFAULT_BDR_RECOVERY_TIMEOUT;

-	/*-----------------
-	 * service settings
-	 *-----------------
+	/*-------------------------
+	 * service command settings
+	 *-------------------------
 	 */
 	memset(options->pg_ctl_options, 0, sizeof(options->pg_ctl_options));
-	memset(options->service_stop_command, 0, sizeof(options->service_stop_command));
 	memset(options->service_start_command, 0, sizeof(options->service_start_command));
+	memset(options->service_stop_command, 0, sizeof(options->service_stop_command));
 	memset(options->service_restart_command, 0, sizeof(options->service_restart_command));
 	memset(options->service_reload_command, 0, sizeof(options->service_reload_command));
 	memset(options->service_promote_command, 0, sizeof(options->service_promote_command));

+	/*---------------------------------
+	 * repmgrd service command settings
+	 *---------------------------------
+	 */
+	memset(options->repmgrd_service_start_command, 0, sizeof(options->repmgrd_service_start_command));
+	memset(options->repmgrd_service_stop_command, 0, sizeof(options->repmgrd_service_stop_command));
+
 	/*----------------------------
 	 * event notification settings
 	 *----------------------------
@@ -466,11 +480,18 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 		/* Copy into correct entry in parameters struct */
 		if (strcmp(name, "node_id") == 0)
 		{
-			options->node_id = repmgr_atoi(value, name, error_list, 1);
+			options->node_id = repmgr_atoi(value, name, error_list, MIN_NODE_ID);
 			node_id_found = true;
 		}
 		else if (strcmp(name, "node_name") == 0)
-			strncpy(options->node_name, value, MAXLEN);
+		{
+			if (strlen(value) < sizeof(options->node_name))
+				strncpy(options->node_name, value, sizeof(options->node_name));
+			else
+				item_list_append_format(error_list,
+										_("value for \"node_name\" must contain fewer than %lu characters"),
+										sizeof(options->node_name));
+		}
 		else if (strcmp(name, "conninfo") == 0)
 			strncpy(options->conninfo, value, MAXLEN);
 		else if (strcmp(name, "data_directory") == 0)
@@ -480,11 +501,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *

 		else if (strcmp(name, "replication_user") == 0)
 		{
-			if (strlen(value) < NAMEDATALEN)
-				strncpy(options->replication_user, value, NAMEDATALEN);
+			if (strlen(value) < sizeof(options->replication_user))
+				strncpy(options->replication_user, value, sizeof(options->replication_user));
 			else
-				item_list_append(error_list,
-								 _("value for \"replication_user\" must contain fewer than " STR(NAMEDATALEN) " characters"));
+				item_list_append_format(error_list,
+										_("value for \"replication_user\" must contain fewer than %lu characters"),
+										sizeof(options->replication_user));
 		}
 		else if (strcmp(name, "pg_bindir") == 0)
 			strncpy(options->pg_bindir, value, MAXPGPATH);
@@ -550,6 +572,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 			options->shutdown_check_timeout = repmgr_atoi(value, name, error_list, 0);
 		else if (strcmp(name, "standby_reconnect_timeout") == 0)
 			options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
+		else if (strcmp(name, "wal_receive_check_timeout") == 0)
+			options->wal_receive_check_timeout = repmgr_atoi(value, name, error_list, 0);

 		/* node rejoin settings */
 		else if (strcmp(name, "node_rejoin_timeout") == 0)
@@ -585,11 +609,11 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 		else if (strcmp(name, "priority") == 0)
 			options->priority = repmgr_atoi(value, name, error_list, 0);
 		else if (strcmp(name, "location") == 0)
-			strncpy(options->location, value, MAXLEN);
+			strncpy(options->location, value, sizeof(options->location));
 		else if (strcmp(name, "promote_command") == 0)
-			strncpy(options->promote_command, value, MAXLEN);
+			strncpy(options->promote_command, value, sizeof(options->promote_command));
 		else if (strcmp(name, "follow_command") == 0)
-			strncpy(options->follow_command, value, MAXLEN);
+			strncpy(options->follow_command, value, sizeof(options->follow_command));
 		else if (strcmp(name, "reconnect_attempts") == 0)
 			options->reconnect_attempts = repmgr_atoi(value, name, error_list, 0);
 		else if (strcmp(name, "reconnect_interval") == 0)
@@ -608,6 +632,36 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 			options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0);
 		else if (strcmp(name, "repmgrd_pid_file") == 0)
 			strncpy(options->repmgrd_pid_file, value, MAXPGPATH);
+		else if (strcmp(name, "standby_disconnect_on_failover") == 0)
+			options->standby_disconnect_on_failover = parse_bool(value, name, error_list);
+		else if (strcmp(name, "sibling_nodes_disconnect_timeout") == 0)
+			options->sibling_nodes_disconnect_timeout = repmgr_atoi(value, name, error_list, 0);
+		else if (strcmp(name, "connection_check_type") == 0)
+		{
+			if (strcasecmp(value, "ping") == 0)
+			{
+				options->connection_check_type = CHECK_PING;
+			}
+			else if (strcasecmp(value, "connection") == 0)
+			{
+				options->connection_check_type = CHECK_CONNECTION;
+			}
+			else if (strcasecmp(value, "query") == 0)
+			{
+				options->connection_check_type = CHECK_QUERY;
+			}
+			else
+			{
+				item_list_append(error_list,
+								 _("value for \"connection_check_type\" must be \"ping\", \"connection\" or \"query\"\n"));
+			}
+		}
+		else if (strcmp(name, "primary_visibility_consensus") == 0)
+			options->primary_visibility_consensus = parse_bool(value, name, error_list);
+		else if (strcmp(name, "failover_validation_command") == 0)
+			strncpy(options->failover_validation_command, value, sizeof(options->failover_validation_command));
+		else if (strcmp(name, "election_rerun_interval") == 0)
+			options->election_rerun_interval = repmgr_atoi(value, name, error_list, 0);

 		/* witness settings */
 		else if (strcmp(name, "witness_sync_interval") == 0)
@@ -621,41 +675,48 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *

 		/* service settings */
 		else if (strcmp(name, "pg_ctl_options") == 0)
-			strncpy(options->pg_ctl_options, value, MAXLEN);
-		else if (strcmp(name, "service_stop_command") == 0)
-			strncpy(options->service_stop_command, value, MAXLEN);
+			strncpy(options->pg_ctl_options, value, sizeof(options->pg_ctl_options));
 		else if (strcmp(name, "service_start_command") == 0)
-			strncpy(options->service_start_command, value, MAXLEN);
+			strncpy(options->service_start_command, value, sizeof(options->service_start_command));
+		else if (strcmp(name, "service_stop_command") == 0)
+			strncpy(options->service_stop_command, value, sizeof(options->service_stop_command));
 		else if (strcmp(name, "service_restart_command") == 0)
-			strncpy(options->service_restart_command, value, MAXLEN);
+			strncpy(options->service_restart_command, value, sizeof(options->service_restart_command));
 		else if (strcmp(name, "service_reload_command") == 0)
-			strncpy(options->service_reload_command, value, MAXLEN);
+			strncpy(options->service_reload_command, value, sizeof(options->service_reload_command));
 		else if (strcmp(name, "service_promote_command") == 0)
-			strncpy(options->service_promote_command, value, MAXLEN);
+			strncpy(options->service_promote_command, value, sizeof(options->service_promote_command));
+
+		/* repmgrd service settings */
+		else if (strcmp(name, "repmgrd_service_start_command") == 0)
+			strncpy(options->repmgrd_service_start_command, value, sizeof(options->repmgrd_service_start_command));
+		else if (strcmp(name, "repmgrd_service_stop_command") == 0)
+			strncpy(options->repmgrd_service_stop_command, value, sizeof(options->repmgrd_service_stop_command));
+

 		/* event notification settings */
 		else if (strcmp(name, "event_notification_command") == 0)
-			strncpy(options->event_notification_command, value, MAXLEN);
+			strncpy(options->event_notification_command, value, sizeof(options->event_notification_command));
 		else if (strcmp(name, "event_notifications") == 0)
 		{
 			/* store unparsed value for comparison when reloading config */
-			strncpy(options->event_notifications_orig, value, MAXLEN);
+			strncpy(options->event_notifications_orig, value, sizeof(options->event_notifications_orig));
 			parse_event_notifications_list(options, value);
 		}

 		/* barman settings */
 		else if (strcmp(name, "barman_host") == 0)
-			strncpy(options->barman_host, value, MAXLEN);
+			strncpy(options->barman_host, value, sizeof(options->barman_host));
 		else if (strcmp(name, "barman_server") == 0)
-			strncpy(options->barman_server, value, MAXLEN);
+			strncpy(options->barman_server, value, sizeof(options->barman_server));
 		else if (strcmp(name, "barman_config") == 0)
-			strncpy(options->barman_config, value, MAXLEN);
+			strncpy(options->barman_config, value, sizeof(options->barman_config));

 		/* rsync/ssh settings */
 		else if (strcmp(name, "rsync_options") == 0)
-			strncpy(options->rsync_options, value, MAXLEN);
+			strncpy(options->rsync_options, value, sizeof(options->rsync_options));
 		else if (strcmp(name, "ssh_options") == 0)
-			strncpy(options->ssh_options, value, MAXLEN);
+			strncpy(options->ssh_options, value, sizeof(options->ssh_options));

 		/* undocumented settings for testing */
 		else if (strcmp(name, "promote_delay") == 0)
@@ -1032,15 +1093,19 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
 * loop is started up; it therefore only needs to reload options required
 * by repmgrd, which are as follows:
 *
- * changeable options:
+ * changeable options (keep the list in "doc/repmgrd-configuration.sgml" in sync
+ * with these):
+ *
 * - async_query_timeout
 * - bdr_local_monitoring_only
 * - bdr_recovery_timeout
+ * - connection_check_type
 * - conninfo
 * - degraded_monitoring_timeout
 * - event_notification_command
 * - event_notifications
 * - failover
+ * - failover_validation_command
 * - follow_command
 * - log_facility
 * - log_file
@@ -1048,12 +1113,19 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
 * - log_status_interval
 * - monitor_interval_secs
 * - monitoring_history
+ * - primary_notification_timeout
+ * - primary_visibility_consensus
 * - promote_command
- * - promote_delay
 * - reconnect_attempts
 * - reconnect_interval
 * - repmgrd_standby_startup_timeout
 * - retry_promote_interval_secs
+ * - sibling_nodes_disconnect_timeout
+ * - standby_disconnect_on_failover
+ *
+ *
+ * Not publicly documented:
+ * - promote_delay
 *
 * non-changeable options (repmgrd references these from the "repmgr.nodes"
 * table, not the configuration file)
@@ -1132,13 +1204,12 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		return false;
 	}

-	if (strncmp(new_options.node_name, orig_options->node_name, MAXLEN) != 0)
+	if (strncmp(new_options.node_name, orig_options->node_name, sizeof(orig_options->node_name)) != 0)
 	{
 		log_warning(_("\"node_name\" cannot be changed, keeping current configuration"));
 		return false;
 	}

-
 	/*
 	 * No configuration problems detected - copy any changed values
 	 *
@@ -1188,8 +1259,8 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		{
 			strncpy(orig_options->conninfo, new_options.conninfo, MAXLEN);
 			log_info(_("\"conninfo\" is now \"%s\""), new_options.conninfo);
-
 		}
+
 		PQfinish(conn);
 	}

@@ -1267,7 +1338,6 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		config_changed = true;
 	}

-
 	/* promote_command */
 	if (strncmp(orig_options->promote_command, new_options.promote_command, MAXLEN) != 0)
 	{
@@ -1313,6 +1383,51 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		config_changed = true;
 	}

+	/* standby_disconnect_on_failover */
+	if (orig_options->standby_disconnect_on_failover != new_options.standby_disconnect_on_failover)
+	{
+		orig_options->standby_disconnect_on_failover = new_options.standby_disconnect_on_failover;
+		log_info(_("\"standby_disconnect_on_failover\" is now \"%s\""),
+				 new_options.standby_disconnect_on_failover == true ? "TRUE" : "FALSE");
+		config_changed = true;
+	}
+
+	/* sibling_nodes_disconnect_timeout */
+	if (orig_options->sibling_nodes_disconnect_timeout != new_options.sibling_nodes_disconnect_timeout)
+	{
+		orig_options->sibling_nodes_disconnect_timeout = new_options.sibling_nodes_disconnect_timeout;
+		log_info(_("\"sibling_nodes_disconnect_timeout\" is now \"%i\""),
+				 new_options.sibling_nodes_disconnect_timeout);
+		config_changed = true;
+	}
+
+	/* connection_check_type */
+	if (orig_options->connection_check_type != new_options.connection_check_type)
+	{
+		orig_options->connection_check_type = new_options.connection_check_type;
+		log_info(_("\"connection_check_type\" is now \"%s\""),
+				 print_connection_check_type(new_options.connection_check_type));
+		config_changed = true;
+	}
+
+	/* primary_visibility_consensus */
+	if (orig_options->primary_visibility_consensus != new_options.primary_visibility_consensus)
+	{
+		orig_options->primary_visibility_consensus = new_options.primary_visibility_consensus;
+		log_info(_("\"primary_visibility_consensus\" is now \"%s\""),
+				 new_options.primary_visibility_consensus == true ? "TRUE" : "FALSE");
+		config_changed = true;
+	}
+
+	/* failover_validation_command */
+	if (strncmp(orig_options->failover_validation_command, new_options.failover_validation_command, MAXPGPATH) != 0)
+	{
+		strncpy(orig_options->failover_validation_command, new_options.failover_validation_command, MAXPGPATH);
+		log_info(_("\"failover_validation_command\" is now \"%s\""), new_options.failover_validation_command);
+
+		config_changed = true;
+	}
+
 	/*
 	 * Handle changes to logging configuration
 	 */
@@ -1524,7 +1639,7 @@ repmgr_atoi(const char *value, const char *config_item, ItemList *error_list, in
 *
 * TODO: accept "any unambiguous prefix of one of these" as per postgresql.conf:
 *
- *   https://www.postgresql.org/docs/current/static/config-setting.html
+ *   https://www.postgresql.org/docs/current/config-setting.html
 */
 bool
 parse_bool(const char *s, const char *config_item, ItemList *error_list)
@@ -1910,3 +2025,21 @@ parse_pg_basebackup_options(const char *pg_basebackup_options, t_basebackup_opti

 	return backup_options_ok;
 }
+
+
+const char *
+print_connection_check_type(ConnectionCheckType type)
+{
+	switch (type)
+	{
+		case CHECK_PING:
+			return "ping";
+		case CHECK_QUERY:
+			return "query";
+		case CHECK_CONNECTION:
+			return "connection";
+	}
+
+	/* should never reach here */
+	return "UNKNOWN";
+}
--- a/configfile.h
+++ b/configfile.h
@@ -1,7 +1,7 @@
 /*
 * configfile.h
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 *
 * This program is free software: you can redistribute it and/or modify
@@ -37,6 +37,13 @@ typedef enum
 	FAILOVER_AUTOMATIC
 } failover_mode_opt;

+typedef enum
+{
+	CHECK_PING,
+	CHECK_QUERY,
+	CHECK_CONNECTION
+} ConnectionCheckType;
+
 typedef struct EventNotificationListCell
 {
 	struct EventNotificationListCell *next;
@@ -69,7 +76,7 @@ typedef struct
 {
 	/* node information */
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	char		conninfo[MAXLEN];
 	char		replication_user[NAMEDATALEN];
 	char		data_directory[MAXPGPATH];
@@ -106,6 +113,7 @@ typedef struct
 	/* standby switchover settings */
 	int			shutdown_check_timeout;
 	int			standby_reconnect_timeout;
+	int			wal_receive_check_timeout;

 	/* node rejoin settings */
 	int			node_rejoin_timeout;
@@ -134,6 +142,12 @@ typedef struct
 	int			primary_notification_timeout;
 	int			repmgrd_standby_startup_timeout;
 	char		repmgrd_pid_file[MAXPGPATH];
+	bool		standby_disconnect_on_failover;
+	int			sibling_nodes_disconnect_timeout;
+	ConnectionCheckType connection_check_type;
+	bool		primary_visibility_consensus;
+	char		failover_validation_command[MAXPGPATH];
+	int			election_rerun_interval;

 	/* BDR settings */
 	bool		bdr_local_monitoring_only;
@@ -141,14 +155,18 @@ typedef struct

 	/* service settings */
 	char		pg_ctl_options[MAXLEN];
-	char		service_stop_command[MAXLEN];
-	char		service_start_command[MAXLEN];
-	char		service_restart_command[MAXLEN];
-	char		service_reload_command[MAXLEN];
-	char		service_promote_command[MAXLEN];
+	char		service_start_command[MAXPGPATH];
+	char		service_stop_command[MAXPGPATH];
+	char		service_restart_command[MAXPGPATH];
+	char		service_reload_command[MAXPGPATH];
+	char		service_promote_command[MAXPGPATH];
+
+	/* repmgrd service settings */
+	char		repmgrd_service_start_command[MAXPGPATH];
+	char		repmgrd_service_stop_command[MAXPGPATH];

 	/* event notification settings */
-	char		event_notification_command[MAXLEN];
+	char		event_notification_command[MAXPGPATH];
 	char		event_notifications_orig[MAXLEN];
 	EventNotificationList event_notifications;

@@ -174,7 +192,7 @@ typedef struct
 		/* node information */ \
 		UNKNOWN_NODE_ID, "", "", "", "", "", "", "", REPLICATION_TYPE_PHYSICAL,	\
 		/* log settings */ \
-		"", "", "", DEFAULT_LOG_STATUS_INTERVAL,	\
+		"", "", "", DEFAULT_LOG_STATUS_INTERVAL, \
 		/* standby clone settings */ \
 		false, "", "", { NULL, NULL }, "", false, "", false, "", \
 		/* standby promote settings */ \
@@ -185,6 +203,7 @@ typedef struct
 		/* standby switchover settings */ \
 		DEFAULT_SHUTDOWN_CHECK_TIMEOUT, \
 		DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
+		DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT, \
 		/* node rejoin settings */ \
 		DEFAULT_NODE_REJOIN_TIMEOUT, \
 		/* node check settings */ \
@@ -199,12 +218,15 @@ typedef struct
        DEFAULT_RECONNECTION_INTERVAL, \
        false, -1, \
 		DEFAULT_ASYNC_QUERY_TIMEOUT, \
-		DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT,	\
-		-1, "", \
+		DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
+		-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, \
+		CHECK_PING, true, "", DEFAULT_ELECTION_RERUN_INTERVAL, \
 		/* BDR settings */ \
 		false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
 		/* service settings */ \
 		"", "", "", "", "", "", \
+		/* repmgrd service settings */ \
+		"", "",  \
 		/* event notification settings */ \
 		"", "", { NULL, NULL }, \
 		/* barman settings */ \
@@ -307,5 +329,6 @@ void free_parsed_argv(char ***argv_array);
 /* called by repmgr-client and repmgrd */
 void		exit_with_cli_errors(ItemList *error_list, const char *repmgr_command);
 void		print_item_list(ItemList *item_list);
+const char *print_connection_check_type(ConnectionCheckType type);

 #endif							/* _REPMGR_CONFIGFILE_H_ */
--- a/38
+++ b/38
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for repmgr 4.2.
+# Generated by GNU Autoconf 2.69 for repmgr 4.3.
 #
-# Report bugs to <pgsql-bugs@postgresql.org>.
+# Report bugs to <repmgr@googlegroups.com>.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -11,7 +11,7 @@
 # This configure script is free software; the Free Software Foundation
 # gives unlimited permission to copy, distribute and modify it.
 #
-# Copyright (c) 2010-2018, 2ndQuadrant Ltd.
+# Copyright (c) 2010-2019, 2ndQuadrant Ltd.
 ## -------------------- ##
 ## M4sh Initialization. ##
 ## -------------------- ##
@@ -269,7 +269,7 @@ fi
    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
  else
    $as_echo "$0: Please tell bug-autoconf@gnu.org and
-$0: pgsql-bugs@postgresql.org about your system, including
+$0: repmgr@googlegroups.com about your system, including
 $0: any error possibly output before this message. Then
 $0: install a modern shell, or manually run the script
 $0: under such a shell if you do have one."
@@ -582,10 +582,10 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='repmgr'
 PACKAGE_TARNAME='repmgr'
-PACKAGE_VERSION='4.2'
-PACKAGE_STRING='repmgr 4.2'
-PACKAGE_BUGREPORT='pgsql-bugs@postgresql.org'
-PACKAGE_URL='https://2ndquadrant.com/en/resources/repmgr/'
+PACKAGE_VERSION='4.3'
+PACKAGE_STRING='repmgr 4.3'
+PACKAGE_BUGREPORT='repmgr@googlegroups.com'
+PACKAGE_URL='https://repmgr.org/'

 ac_subst_vars='LTLIBOBJS
 LIBOBJS
@@ -1178,7 +1178,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures repmgr 4.2 to adapt to many kinds of systems.
+\`configure' configures repmgr 4.3 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1239,7 +1239,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of repmgr 4.2:";;
+     short | recursive ) echo "Configuration of repmgr 4.3:";;
   esac
  cat <<\_ACEOF

@@ -1249,8 +1249,8 @@ Some influential environment variables:
 Use these variables to override the choices made by `configure' or to help
 it to find libraries and programs with nonstandard names/locations.

-Report bugs to <pgsql-bugs@postgresql.org>.
-repmgr home page: <https://2ndquadrant.com/en/resources/repmgr/>.
+Report bugs to <repmgr@googlegroups.com>.
+repmgr home page: <https://repmgr.org/>.
 _ACEOF
 ac_status=$?
 fi
@@ -1313,14 +1313,14 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-repmgr configure 4.2
+repmgr configure 4.3
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
 This configure script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it.

-Copyright (c) 2010-2018, 2ndQuadrant Ltd.
+Copyright (c) 2010-2019, 2ndQuadrant Ltd.
 _ACEOF
  exit
 fi
@@ -1332,7 +1332,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by repmgr $as_me 4.2, which was
+It was created by repmgr $as_me 4.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2359,7 +2359,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by repmgr $as_me 4.2, which was
+This file was extended by repmgr $as_me 4.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -2415,14 +2415,14 @@ $config_files
 Configuration headers:
 $config_headers

-Report bugs to <pgsql-bugs@postgresql.org>.
-repmgr home page: <https://2ndquadrant.com/en/resources/repmgr/>."
+Report bugs to <repmgr@googlegroups.com>.
+repmgr home page: <https://repmgr.org/>."

 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-repmgr config.status 4.2
+repmgr config.status 4.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.in
+++ b/configure.in
@@ -1,6 +1,6 @@
-AC_INIT([repmgr], [4.2], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])
+AC_INIT([repmgr], [4.3], [repmgr@googlegroups.com], [repmgr], [https://repmgr.org/])

-AC_COPYRIGHT([Copyright (c) 2010-2018, 2ndQuadrant Ltd.])
+AC_COPYRIGHT([Copyright (c) 2010-2019, 2ndQuadrant Ltd.])

 AC_CONFIG_HEADER(config.h)

--- a/controldata.c
+++ b/controldata.c
@@ -1,6 +1,12 @@
 /*
- * controldata.c
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * controldata.c - functions for reading the pg_control file
+ *
+ * The functions provided here enable repmgr to read a pg_control file
+ * in a version-indepent way, even if the PostgreSQL instance is not
+ * running. For that reason we can't use on the pg_control_*() functions
+ * provided in PostgreSQL 9.6 and later.
+ *
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
@@ -30,6 +36,53 @@

 static ControlFileInfo *get_controlfile(const char *DataDir);

+int
+get_pg_version(const char *data_directory, char *version_string)
+{
+	char		PgVersionPath[MAXPGPATH] = "";
+	FILE	   *fp = NULL;
+	char	   *endptr = NULL;
+	char		file_version_string[MAX_VERSION_STRING] = "";
+	long		file_major, file_minor;
+	int			ret;
+
+	snprintf(PgVersionPath, MAXPGPATH, "%s/PG_VERSION", data_directory);
+
+	fp = fopen(PgVersionPath, "r");
+
+	if (fp == NULL)
+	{
+		log_warning(_("could not open file \"%s\" for reading"),
+					PgVersionPath);
+		log_detail("%s", strerror(errno));
+		return UNKNOWN_SERVER_VERSION_NUM;
+	}
+
+	file_version_string[0] = '\0';
+
+	ret = fscanf(fp, "%23s", file_version_string);
+	fclose(fp);
+
+	if (ret != 1 || endptr == file_version_string)
+	{
+		log_warning(_("unable to determine major version number from PG_VERSION"));
+
+		return UNKNOWN_SERVER_VERSION_NUM;
+	}
+
+	file_major = strtol(file_version_string, &endptr, 10);
+	file_minor = 0;
+
+	if (*endptr == '.')
+		file_minor = strtol(endptr + 1, NULL, 10);
+
+	if (version_string != NULL)
+		strncpy(version_string, file_version_string, MAX_VERSION_STRING);
+
+	return ((int) file_major * 10000) + ((int) file_minor * 100);
+}
+
+
 uint64
 get_system_identifier(const char *data_directory)
 {
@@ -44,6 +97,7 @@ get_system_identifier(const char *data_directory)
 	return system_identifier;
 }

+
 DBState
 get_db_state(const char *data_directory)
 {
@@ -60,7 +114,7 @@ get_db_state(const char *data_directory)
 }


-extern XLogRecPtr
+XLogRecPtr
 get_latest_checkpoint_location(const char *data_directory)
 {
 	ControlFileInfo *control_file_info = NULL;
@@ -112,10 +166,59 @@ describe_db_state(DBState state)
 		case DB_IN_PRODUCTION:
 			return _("in production");
 	}
+
 	return _("unrecognized status code");
 }


+TimeLineID
+get_timeline(const char *data_directory)
+{
+	ControlFileInfo *control_file_info = NULL;
+	TimeLineID		 timeline = -1;
+
+	control_file_info = get_controlfile(data_directory);
+
+	timeline = (int) control_file_info->timeline;
+
+	pfree(control_file_info);
+
+	return timeline;
+}
+
+
+TimeLineID
+get_min_recovery_end_timeline(const char *data_directory)
+{
+	ControlFileInfo *control_file_info = NULL;
+	TimeLineID		 timeline = -1;
+
+	control_file_info = get_controlfile(data_directory);
+
+	timeline = (int) control_file_info->minRecoveryPointTLI;
+
+	pfree(control_file_info);
+
+	return timeline;
+}
+
+
+XLogRecPtr
+get_min_recovery_location(const char *data_directory)
+{
+	ControlFileInfo *control_file_info = NULL;
+	XLogRecPtr	minRecoveryPoint  = InvalidXLogRecPtr;
+
+	control_file_info = get_controlfile(data_directory);
+
+	minRecoveryPoint = control_file_info->minRecoveryPoint;
+
+	pfree(control_file_info);
+
+	return minRecoveryPoint;
+}
+
+
 /*
 * We maintain our own version of get_controlfile() as we need cross-version
 * compatibility, and also don't care if the file isn't readable.
@@ -123,14 +226,10 @@ describe_db_state(DBState state)
 static ControlFileInfo *
 get_controlfile(const char *DataDir)
 {
+	char		file_version_string[MAX_VERSION_STRING] = "";
 	ControlFileInfo *control_file_info;
-	FILE	   *fp = NULL;
-	int			fd, ret, version_num;
-	char		PgVersionPath[MAXPGPATH] = "";
+	int			fd, version_num;
 	char		ControlFilePath[MAXPGPATH] = "";
-	char		file_version_string[64] = "";
-	long		file_major, file_minor;
-	char	   *endptr = NULL;
 	void	   *ControlFileDataPtr = NULL;
 	int			expected_size = 0;

@@ -142,50 +241,32 @@ get_controlfile(const char *DataDir)
 	control_file_info->state = DB_SHUTDOWNED;
 	control_file_info->checkPoint = InvalidXLogRecPtr;
 	control_file_info->data_checksum_version = -1;
+	control_file_info->timeline = -1;
+	control_file_info->minRecoveryPointTLI = -1;
+	control_file_info->minRecoveryPoint = InvalidXLogRecPtr;

 	/*
 	 * Read PG_VERSION, as we'll need to determine which struct to read
 	 * the control file contents into
 	 */
-	snprintf(PgVersionPath, MAXPGPATH, "%s/PG_VERSION", DataDir);

-	fp = fopen(PgVersionPath, "r");
+	version_num = get_pg_version(DataDir, file_version_string);

-	if (fp == NULL)
+	if (version_num == UNKNOWN_SERVER_VERSION_NUM)
 	{
-		log_warning(_("could not open file \"%s\" for reading"),
-					PgVersionPath);
-		log_detail("%s", strerror(errno));
+		log_warning(_("unable to determine server version number from PG_VERSION"));
 		return control_file_info;
 	}

-	file_version_string[0] = '\0';
-
-	ret = fscanf(fp, "%63s", file_version_string);
-	fclose(fp);
-
-	if (ret != 1 || endptr == file_version_string)
+	if (version_num < MIN_SUPPORTED_VERSION_NUM)
 	{
-		log_warning(_("unable to determine major version number from PG_VERSION"));
-
+		log_warning(_("data directory appears to be initialised for %s"),
+					file_version_string);
+		log_detail(_("minimum supported PostgreSQL version is %s"),
+				   MIN_SUPPORTED_VERSION);
 		return control_file_info;
 	}

-	file_major = strtol(file_version_string, &endptr, 10);
-	file_minor = 0;
-
-	if (*endptr == '.')
-		file_minor = strtol(endptr + 1, NULL, 10);
-
-	version_num = ((int) file_major * 10000) + ((int) file_minor * 100);
-
-	if (version_num < 90300)
-	{
-		log_warning(_("Data directory appears to be initialised for %s"), file_version_string);
-		return control_file_info;
-	}
-
-
 	snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);

 	if ((fd = open(ControlFilePath, O_RDONLY | PG_BINARY, 0)) == -1)
@@ -220,6 +301,8 @@ get_controlfile(const char *DataDir)
 					ControlFilePath);
 		log_detail("%s", strerror(errno));

+		close(fd);
+
 		return control_file_info;
 	}

@@ -234,6 +317,9 @@ get_controlfile(const char *DataDir)
 		control_file_info->state = ptr->state;
 		control_file_info->checkPoint = ptr->checkPoint;
 		control_file_info->data_checksum_version = ptr->data_checksum_version;
+		control_file_info->timeline = ptr->checkPointCopy.ThisTimeLineID;
+		control_file_info->minRecoveryPointTLI = ptr->minRecoveryPointTLI;
+		control_file_info->minRecoveryPoint = ptr->minRecoveryPoint;
 	}
 	else if (version_num >= 90500)
 	{
@@ -242,6 +328,9 @@ get_controlfile(const char *DataDir)
 		control_file_info->state = ptr->state;
 		control_file_info->checkPoint = ptr->checkPoint;
 		control_file_info->data_checksum_version = ptr->data_checksum_version;
+		control_file_info->timeline = ptr->checkPointCopy.ThisTimeLineID;
+		control_file_info->minRecoveryPointTLI = ptr->minRecoveryPointTLI;
+		control_file_info->minRecoveryPoint = ptr->minRecoveryPoint;
 	}
 	else if (version_num >= 90400)
 	{
@@ -250,6 +339,9 @@ get_controlfile(const char *DataDir)
 		control_file_info->state = ptr->state;
 		control_file_info->checkPoint = ptr->checkPoint;
 		control_file_info->data_checksum_version = ptr->data_checksum_version;
+		control_file_info->timeline = ptr->checkPointCopy.ThisTimeLineID;
+		control_file_info->minRecoveryPointTLI = ptr->minRecoveryPointTLI;
+		control_file_info->minRecoveryPoint = ptr->minRecoveryPoint;
 	}
 	else if (version_num >= 90300)
 	{
@@ -258,6 +350,9 @@ get_controlfile(const char *DataDir)
 		control_file_info->state = ptr->state;
 		control_file_info->checkPoint = ptr->checkPoint;
 		control_file_info->data_checksum_version = ptr->data_checksum_version;
+		control_file_info->timeline = ptr->checkPointCopy.ThisTimeLineID;
+		control_file_info->minRecoveryPointTLI = ptr->minRecoveryPointTLI;
+		control_file_info->minRecoveryPoint = ptr->minRecoveryPoint;
 	}

 	pfree(ControlFileDataPtr);
@@ -265,9 +360,7 @@ get_controlfile(const char *DataDir)
 	/*
 	 * We don't check the CRC here as we're potentially checking a pg_control
 	 * file from a different PostgreSQL version to the one repmgr was compiled
-	 * against. However we're only interested in the first few fields, which
-	 * should be constant across supported versions
-	 *
+	 * against.
 	 */

 	return control_file_info;
--- a/controldata.h
+++ b/controldata.h
@@ -1,6 +1,6 @@
 /*
 * controldata.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
@@ -12,6 +12,7 @@
 #include "postgres_fe.h"
 #include "catalog/pg_control.h"

+#define MAX_VERSION_STRING 24
 /*
 * A simplified representation of pg_control containing only those fields
 * required by repmgr.
@@ -23,6 +24,9 @@ typedef struct
 	DBState		state;
 	XLogRecPtr	checkPoint;
 	uint32		data_checksum_version;
+	TimeLineID	timeline;
+	TimeLineID	minRecoveryPointTLI;
+	XLogRecPtr	minRecoveryPoint;
 } ControlFileInfo;


@@ -134,13 +138,11 @@ typedef struct ControlFileData93


 /*
- * Following fields added since 9.3:
+ * Following field added since 9.3:
 *
 * 	int			max_worker_processes;
- *  int			max_prepared_xacts;
- *  int			max_locks_per_xact;
- *
 */
+
 typedef struct ControlFileData94
 {
 	uint64		system_identifier;
@@ -331,11 +333,14 @@ typedef struct ControlFileData11
 } ControlFileData11;


-
+extern int get_pg_version(const char *data_directory, char *version_string);
 extern DBState get_db_state(const char *data_directory);
 extern const char *describe_db_state(DBState state);
 extern int	get_data_checksum_version(const char *data_directory);
 extern uint64 get_system_identifier(const char *data_directory);
 extern XLogRecPtr get_latest_checkpoint_location(const char *data_directory);
+extern TimeLineID get_timeline(const char *data_directory);
+extern TimeLineID get_min_recovery_end_timeline(const char *data_directory);
+extern XLogRecPtr get_min_recovery_location(const char *data_directory);

 #endif							/* _CONTROLDATA_H_ */
--- a/dbutils.c
+++ b/dbutils.c
--- a/dbutils.h
+++ b/dbutils.h
@@ -1,7 +1,7 @@
 /*
 * dbutils.h
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -20,6 +20,7 @@
 #ifndef _REPMGR_DBUTILS_H_
 #define _REPMGR_DBUTILS_H_

+#include "access/timeline.h"
 #include "access/xlogdefs.h"
 #include "pqexpbuffer.h"
 #include "portability/instr_time.h"
@@ -79,7 +80,8 @@ typedef enum
 	NODE_STATUS_UP,
 	NODE_STATUS_SHUTTING_DOWN,
 	NODE_STATUS_DOWN,
-	NODE_STATUS_UNCLEAN_SHUTDOWN
+	NODE_STATUS_UNCLEAN_SHUTDOWN,
+	NODE_STATUS_REJECTED
 } NodeStatus;

 typedef enum
@@ -111,12 +113,16 @@ typedef enum

 typedef struct s_extension_versions {
 	char		default_version[8];
+	int			default_version_num;
 	char		installed_version[8];
+	int			installed_version_num;
 } t_extension_versions;

 #define T_EXTENSION_VERSIONS_INITIALIZER { \
 	"", \
+	UNKNOWN_SERVER_VERSION_NUM, \
 	"", \
+	UNKNOWN_SERVER_VERSION_NUM \
 }

 /*
@@ -128,8 +134,8 @@ typedef struct s_node_info
 	int			node_id;
 	int			upstream_node_id;
 	t_server_type type;
-	char		node_name[MAXLEN];
-	char		upstream_node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
+	char		upstream_node_name[NAMEDATALEN];
 	char		conninfo[MAXLEN];
 	char		repluser[NAMEDATALEN];
 	char		location[MAXLEN];
@@ -178,7 +184,7 @@ typedef struct s_node_info
 	MS_NORMAL, \
 	NULL, \
 	/* for ad-hoc use e.g. when working with a list of nodes */ \
-	"", true, true \
+	"", true, true,	\
 	/* various statistics */ \
 	-1, -1, -1, -1, -1, -1					\
 }
@@ -296,22 +302,16 @@ typedef struct BdrNodeInfoList
 typedef struct
 {
 	char		current_timestamp[MAXLEN];
-	uint64		last_wal_receive_lsn;
-	uint64		last_wal_replay_lsn;
+	bool		in_recovery;
+	XLogRecPtr	last_wal_receive_lsn;
+	XLogRecPtr	last_wal_replay_lsn;
 	char		last_xact_replay_timestamp[MAXLEN];
 	int			replication_lag_time;
 	bool		receiving_streamed_wal;
+	bool		wal_replay_paused;
+	int			upstream_last_seen;
 } ReplInfo;

-#define T_REPLINFO_INTIALIZER { \
-	"", \
-	InvalidXLogRecPtr, \
-	InvalidXLogRecPtr, \
-	"", \
-	0 \
-}
-
-
 typedef struct
 {
 	char		filepath[MAXPGPATH];
@@ -351,16 +351,16 @@ typedef struct RepmgrdInfo {
 	char pid_file[MAXLEN];
 	bool pg_running;
 	char pg_running_text[MAXLEN];
+	RecoveryType recovery_type;
 	bool running;
 	char repmgrd_running[MAXLEN];
 	bool paused;
+	bool wal_paused_pending_wal;
+	int  upstream_last_seen;
+	char upstream_last_seen_text[MAXLEN];
 } RepmgrdInfo;


-/* global variables */
-
-extern int	server_version_num;
-
 /* macros */

 #define is_streaming_replication(x) (x == PRIMARY || x == STANDBY)
@@ -401,6 +401,7 @@ void		param_set_ine(t_conninfo_param_list *param_list, const char *param, const
 char	   *param_get(t_conninfo_param_list *param_list, const char *param);
 bool		parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char **errmsg, bool ignore_local_params);
 char	   *param_list_to_string(t_conninfo_param_list *param_list);
+char	   *normalize_conninfo_string(const char *conninfo_str);
 bool		has_passfile(void);


@@ -415,22 +416,30 @@ bool		set_config_bool(PGconn *conn, const char *config_param, bool state);
 int		    guc_set(PGconn *conn, const char *parameter, const char *op, const char *value);
 int			guc_set_typed(PGconn *conn, const char *parameter, const char *op, const char *value, const char *datatype);
 bool		get_pg_setting(PGconn *conn, const char *setting, char *output);
+bool		alter_system_int(PGconn *conn, const char *name, int value);
+bool		pg_reload_conf(PGconn *conn);

 /* server information functions */
 bool		get_cluster_size(PGconn *conn, char *size);
-int			get_server_version(PGconn *conn, char *server_version);
+int			get_server_version(PGconn *conn, char *server_version_buf);
+
 RecoveryType get_recovery_type(PGconn *conn);
 int			get_primary_node_id(PGconn *conn);
 int			get_ready_archive_files(PGconn *conn, const char *data_directory);
 bool		identify_system(PGconn *repl_conn, t_system_identification *identification);
+TimeLineHistoryEntry *get_timeline_history(PGconn *repl_conn, TimeLineID tli);
+
+/* repmgrd shared memory functions */
 bool		repmgrd_set_local_node_id(PGconn *conn, int local_node_id);
 int			repmgrd_get_local_node_id(PGconn *conn);
+bool		repmgrd_check_local_node_id(PGconn *conn);
 BackupState	server_in_exclusive_backup_mode(PGconn *conn);
 void		repmgrd_set_pid(PGconn *conn, pid_t repmgrd_pid, const char *pidfile);
 pid_t		repmgrd_get_pid(PGconn *conn);
 bool		repmgrd_is_running(PGconn *conn);
 bool		repmgrd_is_paused(PGconn *conn);
 bool		repmgrd_pause(PGconn *conn, bool pause);
+pid_t		get_wal_receiver_pid(PGconn *conn);

 /* extension functions */
 ExtensionStatus get_repmgr_extension_status(PGconn *conn, t_extension_versions *extversions);
@@ -438,13 +447,16 @@ ExtensionStatus get_repmgr_extension_status(PGconn *conn, t_extension_versions *
 /* node management functions */
 void		checkpoint(PGconn *conn);
 bool		vacuum_table(PGconn *conn, const char *table);
-
+bool		promote_standby(PGconn *conn, bool wait, int wait_seconds);
+bool		resume_wal_replay(PGconn *conn);

 /* node record functions */
 t_server_type parse_node_type(const char *type);
 const char *get_node_type_string(t_server_type type);

 RecordStatus get_node_record(PGconn *conn, int node_id, t_node_info *node_info);
+RecordStatus refresh_node_record(PGconn *conn, int node_id, t_node_info *node_info);
+
 RecordStatus get_node_record_with_upstream(PGconn *conn, int node_id, t_node_info *node_info);

 RecordStatus get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_info);
@@ -491,7 +503,7 @@ PGresult   *get_event_records(PGconn *conn, int node_id, const char *node_name,

 /* replication slot functions */
 void		create_slot_name(char *slot_name, int node_id);
-bool		create_replication_slot(PGconn *conn, char *slot_name, int server_version_num, PQExpBufferData *error_msg);
+bool		create_replication_slot(PGconn *conn, char *slot_name, PQExpBufferData *error_msg);
 bool		drop_replication_slot(PGconn *conn, char *slot_name);
 RecordStatus get_slot_record(PGconn *conn, char *slot_name, t_replication_slot *record);
 int			get_free_replication_slot_count(PGconn *conn);
@@ -502,12 +514,14 @@ bool		get_tablespace_name_by_location(PGconn *conn, const char *location, char *

 /* asynchronous query functions */
 bool		cancel_query(PGconn *conn, int timeout);
-int			wait_connection_availability(PGconn *conn, long long timeout);
+int			wait_connection_availability(PGconn *conn, int timeout);

 /* node availability functions */
 bool		is_server_available(const char *conninfo);
+bool		is_server_available_quiet(const char *conninfo);
 bool		is_server_available_params(t_conninfo_param_list *param_list);
 ExecStatusType	connection_ping(PGconn *conn);
+ExecStatusType	connection_ping_reconnect(PGconn *conn);

 /* monitoring functions  */
 void
@@ -538,12 +552,17 @@ bool		get_new_primary(PGconn *conn, int *primary_node_id);
 void		reset_voting_status(PGconn *conn);

 /* replication status functions */
-XLogRecPtr	get_current_wal_lsn(PGconn *conn);
+XLogRecPtr	get_primary_current_lsn(PGconn *conn);
+XLogRecPtr	get_node_current_lsn(PGconn *conn);
 XLogRecPtr	get_last_wal_receive_location(PGconn *conn);
-bool		get_replication_info(PGconn *conn, ReplInfo *replication_info);
+void		init_replication_info(ReplInfo *replication_info);
+bool		get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replication_info);
 int			get_replication_lag_seconds(PGconn *conn);
-void		get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *node_info);
+void		get_node_replication_stats(PGconn *conn, t_node_info *node_info);
 bool		is_downstream_node_attached(PGconn *conn, char *node_name);
+void		set_upstream_last_seen(PGconn *conn);
+int			get_upstream_last_seen(PGconn *conn, t_server_type node_type);
+bool		is_wal_replay_paused(PGconn *conn, bool check_pending_wal);

 /* BDR functions */
 int			get_bdr_version_num(void);
--- a/dirutil.c
+++ b/dirutil.c
@@ -3,7 +3,7 @@
 * dirmod.c
 *	  directory handling functions
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -50,7 +50,7 @@ typedef long pgpid_t;
 * and tablespace directories.
 */
 DataDirState
-check_dir(char *path)
+check_dir(const char *path)
 {
 	DIR		   *chkdir = NULL;
 	struct dirent *file = NULL;
@@ -91,12 +91,17 @@ check_dir(char *path)
 * Create directory with error log message when failing
 */
 bool
-create_dir(char *path)
+create_dir(const char *path)
 {
-	if (mkdir_p(path, 0700) == 0)
+	char create_dir_path[MAXPGPATH];
+
+	/* mkdir_p() may modify the supplied path */
+	strncpy(create_dir_path, path, MAXPGPATH);
+
+	if (mkdir_p(create_dir_path, 0700) == 0)
 		return true;

-	log_error(_("unable to create directory \"%s\""), path);
+	log_error(_("unable to create directory \"%s\""), create_dir_path);
 	log_detail("%s", strerror(errno));

 	return false;
@@ -104,13 +109,12 @@ create_dir(char *path)


 bool
-set_dir_permissions(char *path)
+set_dir_permissions(const char *path)
 {
 	return (chmod(path, 0700) != 0) ? false : true;
 }


-
 /* function from initdb.c */
 /* source adapted from FreeBSD /src/bin/mkdir/mkdir.c */

@@ -198,9 +202,9 @@ mkdir_p(char *path, mode_t omode)


 bool
-is_pg_dir(char *path)
+is_pg_dir(const char *path)
 {
-	char		dirpath[MAXPGPATH];
+	char		dirpath[MAXPGPATH] = "";
 	struct stat sb;

 	/* test pgdata */
@@ -223,7 +227,7 @@ is_pg_dir(char *path)
 * any further useful progress can be made.
 */
 PgDirState
-is_pg_running(char *path)
+is_pg_running(const char *path)
 {
 	long		pid;
 	FILE	   *pidf;
@@ -272,6 +276,8 @@ is_pg_running(char *path)
 			log_warning(_("invalid data in PostgreSQL PID file \"%s\""), path);
 		}

+		fclose(pidf);
+
 		return PG_DIR_NOT_RUNNING;
 	}

@@ -291,7 +297,7 @@ is_pg_running(char *path)


 bool
-create_pg_dir(char *path, bool force)
+create_pg_dir(const char *path, bool force)
 {
 	/* Check this directory can be used as a PGDATA dir */
 	switch (check_dir(path))
@@ -347,8 +353,9 @@ create_pg_dir(char *path, bool force)
 			}
 			break;
 		case DIR_ERROR:
-			log_error(_("could not access directory \"%s\": %s"),
-					  path, strerror(errno));
+			log_error(_("could not access directory \"%s\"")
+					  , path);
+			log_detail("%s", strerror(errno));
 			return false;
 	}

@@ -358,7 +365,7 @@ create_pg_dir(char *path, bool force)


 int
-rmdir_recursive(char *path)
+rmdir_recursive(const char *path)
 {
 	return nftw(path, unlink_dir_callback, 64, FTW_DEPTH | FTW_PHYS);
 }
--- a/dirutil.h
+++ b/dirutil.h
@@ -1,6 +1,6 @@
 /*
 * dirutil.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -35,13 +35,13 @@ typedef enum
 } PgDirState;

 extern int	mkdir_p(char *path, mode_t omode);
-extern bool set_dir_permissions(char *path);
+extern bool set_dir_permissions(const char *path);

-extern DataDirState	check_dir(char *path);
-extern bool create_dir(char *path);
-extern bool is_pg_dir(char *path);
-extern PgDirState is_pg_running(char *path);
-extern bool create_pg_dir(char *path, bool force);
-extern int rmdir_recursive(char *path);
+extern DataDirState	check_dir(const char *path);
+extern bool create_dir(const char *path);
+extern bool is_pg_dir(const char *path);
+extern PgDirState is_pg_running(const char *path);
+extern bool create_pg_dir(const char *path, bool force);
+extern int rmdir_recursive(const char *path);

 #endif
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -61,7 +61,7 @@ clean:

 maintainer-clean:
 	rm -rf html
-	rm -rf Makefile
+	rm -f Makefile

 zip: html
 	cp -r html repmgr-docs-$(REPMGR_VERSION)
--- a/doc/appendix-faq.sgml
+++ b/doc/appendix-faq.sgml
@@ -76,7 +76,7 @@
   <para>
    Before PostgreSQL 10, hash indexes were not WAL logged and are therefore not suitable
    for use in streaming replication in PostgreSQL 9.6 and earlier. See the
-    <ulink url="https://www.postgresql.org/docs/9.6/static/sql-createindex.html#AEN80279">PostgreSQL documentation</ulink>
+    <ulink url="https://www.postgresql.org/docs/9.6/sql-createindex.html#AEN80279">PostgreSQL documentation</ulink>
    for details.
   </para>
   <para>
@@ -96,12 +96,11 @@
   <para>
     For <emphasis>major</emphasis> version upgrades (e.g. from PostgreSQL 9.6 to PostgreSQL 10),
     the traditional approach is to "reseed" a cluster by upgrading a single
-     node with <ulink url="https://www.postgresql.org/docs/current/static/pgupgrade.html">pg_upgrade</ulink>
+     node with <ulink url="https://www.postgresql.org/docs/current/pgupgrade.html">pg_upgrade</ulink>
     and recloning standbys from this.
   </para>
   <para>
-     To minimize downtime during major upgrades, for more recent PostgreSQL
-     versions (PostgreSQL 9.4 and later),
+     To minimize downtime during major upgrades from PostgreSQL 9.4 and later,
     <ulink url="https://www.2ndquadrant.com/en/resources/pglogical/">pglogical</ulink>
     can be used to set up a parallel cluster using the newer PostgreSQL version,
     which can be kept in sync with the existing production cluster until the
--- a/doc/appendix-packages.sgml
+++ b/doc/appendix-packages.sgml
@@ -293,7 +293,7 @@
            </row>
            <row>
              <entry>Repository documentation:</entry>
-              <entry><ulink url="https://wiki.postgresql.org/wiki/Apt)">https://wiki.postgresql.org/wiki/Apt)</ulink></entry>
+              <entry><ulink url="https://wiki.postgresql.org/wiki/Apt">https://wiki.postgresql.org/wiki/Apt</ulink></entry>
            </row>
          </tbody>
        </tgroup>
@@ -456,14 +456,21 @@ repmgr96-4.1.1-0.0git320.g5113ab0.1.el7.x86_64.rpm</programlisting>

  <sect1 id="packages-old-versions" xreflabel="Installing old package versions">
    <title>Installing old package versions</title>
+
    <indexterm>
      <primary>old packages</primary>
    </indexterm>
+
    <indexterm>
      <primary>packages</primary>
      <secondary>old versions</secondary>
    </indexterm>

+    <indexterm>
+      <primary>installation</primary>
+      <secondary>old package versions</secondary>
+    </indexterm>
+
    <sect2 id="packages-old-versions-debian" xreflabel="old Debian package versions">
      <title>Debian/Ubuntu</title>
      <para>
--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -15,6 +15,278 @@
    See also: <xref linkend="upgrading-repmgr">
  </para>

+  <sect1 id="release-4.3">
+    <title>Release 4.3</title>
+    <para><emphasis>Mar ???, 2019</emphasis></para>
+    <para>
+      &repmgr; 4.3 is a major release.
+    </para>
+
+	<important>
+	  <para>
+	    On Debian-based systems, including Ubuntu, if using <application>repmgrd</application>
+	    please ensure that in the file <filename>/etc/init.d/repmgrd</filename>, the parameter
+	    <varname>REPMGRD_OPTS</varname> contains &quot;<literal>--daemonize=false</literal>&quot;, e.g.:
+	    <programlisting>
+# additional options
+REPMGRD_OPTS="--daemonize=false"</programlisting>
+	  </para>
+	  <para>
+	    For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd configuration on Debian/Ubuntu</link>.
+	  </para>
+	</important>
+
+    <sect2>
+      <title>repmgr enhancements</title>
+      <para>
+        <itemizedlist>
+
+          <listitem>
+            <para>
+              <link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>:
+              option <option>--upstream-node-id</option> can now be used to specify another standby
+              to follow.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>:
+              verify that it is actually possible to follow another node.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>:
+              verify that it is actually possible to attach the node to the current primary.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              New commands <link linkend="repmgr-daemon-start"><command>repmgr daemon start</command></link> and
+              <link linkend="repmgr-daemon-stop"><command>repmgr daemon stop</command></link>:
+              these provide a standardized way of starting and stopping <application>repmgrd</application>.
+              GitHub #528.
+            </para>
+            <note>
+              <para>
+                These commands require the configuration file settings
+                <varname>repmgrd_service_start_command</varname> and <varname>repmgrd_service_stop_command</varname>
+                in <filename>repmgr.conf</filename> to be set.
+              </para>
+            </note>
+          </listitem>
+
+          <listitem>
+            <para>
+              <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>
+              additionally displays the node priority and the interval (in seconds) since the
+              <application>repmgrd</application> instance last verified its upstream node was available.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Add <option>--compact</option> option to <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command> (GitHub #521).
+            </para>
+            <para>
+              This makes it easier to copy the output into emails, chats etc. as a compact table.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
+	      differentiate between unreachable nodes and nodes which are running but rejecting connections.
+            </para>
+            <para>
+	      This makes it possible to see whether a node is unreachable at network level,
+	      or if it is running but rejecting connections for some reason.
+            </para>
+          </listitem>
+
+
+          <listitem>
+            <para>
+              Add <option>--dry-run</option> to <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> (GitHub #522).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <command>repmgr --version-number</command> outputs the &quot;raw&quot;
+              repmgr version number (e.g. <literal>40300</literal>). This is intended
+              for use by scripts etc. requiring an easily parseable representation
+              of the &repmgr; version.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <link linkend="repmgr-node-check"><command>repmgr node check --data-directory-config</command></link>
+              option added; this is to confirm &repmgr; is correctly configured. GitHub #523.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Add check to <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>
+              to ensure the data directory on the demotion candidate is configured correctly in <filename>repmgr.conf</filename>.
+              This is to ensure that &repmgr;, when remotely executed on the demotion candidate, can correctly verify
+              that PostgreSQL on the demotion candidate was shut down cleanly. GitHub #523.
+            </para>
+          </listitem>
+
+
+       </itemizedlist>
+      </para>
+    </sect2>
+
+    <sect2>
+      <title>repmgrd enhancements</title>
+      <para>
+        <itemizedlist>
+
+          <listitem>
+            <para>
+              <application>repmgrd</application> will no longer consider nodes where <application>repmgrd</application>
+			  is not running as promotion candidates.
+            </para>
+            <para>
+              Previously, if <application>repmgrd</application> was not running on a node, but
+              that node qualified as the promotion candidate, it would never be promoted due to
+              the absence of a running <application>repmgrd</application>.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Add option <option>connection_check_type</option> to enable selection of the method
+              <application>repmgrd</application> uses to determine whether the upstream node is available.
+            </para>
+            <para>
+              Possible values are <literal>ping</literal> (default; uses <command>PQping()</command> to
+              determine server availability), <literal>connection</literal> (attempst to make a new connection to
+              the upstream node), and <literal>query</literal> (determines server availability
+              by executing an SQL statement on the node via the existing connection).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              New configuration option <link linkend="repmgrd-failover-validation"><option>failover_validation_command</option></link>
+              to allow an external mechanism to validate the failover decision made by <application>repmgrd</application>.
+            </para>
+          </listitem>
+
+         <listitem>
+            <para>
+              New configuration option <link linkend="repmgrd-standby-disconnection-on-failover"><option>standby_disconnect_on_failover</option></link>
+              to force standbys to disconnect their WAL receivers before making a failover decision.
+            </para>
+          </listitem>
+
+         <listitem>
+            <para>
+			  In a failover situation, <application>repmgrd</application> will not attempt to promote a
+			  node if another standby has already appeared (e.g. by being promoted manually).
+			  GitHub #420.
+			</para>
+          </listitem>
+
+		</itemizedlist>
+	  </para>
+	</sect2>
+
+    <sect2>
+      <title>Bug fixes</title>
+      <para>
+        <itemizedlist>
+
+          <listitem>
+            <para>
+              &repmgr;: when executing <command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>,
+              prevent escaping issues with connection URIs when executing <command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>
+              on the demotion candidate. GitHub #525.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              &repmgr;: when executing <command><link linkend="repmgr-witness-register">repmgr witness register</link></command>,
+              check the node to connected is actually the primary (i.e. not the witness server).  GitHub #528.
+            </para>
+          </listitem>
+
+         <listitem>
+            <para>
+              &repmgr;: when executing <link linkend="repmgr-standby-clone"><command>repmgr standby clone</command></link>,
+              recheck primary/upstream connection(s) after the data copy operation is complete, as these may
+              have gone away.
+            </para>
+          </listitem>
+
+         <listitem>
+            <para>
+              &repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
+              avoid a potential race condition when comparing received WAL on the standby to the primary's shutdown location,
+	      as the standby's walreceiver may not have yet flushed all received WAL to disk. GitHub #518.
+            </para>
+          </listitem>
+
+
+         <listitem>
+            <para>
+              &repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
+	      verify the standby (promotion candidate) is currently attached to the primary (demotion candidate). GitHub #519.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <application>repmgrd</application>: on a cascaded standby, don't fail over if
+              <literal>failover=manual</literal>. GitHub #531.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
+	          fix display of node IDs with multiple digits.
+            </para>
+	      </listitem>
+
+          <listitem>
+            <para>
+              ensure <command><link linkend="repmgr-primary-unregister">repmgr primary unregister</link></command>
+	          behaves correctly when executed on a witness server. GitHub #548.
+            </para>
+	      </listitem>
+
+          <listitem>
+            <para>
+              ensure <command><link linkend="repmgr-standby-register">repmgr standby register</link></command>
+	          fails when <option>--upstream-node-id</option> is the same as the local node ID.
+            </para>
+	      </listitem>
+
+          <listitem>
+            <para>
+              <command><link linkend="repmgr-node-check">repmgr node check</link></command>
+	          will only consider physical replication slots, as the purpose
+	          of slot checks is to warn about potential issues with
+	          streaming replication standbys which are no longer attached.
+	        </para>
+	      </listitem>
+
+        </itemizedlist>
+      </para>
+    </sect2>
+
+  </sect1>
+
  <sect1 id="release-4.2">
    <title>Release 4.2</title>
    <para><emphasis>Wed October 24, 2018</emphasis></para>
@@ -31,6 +303,20 @@
      <link linkend="upgrading-major-version">Upgrading a major version release</link>.
    </para>

+	<important>
+	  <para>
+	    On Debian-based systems, including Ubuntu, if using <application>repmgrd</application>
+	    please ensure that the in the file <filename>/etc/init.d/repmgrd</filename>, the parameter
+	    <varname>REPMGRD_OPTS</varname> contains &quot;<literal>--daemonize=false</literal>&quot;, e.g.:
+	    <programlisting>
+# additional options
+REPMGRD_OPTS="--daemonize=false"</programlisting>
+	  </para>
+	  <para>
+	    For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd daemon configuration on Debian/Ubuntu</link>.
+	  </para>
+	</important>
+
    <sect2>
      <title>Configuration file changes</title>
      <para>
@@ -143,7 +429,6 @@
            </para>
          </listitem>

-
          <listitem>
            <para>
              <application>repmgrd</application>: fix parsing of <option>-d/--daemonize</option> option.
@@ -1272,7 +1557,7 @@
            <emphasis>easier upgrades</emphasis>: &repmgr; is now implemented as a native
            PostgreSQL extension, which means future upgrades can be carried out by
            installing the upgraded package and issuing
-            <ulink url="https://www.postgresql.org/docs/current/static/sql-alterextension.html">ALTER EXTENSION repmgr UPDATE</ulink>.
+            <ulink url="https://www.postgresql.org/docs/current/sql-alterextension.html">ALTER EXTENSION repmgr UPDATE</ulink>.
          </para>
        </listitem>

--- a/doc/appendix-support.sgml
+++ b/doc/appendix-support.sgml
@@ -0,0 +1,96 @@
+<appendix id="appendix-support" xreflabel="repmgr support">
+  <indexterm>
+    <primary>support</primary>
+  </indexterm>
+
+  <title>&repmgr; support</title>
+  <para>
+    <ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides 24x7
+    production support for &repmgr; and other PostgreSQL
+    products, including configuration assistance, installation
+    verification and training for running a robust replication cluster.
+  </para>
+  <para>
+    For further details see: <ulink url="https://2ndquadrant.com/en/support/">https://2ndquadrant.com/en/support/</ulink>
+  </para>
+
+  <para>
+    A mailing list/forum is provided via Google groups to discuss contributions or issues: <ulink url="https://groups.google.com/group/repmgr">https://groups.google.com/group/repmgr</ulink>.
+  </para>
+  <para>
+    Please report bugs and other issues to: <ulink url="https://github.com/2ndQuadrant/repmgr">https://github.com/2ndQuadrant/repmgr</ulink>.
+  </para>
+
+  <important>
+    <para>
+      Please read the <link linkend="appendix-support-reporting-issues">following section</link> before submitting questions or issue reports.
+    </para>
+  </important>
+
+  <sect1 id="appendix-support-reporting-issues" xreflabel="Reportins Issues">
+    <indexterm>
+      <primary>support</primary>
+      <secondary>reporting issues</secondary>
+    </indexterm>
+
+    <title>Reporting Issues</title>
+
+    <para>
+      When asking questions or reporting issues, it is extremely helpful if the following information is included:
+
+    <itemizedlist spacing="compact" mark="bullet">
+
+     <listitem>
+      <simpara>
+        &repmgr; version
+      </simpara>
+     </listitem>
+
+     <listitem>
+      <simpara>
+        How was &repmgr installed? From source? From packages? If
+        so from which repository?
+      </simpara>
+     </listitem>
+
+     <listitem>
+      <simpara>
+        <filename>repmpgr.conf</filename> files (suitably anonymized if necessary)
+      </simpara>
+     </listitem>
+
+     <listitem>
+      <simpara>
+        Contents of the <literal>repmgr.nodes</literal> table (suitably anonymized if necessary)
+      </simpara>
+     </listitem>
+
+     <listitem>
+      <simpara>
+        PostgreSQL version
+      </simpara>
+     </listitem>
+
+    </itemizedlist>
+    </para>
+    <para>
+      If issues are encountered with a &repmgr; client command, please provide
+      the output of that command executed with the options
+      <option>-LDEBUG --verbose</option>, which will ensure &repmgr; emits
+      the maximum level of logging output.
+    </para>
+    <para>
+      If issues are encountered with <application>repmgrd</application>,
+      please provide relevant extracts from the &repmgr; log files
+      and if possible the PostgreSQL log itself. Please ensure these
+      logs do not contain any confidential data.
+    </para>
+    <para>
+      In all cases it is <emphasis>extremely</emphasis> useful to receive
+      information on how to reliably reproduce an issue with as much detail as
+      possible.
+    </para>
+
+  </sect1>
+
+</appendix>
--- a/doc/cloning-standbys.sgml
+++ b/doc/cloning-standbys.sgml
@@ -262,7 +262,7 @@
    meaning replication changes "cascade" down through a hierarchy of servers. This
    can be used to reduce load on the primary and minimize bandwith usage between
    sites. For more details, see the
-    <ulink url="https://www.postgresql.org/docs/current/static/warm-standby.html#CASCADING-REPLICATION">
+    <ulink url="https://www.postgresql.org/docs/current/warm-standby.html#CASCADING-REPLICATION">
    PostgreSQL cascading replication documentation</ulink>.
   </para>
   <para>
@@ -391,7 +391,7 @@
      a symlink will automatically be created from the main data directory.
    </para>
    <para>
-     See the <ulink url="https://www.postgresql.org/docs/current/static/app-pgbasebackup.html">PostgreSQL pg_basebackup documentation</ulink>
+     See the <ulink url="https://www.postgresql.org/docs/current/app-pgbasebackup.html">PostgreSQL pg_basebackup documentation</ulink>
     for more details of available options.
    </para>
   </sect2>
@@ -413,7 +413,7 @@
     user's <filename>~/.pgpass</filename> file. It's also possible to store the password in the
     environment variable <varname>PGPASSWORD</varname>, however this is not recommended for
     security reasons. For more details see the
-     <ulink url="https://www.postgresql.org/docs/current/static/libpq-pgpass.html">PostgreSQL password file documentation</ulink>.
+     <ulink url="https://www.postgresql.org/docs/current/libpq-pgpass.html">PostgreSQL password file documentation</ulink>.
    </para>

    <note>
--- a/doc/configuration-file-required-settings.sgml
+++ b/doc/configuration-file-required-settings.sgml
@@ -39,6 +39,10 @@
       called <varname>standby1</varname> (for example), things will be confusing
       to say the least.
     </para>
+     <para>
+       The string's maximum length is 63 characters and it should
+       contain only printable ASCII characters.
+     </para>
    </listitem>
   </varlistentry>

@@ -56,7 +60,7 @@
     </para>
     <para>
       For details on conninfo strings, see section <ulink
-       url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING">Connection Strings</>
+       url="https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING">Connection Strings</>
        in the PosgreSQL documentation.
     </para>
     <para>
@@ -64,7 +68,7 @@
        <varname>connect_timeout</varname> in the <varname>conninfo</varname>
        string to determine the length of time which elapses before a network
        connection attempt is abandoned; for details see <ulink
-        url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNECT-CONNECT-TIMEOUT">
+        url="https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-CONNECT-TIMEOUT">
        the PostgreSQL documentation</>.
     </para>
    </listitem>
--- a/doc/configuration-file.sgml
+++ b/doc/configuration-file.sgml
@@ -36,9 +36,9 @@
    </para>
    <para>
      Whitespace is insignificant (except within a quoted parameter value) and blank lines are ignored.
-      Hash marks (#) designate the remainder of the line as a comment. Parameter values that are not simple
-      identifiers or numbers should be single-quoted. Note that single quote can not be embedded
-      in a parameter value.
+      Hash marks (<literal>#</literal>) designate the remainder of the line as a comment.
+      Parameter values that are not simple identifiers or numbers should be single-quoted.
+      Note that single quote cannot be embedded in a parameter value.
    </para>
    <important>
      <para>
--- a/doc/configuring-witness-server.sgml
+++ b/doc/configuring-witness-server.sgml
@@ -1,93 +0,0 @@
-<chapter id="using-witness-server">
- <indexterm>
-  <primary>witness server</primary>
-  <seealso>Using a witness server with repmgrd</seealso>
- </indexterm>
-
-
- <title>Using a witness server</title>
- <para>
-   A <xref linkend="witness-server"> is a normal PostgreSQL instance which
-   is not part of the streaming replication cluster; its purpose is, if a
-   failover situation occurs, to provide proof that the primary server
-   itself is unavailable.
- </para>
-
- <para>
-   A typical use case for a witness server is a two-node streaming replication
-   setup, where the primary and standby are in different locations (data centres).
-   By creating a witness server in the same location (data centre) as the primary,
-   if the primary becomes unavailable it's possible for the standby to decide whether
-   it can promote itself without risking a "split brain" scenario: if it can't see either the
-   witness or the primary server, it's likely there's a network-level interruption
-   and it should not promote itself. If it can seen the witness but not the primary,
-   this proves there is no network interruption and the primary itself is unavailable,
-   and it can therefore promote itself (and ideally take action to fence the
-   former primary).
- </para>
- <note>
-   <para>
-     <emphasis>Never</emphasis> install a witness server on the same physical host
-     as another node in the replication cluster managed by &repmgr; - it's essential
-     the witness is not affected in any way by failure of another node.
-   </para>
- </note>
- <para>
-   For more complex replication scenarios,e.g. with multiple datacentres, it may
-   be preferable to use location-based failover, which ensures that only nodes
-   in the same location as the primary will ever be promotion candidates;
-   see <xref linkend="repmgrd-network-split"> for more details.
- </para>
-
- <note>
-   <simpara>
-     A witness server will only be useful if <application>repmgrd</application>
-     is in use.
-   </simpara>
- </note>
-
- <sect1 id="creating-witness-server">
-   <title>Creating a witness server</title>
- <para>
-   To create a witness server, set up a normal PostgreSQL instance on a server
-   in the same physical location as the cluster's primary server.
- </para>
- <para>
-   This instance should *not* be on the same physical host as the primary server,
-   as otherwise if the primary server fails due to hardware issues, the witness
-   server will be lost too.
- </para>
- <note>
-   <simpara>
-     &repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
-     command, which would automatically create a PostgreSQL instance. However
-     this often resulted in an unsatisfactory, hard-to-customise instance.
-   </simpara>
- </note>
- <para>
-   The witness server should be configured in the same way as a normal
-   &repmgr; node; see section <xref linkend="configuration">.
- </para>
- <para>
-   Register the witness server with <xref linkend="repmgr-witness-register">.
-   This will create the &repmgr; extension on the witness server, and make
-   a copy of the &repmgr; metadata.
- </para>
- <note>
-   <simpara>
-    As the witness server is not part of the replication cluster, further
-    changes to the &repmgr; metadata will be synchronised by
-    <application>repmgrd</application>.
-   </simpara>
- </note>
- <para>
-   Once the witness server has been configured, <application>repmgrd</application>
-   should be started; for more details see <xref linkend="repmgrd-witness-server">.
- </para>
-
- <para>
-  To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
- </para>
-
- </sect1>
-</chapter>
--- a/doc/event-notifications.sgml
+++ b/doc/event-notifications.sgml
@@ -88,7 +88,7 @@

 <para>
  The values provided for <literal>%t</literal> and <literal>%d</literal>
-  will probably contain spaces, so should be quoted in the provided command
+  may contain spaces, so should be quoted in the provided command
  configuration, e.g.:
  <programlisting>
    event_notification_command='/path/to/some/script %n %e %s "%t" "%d"'
--- a/doc/filelist.sgml
+++ b/doc/filelist.sgml
@@ -45,20 +45,14 @@
 <!ENTITY promoting-standby  SYSTEM "promoting-standby.sgml">
 <!ENTITY follow-new-primary  SYSTEM "follow-new-primary.sgml">
 <!ENTITY switchover  SYSTEM "switchover.sgml">
-<!ENTITY configuring-witness-server SYSTEM "configuring-witness-server.sgml">

 <!ENTITY event-notifications  SYSTEM "event-notifications.sgml">
 <!ENTITY upgrading-repmgr  SYSTEM "upgrading-repmgr.sgml">

+<!ENTITY repmgrd-overview SYSTEM "repmgrd-overview.sgml">
 <!ENTITY repmgrd-automatic-failover SYSTEM "repmgrd-automatic-failover.sgml">
 <!ENTITY repmgrd-configuration SYSTEM "repmgrd-configuration.sgml">
-<!ENTITY repmgrd-demonstration SYSTEM "repmgrd-demonstration.sgml">
-<!ENTITY repmgrd-monitoring SYSTEM "repmgrd-monitoring.sgml">
-<!ENTITY repmgrd-degraded-monitoring SYSTEM "repmgrd-degraded-monitoring.sgml">
-<!ENTITY repmgrd-cascading-replication SYSTEM "repmgrd-cascading-replication.sgml">
-<!ENTITY repmgrd-network-split SYSTEM "repmgrd-network-split.sgml">
-<!ENTITY repmgrd-witness-server SYSTEM "repmgrd-witness-server.sgml">
-<!ENTITY repmgrd-pausing SYSTEM "repmgrd-pausing.sgml">
+<!ENTITY repmgrd-operation SYSTEM "repmgrd-operation.sgml">
 <!ENTITY repmgrd-bdr SYSTEM "repmgrd-bdr.sgml">

 <!ENTITY repmgr-primary-register SYSTEM "repmgr-primary-register.sgml">
@@ -81,6 +75,8 @@
 <!ENTITY repmgr-cluster-event SYSTEM "repmgr-cluster-event.sgml">
 <!ENTITY repmgr-cluster-cleanup SYSTEM "repmgr-cluster-cleanup.sgml">
 <!ENTITY repmgr-daemon-status SYSTEM "repmgr-daemon-status.sgml">
+<!ENTITY repmgr-daemon-start SYSTEM "repmgr-daemon-start.sgml">
+<!ENTITY repmgr-daemon-stop SYSTEM "repmgr-daemon-stop.sgml">
 <!ENTITY repmgr-daemon-pause SYSTEM "repmgr-daemon-pause.sgml">
 <!ENTITY repmgr-daemon-unpause SYSTEM "repmgr-daemon-unpause.sgml">

@@ -88,6 +84,7 @@
 <!ENTITY appendix-faq      SYSTEM "appendix-faq.sgml">
 <!ENTITY appendix-signatures      SYSTEM "appendix-signatures.sgml">
 <!ENTITY appendix-packages      SYSTEM "appendix-packages.sgml">
+<!ENTITY appendix-support SYSTEM "appendix-support.sgml">

 <!ENTITY bookindex  SYSTEM "bookindex.sgml">

--- a/doc/follow-new-primary.sgml
+++ b/doc/follow-new-primary.sgml
@@ -15,7 +15,7 @@
  end of the preceding section (<xref linkend="promoting-standby">),
  execute this:
  <programlisting>
-    $ repmgr -f /etc/repmgr.conf repmgr standby follow
+    $ repmgr -f /etc/repmgr.conf standby follow
    INFO: changing node 3's primary to node 2
    NOTICE: restarting server using "pg_ctl -l /var/log/postgresql/startup.log -w -D '/var/lib/postgresql/data' restart"
    waiting for server to shut down......... done
--- a/doc/install-packages.sgml
+++ b/doc/install-packages.sgml
@@ -1,5 +1,11 @@
 <sect1 id="installation-packages" xreflabel="Installing from packages">
 <title>Installing &repmgr; from packages</title>
+
+  <indexterm>
+   <primary>installation</primary>
+   <secondary>from packages</secondary>
+  </indexterm>
+
 <para>
  We recommend installing &repmgr; using the available packages for your
  system.
@@ -160,7 +166,17 @@ yum search repmgr</programlisting>
      <programlisting>
        [root@localhost ~]# yum install repmgr10-4.0.3-1.rhel7</programlisting>
    </para>
+
+    <para>
+      <emphasis>Installing old packages</emphasis>
+    </para>
+    <para>
+      See appendix <link linkend="packages-old-versions-rhel-centos">Installing old package versions</link>
+      for details on how to retrieve older package versions.
+    </para>
+
  </sect3>
+
 </sect2>

 <sect2 id="installation-packages-debian" xreflabel="Installing from packages on Debian or Ubuntu">
@@ -224,7 +240,6 @@ curl https://dl.2ndquadrant.com/default/release/get/deb | sudo bash</programlist
          </note>
        </listitem>

-
 	<listitem>
 	  <para>
            Install the &repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr10</literal>):
@@ -244,6 +259,15 @@ sudo apt-get install postgresql-10-repmgr</programlisting>

    </para>

+    <para>
+      <emphasis>Installing old packages</emphasis>
+    </para>
+    <para>
+      See appendix <link linkend="packages-old-versions-debian">Installing old package versions</link>
+      for details on how to retrieve older package versions.
+    </para>
+
+
  </sect3>
 </sect2>

--- a/doc/install-source.sgml
+++ b/doc/install-source.sgml
@@ -12,8 +12,8 @@
   To install &repmgr; the prerequisites for compiling
   &postgres; must be installed. These are described in &postgres;'s
   documentation
-   on <ulink url="https://www.postgresql.org/docs/current/static/install-requirements.html">build requirements</ulink>
-   and <ulink url="https://www.postgresql.org/docs/current/static/docguide-toolsets.html">build requirements for documentation</ulink>.
+   on <ulink url="https://www.postgresql.org/docs/current/install-requirements.html">build requirements</ulink>
+   and <ulink url="https://www.postgresql.org/docs/current/docguide-toolsets.html">build requirements for documentation</ulink>.
  </para>

  <para>
@@ -136,6 +136,16 @@ deb-src http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main</programlisti
         </itemizedlist>
       </para>
     </note>
+
+     <tip>
+       <para>
+         If building against PostgreSQL 11 or later configured with the <option>--with-llvm</option> option
+         (this is the case with the PGDG-provided packages) you'll also need to install the
+         <literal>llvm-toolset-7-clang</literal> package. This is available via the
+         <ulink url="https://wiki.centos.org/AdditionalResources/Repositories/SCL">Software Collections (SCL) Repository</ulink>.
+       </para>
+     </tip>
+
    </listitem>
   </itemizedlist>
  </para>
@@ -232,7 +242,7 @@ deb-src http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main</programlisti
    The &repmgr; documentation is (like the main PostgreSQL project)
    written in DocBook format. To build it locally as HTML, you'll need to
    install the required packages as described in the
-    <ulink url="https://www.postgresql.org/docs/9.6/static/docguide-toolsets.html">
+    <ulink url="https://www.postgresql.org/docs/9.6/docguide-toolsets.html">
      PostgreSQL documentation</ulink> then execute:
   <programlisting>
    ./configure && make install-doc</programlisting>
--- a/doc/legal.sgml
+++ b/doc/legal.sgml
@@ -3,7 +3,7 @@
 <date>2017</date>

 <copyright>
- <year>2010-2018</year>
+ <year>2010-2019</year>
 <holder>2ndQuadrant, Ltd.</holder>
 </copyright>

@@ -11,7 +11,7 @@
 <title>Legal Notice</title>

 <para>
-  <productname>repmgr</productname> is Copyright &copy; 2010-2018
+  <productname>repmgr</productname> is Copyright &copy; 2010-2019
  by 2ndQuadrant, Ltd. All rights reserved.
 </para>

--- a/doc/quickstart.sgml
+++ b/doc/quickstart.sgml
@@ -97,7 +97,7 @@
    #  PostgreSQL 9.6 and later: one of 'replica' or 'logical'
    #    ('hot_standby' will still be accepted as an alias for 'replica')
    #
-    # See: https://www.postgresql.org/docs/current/static/runtime-config-wal.html#GUC-WAL-LEVEL
+    # See: https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-WAL-LEVEL

    wal_level = 'hot_standby'

@@ -224,7 +224,7 @@
  <note>
   <para>
    &repmgr; stores connection information as <ulink
-    url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING">libpq
+    url="https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING">libpq
    connection strings</ulink> throughout. This documentation refers to them as <literal>conninfo</literal>
    strings; an alternative name is <literal>DSN</literal> (<literal>data source name</literal>).
    We'll use these in place of the <command>-h hostname -d databasename -U username</command> syntax.
@@ -446,7 +446,7 @@
  </para>
  <para>
    From PostgreSQL 9.6 you can also use the view
-    <ulink url="https://www.postgresql.org/docs/current/static/monitoring-stats.html#PG-STAT-WAL-RECEIVER-VIEW">
+    <ulink url="https://www.postgresql.org/docs/current/monitoring-stats.html#PG-STAT-WAL-RECEIVER-VIEW">
    <literal>pg_stat_wal_receiver</literal></ulink> to check the replication status from the standby.

   <programlisting>
--- a/doc/repmgr-cluster-crosscheck.sgml
+++ b/doc/repmgr-cluster-crosscheck.sgml
@@ -42,7 +42,7 @@
  <refsect1>
    <title>Exit codes</title>
    <para>
-      Following exit codes can be emitted by <command>repmgr cluster crosscheck</command>:
+      One of the following exit codes will be emitted by <command>repmgr cluster crosscheck</command>:
    </para>
    <variablelist>

--- a/doc/repmgr-cluster-matrix.sgml
+++ b/doc/repmgr-cluster-matrix.sgml
@@ -102,7 +102,7 @@
  <refsect1>
    <title>Exit codes</title>
    <para>
-      Following exit codes can be emitted by <command>repmgr cluster matrix</command>:
+      One of the following exit codes will be emitted by <command>repmgr cluster matrix</command>:
    </para>
    <variablelist>

--- a/doc/repmgr-cluster-show.sgml
+++ b/doc/repmgr-cluster-show.sgml
@@ -22,6 +22,14 @@
      directly and can be run on any node in the cluster; this is also useful when analyzing
      connectivity from a particular node.
    </para>
+
+    <para>
+      Node availability is tested by connecting from the node where
+      <command>repmgr cluster show</command> is executed, and does not necessarily imply the node
+      is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
+      better overviews of connections between nodes.
+    </para>
+
  </refsect1>

  <refsect1>
@@ -44,39 +52,59 @@
    <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster show

-     ID | Name  | Role    | Status    | Upstream | Location | Connection string
-    ----+-------+---------+-----------+----------+----------+-----------------------------------------
-     1  | node1 | primary | * running |          | default  | host=db_node1 dbname=repmgr user=repmgr
-     2  | node2 | standby |   running | node1    | default  | host=db_node2 dbname=repmgr user=repmgr
-     3  | node3 | standby |   running | node1    | default  | host=db_node3 dbname=repmgr user=repmgr</programlisting>
+     ID | Name  | Role    | Status    | Upstream | Location | Priority | Connection string
+    ----+-------+---------+-----------+----------+----------+----------+-----------------------------------------
+     1  | node1 | primary | * running |          | default  | 100      | host=db_node1 dbname=repmgr user=repmgr
+     2  | node2 | standby |   running | node1    | default  | 100      | host=db_node2 dbname=repmgr user=repmgr
+     3  | node3 | standby |   running | node1    | default  | 100      | host=db_node3 dbname=repmgr user=repmgr</programlisting>
  </para>
  </refsect1>
  <refsect1>
    <title>Notes</title>
    <para>
      The column <literal>Role</literal> shows the expected server role according to the
-      &repmgr; metadata. <literal>Status</literal> shows whether the server is running or unreachable.
+      &repmgr; metadata.
+	</para>
+	<para>
+	  <literal>Status</literal> shows whether the server is running or unreachable.
      If the node has an unexpected role not reflected in the &repmgr; metadata, e.g. a node was manually
-      promoted to primary, this will be highlighted with an exclamation mark, e.g.:
+      promoted to primary, this will be highlighted with an exclamation mark.
+	  If a connection to the node cannot be made, this will be highlighted with a question mark.
+	  Note that the node will only be shown as <literal>? unreachable</literal>
+	  if a connection is not possible at network level; if the PostgreSQL instance on the
+	  node is pingable but not accepting connections, it will be shown as <literal>? running</literal>.
+	</para>
+	<para>
+	  In the following example, executed on <literal>node3</literal>, <literal>node1</literal> is not reachable
+	  at network level and assumed to be down; <literal>node2</literal> has been promoted to primary
+	  (but <literal>node3</literal> is not attached to it, and its metadata has not yet been updated);
+	  <literal>node4</literal> is running but rejecting connections (from <literal>node3</literal> at least).
      <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster show
+	 ID | Name  | Role    | Status               | Upstream | Location | Priority | Connection string
+	----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
+	 1  | node1 | primary | ? unreachable        |          | default  | 100      | host=db_node1 dbname=repmgr user=repmgr
+	 2  | node2 | standby | ! running as primary | node1    | default  | 100      | host=db_node2 dbname=repmgr user=repmgr
+	 3  | node3 | standby |   running            | node1    | default  | 100      | host=db_node3 dbname=repmgr user=repmgr
+	 4  | node4 | standby | ? running            | node1    | default  | 100      | host=db_node4 dbname=repmgr user=repmgr

-     ID | Name  | Role    | Status               | Upstream | Location | Connection string
-    ----+-------+---------+----------------------+----------+----------+-----------------------------------------
-     1  | node1 | primary | ? unreachable        |          | default  | host=db_node1 dbname=repmgr user=repmgr
-     2  | node2 | standby | ! running as primary | node1    | default  | host=db_node2 dbname=repmgr user=repmgr
-     3  | node3 | standby |   running            | node1    | default  | host=db_node3 dbname=repmgr user=repmgr
-
-    WARNING: following issues were detected
-      node "node1" (ID: 1) is registered as an active primary but is unreachable
-      node "node2" (ID: 2) is registered as standby but running as primary</programlisting>
-    </para>
-    <para>
-      Node availability is tested by connecting from the node where
-      <command>repmgr cluster show</command> is executed, and does not necessarily imply the node
-      is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
-          a better overviews of connections between nodes.
+	WARNING: following issues were detected
+	  - unable to connect to node "node1" (ID: 1)
+	  - node "node1" (ID: 1) is registered as an active primary but is unreachable
+	  - node "node2" (ID: 2) is registered as standby but running as primary
+	  - unable to connect to node "node4" (ID: 4)
+    HINT: execute with --verbose option to see connection error messages</programlisting>
    </para>
+	<para>
+	  To diagnose connection issues, execute <command>repmgr cluster show</command>
+	  with the <option>--verbose</option> option; this will display the error message
+	  for each failed connection attempt.
+	</para>
+	<tip>
+	  <para>
+		Use <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck">
+		to diagnose connection issues across the whole replication cluster.
+	  </para>
+	</tip>
  </refsect1>

  <refsect1>
@@ -87,38 +115,56 @@
      <varlistentry>
        <term><option>--csv</option></term>
        <listitem>
-		  <para>
-			<command>repmgr cluster show</command> accepts an optional parameter <literal>--csv</literal>, which
-			outputs the replication cluster's status in a simple CSV format, suitable for
-			parsing by scripts, e.g.:
-			<programlisting>
+	  <para>
+	    <command>repmgr cluster show</command> accepts an optional parameter <literal>--csv</literal>, which
+	    outputs the replication cluster's status in a simple CSV format, suitable for
+	    parsing by scripts, e.g.:
+	    <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster show --csv
    1,-1,-1
    2,0,0
    3,0,1</programlisting>
-		  </para>
-		  <para>
-			The columns have following meanings:
-			<itemizedlist spacing="compact" mark="bullet">
-			  <listitem>
-				<simpara>
-				  node ID
-				</simpara>
-			  </listitem>
-			  <listitem>
-				<simpara>
+	  </para>
+	  <para>
+	    The columns have following meanings:
+	    <itemizedlist spacing="compact" mark="bullet">
+	      <listitem>
+		<simpara>
+		  node ID
+		</simpara>
+	      </listitem>
+	      <listitem>
+		<simpara>
            availability (0 = available, -1 = unavailable)
-				</simpara>
-			  </listitem>
-			  <listitem>
-				<simpara>
-				  recovery state (0 = not in recovery, 1 = in recovery, -1 = unknown)
-				</simpara>
-			  </listitem>
-			</itemizedlist>
-		  </para>
-		</listitem>
-	  </varlistentry>
+		</simpara>
+	      </listitem>
+	      <listitem>
+		<simpara>
+            recovery state (0 = not in recovery, 1 = in recovery, -1 = unknown)
+		</simpara>
+	      </listitem>
+	    </itemizedlist>
+	  </para>
+	</listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>--compact</option></term>
+        <listitem>
+          <para>
+			Suppress display of the <literal>conninfo</literal> column.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>--terse</option></term>
+        <listitem>
+          <para>
+			Suppress warnings about connection issues.
+          </para>
+        </listitem>
+      </varlistentry>

      <varlistentry>
        <term><option>--verbose</option></term>
@@ -137,7 +183,7 @@
  <refsect1>
    <title>Exit codes</title>
    <para>
-      Following exit codes can be emitted by <command>repmgr cluster show</command>:
+      One of the following exit codes will be emitted by <command>repmgr cluster show</command>:
    </para>
    <variablelist>

--- a/doc/repmgr-daemon-pause.sgml
+++ b/doc/repmgr-daemon-pause.sgml
@@ -3,6 +3,11 @@
    <primary>repmgr daemon pause</primary>
  </indexterm>

+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>pausing</secondary>
+  </indexterm>
+
  <refmeta>
    <refentrytitle>repmgr daemon pause</refentrytitle>
  </refmeta>
@@ -74,7 +79,7 @@ NOTICE: node 3 (node3) paused</programlisting>
  <refsect1>
    <title>Exit codes</title>
    <para>
-      Following exit codes can be emitted by <command>repmgr daemon unpause</command>:
+      One of the following exit codes will be emitted by <command>repmgr daemon unpause</command>:
    </para>
    <variablelist>

--- a/doc/repmgr-daemon-start.sgml
+++ b/doc/repmgr-daemon-start.sgml
@@ -0,0 +1,203 @@
+<refentry id="repmgr-daemon-start">
+  <indexterm>
+    <primary>repmgr daemon start</primary>
+  </indexterm>
+
+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>starting</secondary>
+  </indexterm>
+
+  <refmeta>
+    <refentrytitle>repmgr daemon start</refentrytitle>
+  </refmeta>
+
+  <refnamediv>
+    <refname>repmgr daemon start</refname>
+    <refpurpose>Start the <application>repmgrd</application> daemon</refpurpose>
+  </refnamediv>
+
+  <refsect1>
+    <title>Description</title>
+    <para>
+      This command starts the <application>repmgrd</application> daemon on the
+      local node.
+    </para>
+    <para>
+      By default, &repmgr; will wait for up to 15 seconds to confirm that <application>repmgrd</application>
+      started. This behaviour can be overridden by specifying a diffent value using the <option>--wait</option>
+      option, or disabled altogether with the <option>--no-wait</option> option.
+    </para>
+
+    <important>
+      <para>
+        The <filename>repmgr.conf</filename> parameter <varname>repmgrd_service_start_command</varname>
+        must be set for <command>repmgr daemon start</command> to work; see section
+        <xref linkend="repmgr-daemon-start-configuration"> for details.
+      </para>
+    </important>
+  </refsect1>
+
+
+
+  <refsect1>
+
+    <title>Options</title>
+
+    <variablelist>
+
+      <varlistentry>
+        <term><option>--dry-run</option></term>
+        <listitem>
+          <para>
+            Check prerequisites but don't actually attempt to start <application>repmgrd</application>.
+          </para>
+          <para>
+            This action will output the command which would be executed.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>-w</option></term>
+        <term><option>--wait</option></term>
+        <listitem>
+          <para>
+            Wait for the specified number of seconds to confirm that <application>repmgrd</application>
+            started successfully.
+          </para>
+          <para>
+            Note that providing <option>--wait=0</option> is the equivalent of <option>--no-wait</option>.
+          </para>
+
+         </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>--no-wait</option></term>
+        <listitem>
+          <para>
+            Don't wait to confirm that <application>repmgrd</application>
+            started successfully.
+          </para>
+          <para>
+            This is equivalent to providing <option>--wait=0</option>.
+          </para>
+         </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1 id="repmgr-daemon-start-configuration" xreflabel="repmgr daemon start configuration">
+    <title>Configuration file settings</title>
+    <para>
+     The following parameter in <filename>repmgr.conf</filename> is relevant
+     to <command>repmgr daemon start</command>:
+    </para>
+
+    <variablelist>
+
+      <varlistentry>
+        <indexterm>
+          <primary>repmgrd_service_start_command</primary>
+          <secondary>with &quot;repmgr daemon start&quot;</secondary>
+        </indexterm>
+
+        <term><option>repmgrd_service_start_command</option></term>
+        <listitem>
+          <para>
+            <command>repmgr daemon start</command> will execute the command defined by the
+            <varname>repmgrd_service_start_command</varname> parameter in <filename>repmgr.conf</filename>.
+            This must be set to a shell command which will start <application>repmgrd</application>;
+            if &repmgr; was installed from a package, this will be the service command defined by the
+            package. For more details see <link linkend="appendix-packages">Appendix: &repmgr; package details</link>.
+          </para>
+          <important>
+            <para>
+              If &repmgr; was installed from a system package, and you do not configure
+              <varname>repmgrd_service_start_command</varname> to an appropriate service command, this may
+              result in the system becoming confused about the state of the <application>repmgrd</application>
+              service; this is particularly the case with <literal>systemd</literal>.
+            </para>
+          </important>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+
+  </refsect1>
+
+
+  <refsect1>
+    <title>Exit codes</title>
+    <para>
+      One of the following exit codes will be emitted by <command>repmgr daemon start</command>:
+    </para>
+    <variablelist>
+
+      <varlistentry>
+        <term><option>SUCCESS (0)</option></term>
+        <listitem>
+          <para>
+            The <application>repmgrd</application> start command (defined in
+            <varname>repmgrd_service_start_command</varname>) was successfully executed.
+          </para>
+          <para>
+            If the <option>--wait</option> option was provided, &repmgr; will confirm that
+            <application>repmgrd</application> has actually started up.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_BAD_CONFIG (1)</option></term>
+        <listitem>
+          <para>
+            <varname>repmgrd_service_start_command</varname> is not defined in
+            <filename>repmgr.conf</filename>.
+          </para>
+        </listitem>
+      </varlistentry>
+
+
+      <varlistentry>
+        <term><option>ERR_DB_CONN (6)</option></term>
+        <listitem>
+          <para>
+            &repmgr; was unable to connect to the local PostgreSQL node.
+          </para>
+          <para>
+            PostgreSQL must be running before <application>repmgrd</application>
+            can be started. Additionally, unless the <option>--no-wait</option> option was
+            provided, &repmgr; needs to be able to connect to the local PostgreSQL node
+            to determine the state of <application>repmgrd</application>.
+          </para>
+        </listitem>
+      </varlistentry>
+
+
+      <varlistentry>
+        <term><option>ERR_REPMGRD_SERVICE (27)</option></term>
+        <listitem>
+          <para>
+            The <application>repmgrd</application> start command (defined in
+            <varname>repmgrd_service_start_command</varname>) was not successfully executed.
+          </para>
+          <para>
+            This can also mean that &repmgr; was unable to confirm whether <application>repmgrd</application>
+            successfully started (unless the <option>--no-wait</option> option was provided).
+          </para>
+        </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>See also</title>
+    <para>
+      <xref linkend="repmgr-daemon-stop">, <xref linkend="repmgr-daemon-status">, <xref linkend="repmgrd-daemon">
+    </para>
+  </refsect1>
+
+</refentry>
--- a/doc/repmgr-daemon-status.sgml
+++ b/doc/repmgr-daemon-status.sgml
@@ -3,6 +3,11 @@
    <primary>repmgr daemon status</primary>
  </indexterm>

+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>displaying daemon status</secondary>
+  </indexterm>
+
  <refmeta>
    <refentrytitle>repmgr daemon status</refentrytitle>
  </refmeta>
@@ -28,7 +33,10 @@
      <command>repmgr daemon status</command> can be executed on any active node in the
      replication cluster. A valid <filename>repmgr.conf</filename> file is required.
    </para>
-
+    <para>
+      If PostgreSQL is not running on a node, &repmgr; will not be able to determine the
+      status of that node's <application>repmgrd</application> instance.
+    </para>
    <note>
      <para>
        After restarting PostgreSQL on any node, the <application>repmgrd</application> instance
@@ -44,33 +52,34 @@
    <para>
      <application>repmgrd</application> running normally on all nodes:
    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
- ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
----+-------+---------+---------+---------+------+---------
- 1  | node1 | primary | running | running | 7851 | no
- 2  | node2 | standby | running | running | 7889 | no
- 3  | node3 | standby | running | running | 7918 | no</programlisting>
+ ID | Name  | Role    | Priority | Status  | repmgrd | PID   | Paused? | Upstream last seen
+----+-------+---------+----------+---------+---------+-------+---------+--------------------
+ 1  | node1 | primary | 100      | running | running | 71987 | no      | n/a
+ 2  | node2 | standby | 100      | running | running | 71996 | no      | 1 second(s) ago
+ 3  | node3 | standby | 100      | running | running | 72042 | no      | 1 second(s) ago
+</programlisting>
    </para>

    <para>
      <application>repmgrd</application> paused on all nodes (using <xref linkend="repmgr-daemon-pause">):
    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
- ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
----+-------+---------+---------+---------+------+---------
- 1  | node1 | primary | running | running | 7851 | yes
- 2  | node2 | standby | running | running | 7889 | yes
- 3  | node3 | standby | running | running | 7918 | yes</programlisting>
+ ID | Name  | Role    | Priority | Status  | repmgrd | PID   | Paused? | Upstream last seen
+----+-------+---------+----------+---------+---------+-------+---------+--------------------
+ 1  | node1 | primary | 100      | running | running | 71987 | yes     | n/a
+ 2  | node2 | standby | 100      | running | running | 71996 | yes     | 0 second(s) ago
+ 3  | node3 | standby | 100      | running | running | 72042 | yes     | 0 second(s) ago
+</programlisting>
    </para>

    <para>
      <application>repmgrd</application> not running on one node:
    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
- ID | Name  | Role    | Status  | repmgrd     | PID  | Paused?
----+-------+---------+---------+-------------+------+---------
- 1  | node1 | primary | running | running     | 7851 | yes
- 2  | node2 | standby | running | not running | n/a  | n/a
- 3  | node3 | standby | running | running     | 7918 | yes</programlisting>
+ ID | Name  | Role    | Priority | Status  | repmgrd     | PID   | Paused? | Upstream last seen
+----+-------+---------+----------+---------+-------------+-------+---------+--------------------
+ 1  | node1 | primary | 100      | running | running     | 71987 | yes     | n/a
+ 2  | node2 | standby | 100      | running | not running | n/a   | n/a     | n/a
+ 3  | node3 | standby | 100      | running | running     | 72042 | yes     | 0 second(s) ago</programlisting>
    </para>
-
  </refsect1>

  <refsect1>
@@ -81,76 +90,88 @@
      <varlistentry>
        <term><option>--csv</option></term>
        <listitem>
-		  <para>
-			<command>repmgr daemon status</command> accepts an optional parameter <literal>--csv</literal>, which
-			outputs the replication cluster's status in a simple CSV format, suitable for
-			parsing by scripts, e.g.:
-			<programlisting>
+          <para>
+            <command>repmgr daemon status</command> accepts an optional parameter <literal>--csv</literal>, which
+            outputs the replication cluster's status in a simple CSV format, suitable for
+            parsing by scripts, e.g.:
+            <programlisting>
    $ repmgr -f /etc/repmgr.conf daemon status --csv
-    1,node1,primary,1,1,10204,1
-    2,node2,standby,1,0,-1,1
-    3,node3,standby,1,1,10225,1</programlisting>
-		  </para>
-		  <para>
-			The columns have following meanings:
-			<itemizedlist spacing="compact" mark="bullet">
-			  <listitem>
-				<simpara>
-				  node ID
-				</simpara>
-			  </listitem>
+    1,node1,primary,1,1,5722,1,100,-1
+    2,node2,standby,1,0,-1,1,100,1
+    3,node3,standby,1,1,5779,1,100,1</programlisting>
+          </para>
+          <para>
+            The columns have following meanings:
+            <itemizedlist spacing="compact" mark="bullet">
+              <listitem>
+                <simpara>
+                  node ID
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
+              <listitem>
+                <simpara>
                  node name
-				</simpara>
-			  </listitem>
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
+              <listitem>
+                <simpara>
                  node type (primary or standby)
-				</simpara>
-			  </listitem>
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
-                  PostgreSQL server running
-				</simpara>
-			  </listitem>
+              <listitem>
+                <simpara>
+                  PostgreSQL server running (1 = running, 0 = not running)
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
-                  <application>repmgrd</application> running (1 = running, 0 = not running)
-				</simpara>
-			  </listitem>
+              <listitem>
+                <simpara>
+                  <application>repmgrd</application> running (1 = running, 0 = not running, -1 = unknown)
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
-                  <application>repmgrd</application> PID (-1 if not running)
-				</simpara>
-			  </listitem>
+              <listitem>
+                <simpara>
+                  <application>repmgrd</application> PID (-1 if not running or status unknown)
+                </simpara>
+              </listitem>

-			  <listitem>
-				<simpara>
-                  <application>repmgrd</application> paused (1 = paused, 0 = not paused)
-				</simpara>
-			  </listitem>
+              <listitem>
+                <simpara>
+                  <application>repmgrd</application> paused (1 = paused, 0 = not paused, -1 = unknown)
+                </simpara>
+              </listitem>

-			</itemizedlist>
-		  </para>
-		</listitem>
-	  </varlistentry>
+              <listitem>
+                <simpara>
+                  <application>repmgrd</application> node priority
+                </simpara>
+              </listitem>
+
+              <listitem>
+                <simpara>
+                  interval in seconds since the node's upstream was last seen (this will be -1 if the value could not be retrieved, or the node is primary)
+                </simpara>
+              </listitem>
+
+            </itemizedlist>
+          </para>
+        </listitem>
+      </varlistentry>

      <varlistentry>
        <term><option>--verbose</option></term>
        <listitem>
          <para>
-			Display the full text of any database connection error messages
+            Display the full text of any database connection error messages
          </para>
        </listitem>
      </varlistentry>

-	</variablelist>
+    </variablelist>

  </refsect1>

--- a/doc/repmgr-daemon-stop.sgml
+++ b/doc/repmgr-daemon-stop.sgml
@@ -0,0 +1,200 @@
+<refentry id="repmgr-daemon-stop">
+  <indexterm>
+    <primary>repmgr daemon stop</primary>
+  </indexterm>
+
+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>stopping</secondary>
+  </indexterm>
+
+  <refmeta>
+    <refentrytitle>repmgr daemon stop</refentrytitle>
+  </refmeta>
+
+  <refnamediv>
+    <refname>repmgr daemon stop</refname>
+    <refpurpose>Stop the <application>repmgrd</application> daemon</refpurpose>
+  </refnamediv>
+
+  <refsect1>
+    <title>Description</title>
+    <para>
+      This command stops the <application>repmgrd</application> daemon on the
+      local node.
+    </para>
+
+    <para>
+      By default, &repmgr; will wait for up to 15 seconds to confirm that <application>repmgrd</application>
+      stopped. This behaviour can be overridden by specifying a diffent value using the <option>--wait</option>
+      option, or disabled altogether with the <option>--no-wait</option> option.
+    </para>
+    <note>
+      <para>
+        If PostgreSQL is not running on the local node, under some circumstances &repmgr; may not
+        be able to confirm if <application>repmgrd</application> has actually stopped.
+      </para>
+    </note>
+
+   <important>
+      <para>
+        The <filename>repmgr.conf</filename> parameter <varname>repmgrd_service_stop_command</varname>
+        must be set for <command>repmgr daemon stop</command> to work; see section
+        <xref linkend="repmgr-daemon-stop-configuration"> for details.
+      </para>
+    </important>
+  </refsect1>
+
+  <refsect1>
+    <title>Configuration</title>
+    <para>
+      <command>repmgr daemon stop</command> will execute the command defined by the
+      <varname>repmgrd_service_stop_command</varname> parameter in <filename>repmgr.conf</filename>.
+      This must be set to a shell command which will stop <application>repmgrd</application>;
+      if &repmgr; was installed from a package, this will be the service command defined by the
+      package. For more details see <link linkend="appendix-packages">Appendix: &repmgr; package details</link>.
+    </para>
+
+    <important>
+      <para>
+        If &repmgr; was installed from a system package, and you do not configure
+        <varname>repmgrd_service_stop_command</varname> to an appropriate service command, this may
+        result in the system becoming confused about the state of the <application>repmgrd</application>
+        service; this is particularly the case with <literal>systemd</literal>.
+      </para>
+    </important>
+
+  </refsect1>
+
+  <refsect1>
+
+    <title>Options</title>
+
+    <variablelist>
+
+      <varlistentry>
+        <term><option>--dry-run</option></term>
+        <listitem>
+          <para>
+            Check prerequisites but don't actually attempt to stop <application>repmgrd</application>.
+          </para>
+          <para>
+            This action will output the command which would be executed.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>-w</option></term>
+        <term><option>--wait</option></term>
+        <listitem>
+          <para>
+            Wait for the specified number of seconds to confirm that <application>repmgrd</application>
+            stopped successfully.
+          </para>
+          <para>
+            Note that providing <option>--wait=0</option> is the equivalent of <option>--no-wait</option>.
+          </para>
+
+         </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>--no-wait</option></term>
+        <listitem>
+          <para>
+            Don't wait to confirm that <application>repmgrd</application>
+            stopped successfully.
+          </para>
+          <para>
+            This is equivalent to providing <option>--wait=0</option>.
+          </para>
+         </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1 id="repmgr-daemon-stop-configuration" xreflabel="repmgr daemon stop configuration">
+    <title>Configuration file settings</title>
+    <para>
+     The following parameter in <filename>repmgr.conf</filename> is relevant
+     to <command>repmgr daemon stop</command>:
+    </para>
+
+    <variablelist>
+
+      <varlistentry>
+        <indexterm>
+          <primary>repmgrd_service_stop_command</primary>
+          <secondary>with &quot;repmgr daemon stop&quot;</secondary>
+        </indexterm>
+
+        <term><option>repmgrd_service_stop_command</option></term>
+        <listitem>
+          <para>
+            <command>repmgr daemon stop</command> will execute the command defined by the
+            <varname>repmgrd_service_stop_command</varname> parameter in <filename>repmgr.conf</filename>.
+            This must be set to a shell command which will stop <application>repmgrd</application>;
+            if &repmgr; was installed from a package, this will be the service command defined by the
+            package. For more details see <link linkend="appendix-packages">Appendix: &repmgr; package details</link>.
+          </para>
+          <important>
+            <para>
+              If &repmgr; was installed from a system package, and you do not configure
+              <varname>repmgrd_service_stop_command</varname> to an appropriate service command, this may
+              result in the system becoming confused about the state of the <application>repmgrd</application>
+              service; this is particularly the case with <literal>systemd</literal>.
+            </para>
+          </important>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+
+  </refsect1>
+
+  <refsect1>
+    <title>Exit codes</title>
+    <para>
+      One of the following exit codes will be emitted by <command>repmgr daemon stop</command>:
+    </para>
+    <variablelist>
+
+      <varlistentry>
+        <term><option>SUCCESS (0)</option></term>
+        <listitem>
+          <para>
+            <application>repmgrd</application> could be stopped.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_BAD_CONFIG (1)</option></term>
+        <listitem>
+          <para>
+            <varname>repmgrd_service_stop_command</varname> is not defined in
+            <filename>repmgr.conf</filename>.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_REPMGRD_SERVICE (27)</option></term>
+        <listitem>
+          <para>
+            <application>repmgrd</application> could not be stopped.
+          </para>
+        </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>See also</title>
+    <para>
+      <xref linkend="repmgr-daemon-start">, <xref linkend="repmgr-daemon-status">, <xref linkend="repmgrd-daemon">
+    </para>
+  </refsect1>
+
+</refentry>
--- a/doc/repmgr-daemon-unpause.sgml
+++ b/doc/repmgr-daemon-unpause.sgml
@@ -3,6 +3,12 @@
    <primary>repmgr daemon unpause</primary>
  </indexterm>

+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>unpausing</secondary>
+  </indexterm>
+
+
  <refmeta>
    <refentrytitle>repmgr daemon unpause</refentrytitle>
  </refmeta>
@@ -68,7 +74,7 @@ NOTICE: node 3 (node3) unpaused</programlisting>
  <refsect1>
    <title>Exit codes</title>
    <para>
-      Following exit codes can be emitted by <command>repmgr daemon unpause</command>:
+      One of the following exit codes will be emitted by <command>repmgr daemon unpause</command>:
    </para>
    <variablelist>

--- a/doc/repmgr-node-check.sgml
+++ b/doc/repmgr-node-check.sgml
@@ -18,6 +18,14 @@
      Performs some health checks on a node from a replication perspective.
      This command must be run on the local node.
    </para>
+	<note>
+	  <para>
+		Currently &repmgr; performs health checks on physical replication
+		slots only, with the aim of warning about streaming replication standbys which
+		have become detached and the associated risk of uncontrolled WAL file
+		growth.
+	  </para>
+	</note>
  </refsect1>

  <refsect1>
@@ -30,8 +38,8 @@
            Replication lag: OK (N/A - node is primary)
            WAL archiving: OK (0 pending files)
            Downstream servers: OK (2 of 2 downstream nodes attached)
-            Replication slots: OK (node has no replication slots)
-            Missing replication slots: OK (node has no missing replication slots)</programlisting>
+            Replication slots: OK (node has no physical replication slots)
+            Missing replication slots: OK (node has no missing physical replication slots)</programlisting>
    </para>
  </refsect1>
  <refsect1>
@@ -44,7 +52,7 @@
        OK (node is primary)</programlisting>
    </para>
    <para>
-   Parameters for individual checks are as follows:
+	  Parameters for individual checks are as follows:
    <itemizedlist spacing="compact" mark="bullet">

     <listitem>
@@ -76,16 +84,26 @@

     <listitem>
      <simpara>
-        <literal>--slots</literal>: checks there are no inactive replication slots
+        <literal>--slots</literal>: checks there are no inactive physical replication slots
      </simpara>
     </listitem>

     <listitem>
      <simpara>
-        <literal>--missing-slots</literal>: checks there are no missing replication slots
+        <literal>--missing-slots</literal>: checks there are no missing physical replication slots
      </simpara>
     </listitem>

+     <listitem>
+      <simpara>
+        <literal>--data-directory-config</literal>: checks the data directory configured in
+        <filename>repmgr.conf</filename> matches the actual data directory.
+        This check is not directly related to replication, but is useful to verify &repmgr;
+        is correctly configured.
+      </simpara>
+     </listitem>
+
+
    </itemizedlist>
  </para>
  </refsect1>
@@ -105,6 +123,7 @@
        <listitem>
          <simpara>
            <literal>--nagios</literal>: generate output in a Nagios-compatible format
+            (for individual checks only)
          </simpara>
        </listitem>
      </itemizedlist>
@@ -151,9 +170,10 @@


    <para>
-      Following exit codes can be emitted by <command>repmgr status check</command>
+      One of the following exit codes will be emitted by <command>repmgr status check</command>
      if no individual check was specified.
    </para>
+
    <variablelist>

      <varlistentry>
@@ -175,6 +195,7 @@
      </varlistentry>

   </variablelist>
+
  </refsect1>


--- a/doc/repmgr-node-rejoin.sgml
+++ b/doc/repmgr-node-rejoin.sgml
@@ -119,6 +119,7 @@

    </variablelist>
  </refsect1>
+
  <refsect1>
    <title>Configuration file settings</title>

@@ -132,6 +133,11 @@
 		   the value set in <literal>standby_reconnect_timeout</literal>,
 		   60 seconds).
 		 </simpara>
+         <simpara>
+           Note that <literal>standby_reconnect_timeout</literal> must be
+           set to a value equal to or greater than
+           <literal>node_rejoin_timeout</literal>.
+         </simpara>
 	   </listitem>
 	  </itemizedlist>
 	</para>
@@ -144,6 +150,55 @@
      A <literal>node_rejoin</literal> <link linkend="event-notifications">event notification</link> will be generated.
    </para>
  </refsect1>
+  <refsect1>
+    <title>Exit codes</title>
+    <para>
+      One of the following exit codes will be emitted by <command>repmgr node rejoin</command>:
+    </para>
+
+    <variablelist>
+
+      <varlistentry>
+        <term><option>SUCCESS (0)</option></term>
+        <listitem>
+          <para>
+            The node rejoin succeeded; or if <option>--dry-run</option> was provided,
+            no issues were detected which would prevent the node rejoin.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_BAD_CONFIG (1)</option></term>
+        <listitem>
+          <para>
+            A configuration issue was detected which prevented &repmgr; from
+            continuing with the node rejoin.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_NO_RESTART (4)</option></term>
+        <listitem>
+          <para>
+            The node could not be restarted.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_REJOIN_FAIL (24)</option></term>
+        <listitem>
+          <para>
+            The node rejoin operation failed.
+          </para>
+        </listitem>
+      </varlistentry>
+
+    </variablelist>
+
+  </refsect1>

  <refsect1>
    <title>Notes</title>
@@ -167,6 +222,10 @@
          postgres --single -D /var/lib/pgsql/data/ &lt; /dev/null</programlisting>
      </para>
    </tip>
+    <para>
+      &repmgr; will attempt to verify whether the node can rejoin as-is, or whether
+      <command>pg_rewind</command> must be used (see following section).
+    </para>
  </refsect1>

  <refsect1 id="repmgr-node-rejoin-pg-rewind" xreflabel="Using pg_rewind">
@@ -188,73 +247,137 @@
        <command>pg_rewind</command> <emphasis>requires</emphasis> that either
        <varname>wal_log_hints</varname> is enabled, or that
        data checksums were enabled when the cluster was initialized. See the
-        <ulink url="https://www.postgresql.org/docs/current/static/app-pgrewind.html"><command>pg_rewind</command> documentation</ulink> for details.
+        <ulink url="https://www.postgresql.org/docs/current/app-pgrewind.html"><command>pg_rewind</command> documentation</ulink> for details.
      </para>
    </note>

+    <para>
+      We strongly recommend familiarizing yourself with <command>pg_rewind</command> before attempting
+      to use it with &repmgr;, as while it is an extremely useful tool, it is <emphasis>not</emphasis>
+      a &quot;magic bullet&quot; which can resolve all problematic replication situations.
+    </para>
+
+    <para>
+      A typical use-case for <command>pg_rewind</command> is when a scenario like the following
+      is encountered:
+      <programlisting>
+    $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node3 dbname=repmgr user=repmgr' \
+        --force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose --dry-run
+    INFO: replication connection to the rejoin target node was successful
+    INFO: local and rejoin target system identifiers match
+    DETAIL: system identifier is 6652184002263212600
+    ERROR: this node cannot attach to rejoin target node 3
+    DETAIL: rejoin target server's timeline 2 forked off current database system timeline 1 before current recovery point 0/610D710
+    HINT: use --force-rewind to execute pg_rewind</programlisting>
+
+      Here, <literal>node3</literal> was promoted to a primary while the local node was
+      still attached to the previous primary; this can potentially happen during e.g. a
+      network split. <command>pg_rewind</command> can re-sync the local node with <literal>node3</literal>,
+      removing the need for a full reclone.
+    </para>
+
    <para>
      To have <command>repmgr node rejoin</command> use <command>pg_rewind</command>,
      pass the command line option <literal>--force-rewind</literal>, which will tell &repmgr;
      to execute <command>pg_rewind</command> to ensure the node can be rejoined successfully.
    </para>

-    <para>
-      Be aware that if <command>pg_rewind</command> is executed and actually performs a
-      rewind operation, any configuration files in the PostgreSQL data directory will be
-      overwritten with those from the source server.
-    </para>
-    <para>
-      To prevent this happening, provide a comma-separated list of files to retain
-      using the <literal>--config-file</literal> command line option; the specified files
-      will be archived in a temporary directory (whose parent directory can be specified with
-      <literal>--config-archive-dir</literal>) and restored once the rewind operation is
-      complete.
-    </para>
+    <important>
+      <para>
+        Be aware that if <command>pg_rewind</command> is executed and actually performs a
+        rewind operation, any configuration files in the PostgreSQL data directory will be
+        overwritten with those from the source server.
+      </para>
+      <para>
+        To prevent this happening, provide a comma-separated list of files to retain
+        using the <literal>--config-file</literal> command line option; the specified files
+        will be archived in a temporary directory (whose parent directory can be specified with
+        <literal>--config-archive-dir</literal>) and restored once the rewind operation is
+        complete.
+      </para>
+    </important>

    <para>
      Example, first using <literal>--dry-run</literal>, then actually executing the
      <literal>node rejoin command</literal>.
    <programlisting>
-    $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node1 dbname=repmgr user=repmgr' \
-         --force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose --dry-run
-    NOTICE: using provided configuration file "/etc/repmgr.conf"
+    $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node3 dbname=repmgr user=repmgr' \
+        --config-files=postgresql.local.conf,postgresql.conf --verbose --force-rewind --dry-run
+    INFO: replication connection to the rejoin target node was successful
+    INFO: local and rejoin target system identifiers match
+    DETAIL: system identifier is 6652460429293670710
+    NOTICE: pg_rewind execution required for this node to attach to rejoin target node 3
+    DETAIL: rejoin target server's timeline 2 forked off current database system timeline 1 before current recovery point 0/610D710
    INFO: prerequisites for using pg_rewind are met
-    INFO: file "postgresql.local.conf" would be copied to "/tmp/repmgr-config-archive-node1/postgresql.local.conf"
-    INFO: file "postgresql.conf" would be copied to "/tmp/repmgr-config-archive-node1/postgresql.local.conf"
-    INFO: 2 files would have been copied to "/tmp/repmgr-config-archive-node1"
-    INFO: directory "/tmp/repmgr-config-archive-node1" deleted
+    INFO: file "postgresql.local.conf" would be copied to "/tmp/repmgr-config-archive-node2/postgresql.local.conf"
+    INFO: file "postgresql.replication-setup.conf" would be copied to "/tmp/repmgr-config-archive-node2/postgresql.replication-setup.conf"
    INFO: pg_rewind would now be executed
    DETAIL: pg_rewind command is:
-      pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node1 dbname=repmgr user=repmgr'</programlisting>
+      pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node3 dbname=repmgr user=repmgr'
+    INFO: prerequisites for executing NODE REJOIN are met</programlisting>

    <note>
      <para>
        If <option>--force-rewind</option> is used with the <option>--dry-run</option> option,
-        this checks the prerequisites for using <application>pg_rewind</application>, but cannot
-        predict the outcome of actually executing <application>pg_rewind</application>.
+        this checks the prerequisites for using <application>pg_rewind</application>, but is
+        not an absolute guarantee that actually executing <application>pg_rewind</application>
+        will succeed. See also section <xref linkend="repmgr-node-rejoin-caveats"> below.
      </para>
+
    </note>

    <programlisting>
-    $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node1 dbname=repmgr user=repmgr' \
-         --force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose
-    NOTICE: using provided configuration file "/etc/repmgr.conf"
-    INFO: prerequisites for using pg_rewind are met
-    INFO: 2 files copied to "/tmp/repmgr-config-archive-node1"
+    $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node3 dbname=repmgr user=repmgr' \
+        --config-files=postgresql.local.conf,postgresql.conf --verbose --force-rewind
+    NOTICE: pg_rewind execution required for this node to attach to rejoin target node 3
+    DETAIL: rejoin target server's timeline 2 forked off current database system timeline 1 before current recovery point 0/610D710
    NOTICE: executing pg_rewind
-    NOTICE: 2 files copied to /var/lib/pgsql/data
-    INFO: directory "/tmp/repmgr-config-archive-node1" deleted
-    INFO: deleting "recovery.done"
-    INFO: setting node 1's primary to node 2
-    NOTICE: starting server using "pg_ctl-l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' start"
-    waiting for server to start.... done
-    server started
+    DETAIL: pg_rewind command is "pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node3 dbname=repmgr user=repmgr'"
+    NOTICE: 2 files copied to /var/lib/postgresql/data
+    NOTICE: setting node 2's upstream to node 3
+    NOTICE: starting server using "pg_ctl -l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' start"
    NOTICE: NODE REJOIN successful
-    DETAIL: node 1 is now attached to node 2</programlisting>
+    DETAIL: node 2 is now attached to node 3</programlisting>
    </para>

  </refsect1>

+  <refsect1 id="repmgr-node-rejoin-caveats" xreflabel="Caveats">
+
+   <indexterm>
+      <primary>repmgr node rejoin</primary>
+      <secondary>caveats</secondary>
+    </indexterm>
+
+   <title>Caveats when using <command>repmgr node rejoin</command></title>
+   <para>
+     <command>repmgr node rejoin</command> attempts to determine whether it will succeed by
+     comparing the timelines and relative WAL positions of the local node (rejoin candidate) and primary
+     (rejoin target). This is particularly important if planning to use <application>pg_rewind</application>,
+     which currently (as of PostgreSQL 11) may appear to succeed (or indicate there is no action
+     needed) but potentially allow an impossible action, such as trying to rejoin a standby to a
+     primary which is behind the standby. &repmgr; will prevent this situation from occurring.
+   </para>
+   <para>
+     Currently it is <emphasis>not</emphasis> possible to detect a situation where the rejoin target
+     is a standby which has been &quot;promoted&quot; by removing <filename>recovery.conf</filename>
+     (PostgreSQL 12 and later: <filename>standby.signal</filename>) and restarting it.
+     In this case there will be no information about the point the rejoin target diverged
+     from the current standby; the rejoin operation will fail and
+     the current standby's PostgreSQL log will contain entries with the text
+     &quot;<literal>record with incorrect prev-link</literal>&quot;.
+   </para>
+   <para>
+     We strongly recommend running <command>repmgr node rejoin</command> with the
+     <option>--dry-run</option> option first. Additionally it might be a good idea
+     to execute the <application>pg_rewind</application> command displayed by
+     &repmgr; with the <application>pg_rewind</application> <option>--dry-run</option>
+     option. Note that <application>pg_rewind</application> does not indicate that it
+     is running in <option>--dry-run</option> mode.
+   </para>
+
+  </refsect1>
+
  <refsect1>
    <title>See also</title>
    <para>
--- a/doc/repmgr-node-service.sgml
+++ b/doc/repmgr-node-service.sgml
@@ -84,7 +84,7 @@
  <refsect1>
    <title>Exit codes</title>
    <para>
-      Following exit codes can be emitted by <command>repmgr node service</command>:
+      One of the following exit codes will be emitted by <command>repmgr node service</command>:
    </para>
    <variablelist>

--- a/doc/repmgr-node-status.sgml
+++ b/doc/repmgr-node-status.sgml
@@ -55,7 +55,7 @@
  <refsect1>
    <title>Exit codes</title>
    <para>
-      Following exit codes can be emitted by <command>repmgr node status</command>:
+      One of the following exit codes will be emitted by <command>repmgr node status</command>:
    </para>
    <variablelist>

--- a/doc/repmgr-primary-register.sgml
+++ b/doc/repmgr-primary-register.sgml
@@ -21,6 +21,15 @@
      installing the &repmgr; extension. This command needs to be executed before any
      standby nodes are registered.
    </para>
+
+    <note>
+      <para>
+        It's possibly to install the &repmgr; extension manually before executing
+        <command>repmgr primary register</command>; in this case &repmgr; will
+        detect the presence of the extension and skip that step.
+      </para>
+    </note>
+
  </refsect1>

  <refsect1>
@@ -35,16 +44,16 @@
    </para>

    <note>
-    <para>
-      If providing the configuration file location with <option>-f/--config-file</option>,
-      avoid using a relative path, as &repmgr; stores the configuration file location
-      in the repmgr metadata for use when &repmgr; is executed remotely (e.g. during
-      <xref linkend="repmgr-standby-switchover">). &repmgr; will attempt to convert the
-        a relative path into an absolute one, but this may not be the same as the path you
-        would explicitly provide (e.g. <filename>./repmgr.conf</filename> might be converted
-        to <filename>/path/to/./repmgr.conf</filename>, whereas you'd normally write
-        <filename>/path/to/repmgr.conf</filename>).
-    </para>
+      <para>
+        If providing the configuration file location with <option>-f/--config-file</option>,
+        avoid using a relative path, as &repmgr; stores the configuration file location
+        in the repmgr metadata for use when &repmgr; is executed remotely (e.g. during
+        <xref linkend="repmgr-standby-switchover">). &repmgr; will attempt to convert the
+          a relative path into an absolute one, but this may not be the same as the path you
+          would explicitly provide (e.g. <filename>./repmgr.conf</filename> might be converted
+          to <filename>/path/to/./repmgr.conf</filename>, whereas you'd normally write
+          <filename>/path/to/repmgr.conf</filename>).
+      </para>
    </note>
  </refsect1>

--- a/doc/repmgr-standby-clone.sgml
+++ b/doc/repmgr-standby-clone.sgml
@@ -87,7 +87,7 @@
  <refsect1 id="repmgr-standby-clone-recovery-conf">
   <indexterm>
     <primary>recovery.conf</primary>
-     <secondary>customising with "repmgr standby clone"</secondary>
+     <secondary>customising with &quot;repmgr standby clone&quot;</secondary>
   </indexterm>

   <title>Customising recovery.conf</title>
@@ -170,7 +170,7 @@
      pg_basebackup_options='--xlog-method=fetch'</programlisting>

    and ensure that <literal>wal_keep_segments</literal> is set to an appropriately high value.
-    See the <ulink url="https://www.postgresql.org/docs/current/static/app-pgbasebackup.html">
+    See the <ulink url="https://www.postgresql.org/docs/current/app-pgbasebackup.html">
    pg_basebackup</ulink> documentation for details.
   </para>

@@ -194,10 +194,11 @@
   <title>Using a standby cloned by another method</title>
   <para>
     &repmgr; supports standbys cloned by another method (e.g. using <application>barman</application>'s
-     <command><ulink url="http://docs.pgbarman.org/release/2.4/#recover">barman recover</ulink></command> command).
+     <command><ulink url="http://docs.pgbarman.org/release/2.5/#recover">barman recover</ulink></command> command).
   </para>
   <para>
-     To integrate the standby as a &repmgr; node, ensure the <filename>repmgr.conf</filename>
+     To integrate the standby as a &repmgr; node, once the standby has been cloned,
+     ensure the <filename>repmgr.conf</filename>
     file is created for the node, and that it has been registered using
     <command><link linkend="repmgr-standby-register">repmgr standby register</link></command>.
     Then execute the command <command>repmgr standby clone --recovery-conf-only</command>.
--- a/doc/repmgr-standby-follow.sgml
+++ b/doc/repmgr-standby-follow.sgml
@@ -9,23 +9,35 @@

  <refnamediv>
    <refname>repmgr standby follow</refname>
-    <refpurpose>attach a standby to a new primary</refpurpose>
+    <refpurpose>attach a running standby to a new upstream node</refpurpose>
  </refnamediv>

  <refsect1>
    <title>Description</title>

    <para>
-      Attaches the standby to a new primary. This command requires a valid
+      Attaches the standby (&quot;follow candidate&quot;) to a new upstream node
+      (&quot;follow target&quot;). Typically this will be the primary, but this
+      command can also be used to attach the standby to another standby.
+    </para>
+    <para>
+      This command requires a valid
      <filename>repmgr.conf</filename> file for the standby, either specified
      explicitly with <literal>-f/--config-file</literal> or located in a
      default location; no additional arguments are required.
    </para>
+
+	<para>
+	  By default &repmgr; will attempt to attach the standby to the current primary.
+	  If <option>--upstream-node-id</option> is provided, &repmgr; will attempt
+	  to attach the standby to the specified node, which can be another standby.
+	</para>
+
    <para>
      This command will force a restart of the standby server, which must be
-      running. It can only be used to attach an active standby to the current primary node
-   (and not to another standby).
+      running.
    </para>
+
 	<tip>
      <para>
 		To re-add an inactive node to the replication cluster, use
@@ -36,9 +48,22 @@
 	<para>
 	  <command>repmgr standby follow</command> will wait up to
 	  <varname>standby_follow_timeout</varname> seconds (default: <literal>30</literal>)
-	  to verify the standby has actually connected to the new primary.
+	  to verify the standby has actually connected to the new upstream node.
 	</para>

+	<note>
+	  <para>
+	    If <option>recovery_min_apply_delay</option> is set for the standby, it
+	    will not attach to the new upstream node until it has replayed available
+	    WAL.
+	  </para>
+	  <para>
+	    Conversely, if the standby is attached to an upstream standby
+	    which has <option>recovery_min_apply_delay</option> set, the upstream
+            standby's replay state may actually be behind that of its new downstream node.
+	  </para>
+	</note>
+
  </refsect1>

  <refsect1>
@@ -65,19 +90,46 @@
        <term><option>--dry-run</option></term>
        <listitem>
          <para>
-            Check prerequisites but don't actually follow a new standby.
+            Check prerequisites but don't actually follow a new upstream node.
+          </para>
+          <para>
+            This will also verify whether the standby is capable of following the new upstream node.
          </para>
          <important>
            <para>
-              This does not guarantee the standby can follow the primary; in
-              particular, whether the primary and standby timelines have diverged,
-              can currently only be determined by actually attempting to
-              attach the standby to the primary.
+              If a standby was turned into a primary by removing <filename>recovery.conf</filename>
+              (<application>PostgreSQL 12</application> and later: <filename>standby.signal</filename>),
+              &repmgr; will <emphasis>not</emphasis> be able to determine whether that primary's timeline
+              has diverged from the timeline of the standby (&quot;follow candidate&quot;).
+            </para>
+            <para>
+              We recommend always to use <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>
+              to promote a standby to primary, as this will ensure that the new primary
+              will perform a timeline switch (making it practical to check for timeline divergence)
+              and also that &repmgr; metadata is updated correctly.
            </para>
          </important>
        </listitem>
      </varlistentry>

+      <varlistentry>
+        <term><option>--upstream-node-id</option></term>
+        <listitem>
+          <para>
+            Node ID of the new upstream node (&quot;follow target&quot;).
+          </para>
+          <para>
+            If not provided, &repmgr; will attempt to follow the current primary node.
+          </para>
+          <para>
+            Note that when using <application>repmgrd</application>, <option>--upstream-node-id</option>
+            should always be configured;
+			see <link linkend="repmgrd-automatic-failover-configuration">Automatic failover configuration</link>
+            for details.
+          </para>
+        </listitem>
+      </varlistentry>
+
      <varlistentry>
        <term><option>-w</option></term>
        <term><option>--wait</option></term>
@@ -94,13 +146,104 @@
    </variablelist>
  </refsect1>

+  <refsect1>
+    <title>Execution</title>
+
+    <para>
+      Execute with the <literal>--dry-run</literal> option to test the follow operation as
+      far as possible, without actually changing the status of the node.
+    </para>
+
+    <para>
+      Note that &repmgr; will first attempt to determine whether the standby
+      (&quot;follow candidate&quot;) is capable of following the
+      new upstream node (&quot;follow target&quot;).
+    </para>
+    <para>
+      If, for example, the new upstream node has diverged from this node's timeline,
+      for example if the new upstream node was promoted to primary while this node
+      was still attached to the original primary, it will <emphasis>not</emphasis>
+      be possible to follow the new upstream node, and &repmgr; will emit an error
+      message like this:
+      <programlisting>
+ERROR: this node cannot attach to follow target node 3
+DETAIL: follow target server's timeline 2 forked off current database system timeline 1 before current recovery point 0/6108880</programlisting>
+    </para>
+    <para>
+      In this case, it may be possible to have this node follow the new upstream
+      using <command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>
+      with the <option>--force-rewind</option> to execute <command>pg_rewind</command>.
+      This does mean that transactions which exist on this node, but not the new upstream,
+      will be lost.
+    </para>
+
+  </refsect1>
+
+  <refsect1>
+    <title>Exit codes</title>
+    <para>
+      One of the following exit codes will be emitted by <command>repmgr standby follow</command>:
+    </para>
+    <variablelist>
+
+      <varlistentry>
+        <term><option>SUCCESS (0)</option></term>
+        <listitem>
+          <para>
+            The follow operation succeeded; or if <option>--dry-run</option> was provided,
+            no issues were detected which would prevent the follow operation.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_BAD_CONFIG (1)</option></term>
+        <listitem>
+          <para>
+            A configuration issue was detected which prevented &repmgr; from
+            continuing with the follow operation.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_NO_RESTART (4)</option></term>
+        <listitem>
+          <para>
+            The node could not be restarted.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_DB_CONN (6)</option></term>
+        <listitem>
+          <para>
+            &repmgr; was unable to establish a database connection to one of the nodes.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_FOLLOW_FAIL (23)</option></term>
+        <listitem>
+          <para>
+            &repmgr; was unable to complete the follow command.
+          </para>
+        </listitem>
+      </varlistentry>
+
+    </variablelist>
+
+  </refsect1>
+
  <refsect1 id="repmgr-standby-follow-events">
    <title>Event notifications</title>
    <para>
      A <literal>standby_follow</literal> <link linkend="event-notifications">event notification</link> will be generated.
    </para>
    <para>
-      If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the primary
+      If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the node
      being followed, <literal>%c</literal> with its <literal>conninfo</literal> string, and
      <literal>%a</literal> with its node name.
    </para>
@@ -113,4 +256,3 @@
    </para>
  </refsect1>
 </refentry>
-
--- a/doc/repmgr-standby-promote.sgml
+++ b/doc/repmgr-standby-promote.sgml
@@ -33,8 +33,26 @@
      Both values can be defined in <filename>repmgr.conf</filename>.
    </para>

+    <note>
+      <para>
+        If WAL replay is paused on the standby, and not all WAL files on the standby have been
+        replayed, &repmgr; will not attempt to promote it.
+      </para>
+      <para>
+        This is because if WAL replay is paused, PostgreSQL itself will not react to a promote command
+        until WAL replay is resumed and all pending WAL has been replayed. This means
+        attempting to promote PostgreSQL in this state will leave PostgreSQL in a condition where the
+        promotion may occur at a unpredictable point in the future.
+      </para>
+      <para>
+        Note that if the standby is in archive recovery, &repmgr; will not be able to determine
+        if more WAL is pending replay, and will abort the promotion attempt if WAL replay is paused.
+      </para>
+    </note>
+
  </refsect1>

+
  <refsect1>
    <title>Example</title>
    <para>
@@ -50,6 +68,127 @@
  </refsect1>


+  <refsect1>
+    <title>Options</title>
+    <variablelist>
+      <varlistentry>
+        <term><option>--dry-run</option></term>
+        <listitem>
+          <para>
+            Check if this node can be promoted, but don't carry out the promotion
+          </para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>Configuration file settings</title>
+   <para>
+     The following parameters in <filename>repmgr.conf</filename> are relevant to the
+     promote operation:
+    </para>
+
+    <para>
+      <itemizedlist spacing="compact" mark="bullet">
+
+       <listitem>
+        <indexterm>
+          <primary>promote_check_interval</primary>
+          <secondary>with &quot;repmgr standby promote &quot;</secondary>
+        </indexterm>
+         <simpara>
+           <literal>promote_check_interval</literal>:
+           interval (in seconds, default: 1 second) to wait between each check
+           to determine whether the standby has been promoted.
+		 </simpara>
+	   </listitem>
+
+       <listitem>
+        <indexterm>
+          <primary>promote_check_timeout</primary>
+          <secondary>with &quot;repmgr standby promote &quot;</secondary>
+        </indexterm>
+         <simpara>
+           <literal>promote_check_timeout</literal>:
+           time (in seconds, default: 60 seconds) to wait to verify that the standby has been promoted
+           before exiting with <literal>ERR_PROMOTION_FAIL</literal>.
+		 </simpara>
+	   </listitem>
+
+	  </itemizedlist>
+	</para>
+
+  </refsect1>
+
+  <refsect1>
+    <title>Exit codes</title>
+    <para>
+      Following exit codes can be emitted by <command>repmgr standby promote</command>:
+    </para>
+    <variablelist>
+      <varlistentry>
+        <term><option>SUCCESS (0)</option></term>
+        <listitem>
+          <para>
+            The standby was successfully promoted to primary.
+          </para>
+        </listitem>
+      </varlistentry>
+
+     <varlistentry>
+        <term><option>ERR_DB_CONN (6)</option></term>
+        <listitem>
+          <para>
+            &repmgr; was unable to connect to the local PostgreSQL node.
+          </para>
+          <para>
+            PostgreSQL must be running before the node can be promoted.
+          </para>
+        </listitem>
+      </varlistentry>
+
+     <varlistentry>
+       <term><option>ERR_PROMOTION_FAIL (8)</option></term>
+        <listitem>
+          <para>
+            The node could not be promoted to primary for one of the following
+            reasons:
+            <itemizedlist spacing="compact" mark="bullet">
+
+              <listitem>
+                <simpara>
+                  there is an existing primary node in the replication cluster
+                </simpara>
+              </listitem>
+
+              <listitem>
+                <simpara>
+                  the node is not a standby
+                </simpara>
+              </listitem>
+
+              <listitem>
+                <simpara>
+                  WAL replay is paused on the node
+                </simpara>
+              </listitem>
+
+              <listitem>
+                <simpara>
+                  execution of the PostgreSQL promote command failed
+                </simpara>
+              </listitem>
+
+            </itemizedlist>
+          </para>
+        </listitem>
+     </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+
  <refsect1 id="repmgr-standby-promote-events">
    <title>Event notifications</title>
    <para>
--- a/doc/repmgr-standby-switchover.sgml
+++ b/doc/repmgr-standby-switchover.sgml
@@ -37,7 +37,7 @@
      </para>
      <para>
        &repmgr; will refuse to perform the switchover if an exclusive backup is running on
-        the current primary.
+        the current primary, or if WAL replay is paused on the standby.
      </para>
    </note>
    <para>
@@ -146,6 +146,7 @@


     <varlistentry>
+
        <term><option>--siblings-follow</option></term>
        <listitem>
          <para>
@@ -161,29 +162,45 @@
    <title>Configuration file settings</title>

    <para>
-     Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
+     The following parameters in <filename>repmgr.conf</filename> are relevant to the
     switchover operation:
-     <itemizedlist spacing="compact" mark="bullet">
+    </para>

-       <listitem>
-         <simpara>
-           <literal>replication_lag_critical</literal>:
-           if replication lag (in seconds) on the standby exceeds this value, the
-           switchover will be aborted (unless the <literal>-F/--force</literal> option
-           is provided)
-         </simpara>
-       </listitem>
+    <variablelist>

-       <listitem>
-         <simpara>
-           <literal>shutdown_check_timeout</literal>: maximum number of seconds to wait for the
-           demotion candidate (current primary) to shut down, before aborting the switchover.
-         </simpara>
-         <simpara>
-           Note that this parameter is set on the node where <command>repmgr standby switchover</command>
-           is executed (promotion candidate); setting it on the demotion candidate (former primary) will
-           have no effect.
-         </simpara>
+      <varlistentry>
+        <indexterm>
+          <primary>replication_lag_critical</primary>
+          <secondary>with &quot;repmgr standby switchover&quot;</secondary>
+        </indexterm>
+
+        <term><option>replication_lag_critical</option></term>
+        <listitem>
+          <para>
+            If replication lag (in seconds) on the standby exceeds this value, the
+            switchover will be aborted (unless the <literal>-F/--force</literal> option
+            is provided)
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <indexterm>
+          <primary>shutdown_check_timeout</primary>
+          <secondary>with &quot;repmgr standby switchover&quot;</secondary>
+        </indexterm>
+
+        <term><option>shutdown_check_timeout</option></term>
+        <listitem>
+          <para>
+            The maximum number of seconds to wait for the
+            demotion candidate (current primary) to shut down, before aborting the switchover.
+          </para>
+          <para>
+            Note that this parameter is set on the node where <command>repmgr standby switchover</command>
+            is executed (promotion candidate); setting it on the demotion candidate (former primary) will
+            have no effect.
+          </para>
         <note>
           <para>
             In versions prior to <link linkend="release-4.2">&repmgr; 4.2</link>, <command>repmgr standby switchover</command> would
@@ -191,18 +208,73 @@
             to determine the timeout for demotion candidate shutdown.
           </para>
         </note>
-       </listitem>
+        </listitem>
+      </varlistentry>

-       <listitem>
-         <simpara>
-           <literal>standby_reconnect_timeout</literal>:
-           maximum number of seconds to attempt to wait for the demotion candidate (former primary)
-           to reconnect to the promoted primary (default: 60 seconds)
-         </simpara>
-       </listitem>

-     </itemizedlist>
-    </para>
+      <varlistentry>
+        <indexterm>
+          <primary>wal_receive_check_timeout</primary>
+          <secondary>with &quot;repmgr standby switchover&quot;</secondary>
+        </indexterm>
+
+        <term><option>wal_receive_check_timeout</option></term>
+        <listitem>
+          <para>
+            After the primary has shut down, the maximum number of seconds to wait for the
+            walreceiver on the standby to flush WAL to disk before comparing WAL receive location
+            with the primary's shut down location.
+         </para>
+        </listitem>
+      </varlistentry>
+
+
+      <varlistentry>
+        <indexterm>
+          <primary>standby_reconnect_timeout</primary>
+          <secondary>with &quot;repmgr standby switchover&quot;</secondary>
+        </indexterm>
+
+        <term><option>standby_reconnect_timeout</option></term>
+        <listitem>
+          <para>
+            The maximum number of seconds to attempt to wait for the demotion candidate (former primary)
+            to reconnect to the promoted primary (default: 60 seconds)
+          </para>
+          <para>
+            Note that this parameter is set on the node where <command>repmgr standby switchover</command>
+            is executed (promotion candidate); setting it on the demotion candidate (former primary) will
+            have no effect.
+          </para>
+        </listitem>
+      </varlistentry>
+
+     <varlistentry>
+        <indexterm>
+          <primary>node_rejoin_timeout</primary>
+          <secondary>with &quot;repmgr standby switchover&quot;</secondary>
+        </indexterm>
+
+        <term><option>node_rejoin_timeout</option></term>
+        <listitem>
+          <para>
+            maximum number of seconds to attempt to wait for the demotion candidate (former primary)
+            to reconnect to the promoted primary (default: 60 seconds)
+          </para>
+          <para>
+            Note that this parameter is set on the the demotion candidate (former primary);
+            setting it on the node where <command>repmgr standby switchover</command> is
+            executed will have no effect.
+          </para>
+          <para>
+            However, this value <emphasis>must</emphasis> be less than <option>standby_reconnect_timeout</option> on the
+            promotion candidate (the node where <command>repmgr standby switchover</command> is executed).
+          </para>
+        </listitem>
+      </varlistentry>
+
+    </variablelist>
+
  </refsect1>


@@ -238,7 +310,7 @@
  <refsect1>
    <title>Exit codes</title>
    <para>
-      Following exit codes can be emitted by <command>repmgr standby switchover</command>:
+      One of the following exit codes will be emitted by <command>repmgr standby switchover</command>:
    </para>
    <variablelist>

@@ -246,7 +318,8 @@
        <term><option>SUCCESS (0)</option></term>
        <listitem>
          <para>
-            The switchover completed successfully.
+            The switchover completed successfully; or if <option>--dry-run</option> was provided,
+            no issues were detected which would prevent the switchover operation.
          </para>
        </listitem>
      </varlistentry>
@@ -277,7 +350,10 @@
  <refsect1>
    <title>See also</title>
    <para>
-      For more details see the section <xref linkend="performing-switchover">.
+      <xref linkend="repmgr-standby-follow">, <xref linkend="repmgr-node-rejoin">
+    </para>
+    <para>
+      For more details on performing a switchover operation, see the section <xref linkend="performing-switchover">.
    </para>
  </refsect1>

--- a/doc/repmgr-witness-register.sgml
+++ b/doc/repmgr-witness-register.sgml
@@ -34,6 +34,14 @@
      witness node's <filename>repmgr.conf</filename>, unless these are explicitly
      provided as command line options.
    </para>
+
+    <note>
+      <para>
+        The primary server must be registered with <command><link linkend="repmgr-primary-register">repmgr primary register</link></command> before the witness
+        server can be registered.
+      </para>
+    </note>
+
    <para>
      Execute with the <option>--dry-run</option> option to check what would happen
      without actually registering the witness server.
--- a/doc/repmgr.sgml
+++ b/doc/repmgr.sgml
@@ -25,31 +25,25 @@
   <para>
   This is the official documentation of &repmgr; &repmgrversion; for
   use with PostgreSQL 9.3 - PostgreSQL 11.
-   </para>
-   <para>
-     &repmgr; is being continually developed and we strongly recommend using the
-     latest version. Please check the
-     <ulink url="https://repmgr.org/">repmgr website</ulink> for details
-     about the current &repmgr; version as well as the
-     <ulink url="https://repmgr.org/docs/current/index.html">current documentation</ulink>.
+   It describes the functionality supported by the current version of &repmgr;.
   </para>

   <para>
-    &repmgr; was developed by
+    &repmgr; is developed by
    <ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
    along with contributions from other individuals and companies.
    Contributions from the community are appreciated and welcome - get
-    in touch via <ulink url="https://github.com/2ndQuadrant/repmgr">github</>
-    or <ulink url="https://groups.google.com/group/repmgr">the mailing list/forum</>.
+    in touch via <ulink url="https://github.com/2ndQuadrant/repmgr">github</ulink>
+    or <ulink url="https://groups.google.com/group/repmgr">the mailing list/forum</ulink>.
    Multiple 2ndQuadrant customers contribute funding
    to make repmgr development possible.
   </para>

   <para>
-    2ndQuadrant, a Platinum sponsor of the PostgreSQL project,
-    continues to develop repmgr to meet internal needs and those of customers.
-     Other companies as well as individual developers
-    are welcome to participate in the efforts.
+     &repmgr; is fully supported by 2ndQuadrant's
+     <ulink url="https://www.2ndquadrant.com/en/support/support-postgresql/">24/7 Production Support</ulink>.
+     2ndQuadrant, a Major Sponsor of the PostgreSQL project, continues to develop and maintain &repmgr;.
+     Other companies as well as individual developers are welcome to participate in the efforts.
   </para>
  </abstract>

@@ -79,22 +73,16 @@
  &promoting-standby;
  &follow-new-primary;
  &switchover;
-  &configuring-witness-server;
  &event-notifications;
  &upgrading-repmgr;
 </part>

 <part id="using-repmgrd">
  <title>Using repmgrd</title>
+  &repmgrd-overview;
  &repmgrd-automatic-failover;
  &repmgrd-configuration;
-  &repmgrd-demonstration;
-  &repmgrd-cascading-replication;
-  &repmgrd-network-split;
-  &repmgrd-witness-server;
-  &repmgrd-pausing;
-  &repmgrd-degraded-monitoring;
-  &repmgrd-monitoring;
+  &repmgrd-operation;
  &repmgrd-bdr;
 </part>

@@ -121,6 +109,8 @@
  &repmgr-cluster-event;
  &repmgr-cluster-cleanup;
  &repmgr-daemon-status;
+  &repmgr-daemon-start;
+  &repmgr-daemon-stop;
  &repmgr-daemon-pause;
  &repmgr-daemon-unpause;
 </part>
@@ -129,6 +119,7 @@
 &appendix-signatures;
 &appendix-faq;
 &appendix-packages;
+ &appendix-support;

 <![%include-index;[&bookindex;]]>
 <![%include-xslt-index;[<index id="bookindex"></index>]]>
--- a/doc/repmgrd-automatic-failover.sgml
+++ b/doc/repmgrd-automatic-failover.sgml
@@ -13,5 +13,285 @@
  providing monitoring information about the state of each standby.
 </para>

+<sect1 id="repmgrd-witness-server" xreflabel="Using a witness server with repmgrd">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>witness server</secondary>
+ </indexterm>
+
+ <indexterm>
+   <primary>witness server</primary>
+   <secondary>repmgrd</secondary>
+ </indexterm>
+ <title>Using a witness server</title>
+ <para>
+   A <xref linkend="witness-server"> is a normal PostgreSQL instance which
+   is not part of the streaming replication cluster; its purpose is, if a
+   failover situation occurs, to provide proof that it is the primary server
+   itself which is unavailable, rather than e.g. a network split between
+   different physical locations.
+ </para>
+
+ <para>
+   A typical use case for a witness server is a two-node streaming replication
+   setup, where the primary and standby are in different locations (data centres).
+   By creating a witness server in the same location (data centre) as the primary,
+   if the primary becomes unavailable it's possible for the standby to decide whether
+   it can promote itself without risking a "split brain" scenario: if it can't see either the
+   witness or the primary server, it's likely there's a network-level interruption
+   and it should not promote itself. If it can see the witness but not the primary,
+   this proves there is no network interruption and the primary itself is unavailable,
+   and it can therefore promote itself (and ideally take action to fence the
+   former primary).
+ </para>
+ <note>
+   <para>
+     <emphasis>Never</emphasis> install a witness server on the same physical host
+     as another node in the replication cluster managed by &repmgr; - it's essential
+     the witness is not affected in any way by failure of another node.
+   </para>
+ </note>
+ <para>
+   For more complex replication scenarios,e.g. with multiple datacentres, it may
+   be preferable to use location-based failover, which ensures that only nodes
+   in the same location as the primary will ever be promotion candidates;
+   see <xref linkend="repmgrd-network-split"> for more details.
+ </para>
+
+ <note>
+   <simpara>
+     A witness server will only be useful if <application>repmgrd</application>
+     is in use.
+   </simpara>
+ </note>
+
+ <sect2 id="creating-witness-server">
+   <title>Creating a witness server</title>
+ <para>
+   To create a witness server, set up a normal PostgreSQL instance on a server
+   in the same physical location as the cluster's primary server.
+ </para>
+ <para>
+   This instance should <emphasis>not</emphasis> be on the same physical host as the primary server,
+   as otherwise if the primary server fails due to hardware issues, the witness
+   server will be lost too.
+ </para>
+ <note>
+   <simpara>
+     &repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
+     command, which would automatically create a PostgreSQL instance. However
+     this often resulted in an unsatisfactory, hard-to-customise instance.
+   </simpara>
+ </note>
+ <para>
+   The witness server should be configured in the same way as a normal
+   &repmgr; node; see section <xref linkend="configuration">.
+ </para>
+ <para>
+   Register the witness server with <xref linkend="repmgr-witness-register">.
+   This will create the &repmgr; extension on the witness server, and make
+   a copy of the &repmgr; metadata.
+ </para>
+ <note>
+   <simpara>
+    As the witness server is not part of the replication cluster, further
+    changes to the &repmgr; metadata will be synchronised by
+    <application>repmgrd</application>.
+   </simpara>
+ </note>
+ <para>
+   Once the witness server has been configured, <application>repmgrd</application>
+   should be started.
+ </para>
+
+ <para>
+  To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
+ </para>
+
+ </sect2>
+
+</sect1>
+
+
+<sect1 id="repmgrd-network-split" xreflabel="Handling network splits with repmgrd">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>network splits</secondary>
+ </indexterm>
+
+ <indexterm>
+   <primary>network splits</primary>
+ </indexterm>
+
+ <title>Handling network splits with repmgrd</title>
+ <para>
+  A common pattern for replication cluster setups is to spread servers over
+  more than one datacentre. This can provide benefits such as geographically-
+  distributed read replicas and DR (disaster recovery capability). However
+  this also means there is a risk of disconnection at network level between
+  datacentre locations, which would result in a split-brain scenario if
+  servers in a secondary data centre were no longer able to see the primary
+  in the main data centre and promoted a standby among themselves.
+ </para>
+ <para>
+  &repmgr; enables provision of &quot;<xref linkend="witness-server">&quot; to
+  artificially create a quorum of servers in a particular location, ensuring
+  that nodes in another location will not elect a new primary if they
+  are unable to see the majority of nodes. However this approach does not
+  scale well, particularly with more complex replication setups, e.g.
+  where the majority of nodes are located outside of the primary datacentre.
+  It also means the <literal>witness</literal> node needs to be managed as an
+  extra PostgreSQL instance outside of the main replication cluster, which
+  adds administrative and programming complexity.
+ </para>
+ <para>
+  <literal>repmgr4</literal> introduces the concept of <literal>location</literal>:
+  each node is associated with an arbitrary location string (default is
+  <literal>default</literal>); this is set in <filename>repmgr.conf</filename>, e.g.:
+  <programlisting>
+    node_id=1
+    node_name=node1
+    conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'
+    data_directory='/var/lib/postgresql/data'
+    location='dc1'</programlisting>
+ </para>
+ <para>
+  In a failover situation, <application>repmgrd</application> will check if any servers in the
+  same location as the current primary node are visible.  If not, <application>repmgrd</application>
+  will assume a network interruption and not promote any node in any
+  other location (it will however enter <link linkend="repmgrd-degraded-monitoring">degraded monitoring</link>
+  mode until a primary becomes visible).
+ </para>
+
+</sect1>
+
+<sect1 id="repmgrd-standby-disconnection-on-failover" xreflabel="Standby disconnection on failover">
+  <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>standby disconnection on failover</secondary>
+ </indexterm>
+
+  <indexterm>
+    <primary>standby disconnection on failover</primary>
+  </indexterm>
+
+  <title>Standby disconnection on failover</title>
+  <para>
+    If <option>standby_disconnect_on_failover</option> is set to <literal>true</literal> in
+    <filename>repmgr.conf</filename>, in a failover situation <application>repmgrd</application> will forcibly disconnect
+    the local node's WAL receiver before making a failover decision.
+  </para>
+  <note>
+    <para>
+      <option>standby_disconnect_on_failover</option> is available from PostgreSQL 9.5 and later.
+      Additionally this requires that the <literal>repmgr</literal> database user is a superuser.
+    </para>
+  </note>
+  <para>
+    By doing this, it's possible to ensure that, at the point the failover decision is made, no nodes
+    are receiving data from the primary and their LSN location will be static.
+  </para>
+  <important>
+    <para>
+      <option>standby_disconnect_on_failover</option> <emphasis>must</emphasis> be set to the same value on
+      all nodes.
+    </para>
+  </important>
+  <para>
+    Note that when using <option>standby_disconnect_on_failover</option> there will be a delay of 5 seconds
+    plus however many seconds it takes to confirm the WAL receiver is disconnected before
+    <application>repmgrd</application> proceeds with the failover decision.
+  </para>
+  <para>
+    Following the failover operation, no matter what the outcome, each node will reconnect its WAL receiver.
+  </para>
+
+</sect1>
+
+<sect1 id="repmgrd-failover-validation" xreflabel="Failover validation">
+  <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>failover validation</secondary>
+ </indexterm>
+
+  <indexterm>
+    <primary>failover validation</primary>
+  </indexterm>
+
+  <title>Failover validation</title>
+  <para>
+    From <link linkend="release-4.3">repmgr 4.3</link>, &repmgr; makes it possible to provide a script
+    to <application>repmgrd</application> which, in a failover situation,
+    will be executed by the promotion candidate (the node which has been selected
+    to be the new primary) to confirm whether the node should actually be promoted.
+  </para>
+  <para>
+    To use this, <option>failover_validation_command</option> in <filename>repmgr.conf</filename>
+    to a script executable by the <literal>postgres</literal> system user, e.g.:
+    <programlisting>
+      failover_validation_command=/path/to/script.sh %n %a</programlisting>
+  </para>
+  <para>
+    The <literal>%n</literal> parameter will be replaced with the node ID, and the
+    <literal>%a</literal> parameter will be replaced by the node name when the script is executed.
+  </para>
+  <para>
+    This script must return an exit code of <literal>0</literal> to indicate the node should promote itself.
+    Any other value will result in the promotion being aborted and the election rerun.
+    There is a pause of <option>election_rerun_interval</option> seconds before the election is rerun.
+  </para>
+  <para>
+    Sample <application>repmgrd</application> log file output during which the failover validation
+    script rejects the proposed promotion candidate:
+    <programlisting>
+[2019-03-13 21:01:30] [INFO] visible nodes: 2; total nodes: 2; no nodes have seen the primary within the last 4 seconds
+[2019-03-13 21:01:30] [NOTICE] promotion candidate is "node2" (ID: 2)
+[2019-03-13 21:01:30] [NOTICE] executing "failover_validation_command"
+[2019-03-13 21:01:30] [DETAIL] /usr/local/bin/failover-validation.sh 2
+[2019-03-13 21:01:30] [INFO] output returned by failover validation command:
+Node ID: 2
+
+[2019-03-13 21:01:30] [NOTICE] failover validation command returned a non-zero value: "1"
+[2019-03-13 21:01:30] [NOTICE] promotion candidate election will be rerun
+[2019-03-13 21:01:30] [INFO] 1 followers to notify
+[2019-03-13 21:01:30] [NOTICE] notifying node "node3" (node ID: 3) to rerun promotion candidate selection
+INFO:  node 3 received notification to rerun promotion candidate election
+[2019-03-13 21:01:30] [NOTICE] rerunning election after 15 seconds ("election_rerun_interval")</programlisting>
+  </para>
+
+
+</sect1>
+
+  <sect1 id="cascading-replication" xreflabel="Cascading replication">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>cascading replication</secondary>
+ </indexterm>
+
+ <indexterm>
+   <primary>cascading replication</primary>
+   <secondary>repmgrd</secondary>
+ </indexterm>
+
+ <title>repmgrd and cascading replication</title>
+ <para>
+  Cascading replication - where a standby can connect to an upstream node and not
+  the primary server itself - was introduced in PostgreSQL 9.2. &repmgr; and
+  <application>repmgrd</application> support cascading replication by keeping track of the relationship
+  between standby servers - each node record is stored with the node id of its
+  upstream ("parent") server (except of course the primary server).
+ </para>
+ <para>
+  In a failover situation where the primary node fails and a top-level standby
+  is promoted, a standby connected to another standby will not be affected
+  and continue working as normal (even if the upstream standby it's connected
+  to becomes the primary node). If however the node's direct upstream fails,
+  the &quot;cascaded standby&quot; will attempt to reconnect to that node's parent
+  (unless <varname>failover</varname> is set to <literal>manual</literal> in
+  <filename>repmgr.conf</filename>).
+ </para>
+
+  </sect1>
+

 </chapter>
--- a/doc/repmgrd-bdr.sgml
+++ b/doc/repmgrd-bdr.sgml
@@ -10,7 +10,7 @@

  <title>BDR failover with repmgrd</title>
  <para>
-    &repmgr; 4.x provides support for monitoring a pair of BDR 2.x nodes and taking action in
+    &repmgr; 4.x provides support for monitoring BDR nodes and taking action in
    case one of the nodes fails.
  </para>
  <note>
@@ -31,21 +31,8 @@
    reconfigure a proxy server/connection pooler such as <application>PgBouncer</application>.
  </para>

-  <note>
-    <simpara>
-      This &repmgr; functionality is for BDR 2.x only running on PostgreSQL 9.4/9.6.
-      It is <emphasis>not</emphasis> required for later BDR versions.
-    </simpara>
-  </note>
-
  <sect1 id="bdr-prerequisites" xreflabel="BDR prequisites">
    <title>Prerequisites</title>
-    <important>
-      <para>
-        This &repmgr; functionality is for BDR 2.x only running on PostgreSQL 9.4/9.6.
-        It is <emphasis>not</emphasis> required for later BDR versions.
-      </para>
-    </important>
    <para>
      &repmgr; 4 requires PostgreSQL 9.4 or 9.6 with the BDR 2 extension
      enabled and configured for a two-node BDR network. &repmgr; 4 packages
--- a/doc/repmgrd-cascading-replication.sgml
+++ b/doc/repmgrd-cascading-replication.sgml
@@ -1,22 +0,0 @@
-<chapter id="repmgrd-cascading-replication">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>cascading replication</secondary>
- </indexterm>
-
- <title>repmgrd and cascading replication</title>
- <para>
-  Cascading replication - where a standby can connect to an upstream node and not
-  the primary server itself - was introduced in PostgreSQL 9.2. &repmgr; and
-  <application>repmgrd</application> support cascading replication by keeping track of the relationship
-  between standby servers - each node record is stored with the node id of its
-  upstream ("parent") server (except of course the primary server).
- </para>
- <para>
-  In a failover situation where the primary node fails and a top-level standby
-  is promoted, a standby connected to another standby will not be affected
-  and continue working as normal (even if the upstream standby it's connected
-  to becomes the primary node). If however the node's direct upstream fails,
-  the "cascaded standby" will attempt to reconnect to that node's parent.
- </para>
-</chapter>
--- a/doc/repmgrd-configuration.sgml
+++ b/doc/repmgrd-configuration.sgml
@@ -5,7 +5,7 @@
    <secondary>configuration</secondary>
  </indexterm>

-  <title>repmgrd configuration</title>
+  <title>repmgrd setup and configuration</title>

  <para>
    <application>repmgrd</application> is a daemon which runs on each PostgreSQL node,
@@ -20,7 +20,7 @@
  </para>

  <sect1 id="repmgrd-basic-configuration">
-    <title>repmgrd basic configuration</title>
+    <title>repmgrd configuration</title>

    <para>
      To use <application>repmgrd</application>, its associated function library <emphasis>must</emphasis> be
@@ -31,79 +31,417 @@
    </para>
    <para>
      Changing this setting requires a restart of PostgreSQL; for more details see
-      the <ulink url="https://www.postgresql.org/docs/current/static/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
+      the <ulink url="https://www.postgresql.org/docs/current/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
    </para>

+    <para>
+      The following configuraton options apply to <application>repmgrd</application> in all circumstances:
+    </para>
+    <variablelist>
+
+        <varlistentry>
+
+         <indexterm>
+            <primary>monitor_interval_secs</primary>
+          </indexterm>
+          <term><option>monitor_interval_secs</option></term>
+          <listitem>
+            <para>
+              The interval (in seconds, default: <literal>2</literal>) to check the availability of the upstream node.
+            </para>
+          </listitem>
+
+        </varlistentry>
+
+        <varlistentry id="connection-check-type">
+
+          <indexterm>
+            <primary>connection_check_type</primary>
+          </indexterm>
+          <term><option>connection_check_type</option></term>
+          <listitem>
+            <para>
+              The option <option>connection_check_type</option> is used to select the method
+              <application>repmgrd</application> uses to determine whether the upstream node is available.
+            </para>
+            <para>
+              Possible values are:
+              <itemizedlist spacing="compact" mark="bullet">
+                <listitem>
+                  <simpara>
+                    <literal>ping</literal> (default) - uses <command>PQping()</command> to
+                    determine server availability
+                  </simpara>
+                </listitem>
+                <listitem>
+                  <simpara>
+                    <literal>connection</literal> - determines server availability
+                    by attempt ingto make a new connection to the upstream node
+                  </simpara>
+                </listitem>
+                <listitem>
+                  <simpara>
+                    <literal>query</literal> - determines server availability
+                    by executing an SQL statement on the node via the existing connection
+                  </simpara>
+                </listitem>
+
+              </itemizedlist>
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+         <indexterm>
+            <primary>reconnect_attempts</primary>
+          </indexterm>
+          <term><option>reconnect_attempts</option></term>
+          <listitem>
+            <para>
+              The number of attempts (default: <literal>6</literal>) will be made to reconnect to an unreachable
+	      upstream node before initiating a failover.
+            </para>
+            <para>
+              There will be an interval of <option>reconnect_interval</option> seconds between each reconnection
+              attempt.
+            </para>
+          </listitem>
+        </varlistentry>
+        <varlistentry>
+         <indexterm>
+            <primary>reconnect_interval</primary>
+          </indexterm>
+          <term><option>reconnect_interval</option></term>
+          <listitem>
+            <para>
+              Interval (in seconds, default: <literal>10</literal>) between attempts to reconnect to an unreachable
+              upstream node.
+            </para>
+            <para>
+              The number of reconnection attempts is defined by the parameter <option>reconnect_attempts</option>.
+            </para>
+          </listitem>
+        </varlistentry>
+
+
+
+        <varlistentry>
+          <indexterm>
+            <primary>degraded_monitoring_timeout</primary>
+          </indexterm>
+          <term><option>degraded_monitoring_timeout</option></term>
+          <listitem>
+	    <para>
+              Interval (in seconds) after which <application>repmgrd</application> will terminate if
+              either of the servers (local node and or upstream node) being monitored is no longer available
+              (<link linkend="repmgrd-degraded-monitoring">degraded monitoring mode</link>).
+            </para>
+            <para>
+              <literal>-1</literal> (default) disables this timeout completely.
+            </para>
+	  </listitem>
+	</varlistentry>
+
+    </variablelist>
+
+      <para>
+        See also <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename> for an annotated sample configuration file.
+      </para>

    <sect2 id="repmgrd-automatic-failover-configuration">
-      <title>automatic failover configuration</title>
+      <title>Required configuration for automatic failover</title>
+
      <para>
-        If using automatic failover, the following <application>repmgrd</application> options *must* be set in
-        <filename>repmgr.conf</filename> :
+        The following <application>repmgrd</application> options <emphasis>must</emphasis> be set in
+        <filename>repmgr.conf</filename>:
+
+        <itemizedlist spacing="compact" mark="bullet">
+          <listitem>
+            <simpara><option>failover</option></simpara>
+          </listitem>
+          <listitem>
+            <simpara><option>promote_command</option></simpara>
+          </listitem>
+          <listitem>
+            <simpara><option>follow_command</option></simpara>
+          </listitem>
+        </itemizedlist>
+      </para>
+
+
+      <para>
+        Example:
        <programlisting>
          failover=automatic
          promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'
          follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
      </para>
      <para>
-        Adjust file paths as appropriate; alway specify the full path to the &repmgr; binary.
+        Details of each option are as follows:
      </para>
+      <variablelist>
+        <varlistentry>

-      <note>
-        <para>
-          &repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
-          or <option>follow_command</option>; these can be user-defined scripts so must always be
-          specified with the full path.
-        </para>
-      </note>
+          <indexterm>
+            <primary>failover</primary>
+          </indexterm>
+          <term><option>failover</option></term>
+          <listitem>
+            <para>
+              <option>failover</option> can be one of <literal>automatic</literal> or <literal>manual</literal>.
+            </para>
+            <note>
+              <para>
+                If <option>failover</option> is set to <literal>manual</literal>, <application>repmgrd</application>
+                will not take any action if a failover situation is detected, and the node may need to
+                be modified manually (e.g. by executing <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>).
+              </para>
+            </note>
+
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <indexterm>
+            <primary>promote_command</primary>
+          </indexterm>
+          <term><option>promote_command</option></term>
+          <listitem>
+            <para>
+              The program or script defined in <option>promote_command</option> will be executed
+              in a failover situation when <application>repmgrd</application> determines that
+              the current node is to become the new primary node.
+            </para>
+            <para>
+              Normally <option>promote_command</option> is set as &repmgr;'s
+              <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> command.
+            </para>
+            <para>
+              It is also possible to provide e.g. a shell script to e.g. perform user-defined tasks
+              before promoting the current node. In this case the script <emphasis>must</emphasis>
+              at some point execute <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
+              to promote the node; if this is not done, &repmgr; metadata will not be updated and
+              &repmgr; will no longer function reliably.
+            </para>
+            <para>
+              Example:
+              <programlisting>
+                promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'</programlisting>
+            </para>
+
+            <para>
+              Note that the <literal>--log-to-file</literal> option will cause
+              output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
+              to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
+            </para>
+            <note>
+              <para>
+                &repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
+                or <option>follow_command</option>; these can be user-defined scripts so must always be
+                specified with the full path.
+              </para>
+            </note>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <indexterm>
+            <primary>follow_command</primary>
+          </indexterm>
+          <term><option>follow_command</option></term>
+          <listitem>
+            <para>
+              The program or script defined in <option>follow_command</option> will be executed
+              in a failover situation when <application>repmgrd</application> determines that
+              the current node is to follow the new primary node.
+            </para>
+            <para>
+              Normally <option>follow_command</option> is set as &repmgr;'s
+              <command><link linkend="repmgr-standby-follow">repmgr standby promote</link></command> command.
+            </para>
+            <para>
+              The <option>follow_command</option> parameter
+              should provide the <literal>--upstream-node-id=%n</literal>
+              option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
+              <application>repmgrd</application> with the ID of the new primary node. If this is not provided,
+              <command>repmgr standby follow</command> will attempt to determine the new primary by itself, but if the
+              original primary comes back online after the new primary is promoted, there is a risk that
+              <command>repmgr standby follow</command> will result in the node continuing to follow
+              the original primary.
+            </para>
+            <para>
+              It is also possible to provide e.g. a shell script to e.g. perform user-defined tasks
+              before promoting the current node. In this case the script <emphasis>must</emphasis>
+              at some point execute <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>
+              to promote the node; if this is not done, &repmgr; metadata will not be updated and
+              &repmgr; will no longer function reliably.
+            </para>
+            <para>
+              Example:
+              <programlisting>
+          follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
+            </para>
+
+            <para>
+              Note that the <literal>--log-to-file</literal> option will cause
+              output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
+              to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
+            </para>
+            <note>
+              <para>
+                &repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
+                or <option>follow_command</option>; these can be user-defined scripts so must always be
+                specified with the full path.
+              </para>
+            </note>
+          </listitem>
+
+        </varlistentry>
+
+      </variablelist>

-      <para>
-        Note that the <literal>--log-to-file</literal> option will cause
-        output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
-        to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
-        See <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename>
-        for further <application>repmgrd</application>-specific settings.
-      </para>
-      <para>
-        When <varname>failover</varname> is set to <literal>automatic</literal>, upon detecting failure
-        of the current  primary, <application>repmgrd</application> will execute one of:
-      </para>
-      <itemizedlist spacing="compact" mark="bullet">
-        <listitem>
-          <simpara>
-            <varname>promote_command</varname> (if the current server is to become the new primary)
-          </simpara>
-        </listitem>
-        <listitem>
-          <simpara>
-            <varname>follow_command</varname> (if the current server needs to follow another server which has
-            become the new primary)
-          </simpara>
-        </listitem>
-      </itemizedlist>
-      <note>
-        <para>
-          These commands can be any valid shell script which results in one of these
-          two actions happening, but if &repmgr;'s <command>standby follow</command> or
-          <command>standby promote</command>
-          commands are not executed (either directly as shown here, or from a script which
-          performs other actions), the &repmgr; metadata will not be updated and
-          &repmgr; will no longer function reliably.
-        </para>
-      </note>

-      <para>
-        The <varname>follow_command</varname> should provide the <literal>--upstream-node-id=%n</literal>
-        option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
-        <application>repmgrd</application> with the ID of the new primary node. If this is not provided, &repmgr;
-        will attempt to determine the new primary by itself, but if the
-        original primary comes back online after the new primary is promoted, there is a risk that
-        <command>repmgr standby follow</command> will result in the node continuing to follow
-        the original primary.
-      </para>
    </sect2>

-    <sect2 id="repmgrd-service-configuration">
+    <sect2 id="repmgrd-automatic-failover-configuration-optional">
+      <title>Optional configuration for automatic failover</title>
+
+      <para>
+        The following configuraton options can be use to fine-tune automatic failover:
+      </para>
+      <variablelist>
+
+        <varlistentry>
+          <indexterm>
+            <primary>priority</primary>
+          </indexterm>
+          <term><option>priority</option></term>
+          <listitem>
+            <para>
+              Indicates a preferred priority (default: <literal>100</literal>) for promoting nodes;
+			  a value of zero prevents the node being promoted to primary.
+            </para>
+            <para>
+              Note that the priority setting is only applied if two or more nodes are
+              determined as promotion candidates; in that case the node with the
+              higher priority is selected.
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+          <indexterm>
+            <primary>failover_validation_command</primary>
+          </indexterm>
+          <term><option>failover_validation_command</option></term>
+          <listitem>
+            <para>
+              User-defined script to execute for an external mechanism to validate the failover
+	      decision made by <application>repmgrd</application>.
+            </para>
+            <note>
+              <para>
+                This option <emphasis>must</emphasis> be identically configured
+                on all nodes.
+              </para>
+            </note>
+            <para>
+              One or both of the following parameter placeholders
+			  should be provided, which will be replaced by repmgrd with the appropriate
+	          value:
+              <itemizedlist spacing="compact" mark="bullet">
+                <listitem>
+                  <simpara><literal>%n</literal>: node ID</simpara>
+                </listitem>
+                <listitem>
+                  <simpara><literal>%a</literal>: node name</simpara>
+                </listitem>
+              </itemizedlist>
+            </para>
+            <para>
+              See also: <link linkend="repmgrd-failover-validation">Failover validation</link>.
+            </para>
+          </listitem>
+        </varlistentry>
+
+        <varlistentry>
+
+         <indexterm>
+            <primary>standby_disconnect_on_failover</primary>
+          </indexterm>
+          <term><option>standby_disconnect_on_failover</option></term>
+          <listitem>
+            <para>
+              In a failover situation, disconnect the local node's WAL receiver.
+            </para>
+            <para>
+              This option is available from PostgreSQL 9.5 and later.
+            </para>
+            <note>
+              <para>
+                This option <emphasis>must</emphasis> be identically configured
+                on all nodes.
+              </para>
+              <para>
+                Additionally the &repmgr; user <emphasis>must</emphasis> be a superuser
+                for this option.
+              </para>
+              <para>
+                <application>repmgrd</application> will refuse to start if this option is set
+                but either of these prerequisites is not met.
+              </para>
+            </note>
+
+            <para>
+              See also: <link linkend="repmgrd-standby-disconnection-on-failover">Standby disconnection on failover</link>.
+            </para>
+          </listitem>
+        </varlistentry>
+
+      </variablelist>
+
+      <para>
+        The following options can be used to further fine-tune failover behaviour.
+        In practice it's unlikely these will need to be changed from their default
+        values, but are available as configuration options should the need arise.
+      </para>
+      <variablelist>
+
+        <varlistentry>
+          <indexterm>
+            <primary>election_rerun_interval</primary>
+          </indexterm>
+          <term><option>election_rerun_interval</option></term>
+          <listitem>
+			<para>
+			  If <option>failover_validation_command</option> is set, and the command returns
+			  an error, pause the specified amount of seconds (default: 15) before rerunning the election.
+			</para>
+		  </listitem>
+		</varlistentry>
+
+
+        <varlistentry>
+          <indexterm>
+            <primary>sibling_nodes_disconnect_timeout</primary>
+          </indexterm>
+          <term><option>sibling_nodes_disconnect_timeout</option></term>
+          <listitem>
+			<para>
+              If <option>standby_disconnect_on_failover</option> is <literal>true</literal>, the
+              maximum length of time (in seconds, default: <literal>30</literal>)
+			  to wait for other standbys to confirm they have disconnected their
+		      WAL receivers.
+			</para>
+		  </listitem>
+		</varlistentry>
+      </variablelist>
+
+
+
+    </sect2>
+
+    <sect2 id="postgresql-service-configuration">
      <indexterm>
        <primary>repmgrd</primary>
        <secondary>PostgreSQL service configuration</secondary>
@@ -126,6 +464,42 @@
      </para>
    </sect2>

+    <sect2 id="repmgrd-service-configuration">
+      <indexterm>
+        <primary>repmgrd</primary>
+        <secondary>repmgrd service configuration</secondary>
+      </indexterm>
+      <title>repmgrd service configuration</title>
+      <para>
+        If you are intending to use the <link linkend="repmgr-daemon-start"><command>repmgr daemon start</command></link>
+        and <link linkend="repmgr-daemon-stop"><command>repmgr daemon stop</command></link> commands, the following
+        parameters <emphasis>must</emphasis> be set in <filename>repmgr.conf</filename>:
+        <itemizedlist spacing="compact" mark="bullet">
+
+          <listitem>
+            <simpara><varname>repmgrd_service_start_command</varname></simpara>
+          </listitem>
+
+          <listitem>
+            <simpara><varname>repmgrd_service_stop_command</varname></simpara>
+          </listitem>
+
+        </itemizedlist>
+
+      </para>
+      <para>
+        Example (for &repmgr; with PostgreSQL 11 on CentOS 7):
+        <programlisting>
+repmgrd_service_start_command='sudo systemctl repmgr11 start'
+repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
+</programlisting>
+      </para>
+      <para>
+        For more details see the reference page for each command.
+      </para>
+    </sect2>
+
+
    <sect2 id="repmgrd-monitoring-configuration" xreflabel="repmgrd monitoring configuration">
      <indexterm>
        <primary>repmgrd</primary>
@@ -139,10 +513,8 @@
        in <filename>repmgr.conf</filename>.
      </para>
      <para>
-        The default monitoring interval is 2 seconds; this value can be explicitly set using:
-        <programlisting>
-          monitor_interval_secs=&lt;seconds&gt;</programlisting>
-        in <filename>repmgr.conf</filename>.
+        Monitoring data is written at the interval defined by
+        the option <option>monitor_interval_secs</option> (see above).
      </para>
      <para>
        For more details on monitoring, see <xref linkend="repmgrd-monitoring">.
@@ -192,6 +564,13 @@
          </simpara>
        </listitem>

+
+        <listitem>
+          <simpara>
+            <varname>connection_check_type</varname>
+          </simpara>
+        </listitem>
+
        <listitem>
          <simpara>
            <varname>conninfo</varname>
@@ -216,6 +595,12 @@
          </simpara>
        </listitem>

+        <listitem>
+          <simpara>
+            <varname>failover_validation_command</varname>
+          </simpara>
+        </listitem>
+
        <listitem>
          <simpara>
            <varname>failover</varname>
@@ -288,12 +673,30 @@
          </simpara>
        </listitem>

+        <listitem>
+          <simpara>
+            <varname>retry_promote_interval_secs</varname>
+          </simpara>
+        </listitem>
+
        <listitem>
          <simpara>
            <varname>repmgrd_standby_startup_timeout</varname>
          </simpara>
        </listitem>

+        <listitem>
+          <simpara>
+            <varname>sibling_nodes_disconnect_timeout</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>standby_disconnect_on_failover</varname>
+          </simpara>
+        </listitem>
+
      </itemizedlist>

      <para>
@@ -348,7 +751,7 @@

  </sect1>

-  <sect1 id="repmgrd-daemon">
+  <sect1 id="repmgrd-daemon" xreflabel="repmgrd daemon">
    <indexterm>
      <primary>repmgrd</primary>
      <secondary>starting and stopping</secondary>
@@ -363,6 +766,20 @@
      See appendix <xref linkend="appendix-packages"> for details of service commands
      for different distributions.
    </para>
+    <para>
+      The commands <link linkend="repmgr-daemon-start"><command>repmgr daemon start</command></link> and
+      <link linkend="repmgr-daemon-stop"><command>repmgr daemon stop</command></link> can be used
+      as convenience wrappers to start and stop <application>repmgrd</application>.
+    </para>
+    <important>
+      <para>
+        <link linkend="repmgr-daemon-start"><command>repmgr daemon start</command></link> and
+        <link linkend="repmgr-daemon-stop"><command>repmgr daemon stop</command></link> require
+        that the appropriate start/stop commands are configured as
+        <varname>repmgrd_service_start_command</varname> and <varname>repmgrd_service_stop_command</varname>
+        in <filename>repmgr.conf</filename>.
+      </para>
+    </important>
    <para>
      <application>repmgrd</application> can be started manually like this:
      <programlisting>
@@ -387,7 +804,7 @@
        <simpara>
          This is a behaviour change from previous versions (earlier than 4.1), where
          the PID file had to be explicitly specified with the command line
-          parameter <option> --pid-file</option>.
+          parameter <option>--pid-file</option>.
        </simpara>
      </note>
      <para>
@@ -407,7 +824,7 @@
      </para>
      <para>
        If none of the above apply, <application>repmgrd</application> will create a PID file
-        in the operating system's temporary directory (das etermined by the environment variable
+        in the operating system's temporary directory (as setermined by the environment variable
        <varname>TMPDIR</varname>, or if that is not set, will use <filename>/tmp</filename>).
      </para>
      <para>
@@ -509,7 +926,7 @@ REPMGRD_OPTS="--daemonize=false"
 <para>
  For further details on <varname>conninfo</varname> network connection
  parameters, see the
-  <ulink url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-PARAMKEYWORDS">PostgreSQL documentation</ulink>.
+  <ulink url="https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS">PostgreSQL documentation</ulink>.
 </para>
 </sect1>

--- a/doc/repmgrd-degraded-monitoring.sgml
+++ b/doc/repmgrd-degraded-monitoring.sgml
@@ -1,83 +0,0 @@
-<chapter id="repmgrd-degraded-monitoring" xreflabel="repmgrd degraded monitoring">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>degraded monitoring</secondary>
- </indexterm>
-
- <title>"degraded monitoring" mode</title>
- <para>
-  In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
-  of monitoring the node's upstream server. In these cases it enters &quot;degraded monitoring&quot;
-  mode, where <application>repmgrd</application> remains active but is waiting for the situation
-  to be resolved.
- </para>
- <para>
-  Situations where this happens are:
-  <itemizedlist spacing="compact" mark="bullet">
-
-   <listitem>
-    <simpara>a failover situation has occurred, no nodes in the primary node's location are visible</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but no promotion candidate is available</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but the promotion candidate could not be promoted</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but the node was unable to follow the new primary</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but no primary has become available</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>a failover situation has occurred, but automatic failover is not enabled for the node</simpara>
-   </listitem>
-
-   <listitem>
-    <simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
-   </listitem>
-  </itemizedlist>
- </para>
-
- <para>
-  Example output in a situation where there is only one standby with <literal>failover=manual</literal>,
-  and the primary node is unavailable (but is later restarted):
-  <programlisting>
-    [2017-08-29 10:59:19] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)
-    [2017-08-29 10:59:33] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
-    [2017-08-29 10:59:33] [INFO] checking state of node 1, 1 of 5 attempts
-    [2017-08-29 10:59:33] [INFO] sleeping 1 seconds until next reconnection attempt
-    (...)
-    [2017-08-29 10:59:37] [INFO] checking state of node 1, 5 of 5 attempts
-    [2017-08-29 10:59:37] [WARNING] unable to reconnect to node 1 after 5 attempts
-    [2017-08-29 10:59:37] [NOTICE] this node is not configured for automatic failover so will not be considered as promotion candidate
-    [2017-08-29 10:59:37] [NOTICE] no other nodes are available as promotion candidate
-    [2017-08-29 10:59:37] [HINT] use "repmgr standby promote" to manually promote this node
-    [2017-08-29 10:59:37] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
-    [2017-08-29 10:59:53] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
-    [2017-08-29 11:00:45] [NOTICE] reconnected to upstream node 1 after 68 seconds, resuming monitoring
-    [2017-08-29 11:00:57] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)</programlisting>
-
- </para>
- <para>
-  By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
-  However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
-  after which <application>repmgrd</application> will terminate.
- </para>
-
- <note>
-   <para>
-     If <application>repmgrd</application> is monitoring a primary mode which has been stopped
-     and manually restarted as a standby attached to a new primary, it will automatically detect
-     the status change and update the node record to reflect the node's new status
-     as an active standby. It will then resume monitoring the node as a standby.
-   </para>
- </note>
-
-</chapter>
--- a/doc/repmgrd-demonstration.sgml
+++ b/doc/repmgrd-demonstration.sgml
@@ -1,96 +0,0 @@
-<chapter id="repmgrd-demonstration">
- <title>repmgrd demonstration</title>
- <para>
-  To demonstrate automatic failover, set up a 3-node replication cluster (one primary
-  and two standbys streaming directly from the primary) so that the cluster looks
-  something like this:
-  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster show
-     ID | Name  | Role    | Status    | Upstream | Location | Connection string
-    ----+-------+---------+-----------+----------+----------+--------------------------------------
-     1  | node1 | primary | * running |          | default  | host=node1 dbname=repmgr user=repmgr
-     2  | node2 | standby |   running | node1    | default  | host=node2 dbname=repmgr user=repmgr
-     3  | node3 | standby |   running | node1    | default  | host=node3 dbname=repmgr user=repmgr</programlisting>
- </para>
- <para>
-  Start <application>repmgrd</application> on each standby and verify that it's running by examining the
-  log output, which at log level <literal>INFO</literal> will look like this:
-  <programlisting>
-    [2017-08-24 17:31:00] [NOTICE] using configuration file "/etc/repmgr.conf"
-    [2017-08-24 17:31:00] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr"
-    [2017-08-24 17:31:00] [NOTICE] starting monitoring of node <literal>node2</literal> (ID: 2)
-    [2017-08-24 17:31:00] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
- </para>
- <para>
-  Each <application>repmgrd</application> should also have recorded its successful startup as an event:
-  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster event --event=repmgrd_start
-     Node ID | Name  | Event         | OK | Timestamp           | Details
-    ---------+-------+---------------+----+---------------------+-------------------------------------------------------------
-     3       | node3 | repmgrd_start | t  | 2017-08-24 17:35:54 | monitoring connection to upstream node "node1" (node ID: 1)
-     2       | node2 | repmgrd_start | t  | 2017-08-24 17:35:50 | monitoring connection to upstream node "node1" (node ID: 1)
-     1       | node1 | repmgrd_start | t  | 2017-08-24 17:35:46 | monitoring cluster primary "node1" (node ID: 1)  </programlisting>
- </para>
- <para>
-  Now stop the current primary server with e.g.:
-  <programlisting>
-    pg_ctl -D /var/lib/postgresql/data -m immediate stop</programlisting>
- </para>
- <para>
-  This will force the primary to shut down straight away, aborting all processes
-  and transactions.  This will cause a flurry of activity in the <application>repmgrd</application> log
-  files as each <application>repmgrd</application> detects the failure of the primary and a failover
-  decision is made. This is an extract from the log of a standby server (<literal>node2</literal>)
-  which has promoted to new primary after failure of the original primary (<literal>node1</literal>).
-  <programlisting>
-    [2017-08-24 23:32:01] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state
-    [2017-08-24 23:32:08] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
-    [2017-08-24 23:32:08] [INFO] checking state of node 1, 1 of 5 attempts
-    [2017-08-24 23:32:08] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:09] [INFO] checking state of node 1, 2 of 5 attempts
-    [2017-08-24 23:32:09] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:10] [INFO] checking state of node 1, 3 of 5 attempts
-    [2017-08-24 23:32:10] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:11] [INFO] checking state of node 1, 4 of 5 attempts
-    [2017-08-24 23:32:11] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:12] [INFO] checking state of node 1, 5 of 5 attempts
-    [2017-08-24 23:32:12] [WARNING] unable to reconnect to node 1 after 5 attempts
-    INFO:  setting voting term to 1
-    INFO:  node 2 is candidate
-    INFO:  node 3 has received request from node 2 for electoral term 1 (our term: 0)
-    [2017-08-24 23:32:12] [NOTICE] this node is the winner, will now promote self and inform other nodes
-    INFO: connecting to standby database
-    NOTICE: promoting standby
-    DETAIL: promoting server using 'pg_ctl -l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' promote'
-    INFO: reconnecting to promoted server
-    NOTICE: STANDBY PROMOTE successful
-    DETAIL: node 2 was successfully promoted to primary
-    INFO:  node 3 received notification to follow node 2
-    [2017-08-24 23:32:13] [INFO] switching to primary monitoring mode</programlisting>
- </para>
- <para>
-  The cluster status will now look like this, with the original primary (<literal>node1</literal>)
-  marked as inactive, and standby <literal>node3</literal> now following the new primary
-  (<literal>node2</literal>):
-  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster show
-     ID | Name  | Role    | Status    | Upstream | Location | Connection string
-    ----+-------+---------+-----------+----------+----------+----------------------------------------------------
-     1  | node1 | primary | - failed  |          | default  | host=node1 dbname=repmgr user=repmgr
-     2  | node2 | primary | * running |          | default  | host=node2 dbname=repmgr user=repmgr
-     3  | node3 | standby |   running | node2    | default  | host=node3 dbname=repmgr user=repmgr</programlisting>
-
- </para>
- <para>
-  <command>repmgr cluster event</command> will display a summary of what happened to each server
-  during the failover:
-  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster event
-     Node ID | Name  | Event                    | OK | Timestamp           | Details
-    ---------+-------+--------------------------+----+---------------------+-----------------------------------------------------------------------------------
-     3       | node3 | repmgrd_failover_follow  | t  | 2017-08-24 23:32:16 | node 3 now following new upstream node 2
-     3       | node3 | standby_follow           | t  | 2017-08-24 23:32:16 | node 3 is now attached to node 2
-     2       | node2 | repmgrd_failover_promote | t  | 2017-08-24 23:32:13 | node 2 promoted to primary; old primary 1 marked as failed
-     2       | node2 | standby_promote          | t  | 2017-08-24 23:32:13 | node 2 was successfully promoted to primary</programlisting>
- </para>
-</chapter>
--- a/doc/repmgrd-monitoring.sgml
+++ b/doc/repmgrd-monitoring.sgml
@@ -1,80 +0,0 @@
-<chapter id="repmgrd-monitoring" xreflabel="Monitoring with repmgrd">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>monitoring</secondary>
- </indexterm>
- <indexterm>
-   <primary>monitoring</primary>
-   <secondary>with repmgrd</secondary>
- </indexterm>
-
- <title>Monitoring with repmgrd</title>
- <para>
-   When <application>repmgrd</application> is running with the option <literal>monitoring_history=true</literal>,
-  it will constantly write standby node status information to the
-  <varname>monitoring_history</varname> table, providing a near-real time
-  overview of replication status on all nodes
-  in the cluster.
- </para>
- <para>
-   The view <literal>replication_status</literal> shows the most recent state
-   for each node, e.g.:
-  <programlisting>
-    repmgr=# select * from repmgr.replication_status;
-    -[ RECORD 1 ]-------------+------------------------------
-    primary_node_id           | 1
-    standby_node_id           | 2
-    standby_name              | node2
-    node_type                 | standby
-    active                    | t
-    last_monitor_time         | 2017-08-24 16:28:41.260478+09
-    last_wal_primary_location | 0/6D57A00
-    last_wal_standby_location | 0/5000000
-    replication_lag           | 29 MB
-    replication_time_lag      | 00:00:11.736163
-    apply_lag                 | 15 MB
-    communication_time_lag    | 00:00:01.365643</programlisting>
- </para>
- <para>
-  The interval in which monitoring history is written is controlled by the
-  configuration parameter <varname>monitor_interval_secs</varname>;
-  default is 2.
- </para>
- <para>
-  As this can generate a large amount of monitoring data in the table
-  <literal>repmgr.monitoring_history</literal>. it's advisable to regularly
-  purge historical data using the <xref linkend="repmgr-cluster-cleanup">
-  command; use the <literal>-k/--keep-history</literal> option to
-  specify how many day's worth of data should be retained.
- </para>
- <para>
-  It's possible to use <application>repmgrd</application> to run in monitoring
-  mode only (without automatic failover capability) for some or all
-  nodes by setting <literal>failover=manual</literal> in the node's
-  <filename>repmgr.conf</filename> file. In the event of the node's upstream failing,
-  no failover action will be taken and the node will require manual intervention to
-  be reattached to replication. If this occurs, an
-  <link linkend="event-notifications">event notification</link>
-  <varname>standby_disconnect_manual</varname> will be created.
- </para>
- <para>
-  Note that when a standby node is not streaming directly from its upstream
-  node, e.g. recovering WAL from an archive, <varname>apply_lag</varname> will always appear as
-  <literal>0 bytes</literal>.
- </para>
- <tip>
-  <para>
-   If monitoring history is enabled, the contents of the <literal>repmgr.monitoring_history</literal>
-   table will be replicated to attached standbys. This means there will be a small but
-   constant stream of replication activity which may not be desirable. To prevent
-   this, convert the table to an <literal>UNLOGGED</literal> one with:
-   <programlisting>
-     ALTER TABLE repmgr.monitoring_history SET UNLOGGED;</programlisting>
-  </para>
-  <para>
-   This will however mean that monitoring history will not be available on
-   another node following a failover, and the view <literal>repmgr.replication_status</literal>
-   will not work on standbys.
-  </para>
- </tip>
-</chapter>
--- a/doc/repmgrd-network-split.sgml
+++ b/doc/repmgrd-network-split.sgml
@@ -1,48 +0,0 @@
-<chapter id="repmgrd-network-split" xreflabel="Handling network splits with repmgrd">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>network splits</secondary>
- </indexterm>
-
- <title>Handling network splits with repmgrd</title>
- <para>
-  A common pattern for replication cluster setups is to spread servers over
-  more than one datacentre. This can provide benefits such as geographically-
-  distributed read replicas and DR (disaster recovery capability). However
-  this also means there is a risk of disconnection at network level between
-  datacentre locations, which would result in a split-brain scenario if
-  servers in a secondary data centre were no longer able to see the primary
-  in the main data centre and promoted a standby among themselves.
- </para>
- <para>
-  &repmgr; enables provision of &quot;<xref linkend="witness-server">&quot; to
-  artificially create a quorum of servers in a particular location, ensuring
-  that nodes in another location will not elect a new primary if they
-  are unable to see the majority of nodes. However this approach does not
-  scale well, particularly with more complex replication setups, e.g.
-  where the majority of nodes are located outside of the primary datacentre.
-  It also means the <literal>witness</literal> node needs to be managed as an
-  extra PostgreSQL instance outside of the main replication cluster, which
-  adds administrative and programming complexity.
- </para>
- <para>
-  <literal>repmgr4</literal> introduces the concept of <literal>location</literal>:
-  each node is associated with an arbitrary location string (default is
-  <literal>default</literal>); this is set in <filename>repmgr.conf</filename>, e.g.:
-  <programlisting>
-    node_id=1
-    node_name=node1
-    conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'
-    data_directory='/var/lib/postgresql/data'
-    location='dc1'</programlisting>
- </para>
- <para>
-  In a failover situation, <application>repmgrd</application> will check if any servers in the
-  same location as the current primary node are visible.  If not, <application>repmgrd</application>
-  will assume a network interruption and not promote any node in any
-  other location (it will however enter <link linkend="repmgrd-degraded-monitoring">degraded monitoring</link>
-  mode until a primary becomes visible).
- </para>
-
-</chapter>
-
--- a/doc/repmgrd-operation.sgml
+++ b/doc/repmgrd-operation.sgml
@@ -0,0 +1,386 @@
+<chapter id="repmgrd-operation" xreflabel="repmgrd operation">
+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>operation</secondary>
+  </indexterm>
+
+  <title>repmgrd operation</title>
+
+
+  <sect1 id="repmgrd-pausing">
+
+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>pausing</secondary>
+  </indexterm>
+
+  <indexterm>
+    <primary>pausing repmgrd</primary>
+  </indexterm>
+
+  <title>Pausing repmgrd</title>
+
+  <para>
+    In normal operation, <application>repmgrd</application> monitors the state of the
+    PostgreSQL node it is running on, and will take appropriate action if problems
+    are detected, e.g. (if so configured) promote the node to primary, if the existing
+    primary has been determined as failed.
+  </para>
+
+  <para>
+    However, <application>repmgrd</application> is unable to distinguish between
+    planned outages (such as performing a <link linkend="performing-switchover">switchover</link>
+    or installing PostgreSQL maintenance released), and an actual server outage. In versions prior to
+    &repmgr; 4.2 it was necessary to stop <application>repmgrd</application> on all nodes (or at least
+    on all nodes where <application>repmgrd</application> is
+    <link linkend="repmgrd-automatic-failover">configured for automatic failover</link>)
+    to prevent <application>repmgrd</application> from making unintentional changes to the
+    replication cluster.
+  </para>
+
+  <para>
+    From <link linkend="release-4.2">&repmgr; 4.2</link>, <application>repmgrd</application>
+    can now be &quot;paused&quot;, i.e. instructed not to take any action such as performing a failover.
+    This can be done from any node in the cluster, removing the need to stop/restart
+    each <application>repmgrd</application> individually.
+  </para>
+
+  <note>
+    <para>
+      For major PostgreSQL upgrades, e.g. from PostgreSQL 10 to PostgreSQL 11,
+      <application>repmgrd</application> should be shut down completely and only started up
+      once the &repmgr; packages for the new PostgreSQL major version have been installed.
+    </para>
+  </note>
+
+  <sect2 id="repmgrd-pausing-prerequisites">
+    <title>Prerequisites for pausing <application>repmgrd</application></title>
+    <para>
+      In order to be able to pause/unpause <application>repmgrd</application>, following
+      prerequisites must be met:
+      <itemizedlist spacing="compact" mark="bullet">
+
+        <listitem>
+          <simpara><link linkend="release-4.2">&repmgr; 4.2</link> or later must be installed on all nodes.</simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>The same major &repmgr; version (e.g. 4.2) must be installed on all nodes (and preferably the same minor version).</simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            PostgreSQL on all nodes must be accessible from the node where the
+            <literal>pause</literal>/<literal>unpause</literal> operation is executed, using the
+            <varname>conninfo</varname> string shown by <link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>.
+          </simpara>
+        </listitem>
+      </itemizedlist>
+    </para>
+    <note>
+      <para>
+        These conditions are required for normal &repmgr; operation in any case.
+      </para>
+    </note>
+
+  </sect2>
+
+  <sect2 id="repmgrd-pausing-execution">
+    <title>Pausing/unpausing <application>repmgrd</application></title>
+    <para>
+      To pause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link>, e.g.:
+   <programlisting>
+$ repmgr -f /etc/repmgr.conf daemon pause
+NOTICE: node 1 (node1) paused
+NOTICE: node 2 (node2) paused
+NOTICE: node 3 (node3) paused</programlisting>
+    </para>
+    <para>
+      The state of <application>repmgrd</application> on each node can be checked with
+      <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>, e.g.:
+    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
+ ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
+----+-------+---------+---------+---------+------+---------
+ 1  | node1 | primary | running | running | 7851 | yes
+ 2  | node2 | standby | running | running | 7889 | yes
+ 3  | node3 | standby | running | running | 7918 | yes</programlisting>
+    </para>
+
+    <note>
+      <para>
+        If executing a switchover with  <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
+		&repmgr; will automatically pause/unpause <application>repmgrd</application> as part of the switchover process.
+      </para>
+    </note>
+
+    <para>
+      If the primary (in this example, <literal>node1</literal>) is stopped, <application>repmgrd</application>
+      running on one of the standbys (here: <literal>node2</literal>) will react like this:
+      <programlisting>
+[2018-09-20 12:22:21] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
+[2018-09-20 12:22:21] [INFO] checking state of node 1, 1 of 5 attempts
+[2018-09-20 12:22:21] [INFO] sleeping 1 seconds until next reconnection attempt
+...
+[2018-09-20 12:22:24] [INFO] sleeping 1 seconds until next reconnection attempt
+[2018-09-20 12:22:25] [INFO] checking state of node 1, 5 of 5 attempts
+[2018-09-20 12:22:25] [WARNING] unable to reconnect to node 1 after 5 attempts
+[2018-09-20 12:22:25] [NOTICE] node is paused
+[2018-09-20 12:22:33] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state
+[2018-09-20 12:22:33] [DETAIL] repmgrd paused by administrator
+[2018-09-20 12:22:33] [HINT] execute "repmgr daemon unpause" to resume normal failover mode</programlisting>
+    </para>
+    <para>
+      If the primary becomes available again (e.g. following a software upgrade), <application>repmgrd</application>
+      will automatically reconnect, e.g.:
+      <programlisting>
+[2018-09-20 13:12:41] [NOTICE] reconnected to upstream node 1 after 8 seconds, resuming monitoring</programlisting>
+    </para>
+
+    <para>
+      To unpause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>, e.g.:
+   <programlisting>
+$ repmgr -f /etc/repmgr.conf daemon unpause
+NOTICE: node 1 (node1) unpaused
+NOTICE: node 2 (node2) unpaused
+NOTICE: node 3 (node3) unpaused</programlisting>
+    </para>
+
+    <note>
+      <para>
+        If the previous primary is no longer accessible when <application>repmgrd</application>
+        is unpaused, no failover action will be taken. Instead, a new primary must be manually promoted using
+        <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>,
+		and any standbys attached to the new primary with
+		<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>.
+      </para>
+      <para>
+        This is to prevent <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
+        resulting in the automatic promotion of a new primary, which may be a problem particularly
+        in larger clusters, where <application>repmgrd</application> could select a different promotion
+        candidate to the one intended by the administrator.
+      </para>
+    </note>
+  </sect2>
+  <sect2 id="repmgrd-pausing-details">
+    <title>Details on the <application>repmgrd</application> pausing mechanism</title>
+
+    <para>
+      The pause state of each node will be stored over a PostgreSQL restart.
+    </para>
+
+	<para>
+	  <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
+	  <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link> can be
+	  executed even if <application>repmgrd</application> is not running; in this case,
+	  <application>repmgrd</application> will start up in whichever pause state has been set.
+	</para>
+    <note>
+      <para>
+		<link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
+		<link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
+		<emphasis>do not</emphasis> stop/start <application>repmgrd</application>.
+      </para>
+    </note>
+  </sect2>
+  </sect1>
+
+  <sect1 id="repmgrd-wal-replay-pause">
+    <indexterm>
+      <primary>repmgrd</primary>
+      <secondary>paused WAL replay</secondary>
+    </indexterm>
+
+    <title>repmgrd and paused WAL replay</title>
+    <para>
+      If WAL replay has been paused (using <command>pg_wal_replay_pause()</command>,
+      on PostgreSQL 9.6 and earlier <command>pg_xlog_replay_pause()</command>),
+      in a failover situation <application>repmgrd</application> will
+      automatically resume WAL replay.
+    </para>
+    <para>
+      This is because if WAL replay is paused, but WAL is pending replay,
+      PostgreSQL cannot be promoted until WAL replay is resumed.
+    </para>
+    <note>
+      <para>
+        <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
+        will refuse to promote a node in this state, as the PostgreSQL
+        <command>promote</command> command will not be acted on until
+        WAL replay is resumed, leaving the cluster in a potentially
+        unstable state. In this case it is up to the user to
+        decide whether to resume WAL replay.
+      </para>
+    </note>
+  </sect1>
+
+<sect1 id="repmgrd-degraded-monitoring" xreflabel="repmgrd degraded monitoring">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>degraded monitoring</secondary>
+ </indexterm>
+
+ <indexterm>
+   <primary>degraded monitoring</primary>
+ </indexterm>
+
+ <title>"degraded monitoring" mode</title>
+ <para>
+  In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
+  of monitoring the node's upstream server. In these cases it enters &quot;degraded monitoring&quot;
+  mode, where <application>repmgrd</application> remains active but is waiting for the situation
+  to be resolved.
+ </para>
+ <para>
+  Situations where this happens are:
+  <itemizedlist spacing="compact" mark="bullet">
+
+   <listitem>
+    <simpara>a failover situation has occurred, no nodes in the primary node's location are visible</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but no promotion candidate is available</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but the promotion candidate could not be promoted</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but the node was unable to follow the new primary</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but no primary has become available</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>a failover situation has occurred, but automatic failover is not enabled for the node</simpara>
+   </listitem>
+
+   <listitem>
+    <simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
+   </listitem>
+  </itemizedlist>
+ </para>
+
+ <para>
+  Example output in a situation where there is only one standby with <literal>failover=manual</literal>,
+  and the primary node is unavailable (but is later restarted):
+  <programlisting>
+    [2017-08-29 10:59:19] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)
+    [2017-08-29 10:59:33] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
+    [2017-08-29 10:59:33] [INFO] checking state of node 1, 1 of 5 attempts
+    [2017-08-29 10:59:33] [INFO] sleeping 1 seconds until next reconnection attempt
+    (...)
+    [2017-08-29 10:59:37] [INFO] checking state of node 1, 5 of 5 attempts
+    [2017-08-29 10:59:37] [WARNING] unable to reconnect to node 1 after 5 attempts
+    [2017-08-29 10:59:37] [NOTICE] this node is not configured for automatic failover so will not be considered as promotion candidate
+    [2017-08-29 10:59:37] [NOTICE] no other nodes are available as promotion candidate
+    [2017-08-29 10:59:37] [HINT] use "repmgr standby promote" to manually promote this node
+    [2017-08-29 10:59:37] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
+    [2017-08-29 10:59:53] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
+    [2017-08-29 11:00:45] [NOTICE] reconnected to upstream node 1 after 68 seconds, resuming monitoring
+    [2017-08-29 11:00:57] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)</programlisting>
+
+ </para>
+ <para>
+  By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
+  However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
+  after which <application>repmgrd</application> will terminate.
+ </para>
+
+ <note>
+   <para>
+     If <application>repmgrd</application> is monitoring a primary mode which has been stopped
+     and manually restarted as a standby attached to a new primary, it will automatically detect
+     the status change and update the node record to reflect the node's new status
+     as an active standby. It will then resume monitoring the node as a standby.
+   </para>
+ </note>
+</sect1>
+
+
+<sect1 id="repmgrd-monitoring" xreflabel="Storing monitoring data">
+ <indexterm>
+   <primary>repmgrd</primary>
+   <secondary>monitoring</secondary>
+ </indexterm>
+ <indexterm>
+   <primary>monitoring</primary>
+   <secondary>with repmgrd</secondary>
+ </indexterm>
+
+ <title>Storing monitoring data</title>
+ <para>
+   When <application>repmgrd</application> is running with the option <literal>monitoring_history=true</literal>,
+  it will constantly write standby node status information to the
+  <varname>monitoring_history</varname> table, providing a near-real time
+  overview of replication status on all nodes
+  in the cluster.
+ </para>
+ <para>
+   The view <literal>replication_status</literal> shows the most recent state
+   for each node, e.g.:
+  <programlisting>
+    repmgr=# select * from repmgr.replication_status;
+    -[ RECORD 1 ]-------------+------------------------------
+    primary_node_id           | 1
+    standby_node_id           | 2
+    standby_name              | node2
+    node_type                 | standby
+    active                    | t
+    last_monitor_time         | 2017-08-24 16:28:41.260478+09
+    last_wal_primary_location | 0/6D57A00
+    last_wal_standby_location | 0/5000000
+    replication_lag           | 29 MB
+    replication_time_lag      | 00:00:11.736163
+    apply_lag                 | 15 MB
+    communication_time_lag    | 00:00:01.365643</programlisting>
+ </para>
+ <para>
+  The interval in which monitoring history is written is controlled by the
+  configuration parameter <varname>monitor_interval_secs</varname>;
+  default is 2.
+ </para>
+ <para>
+  As this can generate a large amount of monitoring data in the table
+  <literal>repmgr.monitoring_history</literal>. it's advisable to regularly
+  purge historical data using the <xref linkend="repmgr-cluster-cleanup">
+  command; use the <literal>-k/--keep-history</literal> option to
+  specify how many day's worth of data should be retained.
+ </para>
+ <para>
+  It's possible to use <application>repmgrd</application> to run in monitoring
+  mode only (without automatic failover capability) for some or all
+  nodes by setting <literal>failover=manual</literal> in the node's
+  <filename>repmgr.conf</filename> file. In the event of the node's upstream failing,
+  no failover action will be taken and the node will require manual intervention to
+  be reattached to replication. If this occurs, an
+  <link linkend="event-notifications">event notification</link>
+  <varname>standby_disconnect_manual</varname> will be created.
+ </para>
+ <para>
+  Note that when a standby node is not streaming directly from its upstream
+  node, e.g. recovering WAL from an archive, <varname>apply_lag</varname> will always appear as
+  <literal>0 bytes</literal>.
+ </para>
+ <tip>
+  <para>
+   If monitoring history is enabled, the contents of the <literal>repmgr.monitoring_history</literal>
+   table will be replicated to attached standbys. This means there will be a small but
+   constant stream of replication activity which may not be desirable. To prevent
+   this, convert the table to an <literal>UNLOGGED</literal> one with:
+   <programlisting>
+     ALTER TABLE repmgr.monitoring_history SET UNLOGGED;</programlisting>
+  </para>
+  <para>
+   This will however mean that monitoring history will not be available on
+   another node following a failover, and the view <literal>repmgr.replication_status</literal>
+   will not work on standbys.
+  </para>
+ </tip>
+</sect1>
+
+
+</chapter>
--- a/doc/repmgrd-overview.sgml
+++ b/doc/repmgrd-overview.sgml
@@ -0,0 +1,187 @@
+<chapter id="repmgrd-overview" xreflabel="repmgrd overview">
+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>overview</secondary>
+  </indexterm>
+
+  <title>repmgrd overview</title>
+
+  <para>
+    <application>repmgrd</application> (&quot;<literal>replication manager daemon</literal>&quot;)
+    is a management and monitoring daemon which runs
+    on each node in a replication cluster. It can automate actions such as
+    failover and updating standbys to follow the new primary, as well as
+    providing monitoring information about the state of each standby.
+  </para>
+  <para>
+    <application>repmgrd</application> is designed to be straightforward to set up
+    and does not require additional external infrastructure.
+  </para>
+  <para>
+    Functionality provided by <application>repmgrd</application> includes:
+    <itemizedlist spacing="compact" mark="bullet">
+
+       <listitem>
+         <simpara>
+           wide range of <link linkend="repmgrd-basic-configuration">configuration options</link>
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           option to execute custom scripts (&quot;<link linkend="event-notifications">event notifications</link>
+           at different points in the failover sequence
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           ability to <link linkend="repmgrd-pausing">pause repmgrd</link>
+           operation on all nodes with a
+           <link linkend="repmgr-daemon-pause"><command>single command</command></link>
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           optional <link linkend="repmgrd-witness-server">witness server</link>
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           &quot;location&quot; configuration option to restrict
+           potential promotion candidates to a single location
+           (e.g. when nodes are spread over multiple data centres)
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           <link linkend="connection-check-type">choice of method</link> to determine node availability
+           (PostgreSQL ping, query execution or new connection)
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           retention of monitoring statistics (optional)
+         </simpara>
+       </listitem>
+
+
+    </itemizedlist>
+
+  </para>
+
+  <sect1 id="repmgrd-demonstration">
+
+    <title>repmgrd demonstration</title>
+    <para>
+  To demonstrate automatic failover, set up a 3-node replication cluster (one primary
+  and two standbys streaming directly from the primary) so that the cluster looks
+  something like this:
+  <programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster show --compact
+     ID | Name  | Role    | Status    | Upstream | Location | Prio.
+    ----+-------+---------+-----------+----------+----------+-------
+     1  | node1 | primary | * running |          | default  | 100
+     2  | node2 | standby |   running | node1    | default  | 100
+     3  | node3 | standby |   running | node1    | default  | 100</programlisting>
+ </para>
+
+ <tip>
+   <para>
+     See section <link linkend="repmgrd-automatic-failover-configuration">Required configuration for automatic failover</link>
+     for an example of minimal <filename>repmgr.conf</filename> file settings suitable for use with <application>repmgrd</application>.
+   </para>
+ </tip>
+ <para>
+  Start <application>repmgrd</application> on each standby and verify that it's running by examining the
+  log output, which at log level <literal>INFO</literal> will look like this:
+  <programlisting>
+    [2019-03-15 06:32:05] [NOTICE] repmgrd (repmgrd 4.3) starting up
+    [2019-03-15 06:32:05] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr connect_timeout=2"
+    INFO:  set_repmgrd_pid(): provided pidfile is /var/run/repmgr/repmgrd-11.pid
+    [2019-03-15 06:32:05] [NOTICE] starting monitoring of node "node2" (ID: 2)
+    [2019-03-15 06:32:05] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
+ </para>
+ <para>
+  Each <application>repmgrd</application> should also have recorded its successful startup as an event:
+  <programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster event --event=repmgrd_start
+     Node ID | Name  | Event         | OK | Timestamp           | Details
+    ---------+-------+---------------+----+---------------------+-------------------------------------------------------------
+     3       | node3 | repmgrd_start | t  | 2019-03-14 04:17:30 | monitoring connection to upstream node "node1" (node ID: 1)
+     2       | node2 | repmgrd_start | t  | 2019-03-14 04:11:47 | monitoring connection to upstream node "node1" (node ID: 1)
+     1       | node1 | repmgrd_start | t  | 2019-03-14 04:04:31 | monitoring cluster primary "node1" (node ID: 1)</programlisting>
+ </para>
+ <para>
+  Now stop the current primary server with e.g.:
+  <programlisting>
+    pg_ctl -D /var/lib/postgresql/data -m immediate stop</programlisting>
+ </para>
+ <para>
+  This will force the primary to shut down straight away, aborting all processes
+  and transactions.  This will cause a flurry of activity in the <application>repmgrd</application> log
+  files as each <application>repmgrd</application> detects the failure of the primary and a failover
+  decision is made. This is an extract from the log of a standby server (<literal>node2</literal>)
+  which has promoted to new primary after failure of the original primary (<literal>node1</literal>).
+  <programlisting>
+    [2019-03-15 06:37:50] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
+    [2019-03-15 06:37:50] [INFO] checking state of node 1, 1 of 3 attempts
+    [2019-03-15 06:37:50] [INFO] sleeping 5 seconds until next reconnection attempt
+    [2019-03-15 06:37:55] [INFO] checking state of node 1, 2 of 3 attempts
+    [2019-03-15 06:37:55] [INFO] sleeping 5 seconds until next reconnection attempt
+    [2019-03-15 06:38:00] [INFO] checking state of node 1, 3 of 3 attempts
+    [2019-03-15 06:38:00] [WARNING] unable to reconnect to node 1 after 3 attempts
+    [2019-03-15 06:38:00] [INFO] primary and this node have the same location ("default")
+    [2019-03-15 06:38:00] [INFO] local node's last receive lsn: 0/900CBF8
+    [2019-03-15 06:38:00] [INFO] node 3 last saw primary node 12 second(s) ago
+    [2019-03-15 06:38:00] [INFO] last receive LSN for sibling node "node3" (ID: 3) is: 0/900CBF8
+    [2019-03-15 06:38:00] [INFO] node "node3" (ID: 3) has same LSN as current candidate "node2" (ID: 2)
+    [2019-03-15 06:38:00] [INFO] visible nodes: 2; total nodes: 2; no nodes have seen the primary within the last 4 seconds
+    [2019-03-15 06:38:00] [NOTICE] promotion candidate is "node2" (ID: 2)
+    [2019-03-15 06:38:00] [NOTICE] this node is the winner, will now promote itself and inform other nodes
+    [2019-03-15 06:38:00] [INFO] promote_command is:
+      "/usr/pgsql-11/bin/repmgr -f /etc/repmgr/11/repmgr.conf standby promote"
+    NOTICE: promoting standby to primary
+    DETAIL: promoting server "node2" (ID: 2) using "/usr/pgsql-11/bin/pg_ctl  -w -D '/var/lib/pgsql/11/data' promote"
+    NOTICE: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
+    NOTICE: STANDBY PROMOTE successful
+    DETAIL: server "node2" (ID: 2) was successfully promoted to primary
+    [2019-03-15 06:38:01] [INFO] 3 followers to notify
+    [2019-03-15 06:38:01] [NOTICE] notifying node "node3" (node ID: 3) to follow node 2
+    INFO:  node 3 received notification to follow node 2
+    [2019-03-15 06:38:01] [INFO] switching to primary monitoring mode
+    [2019-03-15 06:38:01] [NOTICE] monitoring cluster primary "node2" (node ID: 2)</programlisting>
+ </para>
+ <para>
+  The cluster status will now look like this, with the original primary (<literal>node1</literal>)
+  marked as inactive, and standby <literal>node3</literal> now following the new primary
+  (<literal>node2</literal>):
+  <programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster show --compact
+     ID | Name  | Role    | Status    | Upstream | Location | Prio.
+    ----+-------+---------+-----------+----------+----------+-------
+     1  | node1 | primary | - failed  |          | default  | 100
+     2  | node2 | primary | * running |          | default  | 100
+     3  | node3 | standby |   running | node2    | default  | 100</programlisting>
+
+ </para>
+ <para>
+   <link linkend="repmgr-cluster-event"><command>repmgr cluster event</command></link> will display a summary of
+   what happened to each server during the failover:
+  <programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster event
+     Node ID | Name  | Event                      | OK | Timestamp           | Details
+    ---------+-------+----------------------------+----+---------------------+-------------------------------------------------------------
+     3       | node3 | repmgrd_failover_follow    | t  | 2019-03-15 06:38:03 | node 3 now following new upstream node 2
+     3       | node3 | standby_follow             | t  | 2019-03-15 06:38:02 | standby attached to upstream node "node2" (node ID: 2)
+     2       | node2 | repmgrd_reload             | t  | 2019-03-15 06:38:01 | monitoring cluster primary "node2" (node ID: 2)
+     2       | node2 | repmgrd_failover_promote   | t  | 2019-03-15 06:38:01 | node 2 promoted to primary; old primary 1 marked as failed
+     2       | node2 | standby_promote            | t  | 2019-03-15 06:38:01 | server "node2" (ID: 2) was successfully promoted to primary</programlisting>
+ </para>
+
+  </sect1>
+</chapter>
--- a/doc/repmgrd-pausing.sgml
+++ b/doc/repmgrd-pausing.sgml
@@ -1,178 +0,0 @@
-<chapter id="repmgrd-pausing" xreflabel="Pausing repmgrd">
-
-  <indexterm>
-    <primary>repmgrd</primary>
-    <secondary>pausing</secondary>
-  </indexterm>
-
-  <indexterm>
-    <primary>pausing repmgrd</primary>
-  </indexterm>
-
-  <title>Pausing repmgrd</title>
-
-  <para>
-    In normal operation, <application>repmgrd</application> monitors the state of the
-    PostgreSQL node it is running on, and will take appropriate action if problems
-    are detected, e.g. (if so configured) promote the node to primary, if the existing
-    primary has been determined as failed.
-  </para>
-
-  <para>
-    However, <application>repmgrd</application> is unable to distinguish between
-    planned outages (such as performing a <link linkend="performing-switchover">switchover</link>
-    or installing PostgreSQL maintenance released), and an actual server outage. In versions prior to
-    &repmgr; 4.2 it was necessary to stop <application>repmgrd</application> on all nodes (or at least
-    on all nodes where <application>repmgrd</application> is
-    <link linkend="repmgrd-automatic-failover">configured for automatic failover</link>)
-    to prevent <application>repmgrd</application> from making unintentional changes to the
-    replication cluster.
-  </para>
-
-  <para>
-    From <link linkend="release-4.2">&repmgr; 4.2</link>, <application>repmgrd</application>
-    can now be &quot;paused&quot;, i.e. instructed not to take any action such as performing a failover.
-    This can be done from any node in the cluster, removing the need to stop/restart
-    each <application>repmgrd</application> individually.
-  </para>
-
-  <note>
-    <para>
-      For major PostgreSQL upgrades, e.g. from PostgreSQL 10 to PostgreSQL 11,
-      <application>repmgrd</application> should be shut down completely and only started up
-      once the &repmgr; packages for the new PostgreSQL major version have been installed.
-    </para>
-  </note>
-
-  <sect1 id="repmgrd-pausing-prerequisites">
-    <title>Prerequisites for pausing <application>repmgrd</application></title>
-    <para>
-      In order to be able to pause/unpause <application>repmgrd</application>, following
-      prerequisites must be met:
-      <itemizedlist spacing="compact" mark="bullet">
-
-        <listitem>
-          <simpara><link linkend="release-4.2">&repmgr; 4.2</link> or later must be installed on all nodes.</simpara>
-        </listitem>
-
-        <listitem>
-          <simpara>The same major &repmgr; version (e.g. 4.2) must be installed on all nodes (and preferably the same minor version).</simpara>
-        </listitem>
-
-        <listitem>
-          <simpara>
-            PostgreSQL on all nodes must be accessible from the node where the
-            <literal>pause</literal>/<literal>unpause</literal> operation is executed, using the
-            <varname>conninfo</varname> string shown by <link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>.
-          </simpara>
-        </listitem>
-      </itemizedlist>
-    </para>
-    <note>
-      <para>
-        These conditions are required for normal &repmgr; operation in any case.
-      </para>
-    </note>
-
-  </sect1>
-
-  <sect1 id="repmgrd-pausing-execution">
-    <title>Pausing/unpausing <application>repmgrd</application></title>
-    <para>
-      To pause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link>, e.g.:
-   <programlisting>
-$ repmgr -f /etc/repmgr.conf daemon pause
-NOTICE: node 1 (node1) paused
-NOTICE: node 2 (node2) paused
-NOTICE: node 3 (node3) paused</programlisting>
-    </para>
-    <para>
-      The state of <application>repmgrd</application> on each node can be checked with
-      <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>, e.g.:
-    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
- ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
----+-------+---------+---------+---------+------+---------
- 1  | node1 | primary | running | running | 7851 | yes
- 2  | node2 | standby | running | running | 7889 | yes
- 3  | node3 | standby | running | running | 7918 | yes</programlisting>
-    </para>
-
-    <note>
-      <para>
-        If executing a switchover with  <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
-		&repmgr; will automatically pause/unpause <application>repmgrd</application> as part of the switchover process.
-      </para>
-    </note>
-
-    <para>
-      If the primary (in this example, <literal>node1</literal>) is stopped, <application>repmgrd</application>
-      running on one of the standbys (here: <literal>node2</literal>) will react like this:
-      <programlisting>
-[2018-09-20 12:22:21] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
-[2018-09-20 12:22:21] [INFO] checking state of node 1, 1 of 5 attempts
-[2018-09-20 12:22:21] [INFO] sleeping 1 seconds until next reconnection attempt
-...
-[2018-09-20 12:22:24] [INFO] sleeping 1 seconds until next reconnection attempt
-[2018-09-20 12:22:25] [INFO] checking state of node 1, 5 of 5 attempts
-[2018-09-20 12:22:25] [WARNING] unable to reconnect to node 1 after 5 attempts
-[2018-09-20 12:22:25] [NOTICE] node is paused
-[2018-09-20 12:22:33] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state
-[2018-09-20 12:22:33] [DETAIL] repmgrd paused by administrator
-[2018-09-20 12:22:33] [HINT] execute "repmgr daemon unpause" to resume normal failover mode</programlisting>
-    </para>
-    <para>
-      If the primary becomes available again (e.g. following a software upgrade), <application>repmgrd</application>
-      will automatically reconnect, e.g.:
-      <programlisting>
-[2018-09-20 13:12:41] [NOTICE] reconnected to upstream node 1 after 8 seconds, resuming monitoring</programlisting>
-    </para>
-
-    <para>
-      To unpause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>, e.g.:
-   <programlisting>
-$ repmgr -f /etc/repmgr.conf daemon unpause
-NOTICE: node 1 (node1) unpaused
-NOTICE: node 2 (node2) unpaused
-NOTICE: node 3 (node3) unpaused</programlisting>
-    </para>
-
-    <note>
-      <para>
-        If the previous primary is no longer accessible when <application>repmgrd</application>
-        is unpaused, no failover action will be taken. Instead, a new primary must be manually promoted using
-        <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>,
-		and any standbys attached to the new primary with
-		<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>.
-      </para>
-      <para>
-        This is to prevent <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
-        resulting in the automatic promotion of a new primary, which may be a problem particularly
-        in larger clusters, where <application>repmgrd</application> could select a different promotion
-        candidate to the one intended by the administrator.
-      </para>
-    </note>
-
-  <sect2 id="repmgrd-pausing-details">
-    <title>Details on the <application>repmgrd</application> pausing mechanism</title>
-
-    <para>
-      The pause state of each node will be stored over a PostgreSQL restart.
-    </para>
-
-	<para>
-	  <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
-	  <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link> can be
-	  executed even if <application>repmgrd</application> is not running; in this case,
-	  <application>repmgrd</application> will start up in whichever pause state has been set.
-	</para>
-    <note>
-      <para>
-		<link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
-		<link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
-		<emphasis>do not</emphasis> stop/start <application>repmgrd</application>.
-      </para>
-    </note>
-  </sect2>
-  </sect1>
-</chapter>
-
--- a/doc/repmgrd-witness-server.sgml
+++ b/doc/repmgrd-witness-server.sgml
@@ -1,31 +0,0 @@
-<chapter id="repmgrd-witness-server" xreflabel="Using a witness server with repmgrd">
- <indexterm>
-   <primary>repmgrd</primary>
-   <secondary>witness server</secondary>
- </indexterm>
-
- <title>Using a witness server with repmgrd</title>
- <para>
-   In a situation caused e.g. by a network interruption between two
-   data centres, it's important to avoid a "split-brain" situation where
-   both sides of the network assume they are the active segment and the
-   side without an active primary unilaterally promotes one of its standbys.
- </para>
- <para>
-   To prevent this situation happening, it's essential to ensure that one
-   network segment has a "voting majority", so other segments will know
-   they're in the minority and not attempt to promote a new primary. Where
-   an odd number of servers exists, this is not an issue. However, if each
-   network has an even number of nodes, it's necessary to provide some way
-   of ensuring a majority, which is where the witness server becomes useful.
- </para>
- <para>
-   This is not a fully-fledged standby node and is not integrated into
-   replication, but it effectively represents the "casting vote" when
-   deciding which network segment has a majority. A witness server can
-   be set up using <xref linkend="repmgr-witness-register">. Note that it only
-   makes sense to create a witness server in conjunction with running
-   <application>repmgrd</application>; the witness server will require its own
-   <application>repmgrd</application> instance.
- </para>
-</chapter>
--- a/doc/switchover.sgml
+++ b/doc/switchover.sgml
@@ -137,8 +137,8 @@

   <note>
     <para>
-       If an exclusive backup is running on the current primary, &repmgr; will not perform the
-       switchover.
+       If an exclusive backup is running on the current primary, or if WAL replay is paused on the standby,
+       &repmgr; will <emphasis>not</emphasis> perform the switchover.
     </para>
   </note>

@@ -236,7 +236,7 @@
    </note>
    <para>
      For more details on <application>pg_rewind</application>, see:
-      <ulink url="https://www.postgresql.org/docs/current/static/app-pgrewind.html">https://www.postgresql.org/docs/current/static/app-pgrewind.html</ulink>.
+      <ulink url="https://www.postgresql.org/docs/current/app-pgrewind.html">https://www.postgresql.org/docs/current/app-pgrewind.html</ulink>.
    </para>
    <para>
      <application>pg_rewind</application> has been part of the core PostgreSQL distribution since
@@ -347,7 +347,7 @@
     <simpara>
      <command>pg_rewind</command> *requires* that either <varname>wal_log_hints</varname> is enabled, or that
      data checksums were enabled when the cluster was initialized. See the
-      <ulink url="https://www.postgresql.org/docs/current/static/app-pgrewind.html">pg_rewind documentation</ulink>
+      <ulink url="https://www.postgresql.org/docs/current/app-pgrewind.html">pg_rewind documentation</ulink>
      for details.
     </simpara>
    </listitem>
@@ -418,7 +418,7 @@ HINT: stop backup before attempting the switchover</programlisting>
       To proceed, either wait until the backup has finished, or cancel it with the command
       <command>SELECT pg_stop_backup()</command>. For more details see the PostgreSQL
       documentation section
-       <ulink url="https://www.postgresql.org/docs/current/static/continuous-archiving.html#BACKUP-LOWLEVEL-BASE-BACKUP-EXCLUSIVE">Making an exclusive low level backup</ulink>.
+       <ulink url="https://www.postgresql.org/docs/current/continuous-archiving.html#BACKUP-LOWLEVEL-BASE-BACKUP-EXCLUSIVE">Making an exclusive low level backup</ulink>.
     </para>
   </sect2>
 </sect1>
--- a/doc/upgrading-repmgr.sgml
+++ b/doc/upgrading-repmgr.sgml
@@ -247,7 +247,7 @@ ALTER EXTENSION repmgr UPDATE</programlisting>
    </simpara>
  </note>
  <para>
-    For further details please see the <ulink url="https://www.postgresql.org/docs/current/static/pgupgrade.html">pg_upgrade documentation</ulink>.
+    For further details please see the <ulink url="https://www.postgresql.org/docs/current/pgupgrade.html">pg_upgrade documentation</ulink>.
  </para>
  <para>
    If replication slots are in use, bear in mind these will <emphasis>not</emphasis>
--- a/doc/version.sgml
+++ b/doc/version.sgml
@@ -1 +0,0 @@
-<!ENTITY repmgrversion "4.2">
--- a/errcode.h
+++ b/errcode.h
@@ -1,6 +1,6 @@
 /*
 * errcode.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -48,5 +48,6 @@
 #define ERR_REJOIN_FAIL 24
 #define ERR_NODE_STATUS 25
 #define ERR_REPMGRD_PAUSE 26
+#define ERR_REPMGRD_SERVICE 27

 #endif							/* _ERRCODE_H_ */
--- a/expected/repmgr_extension.out
+++ b/expected/repmgr_extension.out
@@ -47,7 +47,7 @@ SELECT repmgr.am_bdr_failover_handler(NULL);
 SELECT repmgr.get_new_primary();
 get_new_primary 
 -----------------
-                
+              -1
 (1 row)

 SELECT repmgr.notify_follow_primary(-1);
--- a/log.c
+++ b/log.c
@@ -1,6 +1,6 @@
 /*
 * log.c - Logging methods
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -85,7 +85,7 @@ _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_li

 			time(&t);
 			tm = localtime(&t);
-			strftime(buf, 100, "[%Y-%m-%d %H:%M:%S]", tm);
+			strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", tm);
 			fprintf(stderr, "%s [%s] ", buf, level_name);
 		}
 		else
--- a/log.h
+++ b/log.h
@@ -1,6 +1,6 @@
 /*
 * log.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
--- a/repmgr--4.2--4.3.sql
+++ b/repmgr--4.2--4.3.sql
@@ -0,0 +1,17 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION repmgr" to load this file. \quit
+
+CREATE FUNCTION set_upstream_last_seen()
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_upstream_last_seen'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_upstream_last_seen()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_upstream_last_seen'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_wal_receiver_pid()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_wal_receiver_pid'
+  LANGUAGE C STRICT;
--- a/repmgr--4.3.sql
+++ b/repmgr--4.3.sql
@@ -0,0 +1,217 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION repmgr" to load this file. \quit
+
+CREATE TABLE repmgr.nodes (
+  node_id          INTEGER     PRIMARY KEY,
+  upstream_node_id INTEGER     NULL REFERENCES nodes (node_id) DEFERRABLE,
+  active           BOOLEAN     NOT NULL DEFAULT TRUE,
+  node_name        TEXT        NOT NULL,
+  type             TEXT        NOT NULL CHECK (type IN('primary','standby','witness','bdr')),
+  location         TEXT        NOT NULL DEFAULT 'default',
+  priority         INT         NOT NULL DEFAULT 100,
+  conninfo         TEXT        NOT NULL,
+  repluser         VARCHAR(63) NOT NULL,
+  slot_name        TEXT        NULL,
+  config_file      TEXT        NOT NULL
+);
+
+CREATE TABLE repmgr.events (
+  node_id          INTEGER NOT NULL,
+  event            TEXT NOT NULL,
+  successful       BOOLEAN NOT NULL DEFAULT TRUE,
+  event_timestamp  TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  details          TEXT NULL
+);
+
+DO $repmgr$
+DECLARE
+  DECLARE server_version_num INT;
+BEGIN
+  SELECT setting
+    FROM pg_catalog.pg_settings
+   WHERE name = 'server_version_num'
+    INTO server_version_num;
+  IF server_version_num >= 90400 THEN
+    EXECUTE $repmgr_func$
+CREATE TABLE repmgr.monitoring_history (
+  primary_node_id                INTEGER NOT NULL,
+  standby_node_id                INTEGER NOT NULL,
+  last_monitor_time              TIMESTAMP WITH TIME ZONE NOT NULL,
+  last_apply_time                TIMESTAMP WITH TIME ZONE,
+  last_wal_primary_location      PG_LSN NOT NULL,
+  last_wal_standby_location      PG_LSN,
+  replication_lag                BIGINT NOT NULL,
+  apply_lag                      BIGINT NOT NULL
+)
+    $repmgr_func$;
+  ELSE
+    EXECUTE $repmgr_func$
+CREATE TABLE repmgr.monitoring_history (
+  primary_node_id                INTEGER NOT NULL,
+  standby_node_id                INTEGER NOT NULL,
+  last_monitor_time              TIMESTAMP WITH TIME ZONE NOT NULL,
+  last_apply_time                TIMESTAMP WITH TIME ZONE,
+  last_wal_primary_location      TEXT NOT NULL,
+  last_wal_standby_location      TEXT,
+  replication_lag                BIGINT NOT NULL,
+  apply_lag                      BIGINT NOT NULL
+)
+    $repmgr_func$;
+  END IF;
+END$repmgr$;
+
+
+
+CREATE INDEX idx_monitoring_history_time
+          ON repmgr.monitoring_history (last_monitor_time, standby_node_id);
+
+CREATE VIEW repmgr.show_nodes AS
+   SELECT n.node_id,
+          n.node_name,
+          n.active,
+          n.upstream_node_id,
+          un.node_name AS upstream_node_name,
+          n.type,
+          n.priority,
+          n.conninfo
+     FROM repmgr.nodes n
+LEFT JOIN repmgr.nodes un
+       ON un.node_id = n.upstream_node_id;
+
+
+/* XXX update upgrade scripts! */
+CREATE TABLE repmgr.voting_term (
+  term INT NOT NULL
+);
+
+CREATE UNIQUE INDEX voting_term_restrict
+ON repmgr.voting_term ((TRUE));
+
+CREATE RULE voting_term_delete AS
+   ON DELETE TO repmgr.voting_term
+   DO INSTEAD NOTHING;
+
+
+/* ================= */
+/* repmgrd functions */
+/* ================= */
+
+/* monitoring functions */
+
+CREATE FUNCTION set_local_node_id(INT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_local_node_id'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_local_node_id()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_local_node_id'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION standby_set_last_updated()
+  RETURNS TIMESTAMP WITH TIME ZONE
+  AS 'MODULE_PATHNAME', 'standby_set_last_updated'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION standby_get_last_updated()
+  RETURNS TIMESTAMP WITH TIME ZONE
+  AS 'MODULE_PATHNAME', 'standby_get_last_updated'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION set_upstream_last_seen()
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_upstream_last_seen'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_upstream_last_seen()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_upstream_last_seen'
+  LANGUAGE C STRICT;
+
+
+/* failover functions */
+
+CREATE FUNCTION notify_follow_primary(INT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'notify_follow_primary'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_new_primary()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_new_primary'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION reset_voting_status()
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'reset_voting_status'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION am_bdr_failover_handler(INT)
+  RETURNS BOOL
+  AS 'MODULE_PATHNAME', 'am_bdr_failover_handler'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION unset_bdr_failover_handler()
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'unset_bdr_failover_handler'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_repmgrd_pid()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_repmgrd_pid'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_repmgrd_pidfile()
+  RETURNS TEXT
+  AS 'MODULE_PATHNAME', 'get_repmgrd_pidfile'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION set_repmgrd_pid(INT, TEXT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_repmgrd_pid'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION repmgrd_is_running()
+  RETURNS BOOL
+  AS 'MODULE_PATHNAME', 'repmgrd_is_running'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION repmgrd_pause(BOOL)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'repmgrd_pause'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION repmgrd_is_paused()
+  RETURNS BOOL
+  AS 'MODULE_PATHNAME', 'repmgrd_is_paused'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_wal_receiver_pid()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_wal_receiver_pid'
+  LANGUAGE C STRICT;
+
+
+
+
+/* views */
+
+CREATE VIEW repmgr.replication_status AS
+  SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
+ 	     n.type AS node_type, n.active, last_monitor_time,
+         CASE WHEN n.type='standby' THEN m.last_wal_primary_location ELSE NULL END AS last_wal_primary_location,
+         m.last_wal_standby_location,
+         CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.replication_lag) ELSE NULL END AS replication_lag,
+         CASE WHEN n.type='standby' THEN
+           CASE WHEN replication_lag > 0 THEN age(now(), m.last_apply_time) ELSE '0'::INTERVAL END
+           ELSE NULL
+         END AS replication_time_lag,
+         CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.apply_lag) ELSE NULL END AS apply_lag,
+         AGE(NOW(), CASE WHEN pg_catalog.pg_is_in_recovery() THEN repmgr.standby_get_last_updated() ELSE m.last_monitor_time END) AS communication_time_lag
+    FROM repmgr.monitoring_history m
+    JOIN repmgr.nodes n ON m.standby_node_id = n.node_id
+   WHERE (m.standby_node_id, m.last_monitor_time) IN (
+	          SELECT m1.standby_node_id, MAX(m1.last_monitor_time)
+			    FROM repmgr.monitoring_history m1 GROUP BY 1
+         );
+
--- a/repmgr-action-bdr.c
+++ b/repmgr-action-bdr.c
@@ -3,7 +3,7 @@
 *
 * Implements BDR-related actions for the repmgr command line utility
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -216,7 +216,7 @@ do_bdr_register(void)
 				ExtensionStatus other_node_extension_status = REPMGR_UNKNOWN;

 				/* skip the local node */
-				if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, MAXLEN) == 0)
+				if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, sizeof(node_info.node_name)) == 0)
 				{
 					continue;
 				}
@@ -304,9 +304,9 @@ do_bdr_register(void)
 	node_info.active = true;
 	node_info.priority = config_file_options.priority;

-	strncpy(node_info.node_name, config_file_options.node_name, MAXLEN);
-	strncpy(node_info.location, config_file_options.location, MAXLEN);
-	strncpy(node_info.conninfo, config_file_options.conninfo, MAXLEN);
+	strncpy(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name));
+	strncpy(node_info.location, config_file_options.location, sizeof(node_info.location));
+	strncpy(node_info.conninfo, config_file_options.conninfo, sizeof(node_info.conninfo));

 	if (record_status == RECORD_FOUND)
 	{
@@ -330,7 +330,7 @@ do_bdr_register(void)
 		 * name set when the node was registered.
 		 */

-		if (strncmp(node_info.node_name, config_file_options.node_name, MAXLEN) != 0)
+		if (strncmp(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name)) != 0)
 		{
 			log_error(_("a record for node %i is already registered with node_name \"%s\""),
 					  config_file_options.node_id, node_info.node_name);
--- a/repmgr-action-bdr.h
+++ b/repmgr-action-bdr.h
@@ -1,6 +1,6 @@
 /*
 * repmgr-action-bdr.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
--- a/repmgr-action-cluster.c
+++ b/repmgr-action-cluster.c
@@ -3,7 +3,7 @@
 *
 * Implements cluster information actions for the repmgr command line utility
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -24,7 +24,7 @@
 #include "repmgr-client-global.h"
 #include "repmgr-action-cluster.h"

-#define SHOW_HEADER_COUNT 7
+#define SHOW_HEADER_COUNT 8

 typedef enum
 {
@@ -34,6 +34,7 @@ typedef enum
 	SHOW_STATUS,
 	SHOW_UPSTREAM_NAME,
 	SHOW_LOCATION,
+	SHOW_PRIORITY,
 	SHOW_CONNINFO
 }			ShowHeader;

@@ -102,12 +103,19 @@ do_cluster_show(void)
 		exit(ERR_BAD_CONFIG);
 	}

+	/* Initialize column headers  */
 	strncpy(headers_show[SHOW_ID].title, _("ID"), MAXLEN);
 	strncpy(headers_show[SHOW_NAME].title, _("Name"), MAXLEN);
 	strncpy(headers_show[SHOW_ROLE].title, _("Role"), MAXLEN);
 	strncpy(headers_show[SHOW_STATUS].title, _("Status"), MAXLEN);
 	strncpy(headers_show[SHOW_UPSTREAM_NAME].title, _("Upstream"), MAXLEN);
 	strncpy(headers_show[SHOW_LOCATION].title, _("Location"), MAXLEN);
+
+	if (runtime_options.compact == true)
+		strncpy(headers_show[SHOW_PRIORITY].title, _("Prio."), MAXLEN);
+	else
+		strncpy(headers_show[SHOW_PRIORITY].title, _("Priority"), MAXLEN);
+
 	strncpy(headers_show[SHOW_CONNINFO].title, _("Connection string"), MAXLEN);

 	/*
@@ -117,12 +125,26 @@ do_cluster_show(void)

 	for (i = 0; i < SHOW_HEADER_COUNT; i++)
 	{
-		headers_show[i].max_length = strlen(headers_show[i].title);
+		headers_show[i].display = true;
+
+		if (runtime_options.compact == true)
+		{
+			if (i == SHOW_CONNINFO)
+			{
+				headers_show[i].display = false;
+			}
+		}
+
+		if (headers_show[i].display == true)
+		{
+			headers_show[i].max_length = strlen(headers_show[i].title);
+		}
 	}

 	for (cell = nodes.head; cell; cell = cell->next)
 	{
 		PQExpBufferData details;
+		PQExpBufferData buf;

 		cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);

@@ -133,7 +155,12 @@ do_cluster_show(void)
 		}
 		else
 		{
-			cell->node_info->node_status = NODE_STATUS_DOWN;
+			/* check if node is reachable, but just not letting us in */
+			if (is_server_available_quiet(cell->node_info->conninfo))
+				cell->node_info->node_status = NODE_STATUS_REJECTED;
+			else
+				cell->node_info->node_status = NODE_STATUS_DOWN;
+
 			cell->node_info->recovery_type = RECTYPE_UNKNOWN;

 			connection_error_found = true;
@@ -208,6 +235,19 @@ do_cluster_show(void)
 							}
 						}
 					}
+					/* node is up but cannot connect */
+					else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
+					{
+						if (cell->node_info->active == true)
+						{
+							appendPQExpBufferStr(&details, "? running");
+						}
+						else
+						{
+							appendPQExpBufferStr(&details, "! running");
+								error_found = true;
+						}
+					}
 					/* node is unreachable */
 					else
 					{
@@ -272,6 +312,27 @@ do_cluster_show(void)
 														cell->node_info->node_name, cell->node_info->node_id);
 							}
 						}
+
+						/* warn about issue with paused WAL replay */
+						if (is_wal_replay_paused(cell->node_info->conn, true))
+						{
+							item_list_append_format(&warnings,
+													_("WAL replay is paused on node \"%s\" (ID: %i) with WAL replay pending; this node cannot be manually promoted until WAL replay is resumed"),
+													cell->node_info->node_name, cell->node_info->node_id);
+						}
+					}
+					/* node is up but cannot connect */
+					else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
+					{
+						if (cell->node_info->active == true)
+						{
+							appendPQExpBufferStr(&details, "? running");
+						}
+						else
+						{
+							appendPQExpBufferStr(&details, "! running");
+								error_found = true;
+						}
 					}
 					/* node is unreachable */
 					else
@@ -286,11 +347,12 @@ do_cluster_show(void)
 						}
 						else
 						{
-							appendPQExpBufferStr(&details, "- failed");
-							error_found = true;
+								appendPQExpBufferStr(&details, "- failed");
+								error_found = true;
 						}
 					}
 				}
+
 				break;
 			case WITNESS:
 			case BDR:
@@ -308,6 +370,20 @@ do_cluster_show(void)
 							error_found = true;
 						}
 					}
+					/* node is up but cannot connect */
+					else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
+					{
+						if (cell->node_info->active == true)
+						{
+							appendPQExpBufferStr(&details, "? rejected");
+						}
+						else
+						{
+							appendPQExpBufferStr(&details, "! failed");
+							error_found = true;
+						}
+
+					}
 					/* node is unreachable */
 					else
 					{
@@ -338,15 +414,35 @@ do_cluster_show(void)
 		PQfinish(cell->node_info->conn);
 		cell->node_info->conn = NULL;

+		initPQExpBuffer(&buf);
+		appendPQExpBuffer(&buf, "%i", cell->node_info->node_id);
+		headers_show[SHOW_ID].cur_length = strlen(buf.data);
+		termPQExpBuffer(&buf);
+
 		headers_show[SHOW_ROLE].cur_length = strlen(get_node_type_string(cell->node_info->type));
 		headers_show[SHOW_NAME].cur_length = strlen(cell->node_info->node_name);
 		headers_show[SHOW_STATUS].cur_length = strlen(cell->node_info->details);
 		headers_show[SHOW_UPSTREAM_NAME].cur_length = strlen(cell->node_info->upstream_node_name);
+
+		initPQExpBuffer(&buf);
+		appendPQExpBuffer(&buf, "%i", cell->node_info->priority);
+		headers_show[SHOW_PRIORITY].cur_length = strlen(buf.data);
+		termPQExpBuffer(&buf);
+
 		headers_show[SHOW_LOCATION].cur_length = strlen(cell->node_info->location);
+
+
+
 		headers_show[SHOW_CONNINFO].cur_length = strlen(cell->node_info->conninfo);

 		for (i = 0; i < SHOW_HEADER_COUNT; i++)
 		{
+			if (runtime_options.compact == true)
+			{
+				if (headers_show[i].display == false)
+					continue;
+			}
+
 			if (headers_show[i].cur_length > headers_show[i].max_length)
 			{
 				headers_show[i].max_length = headers_show[i].cur_length;
@@ -398,7 +494,14 @@ do_cluster_show(void)
 			printf("| %-*s ", headers_show[SHOW_STATUS].max_length, cell->node_info->details);
 			printf("| %-*s ", headers_show[SHOW_UPSTREAM_NAME].max_length, cell->node_info->upstream_node_name);
 			printf("| %-*s ", headers_show[SHOW_LOCATION].max_length, cell->node_info->location);
-			printf("| %-*s\n", headers_show[SHOW_CONNINFO].max_length, cell->node_info->conninfo);
+			printf("| %-*i ", headers_show[SHOW_PRIORITY].max_length, cell->node_info->priority);
+
+			if (headers_show[SHOW_CONNINFO].display == true)
+			{
+				printf("| %-*s", headers_show[SHOW_CONNINFO].max_length, cell->node_info->conninfo);
+			}
+
+			puts("");
 		}
 	}

@@ -960,7 +1063,9 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite
 		matrix_rec_list[i] = (t_node_matrix_rec *) pg_malloc0(sizeof(t_node_matrix_rec));

 		matrix_rec_list[i]->node_id = cell->node_info->node_id;
-		strncpy(matrix_rec_list[i]->node_name, cell->node_info->node_name, MAXLEN);
+		strncpy(matrix_rec_list[i]->node_name,
+				cell->node_info->node_name,
+				sizeof(cell->node_info->node_name));

 		/*
 		 * Find the maximum length of a node name
@@ -1058,6 +1163,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite
 		(void) remote_command(host,
 							  runtime_options.remote_user,
 							  command.data,
+							  config_file_options.ssh_options,
 							  &command_output);

 		p = command_output.data;
@@ -1174,7 +1280,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item

 		cube[h] = (t_node_status_cube *) pg_malloc(sizeof(t_node_status_cube));
 		cube[h]->node_id = cell->node_info->node_id;
-		strncpy(cube[h]->node_name, cell->node_info->node_name, MAXLEN);
+		strncpy(cube[h]->node_name, cell->node_info->node_name, sizeof(cell->node_info->node_name));

 		/*
 		 * Find the maximum length of a node name
@@ -1196,7 +1302,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item
 			/* we don't need the name here */
 			cube[h]->matrix_list_rec[i]->node_name[0] = '\0';

-			cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec) * nodes.node_count);
+			cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec *) * nodes.node_count);

 			j = 0;

@@ -1270,6 +1376,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item
 			(void) remote_command(host,
 								  runtime_options.remote_user,
 								  quoted_command.data,
+								  config_file_options.ssh_options,
 								  &command_output);

 			free_conninfo_params(&remote_conninfo);
@@ -1492,6 +1599,7 @@ do_cluster_help(void)
 	printf(_("  Configuration file or database connection required.\n"));
 	puts("");
 	printf(_("    --csv                     emit output as CSV (with a subset of fields)\n"));
+	printf(_("    --compact                 display only a subset of fields\n"));
 	puts("");

 	printf(_("CLUSTER MATRIX\n"));
--- a/repmgr-action-cluster.h
+++ b/repmgr-action-cluster.h
@@ -1,6 +1,6 @@
 /*
 * repmgr-action-cluster.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -30,14 +30,14 @@ typedef struct
 typedef struct
 {
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	t_node_status_rec **node_status_list;
 } t_node_matrix_rec;

 typedef struct
 {
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	t_node_matrix_rec **matrix_list_rec;
 } t_node_status_cube;

--- a/repmgr-action-daemon.c
+++ b/repmgr-action-daemon.c
@@ -2,7 +2,7 @@
 * repmgr-action-daemon.c
 *
 * Implements repmgrd actions for the repmgr command line utility
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -18,12 +18,17 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <signal.h>
+#include <sys/stat.h>			/* for stat() */
+
 #include "repmgr.h"

 #include "repmgr-client-global.h"
 #include "repmgr-action-daemon.h"

-
+#define REPMGR_DAEMON_STOP_START_WAIT 15
+#define REPMGR_DAEMON_STATUS_START_HINT _("use \"repmgr daemon status\" to confirm that repmgrd was successfully started")
+#define REPMGR_DAEMON_STATUS_STOP_HINT _("use \"repmgr daemon status\" to confirm that repmgrd was successfully stopped")

 /*
 * Possibly also show:
@@ -38,13 +43,15 @@ typedef enum
 	STATUS_ID = 0,
 	STATUS_NAME,
 	STATUS_ROLE,
+	STATUS_PRIORITY,
 	STATUS_PG,
 	STATUS_RUNNING,
 	STATUS_PID,
-	STATUS_PAUSED
+	STATUS_PAUSED,
+	STATUS_UPSTREAM_LAST_SEEN
 } StatusHeader;

-#define STATUS_HEADER_COUNT 7
+#define STATUS_HEADER_COUNT 9

 struct ColHeader headers_status[STATUS_HEADER_COUNT];

@@ -61,6 +68,7 @@ do_daemon_status(void)
 	int i;
 	RepmgrdInfo **repmgrd_info;
 	ItemList	warnings = {NULL, NULL};
+	bool		connection_error_found = false;

 	/* Connect to local database to obtain cluster connection data */
 	log_verbose(LOG_INFO, _("connecting to database"));
@@ -83,14 +91,27 @@ do_daemon_status(void)
 	strncpy(headers_status[STATUS_ID].title, _("ID"), MAXLEN);
 	strncpy(headers_status[STATUS_NAME].title, _("Name"), MAXLEN);
 	strncpy(headers_status[STATUS_ROLE].title, _("Role"), MAXLEN);
+
+	if (runtime_options.compact == true)
+		strncpy(headers_status[STATUS_PRIORITY].title, _("Prio."), MAXLEN);
+	else
+		strncpy(headers_status[STATUS_PRIORITY].title, _("Priority"), MAXLEN);
+
 	strncpy(headers_status[STATUS_PG].title, _("Status"), MAXLEN);
 	strncpy(headers_status[STATUS_RUNNING].title, _("repmgrd"), MAXLEN);
 	strncpy(headers_status[STATUS_PID].title, _("PID"), MAXLEN);
 	strncpy(headers_status[STATUS_PAUSED].title, _("Paused?"), MAXLEN);

+	if (runtime_options.compact == true)
+		strncpy(headers_status[STATUS_UPSTREAM_LAST_SEEN].title, _("Upstr. last"), MAXLEN);
+	else
+		strncpy(headers_status[STATUS_UPSTREAM_LAST_SEEN].title, _("Upstream last seen"), MAXLEN);
+
+
 	for (i = 0; i < STATUS_HEADER_COUNT; i++)
 	{
 		headers_status[i].max_length = strlen(headers_status[i].title);
+		headers_status[i].display = true;
 	}

 	i = 0;
@@ -98,18 +119,24 @@ do_daemon_status(void)
 	for (cell = nodes.head; cell; cell = cell->next)
 	{
 		int j;
+		PQExpBufferData buf;

 		repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
 		repmgrd_info[i]->node_id = cell->node_info->node_id;
 		repmgrd_info[i]->pid = UNKNOWN_PID;
+		repmgrd_info[i]->recovery_type = RECTYPE_UNKNOWN;
 		repmgrd_info[i]->paused = false;
 		repmgrd_info[i]->running = false;
 		repmgrd_info[i]->pg_running = true;
+		repmgrd_info[i]->wal_paused_pending_wal = false;
+		repmgrd_info[i]->upstream_last_seen = -1;

 		cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);

 		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
 		{
+			connection_error_found = true;
+
 			if (runtime_options.verbose)
 			{
 				char		error[MAXLEN];
@@ -160,16 +187,55 @@ do_daemon_status(void)

 			repmgrd_info[i]->paused = repmgrd_is_paused(cell->node_info->conn);

+			repmgrd_info[i]->recovery_type = get_recovery_type(cell->node_info->conn);
+
+			if (repmgrd_info[i]->recovery_type == RECTYPE_STANDBY)
+			{
+				repmgrd_info[i]->wal_paused_pending_wal = is_wal_replay_paused(cell->node_info->conn, true);
+
+				if (repmgrd_info[i]->wal_paused_pending_wal == true)
+				{
+					item_list_append_format(&warnings,
+											_("WAL replay is paused on node \"%s\" (ID: %i) with WAL replay pending; this node cannot be manually promoted  until WAL replay is resumed"),
+											cell->node_info->node_name, cell->node_info->node_id);
+				}
+			}
+
+			repmgrd_info[i]->upstream_last_seen = get_upstream_last_seen(cell->node_info->conn, cell->node_info->type);
+			if (repmgrd_info[i]->upstream_last_seen < 0)
+			{
+				maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, "%s", _("n/a"));
+			}
+			else
+			{
+				if (runtime_options.compact == true)
+				{
+					maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, _("%i sec(s) ago"), repmgrd_info[i]->upstream_last_seen);
+				}
+				else
+				{
+					maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, _("%i second(s) ago"), repmgrd_info[i]->upstream_last_seen);
+				}
+			}
+
 			PQfinish(cell->node_info->conn);
 		}


 		headers_status[STATUS_NAME].cur_length = strlen(cell->node_info->node_name);
 		headers_status[STATUS_ROLE].cur_length = strlen(get_node_type_string(cell->node_info->type));
+
+		initPQExpBuffer(&buf);
+		appendPQExpBuffer(&buf, "%i", cell->node_info->priority);
+		headers_status[STATUS_PRIORITY].cur_length = strlen(buf.data);
+		termPQExpBuffer(&buf);
+
 		headers_status[STATUS_PID].cur_length = strlen(repmgrd_info[i]->pid_text);
 		headers_status[STATUS_RUNNING].cur_length = strlen(repmgrd_info[i]->repmgrd_running);
 		headers_status[STATUS_PG].cur_length = strlen(repmgrd_info[i]->pg_running_text);

+		headers_status[STATUS_UPSTREAM_LAST_SEEN].cur_length = strlen(repmgrd_info[i]->upstream_last_seen_text);
+
 		for (j = 0; j < STATUS_HEADER_COUNT; j++)
 		{
 			if (headers_status[j].cur_length > headers_status[j].max_length)
@@ -193,38 +259,61 @@ do_daemon_status(void)
 	{
 		if (runtime_options.output_mode == OM_CSV)
 		{
-			printf("%i,%s,%s,%i,%i,%i,%i\n",
+			int running = repmgrd_info[i]->running ? 1 : 0;
+			int paused = repmgrd_info[i]->paused ? 1 : 0;
+
+			/* If PostgreSQL is not running, repmgrd status is unknown */
+			if (repmgrd_info[i]->pg_running == false)
+			{
+				running = -1;
+				paused = -1;
+			}
+
+			printf("%i,%s,%s,%i,%i,%i,%i,%i,%i\n",
 				   cell->node_info->node_id,
 				   cell->node_info->node_name,
 				   get_node_type_string(cell->node_info->type),
 				   repmgrd_info[i]->pg_running ? 1 : 0,
-				   repmgrd_info[i]->running ? 1 : 0,
+				   running,
 				   repmgrd_info[i]->pid,
-				   repmgrd_info[i]->paused ? 1 : 0);
+				   paused,
+				   cell->node_info->priority,
+				   repmgrd_info[i]->pid == UNKNOWN_PID
+				     ? -1
+				     : repmgrd_info[i]->upstream_last_seen);
 		}
 		else
 		{
 			printf(" %-*i ",  headers_status[STATUS_ID].max_length, cell->node_info->node_id);
 			printf("| %-*s ", headers_status[STATUS_NAME].max_length, cell->node_info->node_name);
 			printf("| %-*s ", headers_status[STATUS_ROLE].max_length, get_node_type_string(cell->node_info->type));
+			printf("| %-*i ", headers_status[STATUS_PRIORITY].max_length, cell->node_info->priority);

 			printf("| %-*s ", headers_status[STATUS_PG].max_length, repmgrd_info[i]->pg_running_text);
 			printf("| %-*s ", headers_status[STATUS_RUNNING].max_length, repmgrd_info[i]->repmgrd_running);
 			printf("| %-*s ", headers_status[STATUS_PID].max_length, repmgrd_info[i]->pid_text);

 			if (repmgrd_info[i]->pid == UNKNOWN_PID)
-				printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, "n/a");
+			{
+				printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, _("n/a"));
+				printf("| %-*s ", headers_status[STATUS_UPSTREAM_LAST_SEEN].max_length, _("n/a"));
+
+			}
 			else
-				printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, repmgrd_info[i]->paused ? "yes" : "no");
+			{
+				printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, repmgrd_info[i]->paused ? _("yes") : _("no"));
+
+				printf("| %-*s ", headers_status[STATUS_UPSTREAM_LAST_SEEN].max_length, repmgrd_info[i]->upstream_last_seen_text);
+			}

 			printf("\n");
 		}

-		free(repmgrd_info[i]);
+		pfree(repmgrd_info[i]);
 		i++;
 	}

-	free(repmgrd_info);
+	pfree(repmgrd_info);

 	/* emit any warnings */

@@ -238,7 +327,7 @@ do_daemon_status(void)
 			printf(_("  - %s\n"), cell->string);
 		}

-		if (runtime_options.verbose == false)
+		if (runtime_options.verbose == false && connection_error_found == true)
 		{
 			log_hint(_("execute with --verbose option to see connection error messages"));
 		}
@@ -264,18 +353,9 @@ _do_repmgr_pause(bool pause)
 	PGconn	   *conn = NULL;
 	NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
 	NodeInfoListCell *cell = NULL;
-	RepmgrdInfo **repmgrd_info;
 	int i;
 	int error_nodes = 0;

-	repmgrd_info = (RepmgrdInfo **) pg_malloc0(sizeof(RepmgrdInfo *) * nodes.node_count);
-
-	if (repmgrd_info == NULL)
-	{
-		log_error(_("unable to allocate memory"));
-		exit(ERR_OUT_OF_MEMORY);
-	}
-
 	/* Connect to local database to obtain cluster connection data */
 	log_verbose(LOG_INFO, _("connecting to database"));

@@ -290,9 +370,6 @@ _do_repmgr_pause(bool pause)

 	for (cell = nodes.head; cell; cell = cell->next)
 	{
-		repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
-		repmgrd_info[i]->node_id = cell->node_info->node_id;
-
 		log_verbose(LOG_DEBUG, "pausing node %i (%s)",
 					cell->node_info->node_id,
 					cell->node_info->node_name);
@@ -383,6 +460,285 @@ fetch_node_records(PGconn *conn, NodeInfoList *node_list)
 }


+void
+do_daemon_start(void)
+{
+	PGconn	   *conn = NULL;
+	PQExpBufferData repmgrd_command;
+	PQExpBufferData output_buf;
+	bool		success;
+
+	if (config_file_options.repmgrd_service_start_command[0] == '\0')
+	{
+		log_error(_("\"repmgrd_service_start_command\" is not set"));
+		log_hint(_("set \"repmgrd_service_start_command\" in \"repmgr.conf\""));
+		exit(ERR_BAD_CONFIG);
+	}
+
+	log_verbose(LOG_INFO, _("connecting to local node"));
+
+	conn = establish_db_connection(config_file_options.conninfo, false);
+
+	if (PQstatus(conn) != CONNECTION_OK)
+	{
+		/* TODO: if PostgreSQL is not available, have repmgrd loop and retry connection */
+		log_error(_("unable to connect to local node"));
+		log_detail(_("PostgreSQL must be running before \"repmgrd\" can be started"));
+		exit(ERR_DB_CONN);
+	}
+
+	/*
+	 * if local connection available, check if repmgr.so is installed, and
+	 * whether repmgrd is running
+	 */
+	check_shared_library(conn);
+
+	if (is_repmgrd_running(conn) == true)
+	{
+		pid_t		pid = UNKNOWN_PID;
+
+		log_error(_("repmgrd appears to be running already"));
+
+		pid = repmgrd_get_pid(conn);
+
+		if (pid != UNKNOWN_PID)
+			log_detail(_("repmgrd PID is %i"), pid);
+		else
+			log_warning(_("unable to determine repmgrd PID"));
+
+		PQfinish(conn);
+		exit(ERR_REPMGRD_SERVICE);
+	}
+
+	PQfinish(conn);
+
+
+	initPQExpBuffer(&repmgrd_command);
+	appendPQExpBufferStr(&repmgrd_command,
+						 config_file_options.repmgrd_service_start_command);
+
+	if (runtime_options.dry_run == true)
+	{
+		log_info(_("prerequisites for starting repmgrd met"));
+		log_detail("following command would be executed:\n  %s", repmgrd_command.data);
+		exit(SUCCESS);
+	}
+
+	log_notice(_("executing: \"%s\""), repmgrd_command.data);
+
+	initPQExpBuffer(&output_buf);
+
+	success = local_command(repmgrd_command.data, &output_buf);
+	termPQExpBuffer(&repmgrd_command);
+
+	if (success == false)
+	{
+		log_error(_("unable to start repmgrd"));
+		if (output_buf.data[0] != '\0')
+			log_detail("%s", output_buf.data);
+		termPQExpBuffer(&output_buf);
+		exit(ERR_REPMGRD_SERVICE);
+	}
+
+	termPQExpBuffer(&output_buf);
+
+	if (runtime_options.no_wait == true || runtime_options.wait == 0)
+	{
+		log_hint(REPMGR_DAEMON_STATUS_START_HINT);
+	}
+	else
+	{
+		int i = 0;
+		int timeout = REPMGR_DAEMON_STOP_START_WAIT;
+
+		if (runtime_options.wait_provided)
+			timeout = runtime_options.wait;
+
+		conn = establish_db_connection(config_file_options.conninfo, false);
+
+		if (PQstatus(conn) != CONNECTION_OK)
+		{
+			log_notice(_("unable to connect to local node"));
+			log_hint(REPMGR_DAEMON_STATUS_START_HINT);
+			exit(ERR_DB_CONN);
+		}
+
+		for (;;)
+		{
+			if (is_repmgrd_running(conn) == true)
+			{
+				log_notice(_("repmgrd was successfully started"));
+				PQfinish(conn);
+				break;
+			}
+
+			if (i == timeout)
+			{
+				PQfinish(conn);
+				log_error(_("repmgrd does not appear to have started after %i seconds"),
+						  timeout);
+				log_hint(REPMGR_DAEMON_STATUS_START_HINT);
+				exit(ERR_REPMGRD_SERVICE);
+			}
+
+			log_debug("sleeping 1 second; %i of %i attempts to determine if repmgrd is running",
+					  i, runtime_options.wait);
+			sleep(1);
+			i++;
+		}
+	}
+}
+
+
+void do_daemon_stop(void)
+{
+	PGconn	   *conn = NULL;
+	PQExpBufferData repmgrd_command;
+	PQExpBufferData output_buf;
+	bool		success;
+	bool		have_db_connection = true;
+	pid_t		pid = UNKNOWN_PID;
+
+	if (config_file_options.repmgrd_service_stop_command[0] == '\0')
+	{
+		log_error(_("\"repmgrd_service_stop_command\" is not set"));
+		log_hint(_("set \"repmgrd_service_stop_command\" in \"repmgr.conf\""));
+		exit(ERR_BAD_CONFIG);
+	}
+
+	/*
+	 * if local connection available, check if repmgr.so is installed, and
+	 * whether repmgrd is running
+	 */
+	log_verbose(LOG_INFO, _("connecting to local node"));
+
+	conn = establish_db_connection(config_file_options.conninfo, false);
+
+	if (PQstatus(conn) != CONNECTION_OK)
+	{
+		/*
+		 * a PostgreSQL connection is not required to stop repmgrd,
+		 */
+		log_warning(_("unable to connect to local node"));
+		have_db_connection = false;
+	}
+	else
+	{
+		check_shared_library(conn);
+
+		if (is_repmgrd_running(conn) == false)
+ 		{
+			log_error(_("repmgrd appears to be stopped already"));
+			PQfinish(conn);
+			exit(ERR_REPMGRD_SERVICE);
+		}
+
+		/* Attempt to fetch the PID, in case we need it later */
+		pid = repmgrd_get_pid(conn);
+		log_debug("retrieved pid is %i", pid);
+	}
+
+	PQfinish(conn);
+
+	initPQExpBuffer(&repmgrd_command);
+
+	appendPQExpBufferStr(&repmgrd_command,
+						 config_file_options.repmgrd_service_stop_command);
+
+	if (runtime_options.dry_run == true)
+	{
+		log_info(_("prerequisites for stopping repmgrd met"));
+		log_detail("following command would be executed:\n  %s", repmgrd_command.data);
+		exit(SUCCESS);
+	}
+
+	log_notice(_("executing: \"%s\""), repmgrd_command.data);
+
+	initPQExpBuffer(&output_buf);
+
+	success = local_command(repmgrd_command.data, &output_buf);
+	termPQExpBuffer(&repmgrd_command);
+
+	if (success == false)
+	{
+		log_error(_("unable to stop repmgrd"));
+		if (output_buf.data[0] != '\0')
+			log_detail("%s", output_buf.data);
+		termPQExpBuffer(&output_buf);
+		exit(ERR_REPMGRD_SERVICE);
+	}
+
+	termPQExpBuffer(&output_buf);
+
+	if (runtime_options.no_wait == true || runtime_options.wait == 0)
+	{
+		if (have_db_connection == true)
+			log_hint(REPMGR_DAEMON_STATUS_STOP_HINT);
+	}
+	else
+	{
+		int i = 0;
+		int timeout = REPMGR_DAEMON_STOP_START_WAIT;
+		/*
+		 *
+		 */
+		if (pid == UNKNOWN_PID)
+		{
+			/*
+			 * XXX attempt to get pidfile from config
+			 *   and get contents
+			 *   ( see check_and_create_pid_file() )
+			 * if PID still unknown, exit here
+			 */
+			log_warning(_("unable to determine repmgrd PID"));
+
+			if (have_db_connection == true)
+				log_hint(REPMGR_DAEMON_STATUS_STOP_HINT);
+
+			exit(ERR_REPMGRD_SERVICE);
+		}
+
+		if (runtime_options.wait_provided)
+			timeout = runtime_options.wait;
+
+		for (;;)
+		{
+			if (kill(pid, 0) == -1)
+			{
+				if (errno == ESRCH)
+				{
+					log_notice(_("repmgrd was successfully stopped"));
+					exit(SUCCESS);
+				}
+				else
+				{
+					log_error(_("unable to determine status of process with PID %i"), pid);
+					log_detail("%s", strerror(errno));
+					exit(ERR_REPMGRD_SERVICE);
+				}
+			}
+
+
+			if (i == timeout)
+			{
+				log_error(_("repmgrd does not appear to have stopped after %i seconds"),
+						  timeout);
+
+				if (have_db_connection == true)
+					log_hint(REPMGR_DAEMON_STATUS_START_HINT);
+
+				exit(ERR_REPMGRD_SERVICE);
+			}
+
+			log_debug("sleeping 1 second; %i of %i attempts to determine if repmgrd with PID %i is running",
+					  i, timeout, pid);
+			sleep(1);
+			i++;
+		}
+	}
+}
+
+
 void do_daemon_help(void)
 {
 	print_help_header();
@@ -391,6 +747,8 @@ void do_daemon_help(void)
 	printf(_("    %s [OPTIONS] daemon status\n"),  progname());
 	printf(_("    %s [OPTIONS] daemon pause\n"),   progname());
 	printf(_("    %s [OPTIONS] daemon unpause\n"), progname());
+	printf(_("    %s [OPTIONS] daemon start\n"),   progname());
+	printf(_("    %s [OPTIONS] daemon stop\n"),    progname());
 	puts("");

 	printf(_("DAEMON STATUS\n"));
@@ -401,6 +759,24 @@ void do_daemon_help(void)
 	printf(_("    --verbose                 show text of database connection error messages\n"));
 	puts("");

+	printf(_("DAEMON START\n"));
+	puts("");
+	printf(_("  \"daemon start\" attempts to start repmgrd\n"));
+	puts("");
+	printf(_("    --dry-run               check prerequisites but don't start repmgrd\n"));
+	printf(_("    -w/--wait               wait for repmgrd to start (default: %i seconds)\n"), REPMGR_DAEMON_STOP_START_WAIT);
+	printf(_("    --no-wait               don't wait for repmgrd to start\n"));
+	puts("");
+
+	printf(_("DAEMON STOP\n"));
+	puts("");
+	printf(_("  \"daemon stop\" attempts to stop repmgrd\n"));
+	puts("");
+	printf(_("    --dry-run               check prerequisites but don't stop repmgrd\n"));
+	printf(_("    -w/--wait               wait for repmgrd to stop (default: %i seconds)\n"), REPMGR_DAEMON_STOP_START_WAIT);
+	printf(_("    --no-wait               don't wait for repmgrd to stop\n"));
+	puts("");
+
 	printf(_("DAEMON PAUSE\n"));
 	puts("");
 	printf(_("  \"daemon pause\" instructs repmgrd on each node to pause failover detection\n"));
@@ -408,13 +784,12 @@ void do_daemon_help(void)
 	printf(_("    --dry-run               check if nodes are reachable but don't pause repmgrd\n"));
 	puts("");

-	printf(_("DAEMON PAUSE\n"));
+	printf(_("DAEMON UNPAUSE\n"));
 	puts("");
 	printf(_("  \"daemon unpause\"  instructs repmgrd on each node to resume failover detection\n"));
 	puts("");
 	printf(_("    --dry-run               check if nodes are reachable but don't unpause repmgrd\n"));
 	puts("");

-
 	puts("");
 }
--- a/repmgr-action-daemon.h
+++ b/repmgr-action-daemon.h
@@ -1,6 +1,6 @@
 /*
 * repmgr-action-daemon.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -23,6 +23,8 @@
 extern void do_daemon_status(void);
 extern void do_daemon_pause(void);
 extern void do_daemon_unpause(void);
+extern void do_daemon_start(void);
+extern void do_daemon_stop(void);

 extern void do_daemon_help(void);
 #endif
--- a/repmgr-action-node.c
+++ b/repmgr-action-node.c
@@ -3,7 +3,7 @@
 *
 * Implements actions available for any kind of node
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -48,6 +48,7 @@ static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode,
 static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
 static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
 static CheckStatus do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
+static CheckStatus do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);

 /*
 * NODE STATUS
@@ -66,7 +67,6 @@ do_node_status(void)
 	PGconn	   *conn = NULL;

 	t_node_info node_info = T_NODE_INFO_INITIALIZER;
-	char		server_version[MAXLEN];
 	char		cluster_size[MAXLEN];
 	PQExpBufferData output;

@@ -76,21 +76,29 @@ do_node_status(void)

 	ItemList	warnings = {NULL, NULL};
 	RecoveryType recovery_type = RECTYPE_UNKNOWN;
-	ReplInfo	replication_info = T_REPLINFO_INTIALIZER;
+	ReplInfo	replication_info;
 	t_recovery_conf recovery_conf = T_RECOVERY_CONF_INITIALIZER;

 	char		data_dir[MAXPGPATH] = "";
+	int			server_version_num = UNKNOWN_SERVER_VERSION_NUM;
+	char		server_version_str[MAXVERSIONSTR] = "";

+	/*
+	 * A database connection is *not* required for this check
+	 */
 	if (runtime_options.is_shutdown_cleanly == true)
 	{
 		return _do_node_status_is_shutdown_cleanly();
 	}

+	init_replication_info(&replication_info);
+
+
 	/* config file required, so we should have "conninfo" and "data_directory" */
 	conn = establish_db_connection(config_file_options.conninfo, true);
 	strncpy(data_dir, config_file_options.data_directory, MAXPGPATH);

-	server_version_num = get_server_version(conn, NULL);
+	server_version_num = get_server_version(conn, server_version_str);

 	/* check node exists  */

@@ -101,18 +109,16 @@ do_node_status(void)
 		exit(ERR_BAD_CONFIG);
 	}

-	(void) get_server_version(conn, server_version);
-
 	if (get_cluster_size(conn, cluster_size) == false)
 		strncpy(cluster_size, _("unknown"), MAXLEN);

 	recovery_type = get_recovery_type(conn);

-	get_node_replication_stats(conn, server_version_num, &node_info);
+	get_node_replication_stats(conn, &node_info);

 	key_value_list_set(&node_status,
 					   "PostgreSQL version",
-					   server_version);
+					   server_version_str);

 	key_value_list_set(&node_status,
 					   "Total data size",
@@ -219,19 +225,27 @@ do_node_status(void)

 		ready_files = get_ready_archive_files(conn, data_dir);

-		if (runtime_options.output_mode == OM_CSV)
+		if (ready_files == ARCHIVE_STATUS_DIR_ERROR)
 		{
-			key_value_list_set_format(&node_status,
-									  "WALs pending archiving",
-									  "%i",
-									  ready_files);
+			item_list_append_format(&warnings,
+									"- unable to check archive_status directory\n");
 		}
 		else
 		{
-			key_value_list_set_format(&node_status,
-									  "WALs pending archiving",
-									  "%i pending files",
-									  ready_files);
+			if (runtime_options.output_mode == OM_CSV)
+			{
+				key_value_list_set_format(&node_status,
+										  "WALs pending archiving",
+										  "%i",
+										  ready_files);
+			}
+			else
+			{
+				key_value_list_set_format(&node_status,
+										  "WALs pending archiving",
+										  "%i pending files",
+										  ready_files);
+			}
 		}

 		if (guc_set(conn, "archive_mode", "=", "off"))
@@ -354,7 +368,7 @@ do_node_status(void)
 		initPQExpBuffer(&slotinfo);

 		appendPQExpBuffer(&slotinfo,
-						  "%i (of maximal %i; %i missing)",
+						  "%i physical (of maximal %i; %i missing)",
 						  node_info.active_replication_slots + node_info.inactive_replication_slots,
 						  node_info.max_replication_slots,
 						  missing_slots.node_count);
@@ -371,13 +385,13 @@ do_node_status(void)
 							  node_info.inactive_replication_slots);

 			item_list_append_format(&warnings,
-									_("- node has %i inactive replication slots"),
+									_("- node has %i inactive physical replication slots"),
 									node_info.inactive_replication_slots);

 			for (cell = inactive_replication_slots.head; cell; cell = cell->next)
 			{
 				item_list_append_format(&warnings,
-										"  - %s (%s)", cell->key, cell->value);
+										"  - %s", cell->key);
 			}

 			key_value_list_free(&inactive_replication_slots);
@@ -399,7 +413,7 @@ do_node_status(void)
 								  node_info.upstream_node_name,
 								  node_info.upstream_node_id);

-		get_replication_info(conn, &replication_info);
+		get_replication_info(conn, node_info.type, &replication_info);

 		key_value_list_set_format(&node_status,
 								  "Replication lag",
@@ -654,27 +668,17 @@ _do_node_status_is_shutdown_cleanly(void)
 		node_status = NODE_STATUS_DOWN;
 	}

-	log_verbose(LOG_DEBUG, "node status determined as: %s", print_node_status(node_status));
+	log_verbose(LOG_DEBUG, "node status determined as: %s",
+				print_node_status(node_status));

-	switch (node_status)
+	appendPQExpBuffer(&output,
+					  "%s", print_node_status(node_status));
+
+	if (node_status == NODE_STATUS_DOWN)
 	{
-		case NODE_STATUS_UP:
-			appendPQExpBufferStr(&output, "RUNNING");
-			break;
-		case NODE_STATUS_SHUTTING_DOWN:
-			appendPQExpBufferStr(&output, "SHUTTING_DOWN");
-			break;
-		case NODE_STATUS_DOWN:
-			appendPQExpBuffer(&output,
-							  "SHUTDOWN --last-checkpoint-lsn=%X/%X",
-							  format_lsn(checkPoint));
-			break;
-		case NODE_STATUS_UNCLEAN_SHUTDOWN:
-			appendPQExpBufferStr(&output, "UNCLEAN_SHUTDOWN");
-			break;
-		case NODE_STATUS_UNKNOWN:
-			appendPQExpBufferStr(&output, "UNKNOWN");
-			break;
+		appendPQExpBuffer(&output,
+						  " --last-checkpoint-lsn=%X/%X",
+						  format_lsn(checkPoint));
 	}

 	printf("%s\n", output.data);
@@ -725,10 +729,8 @@ do_node_check(void)
 		exit(ERR_BAD_CONFIG);
 	}

-	server_version_num = get_server_version(conn, NULL);
-
 	/* add replication statistics to node record */
-	get_node_replication_stats(conn, server_version_num, &node_info);
+	get_node_replication_stats(conn, &node_info);

 	/*
 	 * handle specific checks ======================
@@ -792,6 +794,16 @@ do_node_check(void)
 		exit(return_code);
 	}

+	if (runtime_options.data_directory_config == true)
+	{
+		return_code = do_node_check_data_directory(conn,
+												   runtime_options.output_mode,
+												   &node_info,
+												   NULL);
+		PQfinish(conn);
+		exit(return_code);
+	}
+

 	if (runtime_options.output_mode == OM_NAGIOS)
 	{
@@ -824,6 +836,9 @@ do_node_check(void)
 	if (do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
 		issue_detected = true;

+	if (do_node_check_data_directory(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
+		issue_detected = true;
+
 	if (runtime_options.output_mode == OM_CSV)
 	{
 		appendPQExpBuffer(&output,
@@ -1393,7 +1408,7 @@ do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_i
 					break;
 			}
 		}
-		else if (lag_seconds < 0)
+		else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
 		{
 			status = CHECK_STATUS_UNKNOWN;

@@ -1445,11 +1460,9 @@ do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_i
 	switch (mode)
 	{
 		case OM_OPTFORMAT:
-			{
-				printf("--status=%s %s\n",
-					   output_check_status(status),
-					   details.data);
-			}
+			printf("--status=%s %s\n",
+				   output_check_status(status),
+				   details.data);
 			break;
 		case OM_NAGIOS:
 			printf("REPMGR_REPLICATION_LAG %s: %s\n",
@@ -1618,7 +1631,7 @@ do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, Check

 	initPQExpBuffer(&details);

-	if (server_version_num < 90400)
+	if (PQserverVersion(conn) < 90400)
 	{
 		appendPQExpBufferStr(&details,
 							 _("replication slots not available for this PostgreSQL version"));
@@ -1626,12 +1639,12 @@ do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, Check
 	else if (node_info->total_replication_slots == 0)
 	{
 		appendPQExpBufferStr(&details,
-							 _("node has no replication slots"));
+							 _("node has no physical replication slots"));
 	}
 	else if (node_info->inactive_replication_slots == 0)
 	{
 		appendPQExpBuffer(&details,
-						  _("%i of %i replication slots are active"),
+						  _("%i of %i physical replication slots are active"),
 						  node_info->total_replication_slots,
 						  node_info->total_replication_slots);
 	}
@@ -1640,7 +1653,7 @@ do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, Check
 		status = CHECK_STATUS_CRITICAL;

 		appendPQExpBuffer(&details,
-						  _("%i of %i replication slots are inactive"),
+						  _("%i of %i physical replication slots are inactive"),
 						  node_info->inactive_replication_slots,
 						  node_info->total_replication_slots);
 	}
@@ -1694,7 +1707,7 @@ do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_inf

 	initPQExpBuffer(&details);

-	if (server_version_num < 90400)
+	if (PQserverVersion(conn) < 90400)
 	{
 		appendPQExpBufferStr(&details,
 							 _("replication slots not available for this PostgreSQL version"));
@@ -1708,7 +1721,7 @@ do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_inf
 		if (missing_slots.node_count == 0)
 		{
 			appendPQExpBufferStr(&details,
-								 _("node has no missing replication slots"));
+								 _("node has no missing physical replication slots"));
 		}
 		else
 		{
@@ -1718,7 +1731,7 @@ do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_inf
 			status = CHECK_STATUS_CRITICAL;

 			appendPQExpBuffer(&details,
-							  _("%i replication slots are missing"),
+							  _("%i physical replication slots are missing"),
 							  missing_slots.node_count);

 			if (missing_slots.node_count)
@@ -1779,7 +1792,7 @@ do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_inf
 			if (list_output != NULL)
 			{
 				check_status_list_set(list_output,
-									  "Missing replication slots",
+									  "Missing physical replication slots",
 									  status,
 									  details.data);
 			}
@@ -1800,6 +1813,135 @@ do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_inf
 }


+CheckStatus
+do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
+{
+	CheckStatus status = CHECK_STATUS_OK;
+	char actual_data_directory[MAXPGPATH] = "";
+	PQExpBufferData details;
+
+	if (mode == OM_CSV && list_output == NULL)
+	{
+		log_error(_("--csv output not provided with --data-directory-config option"));
+		PQfinish(conn);
+		exit(ERR_BAD_CONFIG);
+	}
+
+	initPQExpBuffer(&details);
+	/*
+	 * Check actual data directory matches that in repmgr.conf; note this requires
+	 * a superuser connection
+	 */
+
+	if (is_superuser_connection(conn, NULL) == true)
+	{
+		/* we expect to have a database connection */
+		if (get_pg_setting(conn, "data_directory", actual_data_directory) == false)
+		{
+			appendPQExpBuffer(&details,
+							  _("unable to determine current \"data_directory\""));
+			status = CHECK_STATUS_UNKNOWN;
+		}
+
+		if (strncmp(actual_data_directory, config_file_options.data_directory, MAXPGPATH) != 0)
+		{
+			if (mode != OM_NAGIOS)
+			{
+				appendPQExpBuffer(&details,
+								  _("configured \"data_directory\" is \"%s\"; "),
+								  config_file_options.data_directory);
+			}
+
+			appendPQExpBuffer(&details,
+							  "actual data directory is \"%s\"",
+							  actual_data_directory);
+
+			status = CHECK_STATUS_CRITICAL;
+		}
+		else
+		{
+			appendPQExpBuffer(&details,
+							  _("configured \"data_directory\" is \"%s\""),
+							  config_file_options.data_directory);
+		}
+	}
+	/*
+	 * If no superuser connection available, sanity-check that the configuration directory looks
+	 * like a PostgreSQL directory and hope it's the right one.
+	 */
+	else
+	{
+		if (mode == OM_TEXT)
+		{
+			log_info(_("connection is not a superuser connection, falling back to simple check"));
+
+			/* XXX add -S/--superuser option */
+			if (PQserverVersion(conn) >= 100000)
+			{
+				log_hint(_("add the \"%s\" user to group \"pg_read_all_settings\""),
+						   PQuser(conn));
+			}
+		}
+
+		if (is_pg_dir(config_file_options.data_directory) == false)
+		{
+			if (mode == OM_NAGIOS)
+			{
+				appendPQExpBufferStr(&details,
+								  _("configured \"data_directory\" is not a PostgreSQL data directory"));
+			}
+			else
+			{
+				appendPQExpBuffer(&details,
+								  _("configured \"data_directory\" \"%s\" is not a PostgreSQL data directory"),
+								  actual_data_directory);
+			}
+
+			status = CHECK_STATUS_CRITICAL;
+		}
+	}
+
+	switch (mode)
+	{
+		case OM_OPTFORMAT:
+			printf("--configured-data-directory=%s\n",
+				   output_check_status(status));
+			break;
+		case OM_NAGIOS:
+			printf("REPMGR_DATA_DIRECTORY %s: %s",
+				   output_check_status(status),
+				   config_file_options.data_directory);
+
+			if (status == CHECK_STATUS_CRITICAL)
+			{
+				printf(" | %s", details.data);
+			}
+			puts("");
+			break;
+		case OM_CSV:
+		case OM_TEXT:
+			if (list_output != NULL)
+			{
+				check_status_list_set(list_output,
+									  "Configured data directory",
+									  status,
+									  details.data);
+			}
+			else
+			{
+				printf("%s (%s)\n",
+					   output_check_status(status),
+					   details.data);
+			}
+		default:
+			break;
+	}
+
+	termPQExpBuffer(&details);
+
+	return status;
+}
+

 void
 do_node_service(void)
@@ -1993,7 +2135,9 @@ void
 do_node_rejoin(void)
 {
 	PGconn	   *upstream_conn = NULL;
-	RecoveryType upstream_recovery_type = RECTYPE_UNKNOWN;
+	RecoveryType primary_recovery_type = RECTYPE_UNKNOWN;
+	PGconn	   *primary_conn = NULL;
+
 	DBState		db_state;
 	PGPing		status;
 	bool		is_shutdown = true;
@@ -2005,11 +2149,9 @@ do_node_rejoin(void)
 	t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;

 	bool		success = true;
-	int			server_version_num = UNKNOWN_SERVER_VERSION_NUM;
 	int			follow_error_code = SUCCESS;

 	/* check node is not actually running */
-
 	status = PQping(config_file_options.conninfo);

 	switch (status)
@@ -2035,7 +2177,7 @@ do_node_rejoin(void)
 		log_error(_("database is still running in state \"%s\""),
 				  describe_db_state(db_state));
 		log_hint(_("\"repmgr node rejoin\" cannot be executed on a running node"));
-		exit(ERR_BAD_CONFIG);
+		exit(ERR_REJOIN_FAIL);
 	}

 	/* check if cleanly shut down */
@@ -2054,18 +2196,15 @@ do_node_rejoin(void)
 				log_detail(_("pg_rewind will not be able to run"));
 			}
 			log_hint(_("database should be restarted then shut down cleanly after crash recovery completes"));
-			exit(ERR_BAD_CONFIG);
+			exit(ERR_REJOIN_FAIL);
 		}
 	}

-
 	/* check provided upstream connection */
 	upstream_conn = establish_db_connection_by_params(&source_conninfo, true);

 	/* sanity checks for 9.3 */
-	server_version_num = get_server_version(upstream_conn, NULL);
-
-	if (server_version_num < 90400)
+	if (PQserverVersion(upstream_conn) < 90400)
 		check_93_config();

 	if (get_primary_node_record(upstream_conn, &primary_node_record) == false)
@@ -2076,40 +2215,85 @@ do_node_rejoin(void)
 		exit(ERR_BAD_CONFIG);
 	}

-	PQfinish(upstream_conn);
-
 	/* connect to registered primary and check it's not in recovery */
-	upstream_conn = establish_db_connection(primary_node_record.conninfo, false);
+	primary_conn = establish_db_connection(primary_node_record.conninfo, false);

-	if (PQstatus(upstream_conn) != CONNECTION_OK)
+	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
-		log_error(_("unable to connect to current primary \"%s\" (node ID: %i)"),
+		RecoveryType upstream_recovery_type = get_recovery_type(upstream_conn);
+
+		log_error(_("unable to connect to current registered primary \"%s\" (node ID: %i)"),
 				  primary_node_record.node_name,
 				  primary_node_record.node_id);
-		log_detail(_("primay node conninfo is: \"%s\""),
+		log_detail(_("registered primary node conninfo is: \"%s\""),
 				   primary_node_record.conninfo);
+		/*
+		 * Catch case where provided upstream is not in recovery, but is also
+		 * not registered as primary
+		 */
+
+		if (upstream_recovery_type == RECTYPE_PRIMARY)
+		{
+			log_warning(_("provided upstream connection string is for a server which is not in recovery, but not registered as primary"));
+			log_hint(_("fix repmgr metadata configuration before continuing"));
+		}
+
+		PQfinish(upstream_conn);
 		exit(ERR_BAD_CONFIG);
 	}

-	upstream_recovery_type = get_recovery_type(upstream_conn);
+	PQfinish(upstream_conn);

-	if (upstream_recovery_type != RECTYPE_PRIMARY)
+	primary_recovery_type = get_recovery_type(primary_conn);
+
+	if (primary_recovery_type != RECTYPE_PRIMARY)
 	{
-		log_error(_("primary server is registered node \"%s\" (ID: %i), but server is not a primary"),
+		log_error(_("primary server is registered as node \"%s\" (ID: %i), but server is not a primary"),
 				  primary_node_record.node_name,
 				  primary_node_record.node_id);
 		/* TODO: hint about checking cluster */
-		PQfinish(upstream_conn);
+		PQfinish(primary_conn);

 		exit(ERR_BAD_CONFIG);
 	}

+	/*
+	 * sanity-check that it will actually be possible to stream from the new upstream
+	 */
+	{
+		bool can_follow;
+		TimeLineID tli = get_min_recovery_end_timeline(config_file_options.data_directory);
+		XLogRecPtr min_recovery_location = get_min_recovery_location(config_file_options.data_directory);
+
+		/*
+		 * It's possible this was a former primary, so the minRecoveryPoint*
+		 * fields may be empty.
+		 */
+
+		if (min_recovery_location == InvalidXLogRecPtr)
+			min_recovery_location = get_latest_checkpoint_location(config_file_options.data_directory);
+		if (tli == 0)
+			tli = get_timeline(config_file_options.data_directory);
+
+		can_follow = check_node_can_attach(tli,
+										   min_recovery_location,
+										   primary_conn,
+										   &primary_node_record,
+										   true);
+
+		if (can_follow == false)
+		{
+			PQfinish(primary_conn);
+			exit(ERR_REJOIN_FAIL);
+		}
+	}
+
+
 	/*
 	 * --force-rewind specified - check prerequisites, and attempt to execute
  	 * (if --dry-run provided, just output the command which would be executed)
 	 */

-
 	if (runtime_options.force_rewind_used == true)
 	{
 		PQExpBufferData msg;
@@ -2122,12 +2306,12 @@ do_node_rejoin(void)

 		initPQExpBuffer(&msg);

-		if (can_use_pg_rewind(upstream_conn, config_file_options.data_directory, &msg) == false)
+		if (can_use_pg_rewind(primary_conn, config_file_options.data_directory, &msg) == false)
 		{
 			log_error(_("--force-rewind specified but pg_rewind cannot be used"));
 			log_detail("%s", msg.data);
 			termPQExpBuffer(&msg);
-			PQfinish(upstream_conn);
+			PQfinish(primary_conn);

 			exit(ERR_BAD_CONFIG);
 		}
@@ -2186,8 +2370,8 @@ do_node_rejoin(void)
 		else
 		{
 			log_notice(_("executing pg_rewind"));
-			log_debug("pg_rewind command is:\n  %s",
-					  command.data);
+			log_detail(_("pg_rewind command is \"%s\""),
+					   command.data);

 			initPQExpBuffer(&command_output);

@@ -2203,7 +2387,7 @@ do_node_rejoin(void)

 				termPQExpBuffer(&command_output);

-				exit(ERR_BAD_CONFIG);
+				exit(ERR_REJOIN_FAIL);
 			}

 			termPQExpBuffer(&command_output);
@@ -2292,6 +2476,8 @@ do_node_rejoin(void)

 						termPQExpBuffer(&slotdir_ent_path);
 					}
+
+					closedir(slotdir);
 				}
 				termPQExpBuffer(&slotdir_path);
 			}
@@ -2306,26 +2492,34 @@ do_node_rejoin(void)

 	initPQExpBuffer(&follow_output);

-	success = do_standby_follow_internal(upstream_conn,
+	/*
+	 * do_standby_follow_internal() can handle situations where the follow
+	 * target is not the primary, so requires database handles to both
+	 * (even if they point to the same node). For the time being,
+	 * "node rejoin" will only attach a standby to the primary.
+	 */
+	success = do_standby_follow_internal(primary_conn,
+										 primary_conn,
 										 &primary_node_record,
 										 &follow_output,
+										 ERR_REJOIN_FAIL,
 										 &follow_error_code);

 	if (success == false)
 	{
-		log_notice(_("NODE REJOIN failed"));
+		log_error(_("NODE REJOIN failed"));

 		if (strlen(follow_output.data))
 			log_detail("%s", follow_output.data);

-		create_event_notification(upstream_conn,
+		create_event_notification(primary_conn,
 								  &config_file_options,
 								  config_file_options.node_id,
 								  "node_rejoin",
 								  success,
 								  follow_output.data);

-		PQfinish(upstream_conn);
+		PQfinish(primary_conn);

 		termPQExpBuffer(&follow_output);
 		exit(follow_error_code);
@@ -2368,7 +2562,7 @@ do_node_rejoin(void)

 		for (;  i < config_file_options.node_rejoin_timeout; i++)
 		{
-			success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
+			success = is_downstream_node_attached(primary_conn, config_file_options.node_name);

 			if (success == true)
 			{
@@ -2379,9 +2573,13 @@ do_node_rejoin(void)

 			if (i % 5 == 0)
 			{
-				log_info(_("waiting for node %i to connect to new primary; %i of max %i attempts"),
+				log_info(_("waiting for node \"%s\" (ID: %i) to connect to new primary; %i of max %i attempts"),
+						 config_file_options.node_name,
 						 config_file_options.node_id,
 						 i + 1, config_file_options.node_rejoin_timeout);
+				log_detail(_("checking for record in node \"%s\"'s \"pg_stat_replication\" table where \"application_name\" is \"%s\""),
+						   primary_node_record.node_name,
+						   config_file_options.node_name);
 			}
 			else
 			{
@@ -2393,7 +2591,7 @@ do_node_rejoin(void)
 			sleep(1);
 		}

-		create_event_notification(upstream_conn,
+		create_event_notification(primary_conn,
 								  &config_file_options,
 								  config_file_options.node_id,
 								  "node_rejoin",
@@ -2403,13 +2601,18 @@ do_node_rejoin(void)
 		if (success == false)
 		{
 			termPQExpBuffer(&follow_output);
-			log_notice(_("NODE REJOIN failed"));
+			log_error(_("NODE REJOIN failed"));
+			log_detail(_("no record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
+					   config_file_options.node_name,
+					   primary_node_record.node_name);
+			log_hint(_("check the PostgreSQL log on the local node"));
 			exit(ERR_REJOIN_FAIL);
 		}
 	}
 	else
 	{
-		success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
+		/* -W/--no-wait provided - check once */
+		success = is_downstream_node_attached(primary_conn, config_file_options.node_name);
 	}

 	/*
@@ -2480,6 +2683,48 @@ do_node_rejoin(void)
 }


+/*
+ * Currently for testing purposes only, not documented;
+ * use at own risk!
+ */
+
+void
+do_node_control(void)
+{
+	PGconn	   *conn = NULL;
+	pid_t	    wal_receiver_pid = UNKNOWN_PID;
+	conn = establish_db_connection(config_file_options.conninfo, true);
+
+	if (runtime_options.disable_wal_receiver == true)
+	{
+		wal_receiver_pid = disable_wal_receiver(conn);
+
+		PQfinish(conn);
+
+		if (wal_receiver_pid == UNKNOWN_PID)
+			exit(ERR_BAD_CONFIG);
+
+		exit(SUCCESS);
+	}
+
+	if (runtime_options.enable_wal_receiver == true)
+	{
+		wal_receiver_pid = enable_wal_receiver(conn, true);
+
+		PQfinish(conn);
+
+		if (wal_receiver_pid == UNKNOWN_PID)
+			exit(ERR_BAD_CONFIG);
+
+		exit(SUCCESS);
+	}
+
+	log_error(_("no option provided"));
+
+	PQfinish(conn);
+}
+
+
 /*
 * For "internal" use by `node rejoin` on the local node when
 * called by "standby switchover" from the remote node.
@@ -2541,6 +2786,7 @@ _do_node_archive_config(void)

 	arcdir = opendir(archive_dir.data);

+	/* always attempt to open the directory */
 	if (arcdir == NULL)
 	{
 		log_error(_("unable to open archive directory \"%s\""),
@@ -2586,10 +2832,11 @@ _do_node_archive_config(void)

 			termPQExpBuffer(&arcdir_ent_path);
 		}
-
-		closedir(arcdir);
 	}

+	closedir(arcdir);
+
+
 	/*
 	 * extract list of config files from --config-files
 	 */
@@ -2861,11 +3108,12 @@ copy_file(const char *src_file, const char *dest_file)
 	int			a = 0;

 	ptr_old = fopen(src_file, "r");
-	ptr_new = fopen(dest_file, "w");

 	if (ptr_old == NULL)
 		return false;

+	ptr_new = fopen(dest_file, "w");
+
 	if (ptr_new == NULL)
 	{
 		fclose(ptr_old);
@@ -2922,8 +3170,8 @@ do_node_help(void)
 	puts("");
 	printf(_("  Configuration file required, runs on local node only.\n"));
 	puts("");
-	printf(_("    --csv                   emit output as CSV\n"));
-	printf(_("    --nagios                emit output in Nagios format (individual status output only)\n"));
+	printf(_("    --csv                   emit output as CSV (not available for individual check output)\n"));
+	printf(_("    --nagios                emit output in Nagios format (individual check output only)\n"));
 	puts("");
 	printf(_("  Following options check an individual status:\n"));
 	printf(_("    --archive-ready         number of WAL files ready for archiving\n"));
@@ -2932,6 +3180,7 @@ do_node_help(void)
 	printf(_("    --role                  check node has expected role\n"));
 	printf(_("    --slots                 check for inactive replication slots\n"));
 	printf(_("    --missing-slots         check for missing replication slots\n"));
+	printf(_("    --data-directory-config check repmgr's data directory configuration\n"));

 	puts("");

--- a/repmgr-action-node.h
+++ b/repmgr-action-node.h
@@ -1,6 +1,6 @@
 /*
 * repmgr-action-node.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -24,6 +24,7 @@ extern void do_node_check(void);

 extern void do_node_rejoin(void);
 extern void do_node_service(void);
+extern void do_node_control(void);

 extern void do_node_help(void);

--- a/repmgr-action-primary.c
+++ b/repmgr-action-primary.c
@@ -3,7 +3,7 @@
 *
 * Implements primary actions for the repmgr command line utility
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -251,6 +251,7 @@ do_primary_unregister(void)
 	PGconn	   *primary_conn = NULL;
 	PGconn	   *local_conn = NULL;
 	t_node_info local_node_info = T_NODE_INFO_INITIALIZER;
+	t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;

 	t_node_info *target_node_info_ptr = NULL;
 	PGconn	   *target_node_conn = NULL;
@@ -271,8 +272,6 @@ do_primary_unregister(void)

 	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
-		t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
-
 		log_error(_("unable to connect to primary server"));

 		if (get_primary_node_record(local_conn, &primary_node_info) == true)
@@ -291,10 +290,19 @@ do_primary_unregister(void)
 	/* Local connection no longer required */
 	PQfinish(local_conn);

+	if (get_primary_node_record(primary_conn, &primary_node_info) == false)
+	{
+		log_error(_("unable to retrieve record for primary node"));
+		PQfinish(primary_conn);
+		exit(ERR_BAD_CONFIG);
+	}

 	/* Target node is local node? */
-	if (target_node_info.node_id == UNKNOWN_NODE_ID
-		|| target_node_info.node_id == config_file_options.node_id)
+	if (target_node_info.node_id == UNKNOWN_NODE_ID)
+	{
+		target_node_info_ptr = &primary_node_info;
+	}
+	else if (target_node_info.node_id == config_file_options.node_id)
 	{
 		target_node_info_ptr = &local_node_info;
 	}
@@ -304,6 +312,24 @@ do_primary_unregister(void)
 		target_node_info_ptr = &target_node_info;
 	}

+	/*
+	 * Sanity-check the target node is not a witness
+	 */
+
+	if (target_node_info_ptr->type == WITNESS)
+	{
+		log_error(_("node %s (id: %i) is a witness server, unable to unregister"),
+					  target_node_info_ptr->node_name,
+					  target_node_info_ptr->node_id);
+		if (target_node_info_ptr->type == STANDBY)
+		{
+			log_hint(_("the node can be unregistered with \"repmgr witness unregister\""));
+		}
+
+		PQfinish(primary_conn);
+		exit(ERR_BAD_CONFIG);
+	}
+
 	/*
 	 * Check for downstream nodes - if any still defined, we won't be able to
 	 * delete the node record due to foreign key constraints.
--- a/repmgr-action-primary.h
+++ b/repmgr-action-primary.h
@@ -1,6 +1,6 @@
 /*
 * repmgr-action-primary.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
--- a/repmgr-action-standby.h
+++ b/repmgr-action-standby.h
@@ -1,6 +1,6 @@
 /*
 * repmgr-action-standby.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -28,7 +28,7 @@ extern void do_standby_switchover(void);

 extern void do_standby_help(void);

-extern bool do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_record, PQExpBufferData *output, int *error_code);
+extern bool do_standby_follow_internal(PGconn *primary_conn, PGconn *follow_target_conn, t_node_info *follow_target_node_record, PQExpBufferData *output, int general_error_code, int *error_code);



--- a/repmgr-action-witness.c
+++ b/repmgr-action-witness.c
@@ -3,7 +3,7 @@
 *
 * Implements witness actions for the repmgr command line utility
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -36,10 +36,12 @@ do_witness_register(void)
 {
 	PGconn	   *witness_conn = NULL;
 	PGconn	   *primary_conn = NULL;
+	int			primary_node_id = UNKNOWN_NODE_ID;
 	RecoveryType recovery_type = RECTYPE_UNKNOWN;
 	ExtensionStatus extension_status = REPMGR_UNKNOWN;
 	NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
 	t_node_info node_record = T_NODE_INFO_INITIALIZER;
+	t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;
 	RecordStatus record_status = RECORD_NOT_FOUND;
 	bool		record_created = false;

@@ -54,8 +56,7 @@ do_witness_register(void)
 		log_error(_("unable to connect to witness node \"%s\" (ID: %i)"),
 				  config_file_options.node_name,
 				  config_file_options.node_id);
-		log_detail("%s",
-				   PQerrorMessage(witness_conn));
+		log_detail("\n%s", PQerrorMessage(witness_conn));
 		log_hint(_("the witness node must be running before it can be registered"));
 		exit(ERR_BAD_CONFIG);
 	}
@@ -125,6 +126,59 @@ do_witness_register(void)
 		exit(ERR_BAD_CONFIG);
 	}

+
+	/* check we can determine the primary node */
+	primary_node_id = get_primary_node_id(primary_conn);
+
+	if (primary_node_id == UNKNOWN_NODE_ID)
+	{
+		log_error(_("unable to determine the cluster's primary node"));
+		log_hint(_("ensure the primary node connection details are correct and that it is registered"));
+		PQfinish(witness_conn);
+		PQfinish(primary_conn);
+
+		exit(ERR_BAD_CONFIG);
+	}
+
+	record_status = get_node_record(primary_conn, primary_node_id, &primary_node_record);
+	PQfinish(primary_conn);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve record for primary node %i"),
+				  primary_node_id);
+
+		PQfinish(witness_conn);
+
+
+		exit(ERR_BAD_CONFIG);
+	}
+
+	/*
+	 * Reconnect to the primary node's conninfo - this will
+	 * protect against the situation where the witness connection
+	 * details were provided, and we're actually connected to the
+	 * witness server.
+	 */
+
+	primary_conn = establish_db_connection_quiet(primary_node_record.conninfo);
+
+	if (PQstatus(primary_conn) != CONNECTION_OK)
+	{
+		log_error(_("unable to reconnect to the primary node (node %i)"), primary_node_id);
+		log_detail(_("primary node's conninfo is \"%s\""), primary_node_record.conninfo);
+
+		PQfinish(witness_conn);
+
+		exit(ERR_BAD_CONFIG);
+	}
+
+	/*
+	 * TODO: sanity check witness node is not part of main cluster; we could
+	 * add a random application_name to the respective connections,
+	 * and do a simple check of pg_stat_activity
+	 */
+
 	/* check that primary node is not a BDR node */
 	if (is_bdr_db_quiet(primary_conn) == true)
 	{
@@ -137,11 +191,6 @@ do_witness_register(void)
 		exit(ERR_BAD_CONFIG);
 	}

-	/*
-	 * TODO: sanity check witness node is not part of main cluster; we could
-	 * add a random application_name to the respective connections,
-	 * and do a simple check of pg_stat_activity
-	 */

 	/* create repmgr extension, if does not exist */
 	if (runtime_options.dry_run == false &&  !create_repmgr_extension(witness_conn))
@@ -275,7 +324,7 @@ do_witness_register(void)
 	/* these values are mandatory, setting them to anything else has no point */
 	node_record.type = WITNESS;
 	node_record.priority = 0;
-	node_record.upstream_node_id = get_primary_node_id(primary_conn);
+	node_record.upstream_node_id = primary_node_id;

 	if (record_status == RECORD_FOUND)
 	{
@@ -361,7 +410,7 @@ do_witness_unregister(void)
 			log_error(_("unable to connect to node \"%s\" (ID: %i)"),
 					  config_file_options.node_name,
 					  config_file_options.node_id);
-			log_detail("%s", PQerrorMessage(local_conn));
+			log_detail("\n%s", PQerrorMessage(local_conn));
 			exit(ERR_BAD_CONFIG);
 		}

@@ -387,7 +436,7 @@ do_witness_unregister(void)
 	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
 		log_error(_("unable to connect to primary"));
-		log_detail("%s", PQerrorMessage(primary_conn));
+		log_detail("\n%s", PQerrorMessage(primary_conn));

 		if (local_node_available == true)
 		{
--- a/repmgr-action-witness.h
+++ b/repmgr-action-witness.h
@@ -1,6 +1,6 @@
 /*
 * repmgr-action-witness.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
--- a/repmgr-client-global.h
+++ b/repmgr-client-global.h
@@ -1,6 +1,6 @@
 /*
 * repmgr-client-global.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -35,14 +35,16 @@ typedef struct
 	bool		connection_param_provided;
 	bool		host_param_provided;
 	bool		limit_provided;
+	bool		wait_provided;

 	/* general configuration options */
 	char		config_file[MAXPGPATH];
 	bool		dry_run;
 	bool		force;
 	char		pg_bindir[MAXLEN];	/* overrides setting in repmgr.conf */
-	bool		wait;
+	int			wait;
 	bool		no_wait;
+	bool		compact;

 	/* logging options */
 	char		log_level[MAXLEN];	/* overrides setting in repmgr.conf */
@@ -68,7 +70,7 @@ typedef struct

 	/* general node options */
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	char		data_dir[MAXPGPATH];
 	int			remote_node_id;

@@ -111,8 +113,9 @@ typedef struct
 	bool		missing_slots;
 	bool		has_passfile;
 	bool		replication_connection;
+	bool		data_directory_config;

-	/* "node join" options */
+	/* "node rejoin" options */
 	char		config_files[MAXLEN];

 	/* "node service" options */
@@ -132,13 +135,15 @@ typedef struct
 	/* following options for internal use */
 	char		config_archive_dir[MAXPGPATH];
 	OutputMode	output_mode;
+	bool		disable_wal_receiver;
+	bool		enable_wal_receiver;
 } t_runtime_options;

 #define T_RUNTIME_OPTIONS_INITIALIZER { \
 		/* configuration metadata */ \
-		false, false, false, false,	\
+		false, false, false, false, false,	\
 		/* general configuration options */	\
-		"", false, false, "", false, false,	\
+		"", false, false, "", -1, false, false, \
 		/* logging options */ \
 		"", false, false, false, false,	\
 		/* output options */ \
@@ -161,8 +166,8 @@ typedef struct
 		/* "node status" options */ \
 		false, \
 		/* "node check" options */ \
-		false, false, false, false, false, false, false, false,	\
-		/* "node join" options */ \
+		false, false, false, false, false, false, false, false,	false, \
+		/* "node rejoin" options */ \
 		"", \
 		/* "node service" options */ \
 		"", false, false, false,  \
@@ -171,7 +176,7 @@ typedef struct
 		/* "cluster cleanup" options */ \
 		0, \
 		/* following options for internal use */ \
-		"/tmp", OM_TEXT	\
+		"/tmp", OM_TEXT, false, false \
 }


@@ -199,6 +204,7 @@ typedef struct ColHeader
 	char		title[MAXLEN];
 	int			max_length;
 	int			cur_length;
+	bool		display;
 } ColHeader;


@@ -220,8 +226,6 @@ extern int	check_server_version(PGconn *conn, char *server_type, bool exit_on_er
 extern void check_93_config(void);
 extern bool create_repmgr_extension(PGconn *conn);
 extern int	test_ssh_connection(char *host, char *remote_user);
-extern bool local_command(const char *command, PQExpBufferData *outputbuf);
-extern bool local_command_simple(const char *command, PQExpBufferData *outputbuf);

 extern standy_clone_mode get_standby_clone_mode(void);

@@ -234,9 +238,9 @@ extern char *make_pg_path(const char *file);

 extern void get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privileged_conn);

-extern bool remote_command(const char *host, const char *user, const char *command, PQExpBufferData *outputbuf);
-
 extern void make_remote_repmgr_path(PQExpBufferData *outputbuf, t_node_info *remote_node_record);
+extern void make_repmgrd_path(PQExpBufferData *output_buf);
+

 /* display functions */
 extern void print_help_header(void);
@@ -251,4 +255,8 @@ extern void init_node_record(t_node_info *node_record);
 extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
 extern void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);

+extern bool check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin);
+extern void check_shared_library(PGconn *conn);
+extern bool is_repmgrd_running(PGconn *conn);
+
 #endif							/* _REPMGR_CLIENT_GLOBAL_H_ */
--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -1,7 +1,7 @@
 /*
 * repmgr-client.c - Command interpreter for the repmgr package
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This module is a command-line utility to easily setup a cluster of
 * hot standby servers for an HA environment
@@ -31,11 +31,13 @@
 * NODE CHECK
 * NODE REJOIN
 * NODE SERVICE
+ * NODE CONTROL
 *
 * DAEMON STATUS
 * DAEMON PAUSE
 * DAEMON UNPAUSE
- *
+ * DAEMON START
+ * DAEMON STOP
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -53,6 +55,7 @@

 #include <unistd.h>
 #include <sys/stat.h>
+#include <signal.h>

 #include "repmgr.h"
 #include "compat.h"
@@ -95,8 +98,6 @@ t_node_info target_node_info = T_NODE_INFO_INITIALIZER;
 static ItemList cli_errors = {NULL, NULL};
 static ItemList cli_warnings = {NULL, NULL};

-static bool _local_command(const char *command, PQExpBufferData *outputbuf, bool simple);
-
 int
 main(int argc, char **argv)
 {
@@ -205,6 +206,7 @@ main(int argc, char **argv)
 				help_option = true;
 				break;

+				/* -V/--version */
 			case 'V':

 				/*
@@ -215,6 +217,11 @@ main(int argc, char **argv)
 				printf("%s %s\n", progname(), REPMGR_VERSION);
 				exit(SUCCESS);

+				/* --version-number */
+			case OPT_VERSION_NUMBER:
+				printf("%i\n", REPMGR_VERSION_NUM);
+				exit(SUCCESS);
+
 				/*------------------------------
 				 * general configuration options
 				 *------------------------------
@@ -247,7 +254,11 @@ main(int argc, char **argv)

 				/* -w/--wait */
 			case 'w':
-				runtime_options.wait = true;
+				runtime_options.wait_provided = true;
+				if (optarg != NULL)
+				{
+					runtime_options.wait = repmgr_atoi(optarg, "--wait", &cli_errors, 0);
+				}
 				break;

 				/* -W/--no-wait */
@@ -255,6 +266,12 @@ main(int argc, char **argv)
 				runtime_options.no_wait = true;
 				break;

+				/* --compact */
+			case OPT_COMPACT:
+				runtime_options.compact = true;
+				break;
+
+
 				/*----------------------------
 				 * database connection options
 				 *----------------------------
@@ -287,7 +304,12 @@ main(int argc, char **argv)
 				break;

 			case 'p':
-				(void) repmgr_atoi(optarg, "-p/--port", &cli_errors, false);
+				/*
+				 * minimum TCP port number is 1; in practice PostgreSQL
+				 * won't be running on a privileged port, but we don't want
+				 * to be concerned with that level of checking
+				 */
+				(void) repmgr_atoi(optarg, "-p/--port", &cli_errors, 1);
 				param_set(&source_conninfo, "port", optarg);
 				strncpy(runtime_options.port,
 						optarg,
@@ -329,17 +351,23 @@ main(int argc, char **argv)

 				/* --node-id */
 			case OPT_NODE_ID:
-				runtime_options.node_id = repmgr_atoi(optarg, "--node-id", &cli_errors, false);
+				runtime_options.node_id = repmgr_atoi(optarg, "--node-id", &cli_errors, MIN_NODE_ID);
 				break;

 				/* --node-name */
 			case OPT_NODE_NAME:
-				strncpy(runtime_options.node_name, optarg, MAXLEN);
+			{
+				if (strlen(optarg) < sizeof(runtime_options.node_name))
+					strncpy(runtime_options.node_name, optarg, sizeof(runtime_options.node_name));
+				else
+					item_list_append_format(&cli_errors,
+											_("value for \"--node-name\" must contain fewer than %lu characters"),
+											sizeof(runtime_options.node_name));
 				break;
-
+			}
 				/* --remote-node-id */
 			case OPT_REMOTE_NODE_ID:
-				runtime_options.remote_node_id = repmgr_atoi(optarg, "--remote-node-id", &cli_errors, false);
+				runtime_options.remote_node_id = repmgr_atoi(optarg, "--remote-node-id", &cli_errors, MIN_NODE_ID);
 				break;

 				/*
@@ -348,7 +376,7 @@ main(int argc, char **argv)

 				/* --upstream-node-id */
 			case OPT_UPSTREAM_NODE_ID:
-				runtime_options.upstream_node_id = repmgr_atoi(optarg, "--upstream-node-id", &cli_errors, false);
+				runtime_options.upstream_node_id = repmgr_atoi(optarg, "--upstream-node-id", &cli_errors, MIN_NODE_ID);
 				break;

 				/*------------------------
@@ -408,14 +436,14 @@ main(int argc, char **argv)
 				 */

 			case OPT_WAIT_START:
-				runtime_options.wait_start = repmgr_atoi(optarg, "--wait-start", &cli_errors, false);
+				runtime_options.wait_start = repmgr_atoi(optarg, "--wait-start", &cli_errors, 0);
 				break;

 			case OPT_WAIT_SYNC:
 				runtime_options.wait_register_sync = true;
 				if (optarg != NULL)
 				{
-					runtime_options.wait_register_sync_seconds = repmgr_atoi(optarg, "--wait-sync", &cli_errors, false);
+					runtime_options.wait_register_sync_seconds = repmgr_atoi(optarg, "--wait-sync", &cli_errors, 0);
 				}
 				break;

@@ -491,6 +519,10 @@ main(int argc, char **argv)
 				runtime_options.replication_connection = true;
 				break;

+			case OPT_DATA_DIRECTORY_CONFIG:
+				runtime_options.data_directory_config = true;
+				break;
+
 				/*--------------------
 				 * "node rejoin" options
 				 *--------------------
@@ -532,7 +564,7 @@ main(int argc, char **argv)
 				break;

 			case OPT_LIMIT:
-				runtime_options.limit = repmgr_atoi(optarg, "--limit", &cli_errors, false);
+				runtime_options.limit = repmgr_atoi(optarg, "--limit", &cli_errors, 1);
 				runtime_options.limit_provided = true;
 				break;

@@ -547,7 +579,7 @@ main(int argc, char **argv)

 				/* -k/--keep-history */
 			case 'k':
-				runtime_options.keep_history = repmgr_atoi(optarg, "-k/--keep-history", &cli_errors, false);
+				runtime_options.keep_history = repmgr_atoi(optarg, "-k/--keep-history", &cli_errors, 0);
 				break;

 				/*----------------
@@ -599,7 +631,7 @@ main(int argc, char **argv)
 				break;


-				/*--------------
+				/*---------------
 				 * output options
 				 *---------------
 				 */
@@ -615,6 +647,19 @@ main(int argc, char **argv)
 				runtime_options.optformat = true;
 				break;

+				/*---------------------------------
+				 * undocumented options for testing
+				 *----------------------------------
+				 */
+
+			case OPT_DISABLE_WAL_RECEIVER:
+				runtime_options.disable_wal_receiver = true;
+				break;
+
+			case OPT_ENABLE_WAL_RECEIVER:
+				runtime_options.enable_wal_receiver = true;
+				break;
+
 				/*-----------------------------
 				 * options deprecated since 3.3
 				 *-----------------------------
@@ -774,7 +819,7 @@ main(int argc, char **argv)
 	 *   BDR { REGISTER | UNREGISTER } |
 	 *   NODE { STATUS | CHECK | REJOIN | SERVICE } |
 	 *   CLUSTER { CROSSCHECK | MATRIX | SHOW | EVENT | CLEANUP }
-	 *   DAEMON { STATUS | PAUSE | UNPAUSE }
+	 *   DAEMON { STATUS | PAUSE | UNPAUSE | START | STOP }
 	 *
 	 * [node] is an optional hostname, provided instead of the -h/--host
 	 * option
@@ -887,6 +932,8 @@ main(int argc, char **argv)
 				action = NODE_REJOIN;
 			else if (strcasecmp(repmgr_action, "SERVICE") == 0)
 				action = NODE_SERVICE;
+			else if (strcasecmp(repmgr_action, "CONTROL") == 0)
+				action = NODE_CONTROL;
 		}

 		else if (strcasecmp(repmgr_command, "CLUSTER") == 0)
@@ -925,6 +972,10 @@ main(int argc, char **argv)
 				action = DAEMON_PAUSE;
 			else if (strcasecmp(repmgr_action, "UNPAUSE") == 0)
 				action = DAEMON_UNPAUSE;
+			else if (strcasecmp(repmgr_action, "START") == 0)
+				action = DAEMON_START;
+			else if (strcasecmp(repmgr_action, "STOP") == 0)
+				action = DAEMON_STOP;
 		}
 		else
 		{
@@ -1306,6 +1357,9 @@ main(int argc, char **argv)
 		case NODE_SERVICE:
 			do_node_service();
 			break;
+		case NODE_CONTROL:
+			do_node_control();
+			break;

 			/* CLUSTER */
 		case CLUSTER_SHOW:
@@ -1334,6 +1388,12 @@ main(int argc, char **argv)
 		case DAEMON_UNPAUSE:
 			do_daemon_unpause();
 			break;
+		case DAEMON_START:
+			do_daemon_start();
+			break;
+		case DAEMON_STOP:
+			do_daemon_stop();
+			break;

 		default:
 			/* An action will have been determined by this point  */
@@ -1620,6 +1680,8 @@ check_cli_parameters(const int action)
 				item_list_append_format(&cli_warnings,
 										_("--replication-user ignored when executing %s"),
 										action_name(action));
+				break;
+
 			default:
 				item_list_append_format(&cli_warnings,
 										_("--replication-user not required when executing %s"),
@@ -1658,12 +1720,6 @@ check_cli_parameters(const int action)
 		switch (action)
 		{
 			case CLUSTER_EVENT:
-				if (runtime_options.limit < 1)
-				{
-					item_list_append_format(&cli_errors,
-											_("value for --limit must be 1 or greater (provided: %i)"),
-											runtime_options.limit);
-				}
 				break;
 			default:
 				item_list_append_format(&cli_warnings,
@@ -1693,17 +1749,19 @@ check_cli_parameters(const int action)

 	/* --wait/--no-wait */

-	if (runtime_options.wait == true && runtime_options.no_wait == true)
+	if (runtime_options.wait_provided == true && runtime_options.no_wait == true)
 	{
 		item_list_append_format(&cli_errors,
 								_("both --wait and --no-wait options provided"));
 	}
 	else
 	{
-		if (runtime_options.wait)
+		if (runtime_options.wait_provided)
 		{
 			switch (action)
 			{
+				case DAEMON_START:
+				case DAEMON_STOP:
 				case STANDBY_FOLLOW:
 					break;
 				default:
@@ -1712,10 +1770,12 @@ check_cli_parameters(const int action)
 											action_name(action));
 			}
 		}
-		else if (runtime_options.wait)
+		else if (runtime_options.no_wait)
 		{
 			switch (action)
 			{
+				case DAEMON_START:
+				case DAEMON_STOP:
 				case NODE_REJOIN:
 					break;
 				default:
@@ -1817,12 +1877,15 @@ check_cli_parameters(const int action)
 			case STANDBY_REGISTER:
 			case STANDBY_FOLLOW:
 			case STANDBY_SWITCHOVER:
+			case STANDBY_PROMOTE:
 			case WITNESS_REGISTER:
 			case WITNESS_UNREGISTER:
 			case NODE_REJOIN:
 			case NODE_SERVICE:
 			case DAEMON_PAUSE:
 			case DAEMON_UNPAUSE:
+			case DAEMON_START:
+			case DAEMON_STOP:
 				break;
 			default:
 				item_list_append_format(&cli_warnings,
@@ -1851,6 +1914,44 @@ check_cli_parameters(const int action)
 							 "only one of --csv, --nagios and --optformat can be used");
 		}
 	}
+
+	/* --compact */
+
+	if (runtime_options.compact == true)
+	{
+		switch (action)
+		{
+			case CLUSTER_SHOW:
+			case DAEMON_STATUS:
+				break;
+			default:
+				item_list_append_format(&cli_warnings,
+										_("--compact is not effective when executing %s"),
+										action_name(action));
+		}
+	}
+
+	/* --disable-wal-receiver / --enable-wal-receiver */
+	if (runtime_options.disable_wal_receiver == true || runtime_options.enable_wal_receiver == true)
+	{
+		switch (action)
+		{
+			case NODE_CONTROL:
+			{
+				if (runtime_options.disable_wal_receiver == true && runtime_options.enable_wal_receiver == true)
+				{
+						item_list_append(&cli_errors,
+										 _("provide either --disable-wal-receiver or --enable-wal-receiver"));
+				}
+			}
+				break;
+			default:
+					item_list_append_format(&cli_warnings,
+											_("--disable-wal-receiver / --enable-wal-receiver not effective when executing %s"),
+											action_name(action));
+		}
+	}
+
 }


@@ -1909,7 +2010,10 @@ action_name(const int action)
 			return "DAEMON PAUSE";
 		case DAEMON_UNPAUSE:
 			return "DAEMON UNPAUSE";
-
+		case DAEMON_START:
+			return "DAEMON START";
+		case DAEMON_STOP:
+			return "DAEMON STOP";
 	}

 	return "UNKNOWN ACTION";
@@ -1941,9 +2045,20 @@ void
 print_status_header(int cols, ColHeader *headers)
 {
 	int i;
+	int max_cols = 0;
+
+	/* count how many columns we actually need to display */
+	for (i = 0; i < cols; i++)
+	{
+		if (headers[i].display == true)
+			max_cols ++;
+	}

 	for (i = 0; i < cols; i++)
 	{
+		if (headers[i].display == false)
+			continue;
+
 		if (i == 0)
 			printf(" ");
 		else
@@ -1953,17 +2068,22 @@ print_status_header(int cols, ColHeader *headers)
 			   headers[i].max_length,
 			   headers[i].title);
 	}
+
+
 	printf("\n");
 	printf("-");

-	for (i = 0; i < cols; i++)
+	for (i = 0; i < max_cols; i++)
 	{
 		int			j;

+		if (headers[i].display == false)
+			continue;
+
 		for (j = 0; j < headers[i].max_length; j++)
 			printf("-");

-		if (i < (cols - 1))
+		if (i < (max_cols - 1))
 			printf("-+-");
 		else
 			printf("-");
@@ -2001,7 +2121,7 @@ do_help(void)
 	printf(_("    %s [OPTIONS] node    {status|check|rejoin|service}\n"), progname());
 	printf(_("    %s [OPTIONS] cluster {show|event|matrix|crosscheck|cleanup}\n"), progname());
 	printf(_("    %s [OPTIONS] witness {register|unregister}\n"), progname());
-	printf(_("    %s [OPTIONS] daemon  {status|pause|unpause}\n"), progname());
+	printf(_("    %s [OPTIONS] daemon  {status|pause|unpause|start|stop}\n"), progname());

 	puts("");

@@ -2012,6 +2132,7 @@ do_help(void)
 	printf(_("General options:\n"));
 	printf(_("  -?, --help                          show this help, then exit\n"));
 	printf(_("  -V, --version                       output version information, then exit\n"));
+	printf(_("  --version-number                    output version number, then exit\n"));
 	puts("");

 	printf(_("General configuration options:\n"));
@@ -2237,17 +2358,15 @@ create_repmgr_extension(PGconn *conn)
 int
 check_server_version(PGconn *conn, char *server_type, bool exit_on_error, char *server_version_string)
 {
-	int			conn_server_version_num = UNKNOWN_SERVER_VERSION_NUM;
+	int			conn_server_version_num = get_server_version(conn, server_version_string);

-	conn_server_version_num = get_server_version(conn, server_version_string);
 	if (conn_server_version_num < MIN_SUPPORTED_VERSION_NUM)
 	{
 		if (conn_server_version_num > 0)
 			log_error(_("%s requires %s to be PostgreSQL %s or later"),
 					  progname(),
 					  server_type,
-					  MIN_SUPPORTED_VERSION
-				);
+					  MIN_SUPPORTED_VERSION);

 		if (exit_on_error == true)
 		{
@@ -2255,7 +2374,7 @@ check_server_version(PGconn *conn, char *server_type, bool exit_on_error, char *
 			exit(ERR_BAD_CONFIG);
 		}

-		return -1;
+		return UNKNOWN_SERVER_VERSION_NUM;
 	}

 	return conn_server_version_num;
@@ -2327,75 +2446,6 @@ test_ssh_connection(char *host, char *remote_user)



-/*
- * Execute a command locally. "outputbuf" should either be an
- * initialised PQexpbuffer, or NULL
- */
-bool
-local_command(const char *command, PQExpBufferData *outputbuf)
-{
-	return _local_command(command, outputbuf, false);
-}
-
-
-bool
-local_command_simple(const char *command, PQExpBufferData *outputbuf)
-{
-	return _local_command(command, outputbuf, true);
-}
-
-
-static bool
-_local_command(const char *command, PQExpBufferData *outputbuf, bool simple)
-{
-	FILE	   *fp = NULL;
-	char		output[MAXLEN];
-	int			retval = 0;
-	bool		success;
-
-	log_verbose(LOG_DEBUG, "executing:\n  %s", command);
-
-	if (outputbuf == NULL)
-	{
-		retval = system(command);
-		return (retval == 0) ? true : false;
-	}
-
-	fp = popen(command, "r");
-
-	if (fp == NULL)
-	{
-		log_error(_("unable to execute local command:\n%s"), command);
-		return false;
-	}
-
-
-	while (fgets(output, MAXLEN, fp) != NULL)
-	{
-		appendPQExpBuffer(outputbuf, "%s", output);
-
-		if (!feof(fp) && simple == false)
-		{
-			break;
-		}
-	}
-
-	retval = pclose(fp);
-
-	/*  */
-	success = (WEXITSTATUS(retval) == 0 || WEXITSTATUS(retval) == 141) ? true : false;
-
-	log_verbose(LOG_DEBUG, "result of command was %i (%i)", WEXITSTATUS(retval), retval);
-
-	if (outputbuf->data != NULL)
-		log_verbose(LOG_DEBUG, "local_command(): output returned was:\n%s", outputbuf->data);
-	else
-		log_verbose(LOG_DEBUG, "local_command(): no output returned");
-
-	return success;
-}
-
-
 /*
 * get_superuser_connection()
 *
@@ -2415,6 +2465,7 @@ get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privil
 	if (PQstatus(*conn) != CONNECTION_OK)
 	{
 		log_error(_("no database connection available"));
+		log_detail("\n%s", PQerrorMessage(*conn));
 		exit(ERR_INTERNAL);
 	}

@@ -2602,78 +2653,6 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
 }


-/*
- * Execute a command via ssh on the remote host.
- *
- * TODO: implement SSH calls using libssh2.
- */
-bool
-remote_command(const char *host, const char *user, const char *command, PQExpBufferData *outputbuf)
-{
-	FILE	   *fp;
-	char		ssh_command[MAXLEN] = "";
-	PQExpBufferData ssh_host;
-
-	char		output[MAXLEN] = "";
-
-	initPQExpBuffer(&ssh_host);
-
-	if (*user != '\0')
-	{
-		appendPQExpBuffer(&ssh_host, "%s@", user);
-	}
-
-	appendPQExpBuffer(&ssh_host, "%s", host);
-
-	maxlen_snprintf(ssh_command,
-					"ssh -o Batchmode=yes %s %s %s",
-					config_file_options.ssh_options,
-					ssh_host.data,
-					command);
-
-	termPQExpBuffer(&ssh_host);
-
-	log_debug("remote_command():\n  %s", ssh_command);
-
-	fp = popen(ssh_command, "r");
-
-	if (fp == NULL)
-	{
-		log_error(_("unable to execute remote command:\n  %s"), ssh_command);
-		return false;
-	}
-
-	if (outputbuf != NULL)
-	{
-		/* TODO: better error handling */
-		while (fgets(output, MAXLEN, fp) != NULL)
-		{
-			appendPQExpBuffer(outputbuf, "%s", output);
-		}
-	}
-	else
-	{
-		while (fgets(output, MAXLEN, fp) != NULL)
-		{
-			if (!feof(fp))
-			{
-				break;
-			}
-		}
-	}
-
-	pclose(fp);
-
-	if (outputbuf != NULL)
-	{
-		if (strlen(outputbuf->data))
-			log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n%s", outputbuf->data);
-		else
-			log_verbose(LOG_DEBUG, "remote_command(): no output returned");
-	}
-
-	return true;
-}


 void
@@ -2705,6 +2684,34 @@ make_remote_repmgr_path(PQExpBufferData *output_buf, t_node_info *remote_node_re
 }


+void
+make_repmgrd_path(PQExpBufferData *output_buf)
+{
+	if (config_file_options.repmgr_bindir[0] != '\0')
+	{
+		int			len = strlen(config_file_options.repmgr_bindir);
+
+		appendPQExpBufferStr(output_buf,
+							 config_file_options.repmgr_bindir);
+
+		/* Add trailing slash */
+		if (config_file_options.repmgr_bindir[len - 1] != '/')
+		{
+			appendPQExpBufferChar(output_buf, '/');
+		}
+	}
+	else if (pg_bindir[0] != '\0')
+	{
+		appendPQExpBufferStr(output_buf,
+							 pg_bindir);
+	}
+
+	appendPQExpBuffer(output_buf,
+					  "repmgrd -f %s ",
+					  config_file_path);
+}
+
+
 /* ======================== */
 /* server control functions */
 /* ======================== */
@@ -3002,7 +3009,7 @@ init_node_record(t_node_info *node_record)
 		strncpy(node_record->location, "default", MAXLEN);


-	strncpy(node_record->node_name, config_file_options.node_name, MAXLEN);
+	strncpy(node_record->node_name, config_file_options.node_name, sizeof(node_record->node_name));
 	strncpy(node_record->conninfo, config_file_options.conninfo, MAXLEN);
 	strncpy(node_record->config_file, config_file_path, MAXPGPATH);

@@ -3031,10 +3038,9 @@ bool
 can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason)
 {
 	bool		can_use = true;
-	int			server_version_num = get_server_version(conn, NULL);

 	/* wal_log_hints not available in 9.3, so just determine if data checksums enabled */
-	if (server_version_num < 90400)
+	if (PQserverVersion(conn) < 90400)
 	{
 		int			data_checksum_version = get_data_checksum_version(data_directory);

@@ -3057,9 +3063,6 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea
 	/* "full_page_writes" must be on in any case */
 	if (guc_set(conn, "full_page_writes", "=", "off"))
 	{
-		if (can_use == false)
-			appendPQExpBuffer(reason, "; ");
-
 		appendPQExpBuffer(reason,
 						  _("\"full_page_writes\" must be set to \"on\""));

@@ -3141,3 +3144,253 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
 		}
 	}
 }
+
+
+/*
+ * Here we'll perform some timeline sanity checks to ensure the follow target
+ * can actually be followed.
+ *
+ * See also comment for check_node_can_follow() in repmgrd-physical.c .
+ */
+bool
+check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin)
+{
+	uint64		local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
+	t_conninfo_param_list follow_target_repl_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
+	PGconn	   *follow_target_repl_conn = NULL;
+	t_system_identification follow_target_identification = T_SYSTEM_IDENTIFICATION_INITIALIZER;
+	TimeLineHistoryEntry *follow_target_history = NULL;
+	bool success = true;
+
+	const char *action = is_rejoin == true ? "rejoin" : "follow";
+
+	/* check replication connection */
+	initialize_conninfo_params(&follow_target_repl_conninfo, false);
+
+	conn_to_param_list(follow_target_conn, &follow_target_repl_conninfo);
+
+	if (strcmp(param_get(&follow_target_repl_conninfo, "user"), follow_target_node_record->repluser) != 0)
+	{
+		param_set(&follow_target_repl_conninfo, "user", follow_target_node_record->repluser);
+		param_set(&follow_target_repl_conninfo, "dbname", "replication");
+	}
+
+	param_set(&follow_target_repl_conninfo, "replication", "1");
+
+	follow_target_repl_conn = establish_db_connection_by_params(&follow_target_repl_conninfo, false);
+
+	free_conninfo_params(&follow_target_repl_conninfo);
+
+	if (PQstatus(follow_target_repl_conn) != CONNECTION_OK)
+	{
+		log_error(_("unable to establish a replication connection to the %s target node"), action);
+		return false;
+	}
+	else if (runtime_options.dry_run == true)
+	{
+		log_info(_("replication connection to the %s target node was successful"), action);
+	}
+
+	/* check system_identifiers match */
+	if (identify_system(follow_target_repl_conn, &follow_target_identification) == false)
+	{
+		log_error(_("unable to query the %s target node's system identification"), action);
+
+		PQfinish(follow_target_repl_conn);
+		return false;
+	}
+
+	local_system_identifier = get_system_identifier(config_file_options.data_directory);
+
+	/*
+	 * Check for thing that should never happen, but expect the unexpected anyway.
+	 */
+	if (follow_target_identification.system_identifier != local_system_identifier)
+	{
+		log_error(_("this node is not part of the %s target node's replication cluster"), action);
+		log_detail(_("this node's system identifier is %lu, %s target node's system identifier is %lu"),
+				   local_system_identifier,
+				   action,
+				   follow_target_identification.system_identifier);
+		PQfinish(follow_target_repl_conn);
+		return false;
+	}
+
+	if (runtime_options.dry_run == true)
+	{
+		log_info(_("local and %s target system identifiers match"), action);
+		log_detail(_("system identifier is %lu"), local_system_identifier);
+	}
+
+	/* check timelines */
+
+	log_verbose(LOG_DEBUG, "local timeline: %i; %s target timeline: %i",
+				local_tli,
+				action,
+				follow_target_identification.timeline);
+
+	/* upstream's timeline is lower than ours - impossible case */
+	if (follow_target_identification.timeline < local_tli)
+	{
+		log_error(_("this node's timeline is ahead of the %s target node's timeline"), action);
+		log_detail(_("this node's timeline is %i, %s target node's timeline is %i"),
+				   local_tli,
+				   action,
+				   follow_target_identification.timeline);
+		PQfinish(follow_target_repl_conn);
+		return false;
+	}
+
+	/* timelines are the same - check relative positions */
+	if (follow_target_identification.timeline == local_tli)
+	{
+		XLogRecPtr follow_target_xlogpos = get_node_current_lsn(follow_target_conn);
+
+		if (local_xlogpos == InvalidXLogRecPtr || follow_target_xlogpos == InvalidXLogRecPtr)
+		{
+			log_error(_("unable to compare LSN positions"));
+			PQfinish(follow_target_repl_conn);
+			return false;
+		}
+
+		if (local_xlogpos <= follow_target_xlogpos)
+		{
+			log_info(_("timelines are same, this server is not ahead"));
+			log_detail(_("local node lsn is %X/%X, %s target lsn is %X/%X"),
+					   format_lsn(local_xlogpos),
+					   action,
+					   format_lsn(follow_target_xlogpos));
+		}
+		else
+		{
+			log_error(_("this node is ahead of the %s target"), action);
+			log_detail(_("local node lsn is %X/%X, %s target lsn is %X/%X"),
+					   format_lsn(local_xlogpos),
+					   action,
+					   format_lsn(follow_target_xlogpos));
+
+			success = false;
+		}
+	}
+	else
+	{
+		/*
+		 * upstream has higher timeline - check where it forked off from this node's timeline
+		 */
+		follow_target_history = get_timeline_history(follow_target_repl_conn, local_tli + 1);
+
+		if (follow_target_history == NULL)
+		{
+			/* get_timeline_history() will emit relevant error messages */
+			PQfinish(follow_target_repl_conn);
+			return false;
+		}
+
+		log_debug("local tli: %i; local_xlogpos: %X/%X; follow_target_history->tli: %i; follow_target_history->end: %X/%X",
+				  local_tli,
+				  format_lsn(local_xlogpos),
+				  follow_target_history->tli,
+				  format_lsn(follow_target_history->end));
+
+		/*
+		 * Local node has proceeded beyond the follow target's fork, so we
+		 * definitely can't attach.
+		 *
+		 * This could be the case if the follow target was promoted, but does
+		 * not contain all changes which are being replayed to this standby.
+		 */
+		if (local_xlogpos > follow_target_history->end)
+		{
+			if (is_rejoin == true && runtime_options.force_rewind_used == true)
+			{
+				log_notice(_("pg_rewind execution required for this node to attach to rejoin target node %i"),
+						   follow_target_node_record->node_id);
+			}
+			else
+			{
+				log_error(_("this node cannot attach to %s target node %i"),
+						  action,
+						  follow_target_node_record->node_id);
+				success = false;
+			}
+
+			log_detail(_("%s target server's timeline %i forked off current database system timeline %i before current recovery point %X/%X"),
+					   action,
+					   local_tli + 1,
+					   local_tli,
+					   format_lsn(local_xlogpos));
+
+			if (is_rejoin == true && runtime_options.force_rewind_used == false)
+			{
+				log_hint(_("use --force-rewind to execute pg_rewind"));
+			}
+		}
+
+		if (success == true)
+		{
+			if (is_rejoin == false || (is_rejoin == true && runtime_options.force_rewind_used == false))
+			{
+				log_info(_("local node %i can attach to %s target node %i"),
+						 config_file_options.node_id,
+						 action,
+						 follow_target_node_record->node_id);
+
+				log_detail(_("local node's recovery point: %X/%X; %s target node's fork point: %X/%X"),
+						   format_lsn(local_xlogpos),
+						   action,
+						   format_lsn(follow_target_history->end));
+			}
+		}
+	}
+
+	PQfinish(follow_target_repl_conn);
+
+	if (follow_target_history)
+		pfree(follow_target_history);
+
+	return success;
+}
+
+
+/*
+ * Simple check to see if "shared_preload_libraries" includes "repmgr".
+ * Parsing "shared_preload_libraries" is non-trivial, as it's potentially
+ * a comma-separated list, and worse may not be readable by the repmgr
+ * user.
+ *
+ * Instead, we check if a function which should return a value returns
+ * NULL; this indicates the shared library is not installed.
+ */
+void
+check_shared_library(PGconn *conn)
+{
+	bool ok = repmgrd_check_local_node_id(conn);
+
+	if (ok == true)
+		return;
+
+	log_error(_("repmgrd not configured for this node"));
+	log_hint(_("ensure \"shared_preload_libraries\" includes \"repmgr\" and restart PostgreSQL"));
+	PQfinish(conn);
+	exit(ERR_BAD_CONFIG);
+}
+
+
+bool
+is_repmgrd_running(PGconn *conn)
+{
+	pid_t		pid;
+	bool		is_running = false;
+
+	pid = repmgrd_get_pid(conn);
+
+	if (pid != UNKNOWN_PID)
+	{
+		if (kill(pid, 0) != -1)
+		{
+			is_running = true;
+		}
+	}
+
+	return is_running;
+}
--- a/repmgr-client.h
+++ b/repmgr-client.h
@@ -1,6 +1,6 @@
 /*
 * repmgr-client.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -40,14 +40,17 @@
 #define NODE_CHECK			   14
 #define NODE_SERVICE		   15
 #define NODE_REJOIN            16
-#define CLUSTER_SHOW		   17
-#define CLUSTER_CLEANUP		   18
-#define CLUSTER_MATRIX		   19
-#define CLUSTER_CROSSCHECK	   20
-#define CLUSTER_EVENT		   21
-#define DAEMON_STATUS		   22
-#define DAEMON_PAUSE		   23
-#define DAEMON_UNPAUSE		   24
+#define NODE_CONTROL           17
+#define CLUSTER_SHOW		   18
+#define CLUSTER_CLEANUP		   19
+#define CLUSTER_MATRIX		   20
+#define CLUSTER_CROSSCHECK	   21
+#define CLUSTER_EVENT		   22
+#define DAEMON_STATUS		   23
+#define DAEMON_PAUSE		   24
+#define DAEMON_UNPAUSE		   25
+#define DAEMON_START 		   26
+#define DAEMON_STOP 		   27

 /* command line options without short versions */
 #define OPT_HELP						   1001
@@ -92,6 +95,11 @@
 #define OPT_NO_WAIT                        1040
 #define OPT_MISSING_SLOTS                  1041
 #define OPT_REPMGRD_NO_PAUSE               1042
+#define OPT_VERSION_NUMBER				   1043
+#define OPT_DATA_DIRECTORY_CONFIG		   1044
+#define OPT_COMPACT		                   1045
+#define OPT_DISABLE_WAL_RECEIVER           1046
+#define OPT_ENABLE_WAL_RECEIVER            1047

 /* deprecated since 3.3 */
 #define OPT_DATA_DIR						999
@@ -102,16 +110,18 @@
 static struct option long_options[] =
 {
 /* general options */
-	{"version", no_argument, NULL, 'V'},
 	{"help", no_argument, NULL, OPT_HELP},
+	{"version", no_argument, NULL, 'V'},
+	{"version-number", no_argument, NULL, OPT_VERSION_NUMBER},

 /* general configuration options */
 	{"config-file", required_argument, NULL, 'f'},
 	{"dry-run", no_argument, NULL, OPT_DRY_RUN},
 	{"force", no_argument, NULL, 'F'},
 	{"pg_bindir", required_argument, NULL, 'b'},
-	{"wait", no_argument, NULL, 'w'},
+	{"wait", optional_argument, NULL, 'w'},
 	{"no-wait", no_argument, NULL, 'W'},
+	{"compact", no_argument, NULL, OPT_COMPACT},

 /* connection options */
 	{"dbname", required_argument, NULL, 'd'},
@@ -156,7 +166,7 @@ static struct option long_options[] =

 /* "standby switchover" options
 *
- * Note: --force-rewind accepted to pass to "node join"
+ * Note: --force-rewind accepted to pass to "node rejoin"
 */
 	{"always-promote", no_argument, NULL, OPT_ALWAYS_PROMOTE},
 	{"siblings-follow", no_argument, NULL, OPT_SIBLINGS_FOLLOW},
@@ -174,6 +184,7 @@ static struct option long_options[] =
 	{"missing-slots", no_argument, NULL, OPT_MISSING_SLOTS},
 	{"has-passfile", no_argument, NULL, OPT_HAS_PASSFILE},
 	{"replication-connection", no_argument, NULL, OPT_REPL_CONN},
+	{"data-directory-config", no_argument, NULL, OPT_DATA_DIRECTORY_CONFIG},

 /* "node rejoin" options */
 	{"config-files", required_argument, NULL, OPT_CONFIG_FILES},
@@ -193,6 +204,10 @@ static struct option long_options[] =
 /* "cluster cleanup" options */
 	{"keep-history", required_argument, NULL, 'k'},

+/* undocumented options for testing */
+	{"disable-wal-receiver", no_argument, NULL, OPT_DISABLE_WAL_RECEIVER},
+	{"enable-wal-receiver", no_argument, NULL, OPT_ENABLE_WAL_RECEIVER},
+
 /* deprecated */
 	{"check-upstream-config", no_argument, NULL, OPT_CHECK_UPSTREAM_CONFIG},
 	{"no-conninfo-password", no_argument, NULL, OPT_NO_CONNINFO_PASSWORD},
--- a/repmgr.c
+++ b/repmgr.c
@@ -1,7 +1,7 @@
 /*
 * repmgr.c - repmgr extension
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This is the actual extension code; see repmgr-client.c for the code which
 * generates the repmgr binary
@@ -40,7 +40,6 @@

 #include "utils/timestamp.h"

-#include "executor/spi.h"
 #include "lib/stringinfo.h"
 #include "access/xact.h"
 #include "utils/snapmgr.h"
@@ -54,6 +53,7 @@
 #include "voting.h"

 #define UNKNOWN_NODE_ID		-1
+#define ELECTION_RERUN_NOTIFICATION -2
 #define UNKNOWN_PID			-1

 #define TRANCHE_NAME "repmgrd"
@@ -78,6 +78,7 @@ typedef struct repmgrdSharedState
 	char		repmgrd_pidfile[MAXPGPATH];
 	bool		repmgrd_paused;
 	/* streaming failover */
+	TimestampTz upstream_last_seen;
 	NodeVotingStatus voting_status;
 	int			current_electoral_term;
 	int			candidate_node_id;
@@ -108,6 +109,12 @@ PG_FUNCTION_INFO_V1(standby_set_last_updated);
 Datum		standby_get_last_updated(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(standby_get_last_updated);

+Datum		set_upstream_last_seen(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(set_upstream_last_seen);
+
+Datum		get_upstream_last_seen(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(get_upstream_last_seen);
+
 Datum		notify_follow_primary(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(notify_follow_primary);

@@ -141,6 +148,8 @@ PG_FUNCTION_INFO_V1(repmgrd_pause);
 Datum		repmgrd_is_paused(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(repmgrd_is_paused);

+Datum		get_wal_receiver_pid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(get_wal_receiver_pid);


 /*
@@ -219,6 +228,8 @@ repmgr_shmem_startup(void)
 		memset(shared_state->repmgrd_pidfile, 0, MAXPGPATH);
 		shared_state->repmgrd_paused = false;
 		shared_state->current_electoral_term = 0;
+		/* arbitrary "magic" date to indicate this field hasn't been updated */
+		shared_state->upstream_last_seen = POSTGRES_EPOCH_JDATE;
 		shared_state->voting_status = VS_NO_VOTE;
 		shared_state->candidate_node_id = UNKNOWN_NODE_ID;
 		shared_state->follow_new_primary = false;
@@ -354,6 +365,54 @@ standby_get_last_updated(PG_FUNCTION_ARGS)
 }


+Datum
+set_upstream_last_seen(PG_FUNCTION_ARGS)
+{
+	if (!shared_state)
+		PG_RETURN_VOID();
+
+	LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
+
+	shared_state->upstream_last_seen = GetCurrentTimestamp();
+
+	LWLockRelease(shared_state->lock);
+
+	PG_RETURN_VOID();
+}
+
+
+Datum
+get_upstream_last_seen(PG_FUNCTION_ARGS)
+{
+	long		secs;
+	int			microsecs;
+	TimestampTz last_seen;
+
+	if (!shared_state)
+		PG_RETURN_INT32(-1);
+
+	LWLockAcquire(shared_state->lock, LW_SHARED);
+
+	last_seen = shared_state->upstream_last_seen;
+
+	LWLockRelease(shared_state->lock);
+
+	/*
+	 * "last_seen" is initialised with the PostgreSQL epoch as a
+	 * "magic" value to indicate the field hasn't ever been updated
+	 * by repmgrd. We return -1 instead, rather than imply that the
+	 * primary was last seen at the turn of the century.
+	 */
+	if (last_seen == POSTGRES_EPOCH_JDATE)
+		PG_RETURN_INT32(-1);
+
+
+	TimestampDifference(last_seen, GetCurrentTimestamp(),
+						&secs, &microsecs);
+
+	/* let's hope repmgrd never runs for more than a century or so without seeing a primary */
+	PG_RETURN_INT32((uint32)secs);
+}


 /* ===================*/
@@ -367,10 +426,10 @@ notify_follow_primary(PG_FUNCTION_ARGS)
 	int			primary_node_id = UNKNOWN_NODE_ID;

 	if (!shared_state)
-		PG_RETURN_NULL();
+		PG_RETURN_VOID();

 	if (PG_ARGISNULL(0))
-		PG_RETURN_NULL();
+		PG_RETURN_VOID();

 	primary_node_id = PG_GETARG_INT32(0);

@@ -379,9 +438,17 @@ notify_follow_primary(PG_FUNCTION_ARGS)
 	/* only do something if local_node_id is initialised */
 	if (shared_state->local_node_id != UNKNOWN_NODE_ID)
 	{
-		elog(INFO, "node %i received notification to follow node %i",
-			 shared_state->local_node_id,
-			 primary_node_id);
+		if (primary_node_id == ELECTION_RERUN_NOTIFICATION)
+		{
+			elog(INFO, "node %i received notification to rerun promotion candidate election",
+				 shared_state->local_node_id);
+		}
+		else
+		{
+			elog(INFO, "node %i received notification to follow node %i",
+				 shared_state->local_node_id,
+				 primary_node_id);
+		}

 		LWLockRelease(shared_state->lock);
 		LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
@@ -402,7 +469,7 @@ get_new_primary(PG_FUNCTION_ARGS)
 	int			new_primary_node_id = UNKNOWN_NODE_ID;

 	if (!shared_state)
-		PG_RETURN_NULL();
+		PG_RETURN_INT32(UNKNOWN_NODE_ID);

 	LWLockAcquire(shared_state->lock, LW_SHARED);

@@ -412,7 +479,7 @@ get_new_primary(PG_FUNCTION_ARGS)
 	LWLockRelease(shared_state->lock);

 	if (new_primary_node_id == UNKNOWN_NODE_ID)
-		PG_RETURN_NULL();
+		PG_RETURN_INT32(UNKNOWN_NODE_ID);

 	PG_RETURN_INT32(new_primary_node_id);
 }
@@ -680,3 +747,17 @@ repmgrd_is_paused(PG_FUNCTION_ARGS)

 	PG_RETURN_BOOL(is_paused);
 }
+
+
+Datum
+get_wal_receiver_pid(PG_FUNCTION_ARGS)
+{
+	int wal_receiver_pid;
+
+	if (!shared_state)
+		PG_RETURN_NULL();
+
+	wal_receiver_pid = WalRcv->pid;
+
+	PG_RETURN_INT32(wal_receiver_pid);
+}
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -7,7 +7,8 @@
 # parameter will be treated as empty or false.
 #
 # IMPORTANT: string values can be provided as-is, or enclosed in single quotes
-# (but not double-quotes, which will be interpreted as part of the string), e.g.:
+# (but not double-quotes, which will be interpreted as part of the string),
+# e.g.:
 #
 #  node_name=foo
 #  node_name = 'foo'
@@ -24,22 +25,24 @@
 				 # using the server's hostname or another identifier
 				 # unambiguously associated with the server to avoid
 				 # confusion. Avoid choosing names which reflect the
-				 # node's current role, e.g. "primary" or "standby1",
+				 # node's current role, e.g. 'primary' or 'standby1',
 				 # as roles can change and it will be confusing if
-				 # the current primary is called "standby1".
+				 # the current primary is called 'standby1'.
+                                 # The string's maximum length is 63 characters and it should
+                                 # contain only printable ASCII characters.

 #conninfo=''			 # Database connection information as a conninfo string.
 				 # All servers in the cluster must be able to connect to
 				 # the local node using this string.
 				 #
 				 # For details on conninfo strings, see:
-				 #  https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING
+				 #  https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING
 				 #
 				 # If repmgrd is in use, consider explicitly setting
 				 # "connect_timeout" in the conninfo string to determine
 				 # the length of time which elapses before a network
 				 # connection attempt is abandoned; for details see:
-				 #  https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNECT-CONNECT-TIMEOUT
+				 #  https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-CONNECT-TIMEOUT

 #data_directory=''		 # The node's data directory. This is needed by repmgr
 				 # when performing operations when the PostgreSQL instance
@@ -247,6 +250,9 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					# for the demoted standby to reconnect to the promoted
 					# primary (note: this value should be equal to or greater
 					# than that set for "node_rejoin_timeout")
+#wal_receive_check_timeout=30		# The max length of time (in seconds) to wait for the walreceiver
+					# on the standby to flush WAL to disk before comparing location
+					# with the shut-down primary

 #------------------------------------------------------------------------------
 # "node rejoin" settings
@@ -275,11 +281,6 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 # These settings are only applied when repmgrd is running. Values shown
 # are defaults.

-#repmgrd_pid_file=			# Path of PID file to use for repmgrd; if not set, a PID file will
-					# be generated in a temporary directory specified by the environment
-					# variable $TMPDIR, or if not set, in "/tmp". This value can be overridden
-					# by the command line option "-p/--pid-file"; the command line option
-					# "--no-pid-file" will force PID file creation to be skipped.
 #failover=manual			# one of 'automatic', 'manual'.
 					# determines what action to take in the event of upstream failure
 					#
@@ -289,10 +290,13 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					#    manual attention to reattach it to replication
 					# (does not apply to BDR mode)

-#priority=100				# indicate a preferred priority for promoting nodes;
+#priority=100				# indicates a preferred priority for promoting nodes;
 					# a value of zero prevents the node being promoted to primary
 					# (default: 100)

+#connection_check_type=ping		# How to check availability of the upstream node; valid options:
+                                        #  'ping': use PQping() to check if the node is accepting connections
+                                        #  'connection': execute a throwaway query on the current connection
 #reconnect_attempts=6			# Number of attempts which will be made to reconnect to an unreachable
 					# primary (or other upstream node)
 #reconnect_interval=10			# Interval between attempts to reconnect to an unreachable
@@ -316,10 +320,29 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 #monitoring_history=no                  # Whether to write monitoring data to the "montoring_history" table
 #monitor_interval_secs=2                # Interval (in seconds) at which to write monitoring data
 #degraded_monitoring_timeout=-1		# Interval (in seconds) after which repmgrd will terminate if the
-					# server being monitored is no longer available. -1 (default)
+					# server(s) being monitored are no longer available. -1 (default)
 					# disables the timeout completely.
 #async_query_timeout=60			# Interval (in seconds) which repmgrd will wait before
 					# cancelling an asynchronous query.
+#repmgrd_pid_file=			# Path of PID file to use for repmgrd; if not set, a PID file will
+					# be generated in a temporary directory specified by the environment
+					# variable $TMPDIR, or if not set, in "/tmp". This value can be overridden
+					# by the command line option "-p/--pid-file"; the command line option
+					# "--no-pid-file" will force PID file creation to be skipped.
+					# Note: there is normally no need to set this, particularly if
+					# repmgr was installed from packages.
+#standby_disconnect_on_failover=false	# If "true", in a failover situation wait for all standbys to
+					# disconnect their WAL receivers before electing a new primary
+					# (PostgreSQL 9.5 and later only; repmgr user must be a superuser for this)
+#sibling_nodes_disconnect_timeout=30	# If "standby_disconnect_on_failover" is true, the maximum length of time
+					#  (in seconds) to wait for other standbys to confirm they have disconnected their
+					# WAL receivers
+#failover_validation_command=		# Script to execute for an external mechanism to validate the failover
+					# decision made by repmgrd. One or both of the following parameter placeholders
+					# should be provided, which will be replaced by repmgrd with the appropriate
+					# value: %n (node_id), %a (node_name). *Must* be the same on all nodes.
+#election_rerun_interval=15		# if "failover_validation_command" is set, and the command returns
+					# an error, pause the specified amount of seconds before rerunning the election.

 #------------------------------------------------------------------------------
 # service control commands
@@ -328,6 +351,12 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 # repmgr provides options to override the default pg_ctl commands
 # used to stop, start, restart, reload and promote the PostgreSQL cluster
 #
+# These options are useful when PostgreSQL has been installed from a package
+# which provides OS-level service commands. In environments using an init system
+# such as systemd, which keeps track of the state of various services, it is
+# essential that the service commands are correctly configured and pg_ctl is
+# not executed directly.
+#
 # NOTE: These commands must be runnable on remote nodes as well for switchover
 # to function correctly.
 #
@@ -349,7 +378,7 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 #
 # Debian/Ubuntu users: use "sudo pg_ctlcluster" to execute service control commands.
 #
-# For more details, see: https://repmgr.org/docs/4.1/configuration-service-commands.html
+# For more details, see: https://repmgr.org/docs/current/configuration-service-commands.html

 #service_start_command = ''
 #service_stop_command = ''
@@ -361,6 +390,11 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					# for "promote_command"; do not use "repmgr standby promote"
 					# (or a script which executes "repmgr standby promote") here.

+# Used by "repmgr daemon (start|stop)" to control repmgrd
+#
+#repmgrd_service_start_command = ''
+#repmgrd_service_stop_command = ''
+
 #------------------------------------------------------------------------------
 # Status check thresholds
 #------------------------------------------------------------------------------
--- a/repmgr.control
+++ b/repmgr.control
@@ -1,6 +1,6 @@
 # repmgr extension
 comment = 'Replication manager for PostgreSQL'
-default_version = '4.2'
+default_version = '4.3'
 module_pathname = '$libdir/repmgr'
 relocatable = false
 schema = repmgr
--- a/repmgr.h
+++ b/repmgr.h
@@ -1,6 +1,6 @@
 /*
 * repmgr.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -41,6 +41,7 @@
 #include "configfile.h"
 #include "dbutils.h"
 #include "log.h"
+#include "sysutils.h"

 #define MIN_SUPPORTED_VERSION		"9.3"
 #define MIN_SUPPORTED_VERSION_NUM	90300
@@ -54,12 +55,16 @@
 #define UNKNOWN_TIMELINE_ID -1
 #define UNKNOWN_SYSTEM_IDENTIFIER 0
 #define UNKNOWN_PID			-1
+#define UNKNOWN_REPLICATION_LAG	-1

 #define NODE_NOT_FOUND		-1
 #define NO_UPSTREAM_NODE	-1
 #define UNKNOWN_NODE_ID		-1
-
+#define MIN_NODE_ID          1
+#define ELECTION_RERUN_NOTIFICATION -2
 #define VOTING_TERM_NOT_SET -1
+#define ARCHIVE_STATUS_DIR_ERROR -1
+#define NO_DEGRADED_MONITORING_ELAPSED -1

 #define BDR2_REPLICATION_SET_NAME "repmgr"

@@ -88,6 +93,11 @@
 #define DEFAULT_SHUTDOWN_CHECK_TIMEOUT       60  /* seconds */
 #define DEFAULT_STANDBY_RECONNECT_TIMEOUT    60  /* seconds */
 #define DEFAULT_NODE_REJOIN_TIMEOUT          60  /* seconds */
+#define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT    30  /* seconds */
+#define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
+#define DEFAULT_ELECTION_RERUN_INTERVAL      15  /* seconds */
+
+#define WALRECEIVER_DISABLE_TIMEOUT_VALUE    86400000 /* milliseconds */

 #ifndef RECOVERY_COMMAND_FILE
 #define RECOVERY_COMMAND_FILE "recovery.conf"
--- a/repmgr_version.h.in
+++ b/repmgr_version.h.in
@@ -1,2 +1,3 @@
 #define REPMGR_VERSION_DATE ""
-#define REPMGR_VERSION "4.2"
+#define REPMGR_VERSION "4.3"
+#define REPMGR_VERSION_NUM 40300
--- a/repmgrd-bdr.c
+++ b/repmgrd-bdr.c
@@ -1,7 +1,7 @@
 /*
 * repmgrd-bdr.c - BDR functionality for repmgrd
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -43,10 +43,12 @@ handle_sigint_bdr(SIGNAL_ARGS)
 	initPQExpBuffer(&event_details);

 	appendPQExpBuffer(&event_details,
-					  "%s signal received",
+					  _("%s signal received"),
 					  postgres_signal_arg == SIGTERM
 					  ? "TERM" : "INT");

+	log_notice("%s", event_details.data);
+
 	create_event_notification(local_conn,
 							  &config_file_options,
 							  config_file_options.node_id,
@@ -66,7 +68,6 @@ monitor_bdr(void)
 	t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER;
 	RecordStatus record_status;
 	NodeInfoListCell *cell;
-	PQExpBufferData event_details;
 	instr_time	log_status_interval_start;

 	/* sanity check local database */
@@ -227,6 +228,7 @@ monitor_bdr(void)
 								if (cell->node_info->node_status == NODE_STATUS_UP)
 								{
 									int			node_unreachable_elapsed = calculate_elapsed(node_unreachable_start);
+									PQExpBufferData event_details;

 									initPQExpBuffer(&event_details);

@@ -364,7 +366,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *next_node_conn = NULL;
 	NodeInfoListCell *cell;
-	PQExpBufferData event_details;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
 	t_node_info target_node = T_NODE_INFO_INITIALIZER;
 	t_node_info failed_node = T_NODE_INFO_INITIALIZER;
@@ -458,45 +459,49 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)

 	log_debug("this node is the failover handler");

-	initPQExpBuffer(&event_details);
+	{
+		PQExpBufferData event_details;

-	event_info.conninfo_str = target_node.conninfo;
-	event_info.node_name = target_node.node_name;
+		initPQExpBuffer(&event_details);

-	/* update node record on the active node */
-	update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
+		event_info.conninfo_str = target_node.conninfo;
+		event_info.node_name = target_node.node_name;

-	log_notice(_("setting node record for node %i to inactive"), monitored_node->node_id);
+		/* update node record on the active node */
+		update_node_record_set_active(next_node_conn, monitored_node->node_id, false);

-	appendPQExpBuffer(&event_details,
-					  _("node \"%s\" (ID: %i) detected as failed; next available node is \"%s\" (ID: %i)"),
-					  monitored_node->node_name,
-					  monitored_node->node_id,
-					  target_node.node_name,
-					  target_node.node_id);
+		log_notice(_("setting node record for node %i to inactive"), monitored_node->node_id);

-	/*
-	 * Create an event record
-	 *
-	 * If we were able to connect to another node, we'll update the event log
-	 * there.
-	 *
-	 * In any case the event notification command will be triggered with the
-	 * event "bdr_failover"
-	 */
+		appendPQExpBuffer(&event_details,
+						  _("node \"%s\" (ID: %i) detected as failed; next available node is \"%s\" (ID: %i)"),
+						  monitored_node->node_name,
+						  monitored_node->node_id,
+						  target_node.node_name,
+						  target_node.node_id);
+
+		/*
+		 * Create an event record
+		 *
+		 * If we were able to connect to another node, we'll update the event log
+		 * there.
+		 *
+		 * In any case the event notification command will be triggered with the
+		 * event "bdr_failover"
+		 */


-	create_event_notification_extended(next_node_conn,
-									   &config_file_options,
-									   monitored_node->node_id,
-									   "bdr_failover",
-									   true,
-									   event_details.data,
-									   &event_info);
+		create_event_notification_extended(next_node_conn,
+										   &config_file_options,
+										   monitored_node->node_id,
+										   "bdr_failover",
+										   true,
+										   event_details.data,
+										   &event_info);

-	log_info("%s", event_details.data);
+		log_info("%s", event_details.data);

-	termPQExpBuffer(&event_details);
+		termPQExpBuffer(&event_details);
+	}

 	unset_bdr_failover_handler(next_node_conn);

@@ -511,7 +516,6 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *recovered_node_conn;

-	PQExpBufferData event_details;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
 	int			i;
 	bool		slot_reactivated = false;
@@ -541,6 +545,8 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 	 */
 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
+		PQExpBufferData event_details;
+
 		local_conn = NULL;
 		log_warning(_("unable to reconnect to local node"));

@@ -611,49 +617,50 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 	node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
 	monitored_node->monitoring_state = MS_NORMAL;

-
-	initPQExpBuffer(&event_details);
-
-	appendPQExpBuffer(&event_details,
-					  _("node \"%s\" (ID: %i) has recovered after %i seconds"),
-					  monitored_node->node_name,
-					  monitored_node->node_id,
-					  node_recovery_elapsed);
-
-	log_notice("%s", event_details.data);
-
-
-	/* other node will generate the event */
-	if (monitored_node->node_id == local_node_info.node_id)
 	{
+		PQExpBufferData event_details;
+
+		initPQExpBuffer(&event_details);
+
+		appendPQExpBuffer(&event_details,
+						  _("node \"%s\" (ID: %i) has recovered after %i seconds"),
+						  monitored_node->node_name,
+						  monitored_node->node_id,
+						  node_recovery_elapsed);
+
+		log_notice("%s", event_details.data);
+
+
+		/* other node will generate the event */
+		if (monitored_node->node_id == local_node_info.node_id)
+		{
+			termPQExpBuffer(&event_details);
+			PQfinish(recovered_node_conn);
+
+			return;
+		}
+
+
+		/* generate the event on the currently active node only */
+		if (monitored_node->node_id != local_node_info.node_id)
+		{
+			event_info.conninfo_str = monitored_node->conninfo;
+			event_info.node_name = monitored_node->node_name;
+
+			create_event_notification_extended(local_conn,
+											   &config_file_options,
+											   config_file_options.node_id,
+											   "bdr_recovery",
+											   true,
+											   event_details.data,
+											   &event_info);
+		}
+
 		termPQExpBuffer(&event_details);
-		PQfinish(recovered_node_conn);
-
-		return;
 	}

-
-	/* generate the event on the currently active node only */
-	if (monitored_node->node_id != local_node_info.node_id)
-	{
-		event_info.conninfo_str = monitored_node->conninfo;
-		event_info.node_name = monitored_node->node_name;
-
-		create_event_notification_extended(
-										   local_conn,
-										   &config_file_options,
-										   config_file_options.node_id,
-										   "bdr_recovery",
-										   true,
-										   event_details.data,
-										   &event_info);
-	}
-
-
 	update_node_record_set_active(local_conn, monitored_node->node_id, true);

-	termPQExpBuffer(&event_details);
-
 	PQfinish(recovered_node_conn);

 	return;
--- a/repmgrd-bdr.h
+++ b/repmgrd-bdr.h
@@ -1,6 +1,6 @@
 /*
 * repmgrd-bdr.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
--- a/repmgrd-physical.h
+++ b/repmgrd-physical.h
@@ -1,6 +1,6 @@
 /*
 * repmgrd-physical.h
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -1,7 +1,7 @@
 /*
 * repmgrd.c - Replication manager daemon
 *
- * Copyright (c) 2ndQuadrant, 2010-2018
+ * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -372,12 +372,6 @@ main(int argc, char **argv)
 	/* abort if local node not available at startup */
 	local_conn = establish_db_connection(config_file_options.conninfo, true);

-	/*
-	 * store the server version number - we'll need this to generate
-	 * version-dependent queries etc.
-	 */
-	server_version_num = get_server_version(local_conn, NULL);
-
 	/*
 	 * sanity checks
 	 *
@@ -389,16 +383,57 @@ main(int argc, char **argv)
 	 * repmgr has not been properly configured.
 	 */

+
+	/* warn about any settings which might not be relevant for the current PostgreSQL version  */
+	if (config_file_options.standby_disconnect_on_failover == true && PQserverVersion(local_conn) < 90500)
+	{
+		log_warning(_("\"standby_disconnect_on_failover\" specified, but not available for this PostgreSQL version"));
+		/* TODO: format server version */
+		log_detail(_("available from PostgreSQL 9.5, this PostgreSQL version is %i"), PQserverVersion(local_conn));
+	}
+
 	/* Check "repmgr" the extension is installed */
 	extension_status = get_repmgr_extension_status(local_conn, &extversions);

-	if (extension_status != REPMGR_INSTALLED)
+	if (extension_status == REPMGR_INSTALLED)
+	{
+		/*
+		 * extension is the latest available according to "pg_available_extensions" -
+		 * - does our (major) version match that?
+		 */
+		log_verbose(LOG_DEBUG, "binary version: %i; extension version: %i",
+					REPMGR_VERSION_NUM, extversions.installed_version_num);
+		if ((REPMGR_VERSION_NUM/100) < (extversions.installed_version_num / 100))
+		{
+			log_error(_("this \"repmgr\" version is older than the installed \"repmgr\" extension version"));
+			log_detail(_("\"repmgr\" version %s is installed but extension is version %s"),
+					   REPMGR_VERSION,
+					   extversions.installed_version);
+
+			log_hint(_("verify the repmgr installation on this server is updated properly before continuing"));
+			close_connection(&local_conn);
+			exit(ERR_BAD_CONFIG);
+		}
+
+		if ((REPMGR_VERSION_NUM/100) > (extversions.installed_version_num / 100))
+		{
+			log_error(_("this \"repmgr\" version is newer than the installed \"repmgr\" extension version"));
+			log_detail(_("\"repmgr\" version %s is installed but extension is version %s"),
+					   REPMGR_VERSION,
+					   extversions.installed_version);
+
+			log_hint(_("verify the repmgr extension is updated properly before continuing"));
+			close_connection(&local_conn);
+			exit(ERR_BAD_CONFIG);
+		}
+	}
+	else
 	{
 		/* this is unlikely to happen */
 		if (extension_status == REPMGR_UNKNOWN)
 		{
 			log_error(_("unable to determine status of \"repmgr\" extension"));
-			log_detail("%s", PQerrorMessage(local_conn));
+			log_detail("\n%s", PQerrorMessage(local_conn));
 			close_connection(&local_conn);
 			exit(ERR_DB_QUERY);
 		}
@@ -406,11 +441,10 @@ main(int argc, char **argv)
 		if (extension_status == REPMGR_OLD_VERSION_INSTALLED)
 		{
 			log_error(_("an older version of the \"repmgr\" extension is installed"));
-			log_detail(_("version %s is installed but newer version %s is available"),
+			log_detail(_("extension version %s is installed but newer version %s is available"),
 					   extversions.installed_version,
 					   extversions.default_version);
 			log_hint(_("verify the repmgr installation is updated properly before continuing"));
-
 		}
 		else
 		{
@@ -527,6 +561,8 @@ start_monitoring(void)
 			   local_node_info.node_name,
 			   local_node_info.node_id);

+	log_info(_("\"connection_check_type\" set to \"%s\""), print_connection_check_type(config_file_options.connection_check_type));
+
 	while (true)
 	{
 		switch (local_node_info.type)
@@ -593,7 +629,8 @@ daemonize_process(void)
 	switch (pid)
 	{
 		case -1:
-			log_error(_("error in fork():\n  %s"), strerror(errno));
+			log_error(_("error in fork()"));
+			log_detail("%s", strerror(errno));
 			exit(ERR_SYS_FAILURE);
 			break;

@@ -602,7 +639,8 @@ daemonize_process(void)
 			pid = setsid();
 			if (pid == (pid_t) -1)
 			{
-				log_error(_("error in setsid():\n  %s"), strerror(errno));
+				log_error(_("error executing setsid()"));
+				log_detail("%s", strerror(errno));
 				exit(ERR_SYS_FAILURE);
 			}

@@ -612,7 +650,8 @@ daemonize_process(void)
 			/* error case */
 			if (pid == -1)
 			{
-				log_error(_("error in fork():\n  %s"), strerror(errno));
+				log_error(_("error executing fork()"));
+				log_detail("%s", strerror(errno));
 				exit(ERR_SYS_FAILURE);
 			}

@@ -790,6 +829,82 @@ show_help(void)
 }


+bool
+check_upstream_connection(PGconn **conn, const char *conninfo)
+{
+	/* Check the connection status twice in case it changes after reset */
+	bool		twice = false;
+
+	if (config_file_options.connection_check_type == CHECK_PING)
+		return is_server_available(conninfo);
+
+	if (config_file_options.connection_check_type == CHECK_CONNECTION)
+	{
+		bool success = true;
+		PGconn *test_conn = PQconnectdb(conninfo);
+
+		log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo);
+
+		if (PQstatus(test_conn) != CONNECTION_OK)
+		{
+			log_warning(_("unable to connect to \"%s\""), conninfo);
+			log_detail("\n%s", PQerrorMessage(test_conn));
+			success = false;
+		}
+		PQfinish(test_conn);
+
+		return success;
+	}
+
+	for (;;)
+	{
+		if (PQstatus(*conn) != CONNECTION_OK)
+		{
+			log_debug("check_upstream_connection(): connection not OK");
+			if (twice)
+				return false;
+			/* reconnect */
+			PQfinish(*conn);
+			*conn = PQconnectdb(conninfo);
+			twice = true;
+		}
+		else
+		{
+			if (!cancel_query(*conn, config_file_options.async_query_timeout))
+				goto failed;
+
+			if (wait_connection_availability(*conn, config_file_options.async_query_timeout) != 1)
+				goto failed;
+
+			/* execute a simple query to verify connection availability */
+			if (PQsendQuery(*conn, "SELECT 1") == 0)
+			{
+				log_warning(_("unable to send query to upstream"));
+				log_detail("%s", PQerrorMessage(*conn));
+				goto failed;
+			}
+
+			if (wait_connection_availability(*conn, config_file_options.async_query_timeout) != 1)
+				goto failed;
+
+			break;
+
+	failed:
+			/* retry once */
+			if (twice)
+				return false;
+
+			/* reconnect */
+			PQfinish(*conn);
+			*conn = PQconnectdb(conninfo);
+			twice = true;
+		}
+	}
+
+	return true;
+}
+
+
 void
 try_reconnect(PGconn **conn, t_node_info *node_info)
 {
@@ -815,8 +930,7 @@ try_reconnect(PGconn **conn, t_node_info *node_info)
 				 node_info->node_id, i + 1, max_attempts);
 		if (is_server_available_params(&conninfo_params) == true)
 		{
-
-			log_notice(_("node has recovered, reconnecting"));
+			log_notice(_("node %i has recovered, reconnecting"), node_info->node_id);

 			/*
 			 * XXX we should also handle the case where node is pingable but
@@ -846,7 +960,7 @@ try_reconnect(PGconn **conn, t_node_info *node_info)

 					if (ping_result != PGRES_TUPLES_OK)
 					{
-						log_info("original connnection no longer available, using new connection");
+						log_info("original connection no longer available, using new connection");
 						close_connection(conn);
 						*conn = our_conn;
 					}
--- a/Show More
+++ b/Show More