doc: update release notes

standby follow: don't attempt to delete slot if current upstream is primary
An attempt will be made to delete an existing replication slot on the old upstream node (this is important during e.g. a switchover operation or when attaching a cascaded standby to a new upstream). However if the standby is currently attached to the follow target node anyway, the replication slot should never be deleted.
2026-03-23 15:16:29 +00:00 · 2019-12-10 16:35:59 +09:00 · 2019-12-10 15:48:49 +09:00 · 2019-05-24 10:08:27 +09:00 · 2019-05-24 10:05:10 +09:00 · 2019-05-10 10:30:29 +09:00
66 changed files with 4474 additions and 1920 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,9 @@ lib*.pc
 # other
 /.lineno
 *.dSYM
 *.orig
 *.rej
 # generated binaries
 repmgr
 repmgrd
--- a/31
+++ b/31
@@ -1,4 +1,8 @@
-4.3     2019-??
+4.3.1   2019-12-??
        repmgr: ensure an existing replication slot is not deleted if the
          follow target is the node's current upstream (Ian)
 4.3     2019-04-02
        repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian)
        repmgr: add --version-number command line option (Ian)
        repmgr: add --compact option to "cluster show"; GitHub #521 (Ian)
@@ -12,28 +16,21 @@
          data directory on the demotion candidate; GitHub #523 (Ian)
        repmgr: ensure "standby switchover" verifies replication connection
          exists; GitHub #519 (Ian)
-		repmgr: ensure "primary unregister" behaves correctly when executed
+        repmgr: add sanity check for correct extension version (Ian)
-		  on a witness server; GitHub #548 (Ian)
+        repmgr: ensure "witness register --dry-run" does not attempt to read node
-        repmgr: when executing "standby follow" and "node rejoin", check that
+          tables if repmgr extension not installed; GitHub #513 (Ian)
-          it will actually be possible to stream from the target node (Ian)
+        repmgr: ensure "standby register" fails when --upstream-node-id is the
-        repmgr: "standby switchover": improve handling of connection URIs when
+          same as the local node ID (Ian)
          executing "node rejoin" on the demotion candidate; GitHub #525 (Ian)
        repmgr: fix long node ID display in "cluster show" (Ian)
        repmgr: check for primary server before executing "witness register";
           GitHub #538 (Ian)
        repmgr: show "upstream last seen" interval in "daemon status" output (Ian)
        repmgr: "node check" will only consider physical replication slots (Ian)
        repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
        repmgrd: on a cascaded standby, don't fail over if "failover=manual";
          GitHub #531 (Ian)
        repmgrd: don't consider nodes where repmgrd is not running as promotion
          candidates (Ian)
-
+        repmgrd: add option "connection_check_type" (Ian)
 4.2.1   2018-??-??
        repmgr: add sanity check for correct extension version (Ian)
        repmgr: ensure "witness register --dry-run" does not attempt to read node
          tables if repmgr extension not installed; GitHub #513 (Ian)
        repmgrd: improve witness monitoring when primary node not available (Ian)
 		repmgrd: handle situation where a primary has unexpectedly appeared
 		  during failover; GitHub #420 (Ian)
 		general: fix Makefile (John)
 4.2     2018-10-24
        repmgr: add parameter "shutdown_check_timeout" for use by "standby switchover";
--- a/Makefile.in
+++ b/Makefile.in
@@ -50,8 +50,8 @@ $(info Building against PostgreSQL $(MAJORVERSION))
 REPMGR_CLIENT_OBJS = repmgr-client.o \
 	repmgr-action-primary.o repmgr-action-standby.o repmgr-action-witness.o \
 	repmgr-action-bdr.o repmgr-action-cluster.o repmgr-action-node.o repmgr-action-daemon.o \
-	configfile.o log.o strutil.o controldata.o dirutil.o compat.o dbutils.o
+	configfile.o log.o strutil.o controldata.o dirutil.o compat.o dbutils.o sysutils.o
-REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o configfile.o log.o dbutils.o strutil.o controldata.o compat.o
+REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o configfile.o log.o dbutils.o strutil.o controldata.o compat.o sysutils.o
 DATE=$(shell date "+%Y-%m-%d")
 repmgr_version.h: repmgr_version.h.in
@@ -86,29 +86,15 @@ clean: additional-clean
 maintainer-clean: additional-maintainer-clean
 additional-clean:
-	rm -f repmgr-client.o
+	rm -f *.o
 	rm -f repmgr-action-primary.o
 	rm -f repmgr-action-standby.o
 	rm -f repmgr-action-witness.o
 	rm -f repmgr-action-bdr.o
 	rm -f repmgr-action-node.o
 	rm -f repmgr-action-cluster.o
 	rm -f repmgr-action-daemon.o
 	rm -f repmgrd.o
 	rm -f repmgrd-physical.o
 	rm -f repmgrd-bdr.o
 	rm -f compat.o
 	rm -f configfile.o
 	rm -f controldata.o
 	rm -f dbutils.o
 	rm -f dirutil.o
 	rm -f log.o
 	rm -f strutil.o
-maintainer-additional-clean: clean
+additional-maintainer-clean: clean
-	rm -f configure
+	$(MAKE) -C doc maintainer-clean
 	rm -f config.status config.log
 	rm -f config.h
 	rm -f repmgr_version.h
 	rm -f Makefile
 	rm -f Makefile.global
 	@rm -rf autom4te.cache/
 ifeq ($(MAJORVERSION),$(filter $(MAJORVERSION),9.3 9.4))
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Documentation
 The main `repmgr` documentation is available here:
-> [repmgr 4 documentation](https://repmgr.org/docs/4.2/index.html)
+> [repmgr documentation](https://repmgr.org/docs/current/index.html)
 The `README` file for `repmgr` 3.x is available here:
@@ -72,7 +72,7 @@ Please report bugs and other issues to:
 * https://github.com/2ndQuadrant/repmgr
-Further information is available at https://www.repmgr.org/
+Further information is available at https://repmgr.org/
 We'd love to hear from you about how you use repmgr. Case studies and
 news are always welcome. Send us an email at info@2ndQuadrant.com, or
@@ -97,6 +97,7 @@ Thanks from the repmgr core team.
 Further reading
 ---------------
 * [repmgr documentation](https://repmgr.org/docs/current/index.html)
 * https://blog.2ndquadrant.com/repmgr-3-2-is-here-barman-support-brand-new-high-availability-features/
 * https://blog.2ndquadrant.com/improvements-in-repmgr-3-1-4/
 * https://blog.2ndquadrant.com/managing-useful-clusters-repmgr/
--- a/configfile.c
+++ b/configfile.c
@@ -358,6 +358,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 	options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
 	options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */
 	memset(options->repmgrd_pid_file, 0, sizeof(options->repmgrd_pid_file));
 	options->standby_disconnect_on_failover = false;
 	options->sibling_nodes_disconnect_timeout = DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT;
 	options->connection_check_type = CHECK_PING;
 	options->primary_visibility_consensus = false;
 	memset(options->failover_validation_command, 0, sizeof(options->failover_validation_command));
 	options->election_rerun_interval = DEFAULT_ELECTION_RERUN_INTERVAL;
 	/*-------------
 	 * witness settings
@@ -478,7 +484,14 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 			node_id_found = true;
 		}
 		else if (strcmp(name, "node_name") == 0)
-			strncpy(options->node_name, value, MAXLEN);
+		{
 			if (strlen(value) < sizeof(options->node_name))
 				strncpy(options->node_name, value, sizeof(options->node_name));
 			else
 				item_list_append_format(error_list,
 										_("value for \"node_name\" must contain fewer than %lu characters"),
 										sizeof(options->node_name));
 		}
 		else if (strcmp(name, "conninfo") == 0)
 			strncpy(options->conninfo, value, MAXLEN);
 		else if (strcmp(name, "data_directory") == 0)
@@ -488,11 +501,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 		else if (strcmp(name, "replication_user") == 0)
 		{
-			if (strlen(value) < NAMEDATALEN)
+			if (strlen(value) < sizeof(options->replication_user))
-				strncpy(options->replication_user, value, NAMEDATALEN);
+				strncpy(options->replication_user, value, sizeof(options->replication_user));
 			else
-				item_list_append(error_list,
+				item_list_append_format(error_list,
-								 _("value for \"replication_user\" must contain fewer than " STR(NAMEDATALEN) " characters"));
+										_("value for \"replication_user\" must contain fewer than %lu characters"),
 										sizeof(options->replication_user));
 		}
 		else if (strcmp(name, "pg_bindir") == 0)
 			strncpy(options->pg_bindir, value, MAXPGPATH);
@@ -618,6 +632,36 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 			options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0);
 		else if (strcmp(name, "repmgrd_pid_file") == 0)
 			strncpy(options->repmgrd_pid_file, value, MAXPGPATH);
 		else if (strcmp(name, "standby_disconnect_on_failover") == 0)
 			options->standby_disconnect_on_failover = parse_bool(value, name, error_list);
 		else if (strcmp(name, "sibling_nodes_disconnect_timeout") == 0)
 			options->sibling_nodes_disconnect_timeout = repmgr_atoi(value, name, error_list, 0);
 		else if (strcmp(name, "connection_check_type") == 0)
 		{
 			if (strcasecmp(value, "ping") == 0)
 			{
 				options->connection_check_type = CHECK_PING;
 			}
 			else if (strcasecmp(value, "connection") == 0)
 			{
 				options->connection_check_type = CHECK_CONNECTION;
 			}
 			else if (strcasecmp(value, "query") == 0)
 			{
 				options->connection_check_type = CHECK_QUERY;
 			}
 			else
 			{
 				item_list_append(error_list,
 								 _("value for \"connection_check_type\" must be \"ping\", \"connection\" or \"query\"\n"));
 			}
 		}
 		else if (strcmp(name, "primary_visibility_consensus") == 0)
 			options->primary_visibility_consensus = parse_bool(value, name, error_list);
 		else if (strcmp(name, "failover_validation_command") == 0)
 			strncpy(options->failover_validation_command, value, sizeof(options->failover_validation_command));
 		else if (strcmp(name, "election_rerun_interval") == 0)
 			options->election_rerun_interval = repmgr_atoi(value, name, error_list, 0);
 		/* witness settings */
 		else if (strcmp(name, "witness_sync_interval") == 0)
@@ -792,15 +836,16 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 		conninfo_options = PQconninfoParse(options->conninfo, &conninfo_errmsg);
 		if (conninfo_options == NULL)
 		{
-			char		error_message_buf[MAXLEN] = "";
+			PQExpBufferData error_message_buf;
 			initPQExpBuffer(&error_message_buf);
-			snprintf(error_message_buf,
+			appendPQExpBuffer(&error_message_buf,
 					 MAXLEN,
 							  _("\"conninfo\": %s	(provided: \"%s\")"),
 							  conninfo_errmsg,
 							  options->conninfo);
-			item_list_append(error_list, error_message_buf);
+			item_list_append(error_list, error_message_buf.data);
 			termPQExpBuffer(&error_message_buf);
 		}
 		PQconninfoFree(conninfo_options);
@@ -1049,15 +1094,19 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
 * loop is started up; it therefore only needs to reload options required
 * by repmgrd, which are as follows:
 *
- * changeable options:
+ * changeable options (keep the list in "doc/repmgrd-configuration.sgml" in sync
 * with these):
 *
 * - async_query_timeout
 * - bdr_local_monitoring_only
 * - bdr_recovery_timeout
 * - connection_check_type
 * - conninfo
 * - degraded_monitoring_timeout
 * - event_notification_command
 * - event_notifications
 * - failover
 * - failover_validation_command
 * - follow_command
 * - log_facility
 * - log_file
@@ -1065,12 +1114,19 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
 * - log_status_interval
 * - monitor_interval_secs
 * - monitoring_history
 * - primary_notification_timeout
 * - primary_visibility_consensus
 * - promote_command
 * - promote_delay
 * - reconnect_attempts
 * - reconnect_interval
 * - repmgrd_standby_startup_timeout
 * - retry_promote_interval_secs
 * - sibling_nodes_disconnect_timeout
 * - standby_disconnect_on_failover
 *
 *
 * Not publicly documented:
 * - promote_delay
 *
 * non-changeable options (repmgrd references these from the "repmgr.nodes"
 * table, not the configuration file)
@@ -1149,13 +1205,12 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		return false;
 	}
-	if (strncmp(new_options.node_name, orig_options->node_name, MAXLEN) != 0)
+	if (strncmp(new_options.node_name, orig_options->node_name, sizeof(orig_options->node_name)) != 0)
 	{
 		log_warning(_("\"node_name\" cannot be changed, keeping current configuration"));
 		return false;
 	}
 	/*
 	 * No configuration problems detected - copy any changed values
 	 *
@@ -1205,8 +1260,8 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		{
 			strncpy(orig_options->conninfo, new_options.conninfo, MAXLEN);
 			log_info(_("\"conninfo\" is now \"%s\""), new_options.conninfo);
 		}
 		PQfinish(conn);
 	}
@@ -1284,7 +1339,6 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		config_changed = true;
 	}
 	/* promote_command */
 	if (strncmp(orig_options->promote_command, new_options.promote_command, MAXLEN) != 0)
 	{
@@ -1330,6 +1384,51 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		config_changed = true;
 	}
 	/* standby_disconnect_on_failover */
 	if (orig_options->standby_disconnect_on_failover != new_options.standby_disconnect_on_failover)
 	{
 		orig_options->standby_disconnect_on_failover = new_options.standby_disconnect_on_failover;
 		log_info(_("\"standby_disconnect_on_failover\" is now \"%s\""),
 				 new_options.standby_disconnect_on_failover == true ? "TRUE" : "FALSE");
 		config_changed = true;
 	}
 	/* sibling_nodes_disconnect_timeout */
 	if (orig_options->sibling_nodes_disconnect_timeout != new_options.sibling_nodes_disconnect_timeout)
 	{
 		orig_options->sibling_nodes_disconnect_timeout = new_options.sibling_nodes_disconnect_timeout;
 		log_info(_("\"sibling_nodes_disconnect_timeout\" is now \"%i\""),
 				 new_options.sibling_nodes_disconnect_timeout);
 		config_changed = true;
 	}
 	/* connection_check_type */
 	if (orig_options->connection_check_type != new_options.connection_check_type)
 	{
 		orig_options->connection_check_type = new_options.connection_check_type;
 		log_info(_("\"connection_check_type\" is now \"%s\""),
 				 print_connection_check_type(new_options.connection_check_type));
 		config_changed = true;
 	}
 	/* primary_visibility_consensus */
 	if (orig_options->primary_visibility_consensus != new_options.primary_visibility_consensus)
 	{
 		orig_options->primary_visibility_consensus = new_options.primary_visibility_consensus;
 		log_info(_("\"primary_visibility_consensus\" is now \"%s\""),
 				 new_options.primary_visibility_consensus == true ? "TRUE" : "FALSE");
 		config_changed = true;
 	}
 	/* failover_validation_command */
 	if (strncmp(orig_options->failover_validation_command, new_options.failover_validation_command, MAXPGPATH) != 0)
 	{
 		strncpy(orig_options->failover_validation_command, new_options.failover_validation_command, MAXPGPATH);
 		log_info(_("\"failover_validation_command\" is now \"%s\""), new_options.failover_validation_command);
 		config_changed = true;
 	}
 	/*
 	 * Handle changes to logging configuration
 	 */
@@ -1927,3 +2026,21 @@ parse_pg_basebackup_options(const char *pg_basebackup_options, t_basebackup_opti
 	return backup_options_ok;
 }
 const char *
 print_connection_check_type(ConnectionCheckType type)
 {
 	switch (type)
 	{
 		case CHECK_PING:
 			return "ping";
 		case CHECK_QUERY:
 			return "query";
 		case CHECK_CONNECTION:
 			return "connection";
 	}
 	/* should never reach here */
 	return "UNKNOWN";
 }
--- a/configfile.h
+++ b/configfile.h
@@ -37,6 +37,13 @@ typedef enum
 	FAILOVER_AUTOMATIC
 } failover_mode_opt;
 typedef enum
 {
 	CHECK_PING,
 	CHECK_QUERY,
 	CHECK_CONNECTION
 } ConnectionCheckType;
 typedef struct EventNotificationListCell
 {
 	struct EventNotificationListCell *next;
@@ -69,7 +76,7 @@ typedef struct
 {
 	/* node information */
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	char		conninfo[MAXLEN];
 	char		replication_user[NAMEDATALEN];
 	char		data_directory[MAXPGPATH];
@@ -135,6 +142,12 @@ typedef struct
 	int			primary_notification_timeout;
 	int			repmgrd_standby_startup_timeout;
 	char		repmgrd_pid_file[MAXPGPATH];
 	bool		standby_disconnect_on_failover;
 	int			sibling_nodes_disconnect_timeout;
 	ConnectionCheckType connection_check_type;
 	bool		primary_visibility_consensus;
 	char		failover_validation_command[MAXPGPATH];
 	int			election_rerun_interval;
 	/* BDR settings */
 	bool		bdr_local_monitoring_only;
@@ -206,7 +219,8 @@ typedef struct
        false, -1, \
 		DEFAULT_ASYNC_QUERY_TIMEOUT, \
 		DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
-		-1, "", \
+		-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, \
 		CHECK_PING, true, "", DEFAULT_ELECTION_RERUN_INTERVAL, \
 		/* BDR settings */ \
 		false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
 		/* service settings */ \
@@ -315,5 +329,6 @@ void free_parsed_argv(char ***argv_array);
 /* called by repmgr-client and repmgrd */
 void		exit_with_cli_errors(ItemList *error_list, const char *repmgr_command);
 void		print_item_list(ItemList *item_list);
 const char *print_connection_check_type(ConnectionCheckType type);
 #endif							/* _REPMGR_CONFIGFILE_H_ */
--- a/controldata.c
+++ b/controldata.c
@@ -301,6 +301,8 @@ get_controlfile(const char *DataDir)
 					ControlFilePath);
 		log_detail("%s", strerror(errno));
 		close(fd);
 		return control_file_info;
 	}
--- a/dbutils.c
+++ b/dbutils.c
@@ -43,6 +43,8 @@ int			bdr_version_num = UNKNOWN_BDR_VERSION_NUM;
 static void log_db_error(PGconn *conn, const char *query_text, const char *fmt,...)
 __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
 static bool _is_server_available(const char *conninfo, bool quiet);
 static PGconn *_establish_db_connection(const char *conninfo,
 						 const bool exit_on_error,
 						 const bool log_notice,
@@ -67,16 +69,19 @@ void
 log_db_error(PGconn *conn, const char *query_text, const char *fmt,...)
 {
 	va_list		ap;
 	char		buf[MAXLEN];
 	int			retval;
 	va_start(ap, fmt);
-
+	retval = vsnprintf(buf, MAXLEN, fmt, ap);
 	log_error(fmt, ap);
 	va_end(ap);
-	if (conn != NULL && PQstatus(conn) == CONNECTION_OK)
+	if (retval < MAXLEN)
 		log_error("%s", buf);
 	if (conn != NULL)
 	{
-		log_detail("%s", PQerrorMessage(conn));
+		log_detail("\n%s", PQerrorMessage(conn));
 	}
 	if (query_text != NULL)
@@ -190,13 +195,13 @@ _establish_db_connection(const char *conninfo, const bool exit_on_error, const b
 		{
 			if (log_notice)
 			{
-				log_notice(_("connection to database failed:\n  %s"),
+				log_notice(_("connection to database failed"));
-						   PQerrorMessage(conn));
+				log_detail("\n%s", PQerrorMessage(conn));
 			}
 			else
 			{
-				log_error(_("connection to database failed:\n  %s"),
+				log_error(_("connection to database failed"));
-						  PQerrorMessage(conn));
+				log_detail("\n%s", PQerrorMessage(conn));
 			}
 			log_detail(_("attempted to connect using:\n  %s"),
 					   connection_string);
@@ -287,8 +292,9 @@ establish_db_connection_by_params(t_conninfo_param_list *param_list,
 	/* Check to see that the backend connection was successfully made */
 	if ((PQstatus(conn) != CONNECTION_OK))
 	{
-		log_error(_("connection to database failed:\n	%s"),
+		log_error(_("connection to database failed"));
-				  PQerrorMessage(conn));
+		log_detail("\n%s", PQerrorMessage(conn));
 		if (exit_on_error)
 		{
 			PQfinish(conn);
@@ -338,7 +344,9 @@ is_superuser_connection(PGconn *conn, t_connection_user *userinfo)
 	if (userinfo != NULL)
 	{
-		strncpy(userinfo->username, current_user, MAXLEN);
+		snprintf(userinfo->username,
 				 sizeof(userinfo->username),
 				 "%s", current_user);
 		userinfo->is_superuser = is_superuser;
 	}
@@ -821,8 +829,8 @@ begin_transaction(PGconn *conn)
 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
-		log_error(_("unable to begin transaction:\n	 %s"),
+		log_error(_("unable to begin transaction"));
-				  PQerrorMessage(conn));
+		log_detail("%s", PQerrorMessage(conn));
 		PQclear(res);
 		return false;
@@ -845,8 +853,8 @@ commit_transaction(PGconn *conn)
 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
-		log_error(_("unable to commit transaction:\n  %s"),
+		log_error(_("unable to commit transaction"));
-				  PQerrorMessage(conn));
+		log_detail("%s", PQerrorMessage(conn));
 		PQclear(res);
 		return false;
@@ -869,8 +877,8 @@ rollback_transaction(PGconn *conn)
 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
-		log_error(_("unable to rollback transaction:\n	%s"),
+		log_error(_("unable to rollback transaction"));
-				  PQerrorMessage(conn));
+		log_detail("%s", PQerrorMessage(conn));
 		PQclear(res);
 		return false;
@@ -1073,13 +1081,13 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
 	{
 		if (strcmp(PQgetvalue(res, i, 0), setting) == 0)
 		{
-			strncpy(output, PQgetvalue(res, i, 1), MAXLEN);
+			snprintf(output, MAXLEN, "%s", PQgetvalue(res, i, 1));
 			success = true;
 			break;
 		}
 		else
 		{
-			/* XXX highly unlikely this would ever happen */
+			/* highly unlikely this would ever happen */
 			log_error(_("get_pg_setting(): unknown parameter \"%s\""), PQgetvalue(res, i, 0));
 		}
 	}
@@ -1096,6 +1104,55 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
 }
 bool
 alter_system_int(PGconn *conn, const char *name, int value)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
 	bool		success = true;
 	initPQExpBuffer(&query);
 	appendPQExpBuffer(&query,
 					  "ALTER SYSTEM SET %s = %i",
 					  name, value);
 	res = PQexec(conn, query.data);
 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
 		log_db_error(conn, query.data, _("alter_system_int() - unable to execute query"));
 		success = false;
 	}
 	termPQExpBuffer(&query);
 	PQclear(res);
 	return success;
 }
 bool
 pg_reload_conf(PGconn *conn)
 {
 	PGresult   *res = NULL;
 	bool		success = false;
 	res = PQexec(conn, "SELECT pg_catalog.pg_reload_conf()");
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
 		log_db_error(conn, NULL, _("pg_reload_conf() - unable to execute query"));
 		success = false;
 	}
 	PQclear(res);
 	return success;
 }
 /* ============================ */
 /* Server information functions */
 /* ============================ */
@@ -1124,7 +1181,7 @@ get_cluster_size(PGconn *conn, char *size)
 	}
 	else
 	{
-		strncpy(size, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(size, MAXLEN, "%s", PQgetvalue(res, 0, 0));
 	}
 	termPQExpBuffer(&query);
@@ -1172,7 +1229,7 @@ get_server_version(PGconn *conn, char *server_version_buf)
 		 * first space.
 		 */
-		strncpy(_server_version_buf, PQgetvalue(res, 0, 1), MAXVERSIONSTR);
+		snprintf(_server_version_buf, MAXVERSIONSTR, "%s", PQgetvalue(res, 0, 1));
 		for (i = 0; i < MAXVERSIONSTR; i++)
 		{
@@ -1299,7 +1356,8 @@ _get_primary_connection(PGconn *conn,
 		/* initialize with the values of the current node being processed */
 		node_id = atoi(PQgetvalue(res, i, 0));
-		strncpy(remote_conninfo, PQgetvalue(res, i, 1), MAXCONNINFO);
+		snprintf(remote_conninfo, MAXCONNINFO, "%s", PQgetvalue(res, i, 1));
 		log_verbose(LOG_INFO,
 					_("checking if node %i is primary"),
 					node_id);
@@ -1463,10 +1521,10 @@ get_ready_archive_files(PGconn *conn, const char *data_directory)
 	while ((arcdir_ent = readdir(arcdir)) != NULL)
 	{
 		struct stat statbuf;
-		char		file_path[MAXPGPATH] = "";
+		char		file_path[MAXPGPATH + sizeof(arcdir_ent->d_name)];
 		int			basenamelen = 0;
-		snprintf(file_path, MAXPGPATH,
+		snprintf(file_path, sizeof(file_path),
 				 "%s/%s",
 				 archive_status_dir,
 				 arcdir_ent->d_name);
@@ -1503,6 +1561,8 @@ identify_system(PGconn *repl_conn, t_system_identification *identification)
 	if (PQresultStatus(res) != PGRES_TUPLES_OK || !PQntuples(res))
 	{
 		log_db_error(repl_conn, NULL, _("unable to execute IDENTIFY_SYSTEM"));
 		PQclear(res);
 		return false;
 	}
@@ -1621,6 +1681,7 @@ repmgrd_set_local_node_id(PGconn *conn, int local_node_id)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
 	bool		success = true;
 	initPQExpBuffer(&query);
@@ -1629,16 +1690,18 @@ repmgrd_set_local_node_id(PGconn *conn, int local_node_id)
 					  local_node_id);
 	res = PQexec(conn, query.data);
 	termPQExpBuffer(&query);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		PQclear(res);
+		log_db_error(conn, query.data, _("repmgrd_set_local_node_id(): unable to execute query"));
-		return false;
+
 		success = false;
 	}
 	termPQExpBuffer(&query);
 	PQclear(res);
-	return true;
+
 	return success;
 }
@@ -1854,6 +1917,29 @@ repmgrd_pause(PGconn *conn, bool pause)
 	return success;
 }
 pid_t
 get_wal_receiver_pid(PGconn *conn)
 {
 	PGresult   *res = NULL;
 	pid_t		wal_receiver_pid = UNKNOWN_PID;
 	res = PQexec(conn, "SELECT repmgr.get_wal_receiver_pid()");
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
 		log_error(_("unable to execute \"SELECT repmgr.get_wal_receiver_pid()\""));
 		log_detail("%s", PQerrorMessage(conn));
 	}
 	else if (!PQgetisnull(res, 0, 0))
 	{
 		wal_receiver_pid = atoi(PQgetvalue(res, 0, 0));
 	}
 	PQclear(res);
 	return wal_receiver_pid;
 }
 /* ================ */
 /* result functions */
 /* ================ */
@@ -1916,9 +2002,13 @@ get_repmgr_extension_status(PGconn *conn, t_extension_versions *extversions)
 		/* caller wants to know which versions are installed/available */
 		if (extversions != NULL)
 		{
-			strncpy(extversions->default_version, PQgetvalue(res, 0, 2), 7);
+			snprintf(extversions->default_version,
 					 sizeof(extversions->default_version),
 					 "%s", PQgetvalue(res, 0, 2));
 			extversions->default_version_num = available_version;
-			strncpy(extversions->installed_version, PQgetvalue(res, 0, 4), 7);
+			snprintf(extversions->installed_version,
 					 sizeof(extversions->installed_version),
 					 "%s", PQgetvalue(res, 0, 4));
 			extversions->installed_version_num = installed_version;
 		}
@@ -2082,6 +2172,8 @@ _get_node_record(PGconn *conn, char *sqlquery, t_node_info *node_info, bool init
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
 		log_db_error(conn, sqlquery, _("_get_node_record(): unable to execute query"));
 		PQclear(res);
 		return RECORD_ERROR;
 	}
@@ -2117,17 +2209,17 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row, bool init_
 		node_info->upstream_node_id = atoi(PQgetvalue(res, row, 2));
 	}
-	strncpy(node_info->node_name, PQgetvalue(res, row, 3), MAXLEN);
+	snprintf(node_info->node_name, sizeof(node_info->node_name), "%s", PQgetvalue(res, row, 3));
-	strncpy(node_info->conninfo, PQgetvalue(res, row, 4), MAXLEN);
+	snprintf(node_info->conninfo, sizeof(node_info->conninfo), "%s", PQgetvalue(res, row, 4));
-	strncpy(node_info->repluser, PQgetvalue(res, row, 5), NAMEDATALEN);
+	snprintf(node_info->repluser, sizeof(node_info->repluser), "%s", PQgetvalue(res, row, 5));
-	strncpy(node_info->slot_name, PQgetvalue(res, row, 6), MAXLEN);
+	snprintf(node_info->slot_name, sizeof(node_info->slot_name), "%s", PQgetvalue(res, row, 6));
-	strncpy(node_info->location, PQgetvalue(res, row, 7), MAXLEN);
+	snprintf(node_info->location, sizeof(node_info->location), "%s", PQgetvalue(res, row, 7));
 	node_info->priority = atoi(PQgetvalue(res, row, 8));
 	node_info->active = atobool(PQgetvalue(res, row, 9));
-	strncpy(node_info->config_file, PQgetvalue(res, row, 10), MAXPGPATH);
+	snprintf(node_info->config_file, sizeof(node_info->config_file), "%s", PQgetvalue(res, row, 10));
 	/* This won't normally be set */
-	strncpy(node_info->upstream_node_name, PQgetvalue(res, row, 11), MAXLEN);
+	snprintf(node_info->upstream_node_name, sizeof(node_info->upstream_node_name), "%s", PQgetvalue(res, row, 11));
 	/* Set remaining struct fields with default values */
@@ -2991,13 +3083,15 @@ update_node_record_conn_priority(PGconn *conn, t_configuration_options *options)
 					  options->node_id);
 	res = PQexec(conn, query.data);
 	termPQExpBuffer(&query);
 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
 	{
 		log_db_error(conn, query.data, _("update_node_record_conn_priority(): unable to execute query"));
 		success = false;
 	}
 	termPQExpBuffer(&query);
 	PQclear(res);
 	return success;
@@ -3379,11 +3473,15 @@ config_file_list_add(t_configfile_list *list, const char *file, const char *file
 	}
-	strncpy(list->files[list->entries]->filepath, file, MAXPGPATH);
+	snprintf(list->files[list->entries]->filepath,
 			 sizeof(list->files[list->entries]->filepath),
 			 "%s", file);
 	canonicalize_path(list->files[list->entries]->filepath);
 	snprintf(list->files[list->entries]->filename,
 			 sizeof(list->files[list->entries]->filename),
 			 "%s", filename);
 	strncpy(list->files[list->entries]->filename, filename, MAXPGPATH);
 	list->files[list->entries]->in_data_directory = in_data_dir;
 	list->entries++;
@@ -3463,13 +3561,10 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
 	log_verbose(LOG_DEBUG, "_create_event(): event is \"%s\" for node %i", event, node_id);
 	/*
-	 * Only attempt to write a record if a connection handle was provided.
+	 * Only attempt to write a record if a connection handle was provided,
-	 * Also check that the repmgr schema has been properly initialised - if
+	 * and the connection handle points to a node which is not in recovery.
 	 * not it means no configuration file was provided, which can happen with
 	 * e.g. `repmgr standby clone`, and we won't know which schema to write
 	 * to.
 	 */
-	if (conn != NULL && PQstatus(conn) == CONNECTION_OK)
+	if (conn != NULL && PQstatus(conn) == CONNECTION_OK && get_recovery_type(conn) == RECTYPE_PRIMARY)
 	{
 		int			n_node_id = htonl(node_id);
 		char	   *t_successful = successful ? "TRUE" : "FALSE";
@@ -3523,7 +3618,7 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
 		else
 		{
 			/* Store timestamp to send to the notification command */
-			strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
+			snprintf(event_timestamp, MAXLEN, "%s", PQgetvalue(res, 0, 0));
 		}
 		termPQExpBuffer(&query);
@@ -3958,8 +4053,12 @@ get_slot_record(PGconn *conn, char *slot_name, t_replication_slot *record)
 	}
 	else
 	{
-		strncpy(record->slot_name, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(record->slot_name,
-		strncpy(record->slot_type, PQgetvalue(res, 0, 1), MAXLEN);
+				 sizeof(record->slot_name),
 				 "%s", PQgetvalue(res, 0, 0));
 		snprintf(record->slot_type,
 				 sizeof(record->slot_type),
 				 "%s", PQgetvalue(res, 0, 1));
 		record->active = atobool(PQgetvalue(res, 0, 2));
 	}
@@ -4090,7 +4189,8 @@ get_tablespace_name_by_location(PGconn *conn, const char *location, char *name)
 	}
 	else
 	{
-		strncpy(name, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(name, MAXLEN,
 				 "%s", PQgetvalue(res, 0, 0));
 	}
 	termPQExpBuffer(&query);
@@ -4123,7 +4223,8 @@ cancel_query(PGconn *conn, int timeout)
 	 */
 	if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
 	{
-		log_warning(_("unable to stop current query:\n  %s"), errbuf);
+		log_warning(_("unable to cancel current query"));
 		log_detail("\n%s", errbuf);
 		PQfreeCancel(pgcancel);
 		return false;
 	}
@@ -4141,7 +4242,7 @@ cancel_query(PGconn *conn, int timeout)
 * Returns 1 for success; 0 if any error ocurred; -1 if timeout reached.
 */
 int
-wait_connection_availability(PGconn *conn, long long timeout)
+wait_connection_availability(PGconn *conn, int timeout)
 {
 	PGresult   *res = NULL;
 	fd_set		read_set;
@@ -4150,16 +4251,17 @@ wait_connection_availability(PGconn *conn, long long timeout)
 				before,
 				after;
 	struct timezone tz;
 	long long	timeout_ms;
-	/* recalc to microseconds */
+	/* calculate timeout in microseconds */
-	timeout *= 1000000;
+	timeout_ms = (long long) timeout * 1000000;
-	while (timeout > 0)
+	while (timeout_ms > 0)
 	{
 		if (PQconsumeInput(conn) == 0)
 		{
-			log_warning(_("wait_connection_availability(): could not receive data from connection:\n  %s"),
+			log_warning(_("wait_connection_availability(): unable to receive data from connection"));
-						PQerrorMessage(conn));
+			log_detail("%s", PQerrorMessage(conn));
 			return 0;
 		}
@@ -4190,17 +4292,17 @@ wait_connection_availability(PGconn *conn, long long timeout)
 		gettimeofday(&after, &tz);
-		timeout -= (after.tv_sec * 1000000 + after.tv_usec) -
+		timeout_ms -= (after.tv_sec * 1000000 + after.tv_usec) -
 			(before.tv_sec * 1000000 + before.tv_usec);
 	}
-	if (timeout >= 0)
+	if (timeout_ms >= 0)
 	{
 		return 1;
 	}
-	log_warning(_("wait_connection_availability(): timeout reached"));
+	log_warning(_("wait_connection_availability(): timeout (%i secs) reached"), timeout);
 	return -1;
 }
@@ -4211,13 +4313,33 @@ wait_connection_availability(PGconn *conn, long long timeout)
 bool
 is_server_available(const char *conninfo)
 {
 	return _is_server_available(conninfo, false);
 }
 bool
 is_server_available_quiet(const char *conninfo)
 {
 	return _is_server_available(conninfo, true);
 }
 static bool
 _is_server_available(const char *conninfo, bool quiet)
 {
 	PGPing		status = PQping(conninfo);
-	log_verbose(LOG_DEBUG, "is_server_available(): ping status for %s is %i", conninfo, (int)status);
+	log_verbose(LOG_DEBUG, "is_server_available(): ping status for \"%s\" is %s", conninfo, print_pqping_status(status));
 	if (status == PQPING_OK)
 		return true;
 	if (quiet == false)
 	{
 		log_warning(_("unable to ping \"%s\""), conninfo);
 		log_detail(_("PQping() returned \"%s\""), print_pqping_status(status));
 	}
 	return false;
 }
@@ -4230,10 +4352,17 @@ is_server_available_params(t_conninfo_param_list *param_list)
 									  false);
 	/* deparsing the param_list adds overhead, so only do it if needed  */
-	if (log_level == LOG_DEBUG)
+	if (log_level == LOG_DEBUG || status != PQPING_OK)
 	{
 		char *conninfo_str = param_list_to_string(param_list);
-		log_verbose(LOG_DEBUG, "is_server_available_params(): ping status for %s is %i", conninfo_str, (int)status);
+		log_verbose(LOG_DEBUG, "is_server_available_params(): ping status for \"%s\" is %s", conninfo_str, print_pqping_status(status));
 		if (status != PQPING_OK)
 		{
 			log_warning(_("unable to ping \"%s\""), conninfo_str);
 			log_detail(_("PQping() returned \"%s\""), print_pqping_status(status));
 		}
 		pfree(conninfo_str);
 	}
@@ -4263,6 +4392,25 @@ connection_ping(PGconn *conn)
 }
 ExecStatusType
 connection_ping_reconnect(PGconn *conn)
 {
 	ExecStatusType ping_result = connection_ping(conn);
 	if (PQstatus(conn) != CONNECTION_OK)
 	{
 		log_warning(_("connection error, attempting to reset"));
 		log_detail("\n%s", PQerrorMessage(conn));
 		PQreset(conn);
 		ping_result = connection_ping(conn);
 	}
 	log_verbose(LOG_DEBUG, "connection_ping_reconnect(): result is %s", PQresStatus(ping_result));
 	return ping_result;
 }
 /* ==================== */
 /* monitoring functions */
@@ -4647,6 +4795,11 @@ get_primary_current_lsn(PGconn *conn)
 	{
 		ptr = parse_lsn(PQgetvalue(res, 0, 0));
 	}
 	else
 	{
 		log_db_error(conn, NULL, _("unable to execute get_primary_current_lsn()"));
 	}
 	PQclear(res);
@@ -4673,6 +4826,10 @@ get_last_wal_receive_location(PGconn *conn)
 	{
 		ptr = parse_lsn(PQgetvalue(res, 0, 0));
 	}
 	else
 	{
 		log_db_error(conn, NULL, _("unable to execute get_last_wal_receive_location()"));
 	}
 	PQclear(res);
@@ -4775,17 +4932,19 @@ void
 init_replication_info(ReplInfo *replication_info)
 {
 	memset(replication_info->current_timestamp, 0, sizeof(replication_info->current_timestamp));
 	replication_info->in_recovery = false;
 	replication_info->last_wal_receive_lsn = InvalidXLogRecPtr;
 	replication_info->last_wal_replay_lsn = InvalidXLogRecPtr;
 	memset(replication_info->last_xact_replay_timestamp, 0, sizeof(replication_info->last_xact_replay_timestamp));
 	replication_info->replication_lag_time = 0;
 	replication_info->receiving_streamed_wal = true;
 	replication_info->wal_replay_paused = false;
 	replication_info->upstream_last_seen = -1;
 }
 bool
-get_replication_info(PGconn *conn, ReplInfo *replication_info)
+get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replication_info)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
@@ -4794,6 +4953,7 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 	initPQExpBuffer(&query);
 	appendPQExpBufferStr(&query,
 						 " SELECT ts, "
 						 "        in_recovery, "
 						 "        last_wal_receive_lsn, "
 						 "        last_wal_replay_lsn, "
 						 "        last_xact_replay_timestamp, "
@@ -4807,9 +4967,11 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 						 "          END "
 						 "        END AS replication_lag_time, "
 						 "        last_wal_receive_lsn >= last_wal_replay_lsn AS receiving_streamed_wal, "
-						 "        wal_replay_paused "
+						 "        wal_replay_paused, "
 						 "        upstream_last_seen "
 						 "   FROM ( "
 						 " SELECT CURRENT_TIMESTAMP AS ts, "
 						 "        pg_catalog.pg_is_in_recovery() AS in_recovery, "
 						 "        pg_catalog.pg_last_xact_replay_timestamp() AS last_xact_replay_timestamp, ");
@@ -4821,7 +4983,7 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 							 "        CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
 							 "          THEN FALSE "
 							 "          ELSE pg_catalog.pg_is_wal_replay_paused() "
-							 "        END AS wal_replay_paused ");
+							 "        END AS wal_replay_paused, ");
 	}
 	else
 	{
@@ -4843,7 +5005,21 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 							 "        CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
 							 "          THEN FALSE "
 							 "          ELSE pg_catalog.pg_is_xlog_replay_paused() "
-							 "        END AS wal_replay_paused ");
+							 "        END AS wal_replay_paused, ");
 	}
 	if (node_type == WITNESS)
 	{
 		appendPQExpBufferStr(&query,
 							 "        repmgr.get_upstream_last_seen() AS upstream_last_seen");
 	}
 	else
 	{
 		appendPQExpBufferStr(&query,
 							 "        CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
 							 "          THEN -1 "
 							 "          ELSE repmgr.get_upstream_last_seen() "
 							 "        END AS upstream_last_seen ");
 	}
 	appendPQExpBufferStr(&query,
@@ -4861,13 +5037,19 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
 	}
 	else
 	{
-		strncpy(replication_info->current_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(replication_info->current_timestamp,
-		replication_info->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 1));
+				 sizeof(replication_info->current_timestamp),
-		replication_info->last_wal_replay_lsn = parse_lsn(PQgetvalue(res, 0, 2));
+				 "%s", PQgetvalue(res, 0, 0));
-		strncpy(replication_info->last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
+		replication_info->in_recovery = atobool(PQgetvalue(res, 0, 1));
-		replication_info->replication_lag_time = atoi(PQgetvalue(res, 0, 4));
+		replication_info->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 2));
-		replication_info->receiving_streamed_wal = atobool(PQgetvalue(res, 0, 5));
+		replication_info->last_wal_replay_lsn = parse_lsn(PQgetvalue(res, 0, 3));
-		replication_info->wal_replay_paused = atobool(PQgetvalue(res, 0, 6));
+		snprintf(replication_info->last_xact_replay_timestamp,
 				 sizeof(replication_info->last_xact_replay_timestamp),
 				 "%s", PQgetvalue(res, 0, 4));
 		replication_info->replication_lag_time = atoi(PQgetvalue(res, 0, 5));
 		replication_info->receiving_streamed_wal = atobool(PQgetvalue(res, 0, 6));
 		replication_info->wal_replay_paused = atobool(PQgetvalue(res, 0, 7));
 		replication_info->upstream_last_seen = atoi(PQgetvalue(res, 0, 8));
 	}
 	termPQExpBuffer(&query);
@@ -4913,13 +5095,12 @@ get_replication_lag_seconds(PGconn *conn)
 		log_warning("%s", PQerrorMessage(conn));
 		PQclear(res);
-		/* XXX magic number */
+		return UNKNOWN_REPLICATION_LAG;
 		return -1;
 	}
 	if (!PQntuples(res))
 	{
-		return -1;
+		return UNKNOWN_REPLICATION_LAG;
 	}
 	lag_seconds = atoi(PQgetvalue(res, 0, 0));
@@ -5053,7 +5234,7 @@ is_downstream_node_attached(PGconn *conn, char *node_name)
 void
-set_primary_last_seen(PGconn *conn)
+set_upstream_last_seen(PGconn *conn)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
@@ -5061,51 +5242,58 @@ set_primary_last_seen(PGconn *conn)
 	initPQExpBuffer(&query);
 	appendPQExpBufferStr(&query,
-						 "SELECT repmgr.set_primary_last_seen()");
+						 "SELECT repmgr.set_upstream_last_seen()");
 	res = PQexec(conn, query.data);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		log_db_error(conn, query.data, _("unable to execute repmgr.set_primary_last_seen()"));
+		log_db_error(conn, query.data, _("unable to execute repmgr.set_upstream_last_seen()"));
 	}
 	termPQExpBuffer(&query);
 	PQclear(res);
 }
 int
-get_primary_last_seen(PGconn *conn)
+get_upstream_last_seen(PGconn *conn, t_server_type node_type)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
-	int primary_last_seen = -1;
+	int upstream_last_seen = -1;
 	initPQExpBuffer(&query);
 	if (node_type == WITNESS)
 	{
 		appendPQExpBufferStr(&query,
 							 "SELECT repmgr.get_upstream_last_seen()");
 	}
 	else
 	{
 		appendPQExpBufferStr(&query,
 							 "SELECT CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
 							 "   THEN -1 "
-						 "   ELSE repmgr.get_primary_last_seen() "
+							 "   ELSE repmgr.get_upstream_last_seen() "
-						 " END AS primary_last_seen ");
+							 " END AS upstream_last_seen ");
 	}
 	res = PQexec(conn, query.data);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		log_db_error(conn, query.data, _("unable to execute repmgr.get_primary_last_seen()"));
+		log_db_error(conn, query.data, _("unable to execute repmgr.get_upstream_last_seen()"));
 	}
 	else
 	{
-		primary_last_seen = atoi(PQgetvalue(res, 0, 0));
+		upstream_last_seen = atoi(PQgetvalue(res, 0, 0));
 	}
 	termPQExpBuffer(&query);
 	PQclear(res);
-	return primary_last_seen;
+	return upstream_last_seen;
 }
@@ -5370,7 +5558,9 @@ get_default_bdr_replication_set(PGconn *conn)
 		/* For BDR2, we use a custom replication set */
 		namelen = strlen(BDR2_REPLICATION_SET_NAME);
 		default_replication_set = pg_malloc0(namelen + 1);
-		strncpy(default_replication_set, BDR2_REPLICATION_SET_NAME, namelen);
+		snprintf(default_replication_set,
 				 namelen + 1,
 				 "%s", BDR2_REPLICATION_SET_NAME);
 		return default_replication_set;
 	}
@@ -5400,7 +5590,9 @@ get_default_bdr_replication_set(PGconn *conn)
 	namelen = strlen(PQgetvalue(res, 0, 0));
 	default_replication_set = pg_malloc0(namelen + 1);
-	strncpy(default_replication_set, PQgetvalue(res, 0, 0), namelen);
+	snprintf(default_replication_set,
 			 namelen,
 			 "%s", PQgetvalue(res, 0, 0));
 	PQclear(res);
@@ -5621,7 +5813,9 @@ get_bdr_other_node_name(PGconn *conn, int node_id, char *node_name)
 	if (PQresultStatus(res) == PGRES_TUPLES_OK)
 	{
-		strncpy(node_name, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(node_name,
 				 NAMEDATALEN,
 				 "%s", PQgetvalue(res, 0, 0));
 	}
 	else
 	{
@@ -5804,12 +5998,12 @@ _populate_bdr_node_records(PGresult *res, BdrNodeInfoList *node_list)
 static void
 _populate_bdr_node_record(PGresult *res, t_bdr_node_info *node_info, int row)
 {
-	strncpy(node_info->node_sysid, PQgetvalue(res, row, 0), MAXLEN);
+	snprintf(node_info->node_sysid, sizeof(node_info->node_sysid), "%s", PQgetvalue(res, row, 0));
 	node_info->node_timeline = atoi(PQgetvalue(res, row, 1));
 	node_info->node_dboid = atoi(PQgetvalue(res, row, 2));
-	strncpy(node_info->node_name, PQgetvalue(res, row, 3), MAXLEN);
+	snprintf(node_info->node_name, sizeof(node_info->node_name), "%s", PQgetvalue(res, row, 3));
-	strncpy(node_info->node_local_dsn, PQgetvalue(res, row, 4), MAXLEN);
+	snprintf(node_info->node_local_dsn, sizeof(node_info->node_local_dsn), "%s", PQgetvalue(res, row, 4));
-	strncpy(node_info->peer_state_name, PQgetvalue(res, row, 5), MAXLEN);
+	snprintf(node_info->peer_state_name, sizeof(node_info->peer_state_name), "%s", PQgetvalue(res, row, 5));
 }
--- a/dbutils.h
+++ b/dbutils.h
@@ -134,8 +134,8 @@ typedef struct s_node_info
 	int			node_id;
 	int			upstream_node_id;
 	t_server_type type;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
-	char		upstream_node_name[MAXLEN];
+	char		upstream_node_name[NAMEDATALEN];
 	char		conninfo[MAXLEN];
 	char		repluser[NAMEDATALEN];
 	char		location[MAXLEN];
@@ -302,12 +302,14 @@ typedef struct BdrNodeInfoList
 typedef struct
 {
 	char		current_timestamp[MAXLEN];
 	bool		in_recovery;
 	XLogRecPtr	last_wal_receive_lsn;
 	XLogRecPtr	last_wal_replay_lsn;
 	char		last_xact_replay_timestamp[MAXLEN];
 	int			replication_lag_time;
 	bool		receiving_streamed_wal;
 	bool		wal_replay_paused;
 	int			upstream_last_seen;
 } ReplInfo;
 typedef struct
@@ -414,6 +416,8 @@ bool		set_config_bool(PGconn *conn, const char *config_param, bool state);
 int		    guc_set(PGconn *conn, const char *parameter, const char *op, const char *value);
 int			guc_set_typed(PGconn *conn, const char *parameter, const char *op, const char *value, const char *datatype);
 bool		get_pg_setting(PGconn *conn, const char *setting, char *output);
 bool		alter_system_int(PGconn *conn, const char *name, int value);
 bool		pg_reload_conf(PGconn *conn);
 /* server information functions */
 bool		get_cluster_size(PGconn *conn, char *size);
@@ -435,6 +439,7 @@ pid_t		repmgrd_get_pid(PGconn *conn);
 bool		repmgrd_is_running(PGconn *conn);
 bool		repmgrd_is_paused(PGconn *conn);
 bool		repmgrd_pause(PGconn *conn, bool pause);
 pid_t		get_wal_receiver_pid(PGconn *conn);
 /* extension functions */
 ExtensionStatus get_repmgr_extension_status(PGconn *conn, t_extension_versions *extversions);
@@ -509,12 +514,14 @@ bool		get_tablespace_name_by_location(PGconn *conn, const char *location, char *
 /* asynchronous query functions */
 bool		cancel_query(PGconn *conn, int timeout);
-int			wait_connection_availability(PGconn *conn, long long timeout);
+int			wait_connection_availability(PGconn *conn, int timeout);
 /* node availability functions */
 bool		is_server_available(const char *conninfo);
 bool		is_server_available_quiet(const char *conninfo);
 bool		is_server_available_params(t_conninfo_param_list *param_list);
 ExecStatusType	connection_ping(PGconn *conn);
 ExecStatusType	connection_ping_reconnect(PGconn *conn);
 /* monitoring functions  */
 void
@@ -549,12 +556,12 @@ XLogRecPtr	get_primary_current_lsn(PGconn *conn);
 XLogRecPtr	get_node_current_lsn(PGconn *conn);
 XLogRecPtr	get_last_wal_receive_location(PGconn *conn);
 void		init_replication_info(ReplInfo *replication_info);
-bool		get_replication_info(PGconn *conn, ReplInfo *replication_info);
+bool		get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replication_info);
 int			get_replication_lag_seconds(PGconn *conn);
 void		get_node_replication_stats(PGconn *conn, t_node_info *node_info);
 bool		is_downstream_node_attached(PGconn *conn, char *node_name);
-void		set_primary_last_seen(PGconn *conn);
+void		set_upstream_last_seen(PGconn *conn);
-int			get_primary_last_seen(PGconn *conn);
+int			get_upstream_last_seen(PGconn *conn, t_server_type node_type);
 bool		is_wal_replay_paused(PGconn *conn, bool check_pending_wal);
 /* BDR functions */
--- a/dirutil.c
+++ b/dirutil.c
@@ -276,6 +276,8 @@ is_pg_running(const char *path)
 			log_warning(_("invalid data in PostgreSQL PID file \"%s\""), path);
 		}
 		fclose(pidf);
 		return PG_DIR_NOT_RUNNING;
 	}
@@ -334,6 +336,15 @@ create_pg_dir(const char *path, bool force)
 				{
 					log_notice(_("-F/--force provided - deleting existing data directory \"%s\""), path);
 					nftw(path, unlink_dir_callback, 64, FTW_DEPTH | FTW_PHYS);
 					/* recreate the directory ourselves to ensure permissions are correct */
 					if (!create_dir(path))
 					{
 						log_error(_("unable to create directory \"%s\"..."),
 								  path);
 						return false;
 					}
 					return true;
 				}
@@ -345,6 +356,15 @@ create_pg_dir(const char *path, bool force)
 				{
 					log_notice(_("deleting existing directory \"%s\""), path);
 					nftw(path, unlink_dir_callback, 64, FTW_DEPTH | FTW_PHYS);
 					/* recreate the directory ourselves to ensure permissions are correct */
 					if (!create_dir(path))
 					{
 						log_error(_("unable to create directory \"%s\"..."),
 								  path);
 						return false;
 					}
 					return true;
 				}
 				return false;
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -61,7 +61,7 @@ clean:
 maintainer-clean:
 	rm -rf html
-	rm -rf Makefile
+	rm -f Makefile
 zip: html
 	cp -r html repmgr-docs-$(REPMGR_VERSION)
--- a/doc/appendix-faq.sgml
+++ b/doc/appendix-faq.sgml
@@ -100,8 +100,7 @@
     and recloning standbys from this.
   </para>
   <para>
-     To minimize downtime during major upgrades, for more recent PostgreSQL
+     To minimize downtime during major upgrades from PostgreSQL 9.4 and later,
     versions (PostgreSQL 9.4 and later),
     <ulink url="https://www.2ndquadrant.com/en/resources/pglogical/">pglogical</ulink>
     can be used to set up a parallel cluster using the newer PostgreSQL version,
     which can be kept in sync with the existing production cluster until the
--- a/doc/appendix-packages.sgml
+++ b/doc/appendix-packages.sgml
@@ -481,10 +481,29 @@ repmgr96-4.1.1-0.0git320.g5113ab0.1.el7.x86_64.rpm</programlisting>
    <sect2 id="packages-old-versions-rhel-centos" xreflabel="old RHEL/CentOS package versions">
      <title>RHEL/CentOS</title>
      <para>
-        Old RPM packages (<literal>3.2</literal> and later) can be retrieved from the
+        Old versions can be located with e.g.:
        <programlisting>
          yum --showduplicates list repmgr96</programlisting>
        (substitute the appropriate package name; see <xref linkend="packages-centos">) and installed with:
        <programlisting>
          yum install {package_name}-{version}</programlisting>
        where <literal>{package_name}</literal> is the base package name (e.g. <literal>repmgr96</literal>)
        and <literal>{version}</literal> is the version listed by the
        <command> yum --showduplicates list ...</command> command, e.g. <literal>4.0.6-1.rhel6</literal>.
      </para>
      <para>For example:
        <programlisting>
          yum install repmgr96-4.0.6-1.rhel6</programlisting>
      </para>
      <sect3 id="packages-old-versions-rhel-centos-repmgr3">
        <title>repmgr 3 packages</title>
        <para>
          Old &repmgr; 3 RPM packages (<literal>3.2</literal> and later) can be retrieved from the
          (deprecated) 2ndQuadrant repository at
-        <ulink url="http://packages.2ndquadrant.com/">http://packages.2ndquadrant.com/</ulink>
+          <ulink url="http://packages.2ndquadrant.com/repmgr/yum/">http://packages.2ndquadrant.com/repmgr/yum/</ulink>
          by installing the appropriate repository RPM:
        </para>
@@ -503,22 +522,7 @@ repmgr96-4.1.1-0.0git320.g5113ab0.1.el7.x86_64.rpm</programlisting>
          </listitem>
        </itemizedlist>
-
+      </sect3>
      <para>
        Old versions can be located with e.g.:
        <programlisting>
          yum --showduplicates list repmgr96</programlisting>
        (substitute the appropriate package name; see <xref linkend="packages-centos">) and installed with:
        <programlisting>
          yum install {package_name}-{version}</programlisting>
        where <literal>{package_name}</literal> is the base package name (e.g. <literal>repmgr96</literal>)
        and <literal>{version}</literal> is the version listed by the
        <command> yum --showduplicates list ...</command> command, e.g. <literal>4.0.6-1.rhel6</literal>.
      </para>
      <para>For example:
        <programlisting>
          yum install repmgr96-4.0.6-1.rhel6</programlisting>
      </para>
    </sect2>
  </sect1>
--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -14,13 +14,47 @@
  <para>
    See also: <xref linkend="upgrading-repmgr">
  </para>
  <sect1 id="release-4.3.1">
    <title>Release 4.3.1</title>
    <para><emphasis>??? December ??, 2019</emphasis></para>
    <para>
      &repmgr; 4.3.1 is a minor release.
    </para>
    <sect2>
      <title>Bug fixes</title>
      <para>
        <itemizedlist>
         <listitem>
            <para>
              <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>:
 	          ensure an existing replication slot is not deleted if the
              follow target is the node's current upstream.
            </para>
 	      </listitem>
        </itemizedlist>
      </para>
    </sect2>
  </sect1>
  <sect1 id="release-4.3">
    <title>Release 4.3</title>
-    <para><emphasis>Mar ???, 2019</emphasis></para>
+    <para><emphasis>Tue April 2, 2019</emphasis></para>
    <para>
      &repmgr; 4.3 is a major release.
    </para>
    <para>
      For details on how to upgrade an existing &repmgr; instrallation, see
      documentation section <link linkend="upgrading-major-version">Upgrading a major version release</link>.
    </para>
    <para>
      If <application>repmgrd</application> is in use, a PostgreSQL restart <emphasis>is</emphasis> required;
      in that case we suggest combining this &repmgr; upgrade with the next PostgreSQL
      minor release, which will require a PostgreSQL restart in any case.
    </para>
 	<important>
 	  <para>
@@ -32,12 +66,12 @@
 REPMGRD_OPTS="--daemonize=false"</programlisting>
 	  </para>
 	  <para>
-		For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd daemon configuration on Debian/Ubuntu</link>.
+	    For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd configuration on Debian/Ubuntu</link>.
 	  </para>
 	</important>
    <sect2>
-      <title>repmgr enhancements</title>
+      <title>repmgr client enhancements</title>
      <para>
        <itemizedlist>
@@ -72,9 +106,9 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
            </para>
            <note>
              <para>
-                For these commands to work reliably, the configuration file settings
+                These commands require the configuration file settings
                <varname>repmgrd_service_start_command</varname> and <varname>repmgrd_service_stop_command</varname>
-                should be set in <filename>repmgr.conf</filename>.
+                in <filename>repmgr.conf</filename> to be set.
              </para>
            </note>
          </listitem>
@@ -82,8 +116,8 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
          <listitem>
            <para>
              <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>
-              displays the interval (in seconds) since the <application>repmgrd</application> instance
+              additionally displays the node priority and the interval (in seconds) since the
-              last verified its upstream node was available.
+              <application>repmgrd</application> instance last verified its upstream node was available.
            </para>
          </listitem>
@@ -132,7 +166,7 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
          <listitem>
            <para>
-              Add check <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>
+              Add check to <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>
              to ensure the data directory on the demotion candidate is configured correctly in <filename>repmgr.conf</filename>.
              This is to ensure that &repmgr;, when remotely executed on the demotion candidate, can correctly verify
              that PostgreSQL on the demotion candidate was shut down cleanly. GitHub #523.
@@ -161,6 +195,41 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
            </para>
          </listitem>
          <listitem>
            <para>
              Add option <option>connection_check_type</option> to enable selection of the method
              <application>repmgrd</application> uses to determine whether the upstream node is available.
            </para>
            <para>
              Possible values are <literal>ping</literal> (default; uses <command>PQping()</command> to
              determine server availability), <literal>connection</literal> (attempst to make a new connection to
              the upstream node), and <literal>query</literal> (determines server availability
              by executing an SQL statement on the node via the existing connection).
            </para>
          </listitem>
          <listitem>
            <para>
              New configuration option <link linkend="repmgrd-failover-validation"><option>failover_validation_command</option></link>
              to allow an external mechanism to validate the failover decision made by <application>repmgrd</application>.
            </para>
          </listitem>
         <listitem>
            <para>
              New configuration option <link linkend="repmgrd-standby-disconnection-on-failover"><option>standby_disconnect_on_failover</option></link>
              to force standbys to disconnect their WAL receivers before making a failover decision.
            </para>
          </listitem>
         <listitem>
            <para>
 			  In a failover situation, <application>repmgrd</application> will not attempt to promote a
 			  node if another primary has already appeared (e.g. by being promoted manually).
 			  GitHub #420.
 			</para>
          </listitem>
 		</itemizedlist>
 	  </para>
 	</sect2>
@@ -170,44 +239,6 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
      <para>
        <itemizedlist>
          <listitem>
            <para>
              &repmgr;: when executing <command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>,
              prevent escaping issues with connection URIs when executing <command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>
              on the demotion candidate. GitHub #525.
            </para>
          </listitem>
          <listitem>
            <para>
              &repmgr;: when executing <command><link linkend="repmgr-witness-register">repmgr witness register</link></command>,
              chech the node to connected is actually the primary (i.e. not the witness server).  GitHub #528.
            </para>
          </listitem>
         <listitem>
            <para>
              &repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
              avoid a potential race condition when comparing received WAL on the standby to the primary's shutdown location,
 			  as the standby's walreceiver may not have yet flushed all received WAL to disk. GitHub #518.
            </para>
          </listitem>
         <listitem>
            <para>
              &repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
 			  verify the standby (promotion candidate) is currently attached to the primary (demotion candidate). GitHub #519.
            </para>
          </listitem>
          <listitem>
            <para>
              <application>repmgrd</application>: on a cascaded standby, don't fail over if
              <literal>failover=manual</literal>. GitHub #531.
            </para>
          </listitem>
          <listitem>
            <para>
              <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
@@ -222,6 +253,51 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
            </para>
 	      </listitem>
          <listitem>
            <para>
              ensure <command><link linkend="repmgr-standby-register">repmgr standby register</link></command>
 	          fails when <option>--upstream-node-id</option> is the same as the local node ID.
            </para>
 	      </listitem>
          <listitem>
            <para>
              &repmgr;: when executing <link linkend="repmgr-standby-clone"><command>repmgr standby clone</command></link>,
              recheck primary/upstream connection(s) after the data copy operation is complete, as these may
              have gone away.
            </para>
          </listitem>
          <listitem>
            <para>
              &repmgr;: when executing <command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>,
              prevent escaping issues with connection URIs when executing <command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>
              on the demotion candidate. GitHub #525.
            </para>
          </listitem>
          <listitem>
            <para>
              &repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
 	          verify the standby (promotion candidate) is currently attached to the primary (demotion candidate). GitHub #519.
            </para>
          </listitem>
         <listitem>
            <para>
              &repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
              avoid a potential race condition when comparing received WAL on the standby to the primary's shutdown location,
 	          as the standby's walreceiver may not have yet flushed all received WAL to disk. GitHub #518.
            </para>
          </listitem>
          <listitem>
            <para>
              &repmgr;: when executing <command><link linkend="repmgr-witness-register">repmgr witness register</link></command>,
              check the node to connected is actually the primary (i.e. not the witness server).  GitHub #528.
            </para>
          </listitem>
          <listitem>
            <para>
              <command><link linkend="repmgr-node-check">repmgr node check</link></command>
@@ -231,6 +307,13 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
 	        </para>
 	      </listitem>
          <listitem>
            <para>
              <application>repmgrd</application>: on a cascaded standby, don't fail over if
              <literal>failover=manual</literal>. GitHub #531.
            </para>
          </listitem>
        </itemizedlist>
      </para>
    </sect2>
--- a/doc/configuration-file-required-settings.sgml
+++ b/doc/configuration-file-required-settings.sgml
@@ -39,6 +39,10 @@
       called <varname>standby1</varname> (for example), things will be confusing
       to say the least.
     </para>
     <para>
       The string's maximum length is 63 characters and it should
       contain only printable ASCII characters.
     </para>
    </listitem>
   </varlistentry>
--- a/doc/configuration-file.sgml
+++ b/doc/configuration-file.sgml
@@ -1,15 +1,15 @@
-<sect1 id="configuration-file" xreflabel="configuration file location">
+<sect1 id="configuration-file" xreflabel="configuration file">
  <indexterm>
    <primary>repmgr.conf</primary>
    <secondary>location</secondary>
  </indexterm>
  <indexterm>
    <primary>configuration</primary>
-    <secondary>repmgr.conf location</secondary>
+    <secondary>repmgr.conf</secondary>
  </indexterm>
-  <title>Configuration file location</title>
+  <title>Configuration file</title>
  <para>
    <application>repmgr</application> and <application>repmgrd</application>
    use a common configuration file, by default called
@@ -21,6 +21,55 @@
    for more details.
  </para>
  <sect2 id="configuration-file-format" xreflabel="configuration file format">
    <indexterm>
      <primary>repmgr.conf</primary>
      <secondary>format</secondary>
    </indexterm>
    <title>Configuration file format</title>
    <para>
      <filename>repmgr.conf</filename> is a plain text file with one parameter/value
      combination per line.
    </para>
    <para>
      Whitespace is insignificant (except within a quoted parameter value) and blank lines are ignored.
      Hash marks (<literal>#</literal>) designate the remainder of the line as a comment.
      Parameter values that are not simple identifiers or numbers should be single-quoted.
      Note that single quote cannot be embedded in a parameter value.
    </para>
    <important>
      <para>
        &repmgr; will interpret double-quotes as being part of a string value; only use single quotes
        to quote parameter values.
      </para>
    </important>
    <para>
      Example of a valid <filename>repmgr.conf</filename> file:
      <programlisting>
 # repmgr.conf
 node_id=1
 node_name= node1
 conninfo ='host=node1 dbname=repmgr user=repmgr connect_timeout=2'
 data_directory = /var/lib/pgsql/11/data</programlisting>
    </para>
  </sect2>
  <sect2 id="configuration-file-location" xreflabel="configuration file location">
  <indexterm>
    <primary>repmgr.conf</primary>
    <secondary>location</secondary>
  </indexterm>
  <title>Configuration file location</title>
  <para>
   The configuration file will be searched for in the following locations:
   <itemizedlist spacing="compact" mark="bullet">
@@ -50,7 +99,7 @@
   Note that if a file is explicitly specified with <literal>-f/--config-file</literal>,
   an error will be raised if it is not found or not readable, and no attempt will be made to
   check default locations; this is to prevent <application>repmgr</application> unexpectedly
-   reading the wrong configuraton file.
+   reading the wrong configuration file.
  </para>
  <note>
@@ -66,4 +115,6 @@
      <filename>/path/to/repmgr.conf</filename>).
    </para>
   </note>
   </sect2>
 </sect1>
--- a/doc/configuring-witness-server.sgml
+++ b/doc/configuring-witness-server.sgml
@@ -1,93 +0,0 @@
 <chapter id="using-witness-server">
 <indexterm>
  <primary>witness server</primary>
  <seealso>Using a witness server with repmgrd</seealso>
 </indexterm>
 <title>Using a witness server</title>
 <para>
   A <xref linkend="witness-server"> is a normal PostgreSQL instance which
   is not part of the streaming replication cluster; its purpose is, if a
   failover situation occurs, to provide proof that the primary server
   itself is unavailable.
 </para>
 <para>
   A typical use case for a witness server is a two-node streaming replication
   setup, where the primary and standby are in different locations (data centres).
   By creating a witness server in the same location (data centre) as the primary,
   if the primary becomes unavailable it's possible for the standby to decide whether
   it can promote itself without risking a "split brain" scenario: if it can't see either the
   witness or the primary server, it's likely there's a network-level interruption
   and it should not promote itself. If it can seen the witness but not the primary,
   this proves there is no network interruption and the primary itself is unavailable,
   and it can therefore promote itself (and ideally take action to fence the
   former primary).
 </para>
 <note>
   <para>
     <emphasis>Never</emphasis> install a witness server on the same physical host
     as another node in the replication cluster managed by &repmgr; - it's essential
     the witness is not affected in any way by failure of another node.
   </para>
 </note>
 <para>
   For more complex replication scenarios,e.g. with multiple datacentres, it may
   be preferable to use location-based failover, which ensures that only nodes
   in the same location as the primary will ever be promotion candidates;
   see <xref linkend="repmgrd-network-split"> for more details.
 </para>
 <note>
   <simpara>
     A witness server will only be useful if <application>repmgrd</application>
     is in use.
   </simpara>
 </note>
 <sect1 id="creating-witness-server">
   <title>Creating a witness server</title>
 <para>
   To create a witness server, set up a normal PostgreSQL instance on a server
   in the same physical location as the cluster's primary server.
 </para>
 <para>
   This instance should *not* be on the same physical host as the primary server,
   as otherwise if the primary server fails due to hardware issues, the witness
   server will be lost too.
 </para>
 <note>
   <simpara>
     &repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
     command, which would automatically create a PostgreSQL instance. However
     this often resulted in an unsatisfactory, hard-to-customise instance.
   </simpara>
 </note>
 <para>
   The witness server should be configured in the same way as a normal
   &repmgr; node; see section <xref linkend="configuration">.
 </para>
 <para>
   Register the witness server with <xref linkend="repmgr-witness-register">.
   This will create the &repmgr; extension on the witness server, and make
   a copy of the &repmgr; metadata.
 </para>
 <note>
   <simpara>
    As the witness server is not part of the replication cluster, further
    changes to the &repmgr; metadata will be synchronised by
    <application>repmgrd</application>.
   </simpara>
 </note>
 <para>
   Once the witness server has been configured, <application>repmgrd</application>
   should be started; for more details see <xref linkend="repmgrd-witness-server">.
 </para>
 <para>
  To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
 </para>
 </sect1>
 </chapter>
--- a/doc/event-notifications.sgml
+++ b/doc/event-notifications.sgml
@@ -88,7 +88,7 @@
 <para>
  The values provided for <literal>%t</literal> and <literal>%d</literal>
-  will probably contain spaces, so should be quoted in the provided command
+  may contain spaces, so should be quoted in the provided command
  configuration, e.g.:
  <programlisting>
    event_notification_command='/path/to/some/script %n %e %s "%t" "%d"'
--- a/doc/filelist.sgml
+++ b/doc/filelist.sgml
@@ -45,21 +45,14 @@
 <!ENTITY promoting-standby  SYSTEM "promoting-standby.sgml">
 <!ENTITY follow-new-primary  SYSTEM "follow-new-primary.sgml">
 <!ENTITY switchover  SYSTEM "switchover.sgml">
 <!ENTITY configuring-witness-server SYSTEM "configuring-witness-server.sgml">
 <!ENTITY event-notifications  SYSTEM "event-notifications.sgml">
 <!ENTITY upgrading-repmgr  SYSTEM "upgrading-repmgr.sgml">
 <!ENTITY repmgrd-overview SYSTEM "repmgrd-overview.sgml">
 <!ENTITY repmgrd-automatic-failover SYSTEM "repmgrd-automatic-failover.sgml">
 <!ENTITY repmgrd-configuration SYSTEM "repmgrd-configuration.sgml">
-<!ENTITY repmgrd-demonstration SYSTEM "repmgrd-demonstration.sgml">
+<!ENTITY repmgrd-operation SYSTEM "repmgrd-operation.sgml">
 <!ENTITY repmgrd-monitoring SYSTEM "repmgrd-monitoring.sgml">
 <!ENTITY repmgrd-degraded-monitoring SYSTEM "repmgrd-degraded-monitoring.sgml">
 <!ENTITY repmgrd-cascading-replication SYSTEM "repmgrd-cascading-replication.sgml">
 <!ENTITY repmgrd-network-split SYSTEM "repmgrd-network-split.sgml">
 <!ENTITY repmgrd-witness-server SYSTEM "repmgrd-witness-server.sgml">
 <!ENTITY repmgrd-pausing SYSTEM "repmgrd-pausing.sgml">
 <!ENTITY repmgrd-notes SYSTEM "repmgrd-notes.sgml">
 <!ENTITY repmgrd-bdr SYSTEM "repmgrd-bdr.sgml">
 <!ENTITY repmgr-primary-register SYSTEM "repmgr-primary-register.sgml">
--- a/doc/install-source.sgml
+++ b/doc/install-source.sgml
@@ -61,28 +61,28 @@ deb-src http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main</programlisti
         <itemizedlist spacing="compact" mark="bullet">
           <listitem>
-             <simpara><literal>llibedit-dev</literal></simpara>
+             <simpara><literal>libedit-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibkrb5-dev</literal></simpara>
+             <simpara><literal>libkrb5-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibpam0g-dev</literal></simpara>
+             <simpara><literal>libpam0g-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibreadline-dev</literal></simpara>
+             <simpara><literal>libreadline-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibselinux1-dev</literal></simpara>
+             <simpara><literal>libselinux1-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibssl-dev</literal></simpara>
+             <simpara><literal>libssl-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibxml2-dev</literal></simpara>
+             <simpara><literal>libxml2-dev</literal></simpara>
           </listitem>
           <listitem>
-             <simpara><literal>llibxslt1-dev</literal></simpara>
+             <simpara><literal>libxslt1-dev</literal></simpara>
           </listitem>
         </itemizedlist>
       </para>
@@ -136,6 +136,16 @@ deb-src http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main</programlisti
         </itemizedlist>
       </para>
     </note>
     <tip>
       <para>
         If building against PostgreSQL 11 or later configured with the <option>--with-llvm</option> option
         (this is the case with the PGDG-provided packages) you'll also need to install the
         <literal>llvm-toolset-7-clang</literal> package. This is available via the
         <ulink url="https://wiki.centos.org/AdditionalResources/Repositories/SCL">Software Collections (SCL) Repository</ulink>.
       </para>
     </tip>
    </listitem>
   </itemizedlist>
  </para>
--- a/doc/quickstart.sgml
+++ b/doc/quickstart.sgml
@@ -76,19 +76,25 @@
   </para>
   <programlisting>
-    # Enable replication connections; set this figure to at least one more
+    # Enable replication connections; set this value to at least one more
    # than the number of standbys which will connect to this server
-    # (note that repmgr will execute `pg_basebackup` in WAL streaming mode,
+    # (note that repmgr will execute "pg_basebackup" in WAL streaming mode,
-    # which requires two free WAL senders)
+    # which requires two free WAL senders).
    #
    # See: https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-MAX-WAL-SENDERS
    max_wal_senders = 10
-    # Enable replication slots; set this figure to at least one more
+    # If using replication slots, set this value to at least one more
    # than the number of standbys which will connect to this server.
    # Note that repmgr will only make use of replication slots if
-    # "use_replication_slots" is set to "true" in repmgr.conf
+    # "use_replication_slots" is set to "true" in "repmgr.conf".
    # (If you are not intending to use replication slots, this value
    # can be set to "0").
    #
    # See: https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-MAX-REPLICATION-SLOTS
-    max_replication_slots = 0
+    max_replication_slots = 10
    # Ensure WAL files contain enough information to enable read-only queries
    # on the standby.
@@ -103,16 +109,23 @@
    # Enable read-only queries on a standby
    # (Note: this will be ignored on a primary but we recommend including
-    # it anyway)
+    # it anyway, in case the primary later becomes a standby)
    #
    # See: https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY
    hot_standby = on
    # Enable WAL file archiving
    #
    # See: https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-ARCHIVE-MODE
    archive_mode = on
-    # Set archive command to a script or application that will safely store
+    # Set archive command to a dummy command; this can later be changed without
-    # you WALs in a secure place. /bin/true is an example of a command that
+    # needing to restart the PostgreSQL instance.
-    # ignores archiving. Use something more sensible.
+    #
    # See: https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-ARCHIVE-COMMAND
    archive_command = '/bin/true'
   </programlisting>
   <tip>
@@ -120,7 +133,7 @@
      Rather than editing these settings in the default <filename>postgresql.conf</filename>
      file, create a separate file such as <filename>postgresql.replication.conf</filename> and
      include it from the end of the main configuration file with:
-     <command>include 'postgresql.replication.conf</command>.
+     <command>include 'postgresql.replication.conf'</command>.
    </simpara>
   </tip>
   <para>
@@ -129,7 +142,8 @@
     <varname>wal_log_hints</varname>; for more details see <xref linkend="repmgr-node-rejoin-pg-rewind">.
   </para>
    <para>
-      See also the <link linkend="configuration-postgresql">PostgreSQL configuration</link> section in the <link linkend="configuration">repmgr configuaration guide</link>.
+      See also the <link linkend="configuration-postgresql">PostgreSQL configuration</link> section in the
      <link linkend="configuration">repmgr configuration guide</link>.
    </para>
 </sect1>
--- a/doc/repmgr-cluster-show.sgml
+++ b/doc/repmgr-cluster-show.sgml
@@ -196,11 +196,31 @@
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>ERR_BAD_CONFIG (1)</option></term>
        <listitem>
          <para>
            An issue was encountered while attempting to retrieve
            &repmgr; metadata.
          </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>ERR_DB_CONN (6)</option></term>
        <listitem>
          <para>
            &repmgr; was unable to connect to the local PostgreSQL instance.
          </para>
        </listitem>
      </varlistentry>
      <varlistentry>
        <term><option>ERR_NODE_STATUS (25)</option></term>
        <listitem>
          <para>
-            One or more issues were detected.
+            One or more issues were detected with the replication configuration,
            e.g. a node was not in its expected state.
          </para>
        </listitem>
      </varlistentry>
--- a/doc/repmgr-daemon-status.sgml
+++ b/doc/repmgr-daemon-status.sgml
@@ -33,7 +33,10 @@
      <command>repmgr daemon status</command> can be executed on any active node in the
      replication cluster. A valid <filename>repmgr.conf</filename> file is required.
    </para>
-
+    <para>
      If PostgreSQL is not running on a node, &repmgr; will not be able to determine the
      status of that node's <application>repmgrd</application> instance.
    </para>
    <note>
      <para>
        After restarting PostgreSQL on any node, the <application>repmgrd</application> instance
@@ -126,19 +129,19 @@
              <listitem>
                <simpara>
-                  <application>repmgrd</application> running (1 = running, 0 = not running)
+                  <application>repmgrd</application> running (1 = running, 0 = not running, -1 = unknown)
                </simpara>
              </listitem>
              <listitem>
                <simpara>
-                  <application>repmgrd</application> PID (-1 if not running)
+                  <application>repmgrd</application> PID (-1 if not running or status unknown)
                </simpara>
              </listitem>
              <listitem>
                <simpara>
-                  <application>repmgrd</application> paused (1 = paused, 0 = not paused)
+                  <application>repmgrd</application> paused (1 = paused, 0 = not paused, -1 = unknown)
                </simpara>
              </listitem>
@@ -150,7 +153,7 @@
              <listitem>
                <simpara>
-                  interval in seconds since the node's upstream was last seen
+                  interval in seconds since the node's upstream was last seen (this will be -1 if the value could not be retrieved, or the node is primary)
                </simpara>
              </listitem>
--- a/doc/repmgr-standby-promote.sgml
+++ b/doc/repmgr-standby-promote.sgml
@@ -99,7 +99,7 @@
        </indexterm>
         <simpara>
           <literal>promote_check_interval</literal>:
-           interval (in seconds, default: 1 seconds) to wait between each check
+           interval (in seconds, default: 1 second) to wait between each check
           to determine whether the standby has been promoted.
 		 </simpara>
 	   </listitem>
--- a/doc/repmgr-standby-switchover.sgml
+++ b/doc/repmgr-standby-switchover.sgml
@@ -22,10 +22,10 @@
      passwordless SSH connection to the current primary.
    </para>
    <para>
-      If other standbys are connected to the demotion candidate, &repmgr; can instruct
+      If other nodes are connected to the demotion candidate, &repmgr; can instruct
      these to follow the new primary if the option <literal>--siblings-follow</literal>
      is specified. This requires a passwordless SSH connection between the promotion
-      candidate (new primary) and the standbys attached to the demotion candidate
+      candidate (new primary) and the nodes attached to the demotion candidate
      (existing primary).
    </para>
    <note>
@@ -150,8 +150,18 @@
        <term><option>--siblings-follow</option></term>
        <listitem>
          <para>
-            Have standbys attached to the old primary follow the new primary.
+            Have nodes attached to the old primary follow the new primary.
          </para>
          <para>
            This will also ensure that a witness node, if in use, is updated
            with the new primary's data.
          </para>
          <note>
            <para>
              In a future &repmgr; release, <option>--siblings-follow</option> will be applied
              by default.
            </para>
          </note>
        </listitem>
      </varlistentry>
    </variablelist>
--- a/doc/repmgr.sgml
+++ b/doc/repmgr.sgml
@@ -9,6 +9,7 @@
          %filelist;
          <!ENTITY repmgr "<productname>repmgr</productname>">
          <!ENTITY repmgrd "<productname>repmgrd</productname>">
          <!ENTITY postgres "<productname>PostgreSQL</productname>">
 ]>
@@ -25,25 +26,31 @@
   <para>
   This is the official documentation of &repmgr; &repmgrversion; for
   use with PostgreSQL 9.3 - PostgreSQL 11.
-   It describes the functionality supported by the current version of &repmgr;.
+   </para>
   <para>
     &repmgr; is being continually developed and we strongly recommend using the
     latest version. Please check the
     <ulink url="https://repmgr.org/">repmgr website</ulink> for details
     about the current &repmgr; version as well as the
     <ulink url="https://repmgr.org/docs/current/index.html">current repmgr documentation</ulink>.
   </para>
   <para>
-    &repmgr; was developed by
+    &repmgr; is developed by
    <ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
    along with contributions from other individuals and companies.
    Contributions from the community are appreciated and welcome - get
-    in touch via <ulink url="https://github.com/2ndQuadrant/repmgr">github</>
+    in touch via <ulink url="https://github.com/2ndQuadrant/repmgr">github</ulink>
-    or <ulink url="https://groups.google.com/group/repmgr">the mailing list/forum</>.
+    or <ulink url="https://groups.google.com/group/repmgr">the mailing list/forum</ulink>.
    Multiple 2ndQuadrant customers contribute funding
    to make repmgr development possible.
   </para>
   <para>
-    2ndQuadrant, a Platinum sponsor of the PostgreSQL project,
+     &repmgr; is fully supported by 2ndQuadrant's
-    continues to develop repmgr to meet internal needs and those of customers.
+     <ulink url="https://www.2ndquadrant.com/en/support/support-postgresql/">24/7 Production Support</ulink>.
-     Other companies as well as individual developers
+     2ndQuadrant, a Major Sponsor of the PostgreSQL project, continues to develop and maintain &repmgr;.
-    are welcome to participate in the efforts.
+     Other companies as well as individual developers are welcome to participate in the efforts.
   </para>
  </abstract>
@@ -73,23 +80,16 @@
  &promoting-standby;
  &follow-new-primary;
  &switchover;
  &configuring-witness-server;
  &event-notifications;
  &upgrading-repmgr;
 </part>
 <part id="using-repmgrd">
  <title>Using repmgrd</title>
  &repmgrd-overview;
  &repmgrd-automatic-failover;
  &repmgrd-configuration;
-  &repmgrd-demonstration;
+  &repmgrd-operation;
  &repmgrd-cascading-replication;
  &repmgrd-network-split;
  &repmgrd-witness-server;
  &repmgrd-pausing;
  &repmgrd-degraded-monitoring;
  &repmgrd-monitoring;
  &repmgrd-notes;
  &repmgrd-bdr;
 </part>
--- a/doc/repmgrd-automatic-failover.sgml
+++ b/doc/repmgrd-automatic-failover.sgml
@@ -13,5 +13,285 @@
  providing monitoring information about the state of each standby.
 </para>
 <sect1 id="repmgrd-witness-server" xreflabel="Using a witness server with repmgrd">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>witness server</secondary>
 </indexterm>
 <indexterm>
   <primary>witness server</primary>
   <secondary>repmgrd</secondary>
 </indexterm>
 <title>Using a witness server</title>
 <para>
   A <xref linkend="witness-server"> is a normal PostgreSQL instance which
   is not part of the streaming replication cluster; its purpose is, if a
   failover situation occurs, to provide proof that it is the primary server
   itself which is unavailable, rather than e.g. a network split between
   different physical locations.
 </para>
 <para>
   A typical use case for a witness server is a two-node streaming replication
   setup, where the primary and standby are in different locations (data centres).
   By creating a witness server in the same location (data centre) as the primary,
   if the primary becomes unavailable it's possible for the standby to decide whether
   it can promote itself without risking a "split brain" scenario: if it can't see either the
   witness or the primary server, it's likely there's a network-level interruption
   and it should not promote itself. If it can see the witness but not the primary,
   this proves there is no network interruption and the primary itself is unavailable,
   and it can therefore promote itself (and ideally take action to fence the
   former primary).
 </para>
 <note>
   <para>
     <emphasis>Never</emphasis> install a witness server on the same physical host
     as another node in the replication cluster managed by &repmgr; - it's essential
     the witness is not affected in any way by failure of another node.
   </para>
 </note>
 <para>
   For more complex replication scenarios,e.g. with multiple datacentres, it may
   be preferable to use location-based failover, which ensures that only nodes
   in the same location as the primary will ever be promotion candidates;
   see <xref linkend="repmgrd-network-split"> for more details.
 </para>
 <note>
   <simpara>
     A witness server will only be useful if <application>repmgrd</application>
     is in use.
   </simpara>
 </note>
 <sect2 id="creating-witness-server">
   <title>Creating a witness server</title>
 <para>
   To create a witness server, set up a normal PostgreSQL instance on a server
   in the same physical location as the cluster's primary server.
 </para>
 <para>
   This instance should <emphasis>not</emphasis> be on the same physical host as the primary server,
   as otherwise if the primary server fails due to hardware issues, the witness
   server will be lost too.
 </para>
 <note>
   <simpara>
     &repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
     command, which would automatically create a PostgreSQL instance. However
     this often resulted in an unsatisfactory, hard-to-customise instance.
   </simpara>
 </note>
 <para>
   The witness server should be configured in the same way as a normal
   &repmgr; node; see section <xref linkend="configuration">.
 </para>
 <para>
   Register the witness server with <xref linkend="repmgr-witness-register">.
   This will create the &repmgr; extension on the witness server, and make
   a copy of the &repmgr; metadata.
 </para>
 <note>
   <simpara>
    As the witness server is not part of the replication cluster, further
    changes to the &repmgr; metadata will be synchronised by
    <application>repmgrd</application>.
   </simpara>
 </note>
 <para>
   Once the witness server has been configured, <application>repmgrd</application>
   should be started.
 </para>
 <para>
  To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
 </para>
 </sect2>
 </sect1>
 <sect1 id="repmgrd-network-split" xreflabel="Handling network splits with repmgrd">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>network splits</secondary>
 </indexterm>
 <indexterm>
   <primary>network splits</primary>
 </indexterm>
 <title>Handling network splits with repmgrd</title>
 <para>
  A common pattern for replication cluster setups is to spread servers over
  more than one datacentre. This can provide benefits such as geographically-
  distributed read replicas and DR (disaster recovery capability). However
  this also means there is a risk of disconnection at network level between
  datacentre locations, which would result in a split-brain scenario if
  servers in a secondary data centre were no longer able to see the primary
  in the main data centre and promoted a standby among themselves.
 </para>
 <para>
  &repmgr; enables provision of &quot;<xref linkend="witness-server">&quot; to
  artificially create a quorum of servers in a particular location, ensuring
  that nodes in another location will not elect a new primary if they
  are unable to see the majority of nodes. However this approach does not
  scale well, particularly with more complex replication setups, e.g.
  where the majority of nodes are located outside of the primary datacentre.
  It also means the <literal>witness</literal> node needs to be managed as an
  extra PostgreSQL instance outside of the main replication cluster, which
  adds administrative and programming complexity.
 </para>
 <para>
  <literal>repmgr4</literal> introduces the concept of <literal>location</literal>:
  each node is associated with an arbitrary location string (default is
  <literal>default</literal>); this is set in <filename>repmgr.conf</filename>, e.g.:
  <programlisting>
    node_id=1
    node_name=node1
    conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'
    data_directory='/var/lib/postgresql/data'
    location='dc1'</programlisting>
 </para>
 <para>
  In a failover situation, <application>repmgrd</application> will check if any servers in the
  same location as the current primary node are visible.  If not, <application>repmgrd</application>
  will assume a network interruption and not promote any node in any
  other location (it will however enter <link linkend="repmgrd-degraded-monitoring">degraded monitoring</link>
  mode until a primary becomes visible).
 </para>
 </sect1>
 <sect1 id="repmgrd-standby-disconnection-on-failover" xreflabel="Standby disconnection on failover">
  <indexterm>
   <primary>repmgrd</primary>
   <secondary>standby disconnection on failover</secondary>
 </indexterm>
  <indexterm>
    <primary>standby disconnection on failover</primary>
  </indexterm>
  <title>Standby disconnection on failover</title>
  <para>
    If <option>standby_disconnect_on_failover</option> is set to <literal>true</literal> in
    <filename>repmgr.conf</filename>, in a failover situation <application>repmgrd</application> will forcibly disconnect
    the local node's WAL receiver before making a failover decision.
  </para>
  <note>
    <para>
      <option>standby_disconnect_on_failover</option> is available from PostgreSQL 9.5 and later.
      Additionally this requires that the <literal>repmgr</literal> database user is a superuser.
    </para>
  </note>
  <para>
    By doing this, it's possible to ensure that, at the point the failover decision is made, no nodes
    are receiving data from the primary and their LSN location will be static.
  </para>
  <important>
    <para>
      <option>standby_disconnect_on_failover</option> <emphasis>must</emphasis> be set to the same value on
      all nodes.
    </para>
  </important>
  <para>
    Note that when using <option>standby_disconnect_on_failover</option> there will be a delay of 5 seconds
    plus however many seconds it takes to confirm the WAL receiver is disconnected before
    <application>repmgrd</application> proceeds with the failover decision.
  </para>
  <para>
    Following the failover operation, no matter what the outcome, each node will reconnect its WAL receiver.
  </para>
 </sect1>
 <sect1 id="repmgrd-failover-validation" xreflabel="Failover validation">
  <indexterm>
   <primary>repmgrd</primary>
   <secondary>failover validation</secondary>
 </indexterm>
  <indexterm>
    <primary>failover validation</primary>
  </indexterm>
  <title>Failover validation</title>
  <para>
    From <link linkend="release-4.3">repmgr 4.3</link>, &repmgr; makes it possible to provide a script
    to <application>repmgrd</application> which, in a failover situation,
    will be executed by the promotion candidate (the node which has been selected
    to be the new primary) to confirm whether the node should actually be promoted.
  </para>
  <para>
    To use this, <option>failover_validation_command</option> in <filename>repmgr.conf</filename>
    to a script executable by the <literal>postgres</literal> system user, e.g.:
    <programlisting>
      failover_validation_command=/path/to/script.sh %n %a</programlisting>
  </para>
  <para>
    The <literal>%n</literal> parameter will be replaced with the node ID, and the
    <literal>%a</literal> parameter will be replaced by the node name when the script is executed.
  </para>
  <para>
    This script must return an exit code of <literal>0</literal> to indicate the node should promote itself.
    Any other value will result in the promotion being aborted and the election rerun.
    There is a pause of <option>election_rerun_interval</option> seconds before the election is rerun.
  </para>
  <para>
    Sample <application>repmgrd</application> log file output during which the failover validation
    script rejects the proposed promotion candidate:
    <programlisting>
 [2019-03-13 21:01:30] [INFO] visible nodes: 2; total nodes: 2; no nodes have seen the primary within the last 4 seconds
 [2019-03-13 21:01:30] [NOTICE] promotion candidate is "node2" (ID: 2)
 [2019-03-13 21:01:30] [NOTICE] executing "failover_validation_command"
 [2019-03-13 21:01:30] [DETAIL] /usr/local/bin/failover-validation.sh 2
 [2019-03-13 21:01:30] [INFO] output returned by failover validation command:
 Node ID: 2
 [2019-03-13 21:01:30] [NOTICE] failover validation command returned a non-zero value: "1"
 [2019-03-13 21:01:30] [NOTICE] promotion candidate election will be rerun
 [2019-03-13 21:01:30] [INFO] 1 followers to notify
 [2019-03-13 21:01:30] [NOTICE] notifying node "node3" (node ID: 3) to rerun promotion candidate selection
 INFO:  node 3 received notification to rerun promotion candidate election
 [2019-03-13 21:01:30] [NOTICE] rerunning election after 15 seconds ("election_rerun_interval")</programlisting>
  </para>
 </sect1>
  <sect1 id="cascading-replication" xreflabel="Cascading replication">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>cascading replication</secondary>
 </indexterm>
 <indexterm>
   <primary>cascading replication</primary>
   <secondary>repmgrd</secondary>
 </indexterm>
 <title>repmgrd and cascading replication</title>
 <para>
  Cascading replication - where a standby can connect to an upstream node and not
  the primary server itself - was introduced in PostgreSQL 9.2. &repmgr; and
  <application>repmgrd</application> support cascading replication by keeping track of the relationship
  between standby servers - each node record is stored with the node id of its
  upstream ("parent") server (except of course the primary server).
 </para>
 <para>
  In a failover situation where the primary node fails and a top-level standby
  is promoted, a standby connected to another standby will not be affected
  and continue working as normal (even if the upstream standby it's connected
  to becomes the primary node). If however the node's direct upstream fails,
  the &quot;cascaded standby&quot; will attempt to reconnect to that node's parent
  (unless <varname>failover</varname> is set to <literal>manual</literal> in
  <filename>repmgr.conf</filename>).
 </para>
  </sect1>
 </chapter>
--- a/doc/repmgrd-bdr.sgml
+++ b/doc/repmgrd-bdr.sgml
@@ -10,7 +10,7 @@
  <title>BDR failover with repmgrd</title>
  <para>
-    &repmgr; 4.x provides support for monitoring BDR nodes and taking action in
+    &repmgr; 4.x provides support for monitoring a pair of BDR 2.x nodes and taking action in
    case one of the nodes fails.
  </para>
  <note>
@@ -31,8 +31,21 @@
    reconfigure a proxy server/connection pooler such as <application>PgBouncer</application>.
  </para>
  <note>
    <simpara>
      This &repmgr; functionality is for BDR 2.x only running on PostgreSQL 9.4/9.6.
      It is <emphasis>not</emphasis> required for later BDR versions.
    </simpara>
  </note>
  <sect1 id="bdr-prerequisites" xreflabel="BDR prequisites">
    <title>Prerequisites</title>
    <important>
      <para>
        This &repmgr; functionality is for BDR 2.x only running on PostgreSQL 9.4/9.6.
        It is <emphasis>not</emphasis> required for later BDR versions.
      </para>
    </important>
    <para>
      &repmgr; 4 requires PostgreSQL 9.4 or 9.6 with the BDR 2 extension
      enabled and configured for a two-node BDR network. &repmgr; 4 packages
--- a/doc/repmgrd-cascading-replication.sgml
+++ b/doc/repmgrd-cascading-replication.sgml
@@ -1,24 +0,0 @@
 <chapter id="repmgrd-cascading-replication">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>cascading replication</secondary>
 </indexterm>
 <title>repmgrd and cascading replication</title>
 <para>
  Cascading replication - where a standby can connect to an upstream node and not
  the primary server itself - was introduced in PostgreSQL 9.2. &repmgr; and
  <application>repmgrd</application> support cascading replication by keeping track of the relationship
  between standby servers - each node record is stored with the node id of its
  upstream ("parent") server (except of course the primary server).
 </para>
 <para>
  In a failover situation where the primary node fails and a top-level standby
  is promoted, a standby connected to another standby will not be affected
  and continue working as normal (even if the upstream standby it's connected
  to becomes the primary node). If however the node's direct upstream fails,
  the &quot;cascaded standby&quot; will attempt to reconnect to that node's parent
  (unless <varname>failover</varname> is set to <literal>manual</literal> in
  <filename>repmgr.conf</filename>).
 </para>
 </chapter>
--- a/doc/repmgrd-configuration.sgml
+++ b/doc/repmgrd-configuration.sgml
@@ -5,7 +5,7 @@
    <secondary>configuration</secondary>
  </indexterm>
-  <title>repmgrd configuration</title>
+  <title>repmgrd setup and configuration</title>
  <para>
    <application>repmgrd</application> is a daemon which runs on each PostgreSQL node,
@@ -20,7 +20,7 @@
  </para>
  <sect1 id="repmgrd-basic-configuration">
-    <title>repmgrd basic configuration</title>
+    <title>repmgrd configuration</title>
    <para>
      To use <application>repmgrd</application>, its associated function library <emphasis>must</emphasis> be
@@ -34,21 +34,206 @@
      the <ulink url="https://www.postgresql.org/docs/current/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
    </para>
    <para>
      The following configuraton options apply to <application>repmgrd</application> in all circumstances:
    </para>
    <variablelist>
        <varlistentry>
         <indexterm>
            <primary>monitor_interval_secs</primary>
          </indexterm>
          <term><option>monitor_interval_secs</option></term>
          <listitem>
            <para>
              The interval (in seconds, default: <literal>2</literal>) to check the availability of the upstream node.
            </para>
          </listitem>
        </varlistentry>
        <varlistentry id="connection-check-type">
          <indexterm>
            <primary>connection_check_type</primary>
          </indexterm>
          <term><option>connection_check_type</option></term>
          <listitem>
            <para>
              The option <option>connection_check_type</option> is used to select the method
              <application>repmgrd</application> uses to determine whether the upstream node is available.
            </para>
            <para>
              Possible values are:
              <itemizedlist spacing="compact" mark="bullet">
                <listitem>
                  <simpara>
                    <literal>ping</literal> (default) - uses <command>PQping()</command> to
                    determine server availability
                  </simpara>
                </listitem>
                <listitem>
                  <simpara>
                    <literal>connection</literal> - determines server availability
                    by attempt ingto make a new connection to the upstream node
                  </simpara>
                </listitem>
                <listitem>
                  <simpara>
                    <literal>query</literal> - determines server availability
                    by executing an SQL statement on the node via the existing connection
                  </simpara>
                </listitem>
              </itemizedlist>
            </para>
          </listitem>
        </varlistentry>
        <varlistentry>
         <indexterm>
            <primary>reconnect_attempts</primary>
          </indexterm>
          <term><option>reconnect_attempts</option></term>
          <listitem>
            <para>
              The number of attempts (default: <literal>6</literal>) will be made to reconnect to an unreachable
 	      upstream node before initiating a failover.
            </para>
            <para>
              There will be an interval of <option>reconnect_interval</option> seconds between each reconnection
              attempt.
            </para>
          </listitem>
        </varlistentry>
        <varlistentry>
         <indexterm>
            <primary>reconnect_interval</primary>
          </indexterm>
          <term><option>reconnect_interval</option></term>
          <listitem>
            <para>
              Interval (in seconds, default: <literal>10</literal>) between attempts to reconnect to an unreachable
              upstream node.
            </para>
            <para>
              The number of reconnection attempts is defined by the parameter <option>reconnect_attempts</option>.
            </para>
          </listitem>
        </varlistentry>
        <varlistentry>
          <indexterm>
            <primary>degraded_monitoring_timeout</primary>
          </indexterm>
          <term><option>degraded_monitoring_timeout</option></term>
          <listitem>
 	    <para>
              Interval (in seconds) after which <application>repmgrd</application> will terminate if
              either of the servers (local node and or upstream node) being monitored is no longer available
              (<link linkend="repmgrd-degraded-monitoring">degraded monitoring mode</link>).
            </para>
            <para>
              <literal>-1</literal> (default) disables this timeout completely.
            </para>
 	  </listitem>
 	</varlistentry>
    </variablelist>
      <para>
        See also <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename> for an annotated sample configuration file.
      </para>
    <sect2 id="repmgrd-automatic-failover-configuration">
-      <title>Automatic failover configuration</title>
+      <title>Required configuration for automatic failover</title>
      <para>
-        If using automatic failover, the following <application>repmgrd</application> options *must* be set in
+        The following <application>repmgrd</application> options <emphasis>must</emphasis> be set in
        <filename>repmgr.conf</filename>:
        <itemizedlist spacing="compact" mark="bullet">
          <listitem>
            <simpara><option>failover</option></simpara>
          </listitem>
          <listitem>
            <simpara><option>promote_command</option></simpara>
          </listitem>
          <listitem>
            <simpara><option>follow_command</option></simpara>
          </listitem>
        </itemizedlist>
      </para>
      <para>
        Example:
        <programlisting>
          failover=automatic
          promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'
          follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
      </para>
      <para>
-        Adjust file paths as appropriate; alway specify the full path to the &repmgr; binary.
+        Details of each option are as follows:
      </para>
      <variablelist>
        <varlistentry>
          <indexterm>
            <primary>failover</primary>
          </indexterm>
          <term><option>failover</option></term>
          <listitem>
            <para>
              <option>failover</option> can be one of <literal>automatic</literal> or <literal>manual</literal>.
            </para>
            <note>
              <para>
                If <option>failover</option> is set to <literal>manual</literal>, <application>repmgrd</application>
                will not take any action if a failover situation is detected, and the node may need to
                be modified manually (e.g. by executing <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>).
              </para>
            </note>
          </listitem>
        </varlistentry>
        <varlistentry>
          <indexterm>
            <primary>promote_command</primary>
          </indexterm>
          <term><option>promote_command</option></term>
          <listitem>
            <para>
              The program or script defined in <option>promote_command</option> will be executed
              in a failover situation when <application>repmgrd</application> determines that
              the current node is to become the new primary node.
            </para>
            <para>
              Normally <option>promote_command</option> is set as &repmgr;'s
              <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> command.
            </para>
            <para>
              It is also possible to provide a shell script to e.g. perform user-defined tasks
              before promoting the current node. In this case the script <emphasis>must</emphasis>
              at some point execute <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
              to promote the node; if this is not done, &repmgr; metadata will not be updated and
              &repmgr; will no longer function reliably.
            </para>
            <para>
              Example:
              <programlisting>
                promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'</programlisting>
            </para>
            <para>
              Note that the <literal>--log-to-file</literal> option will cause
              output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
              to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
            </para>
            <note>
              <para>
                &repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
@@ -56,51 +241,204 @@
                specified with the full path.
              </para>
            </note>
          </listitem>
        </varlistentry>
        <varlistentry>
          <indexterm>
            <primary>follow_command</primary>
          </indexterm>
          <term><option>follow_command</option></term>
          <listitem>
            <para>
              The program or script defined in <option>follow_command</option> will be executed
              in a failover situation when <application>repmgrd</application> determines that
              the current node is to follow the new primary node.
            </para>
            <para>
              Normally <option>follow_command</option> is set as &repmgr;'s
              <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command> command.
            </para>
            <para>
              The <option>follow_command</option> parameter
              should provide the <literal>--upstream-node-id=%n</literal>
              option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
              <application>repmgrd</application> with the ID of the new primary node. If this is not provided,
              <command>repmgr standby follow</command> will attempt to determine the new primary by itself, but if the
              original primary comes back online after the new primary is promoted, there is a risk that
              <command>repmgr standby follow</command> will result in the node continuing to follow
              the original primary.
            </para>
            <para>
              It is also possible to provide a shell script to e.g. perform user-defined tasks
              before promoting the current node. In this case the script <emphasis>must</emphasis>
              at some point execute <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>
              to promote the node; if this is not done, &repmgr; metadata will not be updated and
              &repmgr; will no longer function reliably.
            </para>
            <para>
              Example:
              <programlisting>
          follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
            </para>
            <para>
              Note that the <literal>--log-to-file</literal> option will cause
              output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
              to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
        See <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename>
        for further <application>repmgrd</application>-specific settings.
            </para>
      <para>
        When <varname>failover</varname> is set to <literal>automatic</literal>, upon detecting failure
        of the current  primary, <application>repmgrd</application> will execute one of:
      </para>
      <itemizedlist spacing="compact" mark="bullet">
        <listitem>
          <simpara>
            <varname>promote_command</varname> (if the current server is to become the new primary)
          </simpara>
        </listitem>
        <listitem>
          <simpara>
            <varname>follow_command</varname> (if the current server needs to follow another server which has
            become the new primary)
          </simpara>
        </listitem>
      </itemizedlist>
            <note>
              <para>
-          These commands can be any valid shell script which results in one of these
+                &repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
-          two actions happening, but if &repmgr;'s <command>standby follow</command> or
+                or <option>follow_command</option>; these can be user-defined scripts so must always be
-          <command>standby promote</command>
+                specified with the full path.
-          commands are not executed (either directly as shown here, or from a script which
+              </para>
-          performs other actions), the &repmgr; metadata will not be updated and
+            </note>
-          &repmgr; will no longer function reliably.
+          </listitem>
        </varlistentry>
      </variablelist>
    </sect2>
    <sect2 id="repmgrd-automatic-failover-configuration-optional">
      <title>Optional configuration for automatic failover</title>
      <para>
        The following configuraton options can be use to fine-tune automatic failover:
      </para>
      <variablelist>
        <varlistentry>
          <indexterm>
            <primary>priority</primary>
          </indexterm>
          <term><option>priority</option></term>
          <listitem>
            <para>
              Indicates a preferred priority (default: <literal>100</literal>) for promoting nodes;
 			  a value of zero prevents the node being promoted to primary.
            </para>
            <para>
              Note that the priority setting is only applied if two or more nodes are
              determined as promotion candidates; in that case the node with the
              higher priority is selected.
            </para>
          </listitem>
        </varlistentry>
        <varlistentry>
          <indexterm>
            <primary>failover_validation_command</primary>
          </indexterm>
          <term><option>failover_validation_command</option></term>
          <listitem>
            <para>
              User-defined script to execute for an external mechanism to validate the failover
 	      decision made by <application>repmgrd</application>.
            </para>
            <note>
              <para>
                This option <emphasis>must</emphasis> be identically configured
                on all nodes.
              </para>
            </note>
            <para>
              One or both of the following parameter placeholders
 			  should be provided, which will be replaced by repmgrd with the appropriate
 	          value:
              <itemizedlist spacing="compact" mark="bullet">
                <listitem>
                  <simpara><literal>%n</literal>: node ID</simpara>
                </listitem>
                <listitem>
                  <simpara><literal>%a</literal>: node name</simpara>
                </listitem>
              </itemizedlist>
            </para>
            <para>
              See also: <link linkend="repmgrd-failover-validation">Failover validation</link>.
            </para>
          </listitem>
        </varlistentry>
        <varlistentry>
         <indexterm>
            <primary>standby_disconnect_on_failover</primary>
          </indexterm>
          <term><option>standby_disconnect_on_failover</option></term>
          <listitem>
            <para>
              In a failover situation, disconnect the local node's WAL receiver.
            </para>
            <para>
              This option is available from PostgreSQL 9.5 and later.
            </para>
            <note>
              <para>
                This option <emphasis>must</emphasis> be identically configured
                on all nodes.
              </para>
              <para>
                Additionally the &repmgr; user <emphasis>must</emphasis> be a superuser
                for this option.
              </para>
              <para>
                <application>repmgrd</application> will refuse to start if this option is set
                but either of these prerequisites is not met.
              </para>
            </note>
            <para>
-        The <varname>follow_command</varname> should provide the <literal>--upstream-node-id=%n</literal>
+              See also: <link linkend="repmgrd-standby-disconnection-on-failover">Standby disconnection on failover</link>.
        option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
        <application>repmgrd</application> with the ID of the new primary node. If this is not provided, &repmgr;
        will attempt to determine the new primary by itself, but if the
        original primary comes back online after the new primary is promoted, there is a risk that
        <command>repmgr standby follow</command> will result in the node continuing to follow
        the original primary.
            </para>
          </listitem>
        </varlistentry>
      </variablelist>
      <para>
        The following options can be used to further fine-tune failover behaviour.
        In practice it's unlikely these will need to be changed from their default
        values, but are available as configuration options should the need arise.
      </para>
      <variablelist>
        <varlistentry>
          <indexterm>
            <primary>election_rerun_interval</primary>
          </indexterm>
          <term><option>election_rerun_interval</option></term>
          <listitem>
 			<para>
 			  If <option>failover_validation_command</option> is set, and the command returns
 			  an error, pause the specified amount of seconds (default: 15) before rerunning the election.
 			</para>
 		  </listitem>
 		</varlistentry>
        <varlistentry>
          <indexterm>
            <primary>sibling_nodes_disconnect_timeout</primary>
          </indexterm>
          <term><option>sibling_nodes_disconnect_timeout</option></term>
          <listitem>
 			<para>
              If <option>standby_disconnect_on_failover</option> is <literal>true</literal>, the
              maximum length of time (in seconds, default: <literal>30</literal>)
 			  to wait for other standbys to confirm they have disconnected their
 		      WAL receivers.
 			</para>
 		  </listitem>
 		</varlistentry>
      </variablelist>
    </sect2>
    <sect2 id="postgresql-service-configuration">
@@ -175,10 +513,8 @@ repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
        in <filename>repmgr.conf</filename>.
      </para>
      <para>
-        The default monitoring interval is 2 seconds; this value can be explicitly set using:
+        Monitoring data is written at the interval defined by
-        <programlisting>
+        the option <option>monitor_interval_secs</option> (see above).
          monitor_interval_secs=&lt;seconds&gt;</programlisting>
        in <filename>repmgr.conf</filename>.
      </para>
      <para>
        For more details on monitoring, see <xref linkend="repmgrd-monitoring">.
@@ -228,6 +564,13 @@ repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
          </simpara>
        </listitem>
        <listitem>
          <simpara>
            <varname>connection_check_type</varname>
          </simpara>
        </listitem>
        <listitem>
          <simpara>
            <varname>conninfo</varname>
@@ -252,6 +595,12 @@ repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
          </simpara>
        </listitem>
        <listitem>
          <simpara>
            <varname>failover_validation_command</varname>
          </simpara>
        </listitem>
        <listitem>
          <simpara>
            <varname>failover</varname>
@@ -324,12 +673,30 @@ repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
          </simpara>
        </listitem>
        <listitem>
          <simpara>
            <varname>retry_promote_interval_secs</varname>
          </simpara>
        </listitem>
        <listitem>
          <simpara>
            <varname>repmgrd_standby_startup_timeout</varname>
          </simpara>
        </listitem>
        <listitem>
          <simpara>
            <varname>sibling_nodes_disconnect_timeout</varname>
          </simpara>
        </listitem>
        <listitem>
          <simpara>
            <varname>standby_disconnect_on_failover</varname>
          </simpara>
        </listitem>
      </itemizedlist>
      <para>
--- a/doc/repmgrd-degraded-monitoring.sgml
+++ b/doc/repmgrd-degraded-monitoring.sgml
@@ -1,83 +0,0 @@
 <chapter id="repmgrd-degraded-monitoring" xreflabel="repmgrd degraded monitoring">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>degraded monitoring</secondary>
 </indexterm>
 <title>"degraded monitoring" mode</title>
 <para>
  In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
  of monitoring the node's upstream server. In these cases it enters &quot;degraded monitoring&quot;
  mode, where <application>repmgrd</application> remains active but is waiting for the situation
  to be resolved.
 </para>
 <para>
  Situations where this happens are:
  <itemizedlist spacing="compact" mark="bullet">
   <listitem>
    <simpara>a failover situation has occurred, no nodes in the primary node's location are visible</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but no promotion candidate is available</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but the promotion candidate could not be promoted</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but the node was unable to follow the new primary</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but no primary has become available</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but automatic failover is not enabled for the node</simpara>
   </listitem>
   <listitem>
    <simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
   </listitem>
  </itemizedlist>
 </para>
 <para>
  Example output in a situation where there is only one standby with <literal>failover=manual</literal>,
  and the primary node is unavailable (but is later restarted):
  <programlisting>
    [2017-08-29 10:59:19] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)
    [2017-08-29 10:59:33] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
    [2017-08-29 10:59:33] [INFO] checking state of node 1, 1 of 5 attempts
    [2017-08-29 10:59:33] [INFO] sleeping 1 seconds until next reconnection attempt
    (...)
    [2017-08-29 10:59:37] [INFO] checking state of node 1, 5 of 5 attempts
    [2017-08-29 10:59:37] [WARNING] unable to reconnect to node 1 after 5 attempts
    [2017-08-29 10:59:37] [NOTICE] this node is not configured for automatic failover so will not be considered as promotion candidate
    [2017-08-29 10:59:37] [NOTICE] no other nodes are available as promotion candidate
    [2017-08-29 10:59:37] [HINT] use "repmgr standby promote" to manually promote this node
    [2017-08-29 10:59:37] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
    [2017-08-29 10:59:53] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
    [2017-08-29 11:00:45] [NOTICE] reconnected to upstream node 1 after 68 seconds, resuming monitoring
    [2017-08-29 11:00:57] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)</programlisting>
 </para>
 <para>
  By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
  However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
  after which <application>repmgrd</application> will terminate.
 </para>
 <note>
   <para>
     If <application>repmgrd</application> is monitoring a primary mode which has been stopped
     and manually restarted as a standby attached to a new primary, it will automatically detect
     the status change and update the node record to reflect the node's new status
     as an active standby. It will then resume monitoring the node as a standby.
   </para>
 </note>
 </chapter>
--- a/doc/repmgrd-demonstration.sgml
+++ b/doc/repmgrd-demonstration.sgml
@@ -1,96 +0,0 @@
 <chapter id="repmgrd-demonstration">
 <title>repmgrd demonstration</title>
 <para>
  To demonstrate automatic failover, set up a 3-node replication cluster (one primary
  and two standbys streaming directly from the primary) so that the cluster looks
  something like this:
  <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster show
     ID | Name  | Role    | Status    | Upstream | Location | Connection string
    ----+-------+---------+-----------+----------+----------+--------------------------------------
     1  | node1 | primary | * running |          | default  | host=node1 dbname=repmgr user=repmgr
     2  | node2 | standby |   running | node1    | default  | host=node2 dbname=repmgr user=repmgr
     3  | node3 | standby |   running | node1    | default  | host=node3 dbname=repmgr user=repmgr</programlisting>
 </para>
 <para>
  Start <application>repmgrd</application> on each standby and verify that it's running by examining the
  log output, which at log level <literal>INFO</literal> will look like this:
  <programlisting>
    [2017-08-24 17:31:00] [NOTICE] using configuration file "/etc/repmgr.conf"
    [2017-08-24 17:31:00] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr"
    [2017-08-24 17:31:00] [NOTICE] starting monitoring of node <literal>node2</literal> (ID: 2)
    [2017-08-24 17:31:00] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
 </para>
 <para>
  Each <application>repmgrd</application> should also have recorded its successful startup as an event:
  <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster event --event=repmgrd_start
     Node ID | Name  | Event         | OK | Timestamp           | Details
    ---------+-------+---------------+----+---------------------+-------------------------------------------------------------
     3       | node3 | repmgrd_start | t  | 2017-08-24 17:35:54 | monitoring connection to upstream node "node1" (node ID: 1)
     2       | node2 | repmgrd_start | t  | 2017-08-24 17:35:50 | monitoring connection to upstream node "node1" (node ID: 1)
     1       | node1 | repmgrd_start | t  | 2017-08-24 17:35:46 | monitoring cluster primary "node1" (node ID: 1)  </programlisting>
 </para>
 <para>
  Now stop the current primary server with e.g.:
  <programlisting>
    pg_ctl -D /var/lib/postgresql/data -m immediate stop</programlisting>
 </para>
 <para>
  This will force the primary to shut down straight away, aborting all processes
  and transactions.  This will cause a flurry of activity in the <application>repmgrd</application> log
  files as each <application>repmgrd</application> detects the failure of the primary and a failover
  decision is made. This is an extract from the log of a standby server (<literal>node2</literal>)
  which has promoted to new primary after failure of the original primary (<literal>node1</literal>).
  <programlisting>
    [2017-08-24 23:32:01] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state
    [2017-08-24 23:32:08] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
    [2017-08-24 23:32:08] [INFO] checking state of node 1, 1 of 5 attempts
    [2017-08-24 23:32:08] [INFO] sleeping 1 seconds until next reconnection attempt
    [2017-08-24 23:32:09] [INFO] checking state of node 1, 2 of 5 attempts
    [2017-08-24 23:32:09] [INFO] sleeping 1 seconds until next reconnection attempt
    [2017-08-24 23:32:10] [INFO] checking state of node 1, 3 of 5 attempts
    [2017-08-24 23:32:10] [INFO] sleeping 1 seconds until next reconnection attempt
    [2017-08-24 23:32:11] [INFO] checking state of node 1, 4 of 5 attempts
    [2017-08-24 23:32:11] [INFO] sleeping 1 seconds until next reconnection attempt
    [2017-08-24 23:32:12] [INFO] checking state of node 1, 5 of 5 attempts
    [2017-08-24 23:32:12] [WARNING] unable to reconnect to node 1 after 5 attempts
    INFO:  setting voting term to 1
    INFO:  node 2 is candidate
    INFO:  node 3 has received request from node 2 for electoral term 1 (our term: 0)
    [2017-08-24 23:32:12] [NOTICE] this node is the winner, will now promote self and inform other nodes
    INFO: connecting to standby database
    NOTICE: promoting standby
    DETAIL: promoting server using 'pg_ctl -l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' promote'
    INFO: reconnecting to promoted server
    NOTICE: STANDBY PROMOTE successful
    DETAIL: node 2 was successfully promoted to primary
    INFO:  node 3 received notification to follow node 2
    [2017-08-24 23:32:13] [INFO] switching to primary monitoring mode</programlisting>
 </para>
 <para>
  The cluster status will now look like this, with the original primary (<literal>node1</literal>)
  marked as inactive, and standby <literal>node3</literal> now following the new primary
  (<literal>node2</literal>):
  <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster show
     ID | Name  | Role    | Status    | Upstream | Location | Connection string
    ----+-------+---------+-----------+----------+----------+----------------------------------------------------
     1  | node1 | primary | - failed  |          | default  | host=node1 dbname=repmgr user=repmgr
     2  | node2 | primary | * running |          | default  | host=node2 dbname=repmgr user=repmgr
     3  | node3 | standby |   running | node2    | default  | host=node3 dbname=repmgr user=repmgr</programlisting>
 </para>
 <para>
  <command>repmgr cluster event</command> will display a summary of what happened to each server
  during the failover:
  <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster event
     Node ID | Name  | Event                    | OK | Timestamp           | Details
    ---------+-------+--------------------------+----+---------------------+-----------------------------------------------------------------------------------
     3       | node3 | repmgrd_failover_follow  | t  | 2017-08-24 23:32:16 | node 3 now following new upstream node 2
     3       | node3 | standby_follow           | t  | 2017-08-24 23:32:16 | node 3 is now attached to node 2
     2       | node2 | repmgrd_failover_promote | t  | 2017-08-24 23:32:13 | node 2 promoted to primary; old primary 1 marked as failed
     2       | node2 | standby_promote          | t  | 2017-08-24 23:32:13 | node 2 was successfully promoted to primary</programlisting>
 </para>
 </chapter>
--- a/doc/repmgrd-monitoring.sgml
+++ b/doc/repmgrd-monitoring.sgml
@@ -1,80 +0,0 @@
 <chapter id="repmgrd-monitoring" xreflabel="Monitoring with repmgrd">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>monitoring</secondary>
 </indexterm>
 <indexterm>
   <primary>monitoring</primary>
   <secondary>with repmgrd</secondary>
 </indexterm>
 <title>Monitoring with repmgrd</title>
 <para>
   When <application>repmgrd</application> is running with the option <literal>monitoring_history=true</literal>,
  it will constantly write standby node status information to the
  <varname>monitoring_history</varname> table, providing a near-real time
  overview of replication status on all nodes
  in the cluster.
 </para>
 <para>
   The view <literal>replication_status</literal> shows the most recent state
   for each node, e.g.:
  <programlisting>
    repmgr=# select * from repmgr.replication_status;
    -[ RECORD 1 ]-------------+------------------------------
    primary_node_id           | 1
    standby_node_id           | 2
    standby_name              | node2
    node_type                 | standby
    active                    | t
    last_monitor_time         | 2017-08-24 16:28:41.260478+09
    last_wal_primary_location | 0/6D57A00
    last_wal_standby_location | 0/5000000
    replication_lag           | 29 MB
    replication_time_lag      | 00:00:11.736163
    apply_lag                 | 15 MB
    communication_time_lag    | 00:00:01.365643</programlisting>
 </para>
 <para>
  The interval in which monitoring history is written is controlled by the
  configuration parameter <varname>monitor_interval_secs</varname>;
  default is 2.
 </para>
 <para>
  As this can generate a large amount of monitoring data in the table
  <literal>repmgr.monitoring_history</literal>. it's advisable to regularly
  purge historical data using the <xref linkend="repmgr-cluster-cleanup">
  command; use the <literal>-k/--keep-history</literal> option to
  specify how many day's worth of data should be retained.
 </para>
 <para>
  It's possible to use <application>repmgrd</application> to run in monitoring
  mode only (without automatic failover capability) for some or all
  nodes by setting <literal>failover=manual</literal> in the node's
  <filename>repmgr.conf</filename> file. In the event of the node's upstream failing,
  no failover action will be taken and the node will require manual intervention to
  be reattached to replication. If this occurs, an
  <link linkend="event-notifications">event notification</link>
  <varname>standby_disconnect_manual</varname> will be created.
 </para>
 <para>
  Note that when a standby node is not streaming directly from its upstream
  node, e.g. recovering WAL from an archive, <varname>apply_lag</varname> will always appear as
  <literal>0 bytes</literal>.
 </para>
 <tip>
  <para>
   If monitoring history is enabled, the contents of the <literal>repmgr.monitoring_history</literal>
   table will be replicated to attached standbys. This means there will be a small but
   constant stream of replication activity which may not be desirable. To prevent
   this, convert the table to an <literal>UNLOGGED</literal> one with:
   <programlisting>
     ALTER TABLE repmgr.monitoring_history SET UNLOGGED;</programlisting>
  </para>
  <para>
   This will however mean that monitoring history will not be available on
   another node following a failover, and the view <literal>repmgr.replication_status</literal>
   will not work on standbys.
  </para>
 </tip>
 </chapter>
--- a/doc/repmgrd-network-split.sgml
+++ b/doc/repmgrd-network-split.sgml
@@ -1,48 +0,0 @@
 <chapter id="repmgrd-network-split" xreflabel="Handling network splits with repmgrd">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>network splits</secondary>
 </indexterm>
 <title>Handling network splits with repmgrd</title>
 <para>
  A common pattern for replication cluster setups is to spread servers over
  more than one datacentre. This can provide benefits such as geographically-
  distributed read replicas and DR (disaster recovery capability). However
  this also means there is a risk of disconnection at network level between
  datacentre locations, which would result in a split-brain scenario if
  servers in a secondary data centre were no longer able to see the primary
  in the main data centre and promoted a standby among themselves.
 </para>
 <para>
  &repmgr; enables provision of &quot;<xref linkend="witness-server">&quot; to
  artificially create a quorum of servers in a particular location, ensuring
  that nodes in another location will not elect a new primary if they
  are unable to see the majority of nodes. However this approach does not
  scale well, particularly with more complex replication setups, e.g.
  where the majority of nodes are located outside of the primary datacentre.
  It also means the <literal>witness</literal> node needs to be managed as an
  extra PostgreSQL instance outside of the main replication cluster, which
  adds administrative and programming complexity.
 </para>
 <para>
  <literal>repmgr4</literal> introduces the concept of <literal>location</literal>:
  each node is associated with an arbitrary location string (default is
  <literal>default</literal>); this is set in <filename>repmgr.conf</filename>, e.g.:
  <programlisting>
    node_id=1
    node_name=node1
    conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'
    data_directory='/var/lib/postgresql/data'
    location='dc1'</programlisting>
 </para>
 <para>
  In a failover situation, <application>repmgrd</application> will check if any servers in the
  same location as the current primary node are visible.  If not, <application>repmgrd</application>
  will assume a network interruption and not promote any node in any
  other location (it will however enter <link linkend="repmgrd-degraded-monitoring">degraded monitoring</link>
  mode until a primary becomes visible).
 </para>
 </chapter>
--- a/doc/repmgrd-notes.sgml
+++ b/doc/repmgrd-notes.sgml
@@ -1,38 +0,0 @@
 <chapter id="repmgrd-notes" xreflabel="repmgrd notes">
  <indexterm>
    <primary>repmgrd</primary>
    <secondary>notes</secondary>
  </indexterm>
  <title>repmgrd notes</title>
  <sect1 id="repmgrd-wal-replay-pause">
    <indexterm>
      <primary>repmgrd</primary>
      <secondary>paused WAL replay</secondary>
    </indexterm>
    <title>repmgrd and paused WAL replay</title>
    <para>
      If WAL replay has been paused (using <command>pg_wal_replay_pause()</command>,
      on PostgreSQL 9.6 and earlier <command>pg_xlog_replay_pause()</command>),
      in a failover situation <application>repmgrd</application> will
      automatically resume WAL replay.
    </para>
    <para>
      This is because if WAL replay is paused, but WAL is pending replay,
      PostgreSQL cannot be promoted until WAL replay is resumed.
    </para>
    <note>
      <para>
        <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
        will refuse to promote a node in this state, as the PostgreSQL
        <command>promote</command> command will not be acted on until
        WAL replay is resumed, leaving the cluster in a potentially
        unstable state. In this case it is up to the user to
        decide whether to resume WAL replay.
      </para>
    </note>
  </sect1>
 </chapter>
--- a/doc/repmgrd-operation.sgml
+++ b/doc/repmgrd-operation.sgml
@@ -0,0 +1,386 @@
 <chapter id="repmgrd-operation" xreflabel="repmgrd operation">
  <indexterm>
    <primary>repmgrd</primary>
    <secondary>operation</secondary>
  </indexterm>
  <title>repmgrd operation</title>
  <sect1 id="repmgrd-pausing">
  <indexterm>
    <primary>repmgrd</primary>
    <secondary>pausing</secondary>
  </indexterm>
  <indexterm>
    <primary>pausing repmgrd</primary>
  </indexterm>
  <title>Pausing repmgrd</title>
  <para>
    In normal operation, <application>repmgrd</application> monitors the state of the
    PostgreSQL node it is running on, and will take appropriate action if problems
    are detected, e.g. (if so configured) promote the node to primary, if the existing
    primary has been determined as failed.
  </para>
  <para>
    However, <application>repmgrd</application> is unable to distinguish between
    planned outages (such as performing a <link linkend="performing-switchover">switchover</link>
    or installing PostgreSQL maintenance released), and an actual server outage. In versions prior to
    &repmgr; 4.2 it was necessary to stop <application>repmgrd</application> on all nodes (or at least
    on all nodes where <application>repmgrd</application> is
    <link linkend="repmgrd-automatic-failover">configured for automatic failover</link>)
    to prevent <application>repmgrd</application> from making unintentional changes to the
    replication cluster.
  </para>
  <para>
    From <link linkend="release-4.2">&repmgr; 4.2</link>, <application>repmgrd</application>
    can now be &quot;paused&quot;, i.e. instructed not to take any action such as performing a failover.
    This can be done from any node in the cluster, removing the need to stop/restart
    each <application>repmgrd</application> individually.
  </para>
  <note>
    <para>
      For major PostgreSQL upgrades, e.g. from PostgreSQL 10 to PostgreSQL 11,
      <application>repmgrd</application> should be shut down completely and only started up
      once the &repmgr; packages for the new PostgreSQL major version have been installed.
    </para>
  </note>
  <sect2 id="repmgrd-pausing-prerequisites">
    <title>Prerequisites for pausing <application>repmgrd</application></title>
    <para>
      In order to be able to pause/unpause <application>repmgrd</application>, following
      prerequisites must be met:
      <itemizedlist spacing="compact" mark="bullet">
        <listitem>
          <simpara><link linkend="release-4.2">&repmgr; 4.2</link> or later must be installed on all nodes.</simpara>
        </listitem>
        <listitem>
          <simpara>The same major &repmgr; version (e.g. 4.2) must be installed on all nodes (and preferably the same minor version).</simpara>
        </listitem>
        <listitem>
          <simpara>
            PostgreSQL on all nodes must be accessible from the node where the
            <literal>pause</literal>/<literal>unpause</literal> operation is executed, using the
            <varname>conninfo</varname> string shown by <link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>.
          </simpara>
        </listitem>
      </itemizedlist>
    </para>
    <note>
      <para>
        These conditions are required for normal &repmgr; operation in any case.
      </para>
    </note>
  </sect2>
  <sect2 id="repmgrd-pausing-execution">
    <title>Pausing/unpausing <application>repmgrd</application></title>
    <para>
      To pause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link>, e.g.:
   <programlisting>
 $ repmgr -f /etc/repmgr.conf daemon pause
 NOTICE: node 1 (node1) paused
 NOTICE: node 2 (node2) paused
 NOTICE: node 3 (node3) paused</programlisting>
    </para>
    <para>
      The state of <application>repmgrd</application> on each node can be checked with
      <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>, e.g.:
    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
 ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
 ----+-------+---------+---------+---------+------+---------
 1  | node1 | primary | running | running | 7851 | yes
 2  | node2 | standby | running | running | 7889 | yes
 3  | node3 | standby | running | running | 7918 | yes</programlisting>
    </para>
    <note>
      <para>
        If executing a switchover with  <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
 		&repmgr; will automatically pause/unpause <application>repmgrd</application> as part of the switchover process.
      </para>
    </note>
    <para>
      If the primary (in this example, <literal>node1</literal>) is stopped, <application>repmgrd</application>
      running on one of the standbys (here: <literal>node2</literal>) will react like this:
      <programlisting>
 [2018-09-20 12:22:21] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
 [2018-09-20 12:22:21] [INFO] checking state of node 1, 1 of 5 attempts
 [2018-09-20 12:22:21] [INFO] sleeping 1 seconds until next reconnection attempt
 ...
 [2018-09-20 12:22:24] [INFO] sleeping 1 seconds until next reconnection attempt
 [2018-09-20 12:22:25] [INFO] checking state of node 1, 5 of 5 attempts
 [2018-09-20 12:22:25] [WARNING] unable to reconnect to node 1 after 5 attempts
 [2018-09-20 12:22:25] [NOTICE] node is paused
 [2018-09-20 12:22:33] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state
 [2018-09-20 12:22:33] [DETAIL] repmgrd paused by administrator
 [2018-09-20 12:22:33] [HINT] execute "repmgr daemon unpause" to resume normal failover mode</programlisting>
    </para>
    <para>
      If the primary becomes available again (e.g. following a software upgrade), <application>repmgrd</application>
      will automatically reconnect, e.g.:
      <programlisting>
 [2018-09-20 13:12:41] [NOTICE] reconnected to upstream node 1 after 8 seconds, resuming monitoring</programlisting>
    </para>
    <para>
      To unpause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>, e.g.:
   <programlisting>
 $ repmgr -f /etc/repmgr.conf daemon unpause
 NOTICE: node 1 (node1) unpaused
 NOTICE: node 2 (node2) unpaused
 NOTICE: node 3 (node3) unpaused</programlisting>
    </para>
    <note>
      <para>
        If the previous primary is no longer accessible when <application>repmgrd</application>
        is unpaused, no failover action will be taken. Instead, a new primary must be manually promoted using
        <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>,
 		and any standbys attached to the new primary with
 		<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>.
      </para>
      <para>
        This is to prevent <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
        resulting in the automatic promotion of a new primary, which may be a problem particularly
        in larger clusters, where <application>repmgrd</application> could select a different promotion
        candidate to the one intended by the administrator.
      </para>
    </note>
  </sect2>
  <sect2 id="repmgrd-pausing-details">
    <title>Details on the <application>repmgrd</application> pausing mechanism</title>
    <para>
      The pause state of each node will be stored over a PostgreSQL restart.
    </para>
 	<para>
 	  <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
 	  <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link> can be
 	  executed even if <application>repmgrd</application> is not running; in this case,
 	  <application>repmgrd</application> will start up in whichever pause state has been set.
 	</para>
    <note>
      <para>
 		<link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
 		<link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
 		<emphasis>do not</emphasis> stop/start <application>repmgrd</application>.
      </para>
    </note>
  </sect2>
  </sect1>
  <sect1 id="repmgrd-wal-replay-pause">
    <indexterm>
      <primary>repmgrd</primary>
      <secondary>paused WAL replay</secondary>
    </indexterm>
    <title>repmgrd and paused WAL replay</title>
    <para>
      If WAL replay has been paused (using <command>pg_wal_replay_pause()</command>,
      on PostgreSQL 9.6 and earlier <command>pg_xlog_replay_pause()</command>),
      in a failover situation <application>repmgrd</application> will
      automatically resume WAL replay.
    </para>
    <para>
      This is because if WAL replay is paused, but WAL is pending replay,
      PostgreSQL cannot be promoted until WAL replay is resumed.
    </para>
    <note>
      <para>
        <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
        will refuse to promote a node in this state, as the PostgreSQL
        <command>promote</command> command will not be acted on until
        WAL replay is resumed, leaving the cluster in a potentially
        unstable state. In this case it is up to the user to
        decide whether to resume WAL replay.
      </para>
    </note>
  </sect1>
 <sect1 id="repmgrd-degraded-monitoring" xreflabel="repmgrd degraded monitoring">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>degraded monitoring</secondary>
 </indexterm>
 <indexterm>
   <primary>degraded monitoring</primary>
 </indexterm>
 <title>"degraded monitoring" mode</title>
 <para>
  In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
  of monitoring the node's upstream server. In these cases it enters &quot;degraded monitoring&quot;
  mode, where <application>repmgrd</application> remains active but is waiting for the situation
  to be resolved.
 </para>
 <para>
  Situations where this happens are:
  <itemizedlist spacing="compact" mark="bullet">
   <listitem>
    <simpara>a failover situation has occurred, no nodes in the primary node's location are visible</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but no promotion candidate is available</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but the promotion candidate could not be promoted</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but the node was unable to follow the new primary</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but no primary has become available</simpara>
   </listitem>
   <listitem>
    <simpara>a failover situation has occurred, but automatic failover is not enabled for the node</simpara>
   </listitem>
   <listitem>
    <simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
   </listitem>
  </itemizedlist>
 </para>
 <para>
  Example output in a situation where there is only one standby with <literal>failover=manual</literal>,
  and the primary node is unavailable (but is later restarted):
  <programlisting>
    [2017-08-29 10:59:19] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)
    [2017-08-29 10:59:33] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
    [2017-08-29 10:59:33] [INFO] checking state of node 1, 1 of 5 attempts
    [2017-08-29 10:59:33] [INFO] sleeping 1 seconds until next reconnection attempt
    (...)
    [2017-08-29 10:59:37] [INFO] checking state of node 1, 5 of 5 attempts
    [2017-08-29 10:59:37] [WARNING] unable to reconnect to node 1 after 5 attempts
    [2017-08-29 10:59:37] [NOTICE] this node is not configured for automatic failover so will not be considered as promotion candidate
    [2017-08-29 10:59:37] [NOTICE] no other nodes are available as promotion candidate
    [2017-08-29 10:59:37] [HINT] use "repmgr standby promote" to manually promote this node
    [2017-08-29 10:59:37] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
    [2017-08-29 10:59:53] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
    [2017-08-29 11:00:45] [NOTICE] reconnected to upstream node 1 after 68 seconds, resuming monitoring
    [2017-08-29 11:00:57] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)</programlisting>
 </para>
 <para>
  By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
  However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
  after which <application>repmgrd</application> will terminate.
 </para>
 <note>
   <para>
     If <application>repmgrd</application> is monitoring a primary mode which has been stopped
     and manually restarted as a standby attached to a new primary, it will automatically detect
     the status change and update the node record to reflect the node's new status
     as an active standby. It will then resume monitoring the node as a standby.
   </para>
 </note>
 </sect1>
 <sect1 id="repmgrd-monitoring" xreflabel="Storing monitoring data">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>monitoring</secondary>
 </indexterm>
 <indexterm>
   <primary>monitoring</primary>
   <secondary>with repmgrd</secondary>
 </indexterm>
 <title>Storing monitoring data</title>
 <para>
   When <application>repmgrd</application> is running with the option <literal>monitoring_history=true</literal>,
  it will constantly write standby node status information to the
  <varname>monitoring_history</varname> table, providing a near-real time
  overview of replication status on all nodes
  in the cluster.
 </para>
 <para>
   The view <literal>replication_status</literal> shows the most recent state
   for each node, e.g.:
  <programlisting>
    repmgr=# select * from repmgr.replication_status;
    -[ RECORD 1 ]-------------+------------------------------
    primary_node_id           | 1
    standby_node_id           | 2
    standby_name              | node2
    node_type                 | standby
    active                    | t
    last_monitor_time         | 2017-08-24 16:28:41.260478+09
    last_wal_primary_location | 0/6D57A00
    last_wal_standby_location | 0/5000000
    replication_lag           | 29 MB
    replication_time_lag      | 00:00:11.736163
    apply_lag                 | 15 MB
    communication_time_lag    | 00:00:01.365643</programlisting>
 </para>
 <para>
  The interval in which monitoring history is written is controlled by the
  configuration parameter <varname>monitor_interval_secs</varname>;
  default is 2.
 </para>
 <para>
  As this can generate a large amount of monitoring data in the table
  <literal>repmgr.monitoring_history</literal>. it's advisable to regularly
  purge historical data using the <xref linkend="repmgr-cluster-cleanup">
  command; use the <literal>-k/--keep-history</literal> option to
  specify how many day's worth of data should be retained.
 </para>
 <para>
  It's possible to use <application>repmgrd</application> to run in monitoring
  mode only (without automatic failover capability) for some or all
  nodes by setting <literal>failover=manual</literal> in the node's
  <filename>repmgr.conf</filename> file. In the event of the node's upstream failing,
  no failover action will be taken and the node will require manual intervention to
  be reattached to replication. If this occurs, an
  <link linkend="event-notifications">event notification</link>
  <varname>standby_disconnect_manual</varname> will be created.
 </para>
 <para>
  Note that when a standby node is not streaming directly from its upstream
  node, e.g. recovering WAL from an archive, <varname>apply_lag</varname> will always appear as
  <literal>0 bytes</literal>.
 </para>
 <tip>
  <para>
   If monitoring history is enabled, the contents of the <literal>repmgr.monitoring_history</literal>
   table will be replicated to attached standbys. This means there will be a small but
   constant stream of replication activity which may not be desirable. To prevent
   this, convert the table to an <literal>UNLOGGED</literal> one with:
   <programlisting>
     ALTER TABLE repmgr.monitoring_history SET UNLOGGED;</programlisting>
  </para>
  <para>
   This will however mean that monitoring history will not be available on
   another node following a failover, and the view <literal>repmgr.replication_status</literal>
   will not work on standbys.
  </para>
 </tip>
 </sect1>
 </chapter>
--- a/doc/repmgrd-overview.sgml
+++ b/doc/repmgrd-overview.sgml
@@ -0,0 +1,187 @@
 <chapter id="repmgrd-overview" xreflabel="repmgrd overview">
  <indexterm>
    <primary>repmgrd</primary>
    <secondary>overview</secondary>
  </indexterm>
  <title>repmgrd overview</title>
  <para>
    <application>repmgrd</application> (&quot;<literal>replication manager daemon</literal>&quot;)
    is a management and monitoring daemon which runs
    on each node in a replication cluster. It can automate actions such as
    failover and updating standbys to follow the new primary, as well as
    providing monitoring information about the state of each standby.
  </para>
  <para>
    <application>repmgrd</application> is designed to be straightforward to set up
    and does not require additional external infrastructure.
  </para>
  <para>
    Functionality provided by <application>repmgrd</application> includes:
    <itemizedlist spacing="compact" mark="bullet">
       <listitem>
         <simpara>
           wide range of <link linkend="repmgrd-basic-configuration">configuration options</link>
         </simpara>
       </listitem>
       <listitem>
         <simpara>
           option to execute custom scripts (&quot;<link linkend="event-notifications">event notifications</link>
           at different points in the failover sequence
         </simpara>
       </listitem>
       <listitem>
         <simpara>
           ability to <link linkend="repmgrd-pausing">pause repmgrd</link>
           operation on all nodes with a
           <link linkend="repmgr-daemon-pause"><command>single command</command></link>
         </simpara>
       </listitem>
       <listitem>
         <simpara>
           optional <link linkend="repmgrd-witness-server">witness server</link>
         </simpara>
       </listitem>
       <listitem>
         <simpara>
           &quot;location&quot; configuration option to restrict
           potential promotion candidates to a single location
           (e.g. when nodes are spread over multiple data centres)
         </simpara>
       </listitem>
       <listitem>
         <simpara>
           <link linkend="connection-check-type">choice of method</link> to determine node availability
           (PostgreSQL ping, query execution or new connection)
         </simpara>
       </listitem>
       <listitem>
         <simpara>
           retention of monitoring statistics (optional)
         </simpara>
       </listitem>
    </itemizedlist>
  </para>
  <sect1 id="repmgrd-demonstration">
    <title>repmgrd demonstration</title>
    <para>
  To demonstrate automatic failover, set up a 3-node replication cluster (one primary
  and two standbys streaming directly from the primary) so that the cluster looks
  something like this:
  <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster show --compact
     ID | Name  | Role    | Status    | Upstream | Location | Prio.
    ----+-------+---------+-----------+----------+----------+-------
     1  | node1 | primary | * running |          | default  | 100
     2  | node2 | standby |   running | node1    | default  | 100
     3  | node3 | standby |   running | node1    | default  | 100</programlisting>
 </para>
 <tip>
   <para>
     See section <link linkend="repmgrd-automatic-failover-configuration">Required configuration for automatic failover</link>
     for an example of minimal <filename>repmgr.conf</filename> file settings suitable for use with <application>repmgrd</application>.
   </para>
 </tip>
 <para>
  Start <application>repmgrd</application> on each standby and verify that it's running by examining the
  log output, which at log level <literal>INFO</literal> will look like this:
  <programlisting>
    [2019-03-15 06:32:05] [NOTICE] repmgrd (repmgrd 4.3) starting up
    [2019-03-15 06:32:05] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr connect_timeout=2"
    INFO:  set_repmgrd_pid(): provided pidfile is /var/run/repmgr/repmgrd-11.pid
    [2019-03-15 06:32:05] [NOTICE] starting monitoring of node "node2" (ID: 2)
    [2019-03-15 06:32:05] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
 </para>
 <para>
  Each <application>repmgrd</application> should also have recorded its successful startup as an event:
  <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster event --event=repmgrd_start
     Node ID | Name  | Event         | OK | Timestamp           | Details
    ---------+-------+---------------+----+---------------------+-------------------------------------------------------------
     3       | node3 | repmgrd_start | t  | 2019-03-14 04:17:30 | monitoring connection to upstream node "node1" (node ID: 1)
     2       | node2 | repmgrd_start | t  | 2019-03-14 04:11:47 | monitoring connection to upstream node "node1" (node ID: 1)
     1       | node1 | repmgrd_start | t  | 2019-03-14 04:04:31 | monitoring cluster primary "node1" (node ID: 1)</programlisting>
 </para>
 <para>
  Now stop the current primary server with e.g.:
  <programlisting>
    pg_ctl -D /var/lib/postgresql/data -m immediate stop</programlisting>
 </para>
 <para>
  This will force the primary to shut down straight away, aborting all processes
  and transactions.  This will cause a flurry of activity in the <application>repmgrd</application> log
  files as each <application>repmgrd</application> detects the failure of the primary and a failover
  decision is made. This is an extract from the log of a standby server (<literal>node2</literal>)
  which has promoted to new primary after failure of the original primary (<literal>node1</literal>).
  <programlisting>
    [2019-03-15 06:37:50] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
    [2019-03-15 06:37:50] [INFO] checking state of node 1, 1 of 3 attempts
    [2019-03-15 06:37:50] [INFO] sleeping 5 seconds until next reconnection attempt
    [2019-03-15 06:37:55] [INFO] checking state of node 1, 2 of 3 attempts
    [2019-03-15 06:37:55] [INFO] sleeping 5 seconds until next reconnection attempt
    [2019-03-15 06:38:00] [INFO] checking state of node 1, 3 of 3 attempts
    [2019-03-15 06:38:00] [WARNING] unable to reconnect to node 1 after 3 attempts
    [2019-03-15 06:38:00] [INFO] primary and this node have the same location ("default")
    [2019-03-15 06:38:00] [INFO] local node's last receive lsn: 0/900CBF8
    [2019-03-15 06:38:00] [INFO] node 3 last saw primary node 12 second(s) ago
    [2019-03-15 06:38:00] [INFO] last receive LSN for sibling node "node3" (ID: 3) is: 0/900CBF8
    [2019-03-15 06:38:00] [INFO] node "node3" (ID: 3) has same LSN as current candidate "node2" (ID: 2)
    [2019-03-15 06:38:00] [INFO] visible nodes: 2; total nodes: 2; no nodes have seen the primary within the last 4 seconds
    [2019-03-15 06:38:00] [NOTICE] promotion candidate is "node2" (ID: 2)
    [2019-03-15 06:38:00] [NOTICE] this node is the winner, will now promote itself and inform other nodes
    [2019-03-15 06:38:00] [INFO] promote_command is:
      "/usr/pgsql-11/bin/repmgr -f /etc/repmgr/11/repmgr.conf standby promote"
    NOTICE: promoting standby to primary
    DETAIL: promoting server "node2" (ID: 2) using "/usr/pgsql-11/bin/pg_ctl  -w -D '/var/lib/pgsql/11/data' promote"
    NOTICE: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
    NOTICE: STANDBY PROMOTE successful
    DETAIL: server "node2" (ID: 2) was successfully promoted to primary
    [2019-03-15 06:38:01] [INFO] 3 followers to notify
    [2019-03-15 06:38:01] [NOTICE] notifying node "node3" (node ID: 3) to follow node 2
    INFO:  node 3 received notification to follow node 2
    [2019-03-15 06:38:01] [INFO] switching to primary monitoring mode
    [2019-03-15 06:38:01] [NOTICE] monitoring cluster primary "node2" (node ID: 2)</programlisting>
 </para>
 <para>
  The cluster status will now look like this, with the original primary (<literal>node1</literal>)
  marked as inactive, and standby <literal>node3</literal> now following the new primary
  (<literal>node2</literal>):
  <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster show --compact
     ID | Name  | Role    | Status    | Upstream | Location | Prio.
    ----+-------+---------+-----------+----------+----------+-------
     1  | node1 | primary | - failed  |          | default  | 100
     2  | node2 | primary | * running |          | default  | 100
     3  | node3 | standby |   running | node2    | default  | 100</programlisting>
 </para>
 <para>
   <link linkend="repmgr-cluster-event"><command>repmgr cluster event</command></link> will display a summary of
   what happened to each server during the failover:
  <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster event
     Node ID | Name  | Event                      | OK | Timestamp           | Details
    ---------+-------+----------------------------+----+---------------------+-------------------------------------------------------------
     3       | node3 | repmgrd_failover_follow    | t  | 2019-03-15 06:38:03 | node 3 now following new upstream node 2
     3       | node3 | standby_follow             | t  | 2019-03-15 06:38:02 | standby attached to upstream node "node2" (node ID: 2)
     2       | node2 | repmgrd_reload             | t  | 2019-03-15 06:38:01 | monitoring cluster primary "node2" (node ID: 2)
     2       | node2 | repmgrd_failover_promote   | t  | 2019-03-15 06:38:01 | node 2 promoted to primary; old primary 1 marked as failed
     2       | node2 | standby_promote            | t  | 2019-03-15 06:38:01 | server "node2" (ID: 2) was successfully promoted to primary</programlisting>
 </para>
  </sect1>
 </chapter>
--- a/doc/repmgrd-pausing.sgml
+++ b/doc/repmgrd-pausing.sgml
@@ -1,178 +0,0 @@
 <chapter id="repmgrd-pausing" xreflabel="Pausing repmgrd">
  <indexterm>
    <primary>repmgrd</primary>
    <secondary>pausing</secondary>
  </indexterm>
  <indexterm>
    <primary>pausing repmgrd</primary>
  </indexterm>
  <title>Pausing repmgrd</title>
  <para>
    In normal operation, <application>repmgrd</application> monitors the state of the
    PostgreSQL node it is running on, and will take appropriate action if problems
    are detected, e.g. (if so configured) promote the node to primary, if the existing
    primary has been determined as failed.
  </para>
  <para>
    However, <application>repmgrd</application> is unable to distinguish between
    planned outages (such as performing a <link linkend="performing-switchover">switchover</link>
    or installing PostgreSQL maintenance released), and an actual server outage. In versions prior to
    &repmgr; 4.2 it was necessary to stop <application>repmgrd</application> on all nodes (or at least
    on all nodes where <application>repmgrd</application> is
    <link linkend="repmgrd-automatic-failover">configured for automatic failover</link>)
    to prevent <application>repmgrd</application> from making unintentional changes to the
    replication cluster.
  </para>
  <para>
    From <link linkend="release-4.2">&repmgr; 4.2</link>, <application>repmgrd</application>
    can now be &quot;paused&quot;, i.e. instructed not to take any action such as performing a failover.
    This can be done from any node in the cluster, removing the need to stop/restart
    each <application>repmgrd</application> individually.
  </para>
  <note>
    <para>
      For major PostgreSQL upgrades, e.g. from PostgreSQL 10 to PostgreSQL 11,
      <application>repmgrd</application> should be shut down completely and only started up
      once the &repmgr; packages for the new PostgreSQL major version have been installed.
    </para>
  </note>
  <sect1 id="repmgrd-pausing-prerequisites">
    <title>Prerequisites for pausing <application>repmgrd</application></title>
    <para>
      In order to be able to pause/unpause <application>repmgrd</application>, following
      prerequisites must be met:
      <itemizedlist spacing="compact" mark="bullet">
        <listitem>
          <simpara><link linkend="release-4.2">&repmgr; 4.2</link> or later must be installed on all nodes.</simpara>
        </listitem>
        <listitem>
          <simpara>The same major &repmgr; version (e.g. 4.2) must be installed on all nodes (and preferably the same minor version).</simpara>
        </listitem>
        <listitem>
          <simpara>
            PostgreSQL on all nodes must be accessible from the node where the
            <literal>pause</literal>/<literal>unpause</literal> operation is executed, using the
            <varname>conninfo</varname> string shown by <link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>.
          </simpara>
        </listitem>
      </itemizedlist>
    </para>
    <note>
      <para>
        These conditions are required for normal &repmgr; operation in any case.
      </para>
    </note>
  </sect1>
  <sect1 id="repmgrd-pausing-execution">
    <title>Pausing/unpausing <application>repmgrd</application></title>
    <para>
      To pause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link>, e.g.:
   <programlisting>
 $ repmgr -f /etc/repmgr.conf daemon pause
 NOTICE: node 1 (node1) paused
 NOTICE: node 2 (node2) paused
 NOTICE: node 3 (node3) paused</programlisting>
    </para>
    <para>
      The state of <application>repmgrd</application> on each node can be checked with
      <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>, e.g.:
    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
 ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
 ----+-------+---------+---------+---------+------+---------
 1  | node1 | primary | running | running | 7851 | yes
 2  | node2 | standby | running | running | 7889 | yes
 3  | node3 | standby | running | running | 7918 | yes</programlisting>
    </para>
    <note>
      <para>
        If executing a switchover with  <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
 		&repmgr; will automatically pause/unpause <application>repmgrd</application> as part of the switchover process.
      </para>
    </note>
    <para>
      If the primary (in this example, <literal>node1</literal>) is stopped, <application>repmgrd</application>
      running on one of the standbys (here: <literal>node2</literal>) will react like this:
      <programlisting>
 [2018-09-20 12:22:21] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
 [2018-09-20 12:22:21] [INFO] checking state of node 1, 1 of 5 attempts
 [2018-09-20 12:22:21] [INFO] sleeping 1 seconds until next reconnection attempt
 ...
 [2018-09-20 12:22:24] [INFO] sleeping 1 seconds until next reconnection attempt
 [2018-09-20 12:22:25] [INFO] checking state of node 1, 5 of 5 attempts
 [2018-09-20 12:22:25] [WARNING] unable to reconnect to node 1 after 5 attempts
 [2018-09-20 12:22:25] [NOTICE] node is paused
 [2018-09-20 12:22:33] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state
 [2018-09-20 12:22:33] [DETAIL] repmgrd paused by administrator
 [2018-09-20 12:22:33] [HINT] execute "repmgr daemon unpause" to resume normal failover mode</programlisting>
    </para>
    <para>
      If the primary becomes available again (e.g. following a software upgrade), <application>repmgrd</application>
      will automatically reconnect, e.g.:
      <programlisting>
 [2018-09-20 13:12:41] [NOTICE] reconnected to upstream node 1 after 8 seconds, resuming monitoring</programlisting>
    </para>
    <para>
      To unpause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>, e.g.:
   <programlisting>
 $ repmgr -f /etc/repmgr.conf daemon unpause
 NOTICE: node 1 (node1) unpaused
 NOTICE: node 2 (node2) unpaused
 NOTICE: node 3 (node3) unpaused</programlisting>
    </para>
    <note>
      <para>
        If the previous primary is no longer accessible when <application>repmgrd</application>
        is unpaused, no failover action will be taken. Instead, a new primary must be manually promoted using
        <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>,
 		and any standbys attached to the new primary with
 		<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>.
      </para>
      <para>
        This is to prevent <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
        resulting in the automatic promotion of a new primary, which may be a problem particularly
        in larger clusters, where <application>repmgrd</application> could select a different promotion
        candidate to the one intended by the administrator.
      </para>
    </note>
  <sect2 id="repmgrd-pausing-details">
    <title>Details on the <application>repmgrd</application> pausing mechanism</title>
    <para>
      The pause state of each node will be stored over a PostgreSQL restart.
    </para>
 	<para>
 	  <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
 	  <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link> can be
 	  executed even if <application>repmgrd</application> is not running; in this case,
 	  <application>repmgrd</application> will start up in whichever pause state has been set.
 	</para>
    <note>
      <para>
 		<link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
 		<link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
 		<emphasis>do not</emphasis> stop/start <application>repmgrd</application>.
      </para>
    </note>
  </sect2>
  </sect1>
 </chapter>
--- a/doc/repmgrd-witness-server.sgml
+++ b/doc/repmgrd-witness-server.sgml
@@ -1,31 +0,0 @@
 <chapter id="repmgrd-witness-server" xreflabel="Using a witness server with repmgrd">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>witness server</secondary>
 </indexterm>
 <title>Using a witness server with repmgrd</title>
 <para>
   In a situation caused e.g. by a network interruption between two
   data centres, it's important to avoid a "split-brain" situation where
   both sides of the network assume they are the active segment and the
   side without an active primary unilaterally promotes one of its standbys.
 </para>
 <para>
   To prevent this situation happening, it's essential to ensure that one
   network segment has a "voting majority", so other segments will know
   they're in the minority and not attempt to promote a new primary. Where
   an odd number of servers exists, this is not an issue. However, if each
   network has an even number of nodes, it's necessary to provide some way
   of ensuring a majority, which is where the witness server becomes useful.
 </para>
 <para>
   This is not a fully-fledged standby node and is not integrated into
   replication, but it effectively represents the "casting vote" when
   deciding which network segment has a majority. A witness server can
   be set up using <xref linkend="repmgr-witness-register">. Note that it only
   makes sense to create a witness server in conjunction with running
   <application>repmgrd</application>; the witness server will require its own
   <application>repmgrd</application> instance.
 </para>
 </chapter>
--- a/doc/switchover.sgml
+++ b/doc/switchover.sgml
@@ -72,7 +72,8 @@
     Ensure that a passwordless SSH connection is possible from the promotion candidate
     (standby) to the demotion candidate (current primary). If <literal>--siblings-follow</literal>
     will be used, ensure that passwordless SSH connections are possible from the
-     promotion candidate to all standbys attached to the demotion candidate.
+     promotion candidate to all nodes attached to the demotion candidate
     (including the witness server, if in use).
   </para>
   <note>
--- a/doc/version.sgml
+++ b/doc/version.sgml
@@ -1 +0,0 @@
 <!ENTITY repmgrversion "4.3dev">
--- a/log.c
+++ b/log.c
@@ -85,7 +85,7 @@ _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_li
 			time(&t);
 			tm = localtime(&t);
-			strftime(buf, 100, "[%Y-%m-%d %H:%M:%S]", tm);
+			strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", tm);
 			fprintf(stderr, "%s [%s] ", buf, level_name);
 		}
 		else
--- a/repmgr--4.2--4.3.sql
+++ b/repmgr--4.2--4.3.sql
@@ -1,12 +1,17 @@
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION repmgr" to load this file. \quit
-CREATE FUNCTION set_primary_last_seen()
+CREATE FUNCTION set_upstream_last_seen()
  RETURNS VOID
-  AS 'MODULE_PATHNAME', 'set_primary_last_seen'
+  AS 'MODULE_PATHNAME', 'set_upstream_last_seen'
  LANGUAGE C STRICT;
-CREATE FUNCTION get_primary_last_seen()
+CREATE FUNCTION get_upstream_last_seen()
  RETURNS INT
-  AS 'MODULE_PATHNAME', 'get_primary_last_seen'
+  AS 'MODULE_PATHNAME', 'get_upstream_last_seen'
  LANGUAGE C STRICT;
 CREATE FUNCTION get_wal_receiver_pid()
  RETURNS INT
  AS 'MODULE_PATHNAME', 'get_wal_receiver_pid'
  LANGUAGE C STRICT;
--- a/repmgr--4.3.sql
+++ b/repmgr--4.3.sql
@@ -118,16 +118,17 @@ CREATE FUNCTION standby_get_last_updated()
  AS 'MODULE_PATHNAME', 'standby_get_last_updated'
  LANGUAGE C STRICT;
-CREATE FUNCTION set_primary_last_seen()
+CREATE FUNCTION set_upstream_last_seen()
  RETURNS VOID
-  AS 'MODULE_PATHNAME', 'set_primary_last_seen'
+  AS 'MODULE_PATHNAME', 'set_upstream_last_seen'
  LANGUAGE C STRICT;
-CREATE FUNCTION get_primary_last_seen()
+CREATE FUNCTION get_upstream_last_seen()
  RETURNS INT
-  AS 'MODULE_PATHNAME', 'get_primary_last_seen'
+  AS 'MODULE_PATHNAME', 'get_upstream_last_seen'
  LANGUAGE C STRICT;
 /* failover functions */
 CREATE FUNCTION notify_follow_primary(INT)
@@ -185,6 +186,15 @@ CREATE FUNCTION repmgrd_is_paused()
  AS 'MODULE_PATHNAME', 'repmgrd_is_paused'
  LANGUAGE C STRICT;
 CREATE FUNCTION get_wal_receiver_pid()
  RETURNS INT
  AS 'MODULE_PATHNAME', 'get_wal_receiver_pid'
  LANGUAGE C STRICT;
 /* views */
 CREATE VIEW repmgr.replication_status AS
  SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
--- a/repmgr-action-bdr.c
+++ b/repmgr-action-bdr.c
@@ -93,6 +93,15 @@ do_bdr_register(void)
 		exit(ERR_BAD_CONFIG);
 	}
 	if (get_bdr_version_num() > 2)
 	{
 		log_error(_("\"repmgr bdr register\" is for BDR 2.x only"));
 		PQfinish(conn);
 		pfree(dbname);
 		exit(ERR_BAD_CONFIG);
 	}
 	/* check for a matching BDR node */
 	{
 		PQExpBufferData bdr_local_node_name;
@@ -216,7 +225,7 @@ do_bdr_register(void)
 				ExtensionStatus other_node_extension_status = REPMGR_UNKNOWN;
 				/* skip the local node */
-				if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, MAXLEN) == 0)
+				if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, sizeof(node_info.node_name)) == 0)
 				{
 					continue;
 				}
@@ -304,9 +313,9 @@ do_bdr_register(void)
 	node_info.active = true;
 	node_info.priority = config_file_options.priority;
-	strncpy(node_info.node_name, config_file_options.node_name, MAXLEN);
+	strncpy(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name));
-	strncpy(node_info.location, config_file_options.location, MAXLEN);
+	strncpy(node_info.location, config_file_options.location, sizeof(node_info.location));
-	strncpy(node_info.conninfo, config_file_options.conninfo, MAXLEN);
+	strncpy(node_info.conninfo, config_file_options.conninfo, sizeof(node_info.conninfo));
 	if (record_status == RECORD_FOUND)
 	{
@@ -330,7 +339,7 @@ do_bdr_register(void)
 		 * name set when the node was registered.
 		 */
-		if (strncmp(node_info.node_name, config_file_options.node_name, MAXLEN) != 0)
+		if (strncmp(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name)) != 0)
 		{
 			log_error(_("a record for node %i is already registered with node_name \"%s\""),
 					  config_file_options.node_id, node_info.node_name);
--- a/repmgr-action-cluster.c
+++ b/repmgr-action-cluster.c
@@ -156,7 +156,7 @@ do_cluster_show(void)
 		else
 		{
 			/* check if node is reachable, but just not letting us in */
-			if (is_server_available(cell->node_info->conninfo))
+			if (is_server_available_quiet(cell->node_info->conninfo))
 				cell->node_info->node_status = NODE_STATUS_REJECTED;
 			else
 				cell->node_info->node_status = NODE_STATUS_DOWN;
@@ -1063,7 +1063,9 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite
 		matrix_rec_list[i] = (t_node_matrix_rec *) pg_malloc0(sizeof(t_node_matrix_rec));
 		matrix_rec_list[i]->node_id = cell->node_info->node_id;
-		strncpy(matrix_rec_list[i]->node_name, cell->node_info->node_name, MAXLEN);
+		strncpy(matrix_rec_list[i]->node_name,
 				cell->node_info->node_name,
 				sizeof(matrix_rec_list[i]->node_name));
 		/*
 		 * Find the maximum length of a node name
@@ -1161,6 +1163,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite
 		(void) remote_command(host,
 							  runtime_options.remote_user,
 							  command.data,
 							  config_file_options.ssh_options,
 							  &command_output);
 		p = command_output.data;
@@ -1277,7 +1280,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item
 		cube[h] = (t_node_status_cube *) pg_malloc(sizeof(t_node_status_cube));
 		cube[h]->node_id = cell->node_info->node_id;
-		strncpy(cube[h]->node_name, cell->node_info->node_name, MAXLEN);
+		strncpy(cube[h]->node_name, cell->node_info->node_name, sizeof(cube[h]->node_name));
 		/*
 		 * Find the maximum length of a node name
@@ -1299,7 +1302,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item
 			/* we don't need the name here */
 			cube[h]->matrix_list_rec[i]->node_name[0] = '\0';
-			cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec) * nodes.node_count);
+			cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec *) * nodes.node_count);
 			j = 0;
@@ -1373,6 +1376,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item
 			(void) remote_command(host,
 								  runtime_options.remote_user,
 								  quoted_command.data,
 								  config_file_options.ssh_options,
 								  &command_output);
 			free_conninfo_params(&remote_conninfo);
--- a/repmgr-action-cluster.h
+++ b/repmgr-action-cluster.h
@@ -30,14 +30,14 @@ typedef struct
 typedef struct
 {
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	t_node_status_rec **node_status_list;
 } t_node_matrix_rec;
 typedef struct
 {
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	t_node_matrix_rec **matrix_list_rec;
 } t_node_status_cube;
--- a/repmgr-action-daemon.c
+++ b/repmgr-action-daemon.c
@@ -201,8 +201,7 @@ do_daemon_status(void)
 				}
 			}
-			repmgrd_info[i]->upstream_last_seen = get_primary_last_seen(cell->node_info->conn);
+			repmgrd_info[i]->upstream_last_seen = get_upstream_last_seen(cell->node_info->conn, cell->node_info->type);
 			if (repmgrd_info[i]->upstream_last_seen < 0)
 			{
 				maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, "%s", _("n/a"));
@@ -260,14 +259,24 @@ do_daemon_status(void)
 	{
 		if (runtime_options.output_mode == OM_CSV)
 		{
 			int running = repmgrd_info[i]->running ? 1 : 0;
 			int paused = repmgrd_info[i]->paused ? 1 : 0;
 			/* If PostgreSQL is not running, repmgrd status is unknown */
 			if (repmgrd_info[i]->pg_running == false)
 			{
 				running = -1;
 				paused = -1;
 			}
 			printf("%i,%s,%s,%i,%i,%i,%i,%i,%i\n",
 				   cell->node_info->node_id,
 				   cell->node_info->node_name,
 				   get_node_type_string(cell->node_info->type),
 				   repmgrd_info[i]->pg_running ? 1 : 0,
-				   repmgrd_info[i]->running ? 1 : 0,
+				   running,
 				   repmgrd_info[i]->pid,
-				   repmgrd_info[i]->paused ? 1 : 0,
+				   paused,
 				   cell->node_info->priority,
 				   repmgrd_info[i]->pid == UNKNOWN_PID
 				     ? -1
@@ -344,18 +353,9 @@ _do_repmgr_pause(bool pause)
 	PGconn	   *conn = NULL;
 	NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
 	NodeInfoListCell *cell = NULL;
 	RepmgrdInfo **repmgrd_info;
 	int i;
 	int error_nodes = 0;
 	repmgrd_info = (RepmgrdInfo **) pg_malloc0(sizeof(RepmgrdInfo *) * nodes.node_count);
 	if (repmgrd_info == NULL)
 	{
 		log_error(_("unable to allocate memory"));
 		exit(ERR_OUT_OF_MEMORY);
 	}
 	/* Connect to local database to obtain cluster connection data */
 	log_verbose(LOG_INFO, _("connecting to database"));
@@ -370,9 +370,6 @@ _do_repmgr_pause(bool pause)
 	for (cell = nodes.head; cell; cell = cell->next)
 	{
 		repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
 		repmgrd_info[i]->node_id = cell->node_info->node_id;
 		log_verbose(LOG_DEBUG, "pausing node %i (%s)",
 					cell->node_info->node_id,
 					cell->node_info->node_name);
--- a/repmgr-action-node.c
+++ b/repmgr-action-node.c
@@ -413,7 +413,7 @@ do_node_status(void)
 								  node_info.upstream_node_name,
 								  node_info.upstream_node_id);
-		get_replication_info(conn, &replication_info);
+		get_replication_info(conn, node_info.type, &replication_info);
 		key_value_list_set_format(&node_status,
 								  "Replication lag",
@@ -1408,7 +1408,7 @@ do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_i
 					break;
 			}
 		}
-		else if (lag_seconds < 0)
+		else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
 		{
 			status = CHECK_STATUS_UNKNOWN;
@@ -2476,6 +2476,8 @@ do_node_rejoin(void)
 						termPQExpBuffer(&slotdir_ent_path);
 					}
 					closedir(slotdir);
 				}
 				termPQExpBuffer(&slotdir_path);
 			}
@@ -2681,6 +2683,48 @@ do_node_rejoin(void)
 }
 /*
 * Currently for testing purposes only, not documented;
 * use at own risk!
 */
 void
 do_node_control(void)
 {
 	PGconn	   *conn = NULL;
 	pid_t	    wal_receiver_pid = UNKNOWN_PID;
 	conn = establish_db_connection(config_file_options.conninfo, true);
 	if (runtime_options.disable_wal_receiver == true)
 	{
 		wal_receiver_pid = disable_wal_receiver(conn);
 		PQfinish(conn);
 		if (wal_receiver_pid == UNKNOWN_PID)
 			exit(ERR_BAD_CONFIG);
 		exit(SUCCESS);
 	}
 	if (runtime_options.enable_wal_receiver == true)
 	{
 		wal_receiver_pid = enable_wal_receiver(conn, true);
 		PQfinish(conn);
 		if (wal_receiver_pid == UNKNOWN_PID)
 			exit(ERR_BAD_CONFIG);
 		exit(SUCCESS);
 	}
 	log_error(_("no option provided"));
 	PQfinish(conn);
 }
 /*
 * For "internal" use by `node rejoin` on the local node when
 * called by "standby switchover" from the remote node.
@@ -2742,6 +2786,7 @@ _do_node_archive_config(void)
 	arcdir = opendir(archive_dir.data);
 	/* always attempt to open the directory */
 	if (arcdir == NULL)
 	{
 		log_error(_("unable to open archive directory \"%s\""),
@@ -2787,9 +2832,10 @@ _do_node_archive_config(void)
 			termPQExpBuffer(&arcdir_ent_path);
 		}
 	}
 	closedir(arcdir);
-	}
+
 	/*
 	 * extract list of config files from --config-files
@@ -3062,11 +3108,12 @@ copy_file(const char *src_file, const char *dest_file)
 	int			a = 0;
 	ptr_old = fopen(src_file, "r");
 	ptr_new = fopen(dest_file, "w");
 	if (ptr_old == NULL)
 		return false;
 	ptr_new = fopen(dest_file, "w");
 	if (ptr_new == NULL)
 	{
 		fclose(ptr_old);
--- a/repmgr-action-node.h
+++ b/repmgr-action-node.h
@@ -24,6 +24,7 @@ extern void do_node_check(void);
 extern void do_node_rejoin(void);
 extern void do_node_service(void);
 extern void do_node_control(void);
 extern void do_node_help(void);
--- a/repmgr-action-primary.c
+++ b/repmgr-action-primary.c
@@ -96,28 +96,6 @@ do_primary_register(void)
 	initialize_voting_term(conn);
 	/* Ensure there isn't another registered node which is primary */
 	primary_conn = get_primary_connection(conn, &current_primary_id, NULL);
 	if (primary_conn != NULL)
 	{
 		if (current_primary_id != config_file_options.node_id)
 		{
 			/*
 			 * it's impossible to add a second primary to a streaming
 			 * replication cluster
 			 */
 			log_error(_("there is already an active registered primary (node ID: %i) in this cluster"), current_primary_id);
 			PQfinish(primary_conn);
 			PQfinish(conn);
 			exit(ERR_BAD_CONFIG);
 		}
 		/* we've probably connected to ourselves */
 		PQfinish(primary_conn);
 	}
 	begin_transaction(conn);
 	/*
@@ -128,14 +106,34 @@ do_primary_register(void)
 	current_primary_id = get_primary_node_id(conn);
 	if (current_primary_id != NODE_NOT_FOUND && current_primary_id != config_file_options.node_id)
 	{
-		log_error(_("another node with id %i is already registered as primary"), current_primary_id);
+		log_debug("XXX %i", current_primary_id);
 		primary_conn = establish_primary_db_connection(conn, false);
 		if (PQstatus(primary_conn) == CONNECTION_OK)
 		{
 			if (get_recovery_type(primary_conn) == RECTYPE_PRIMARY)
 			{
 				log_error(_("there is already an active registered primary (node ID: %i) in this cluster"),
 						  current_primary_id);
 				log_detail(_("a streaming replication cluster can have only one primary node"));
 				log_hint(_("ensure this node is shut down before registering a new primary"));
 				PQfinish(primary_conn);
 				rollback_transaction(conn);
 				PQfinish(conn);
 				exit(ERR_BAD_CONFIG);
 			}
 			log_warning(_("node %is is registered as primary but running as a standby"),
 						  current_primary_id);
 			PQfinish(primary_conn);
 		}
 		log_notice(_("setting node %i's node record to inactive"),
 						  current_primary_id);
 		update_node_record_set_active(conn, current_primary_id, false);
 	}
 	/*
 	 * Check whether there's an existing record for this node, and update it
 	 * if --force set
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -213,7 +213,7 @@ do_standby_clone(void)
 		param_set(&recovery_conninfo, "application_name", config_file_options.node_name);
 		get_conninfo_value(config_file_options.conninfo, "application_name", application_name);
-		if (strlen(application_name) && strncmp(application_name, config_file_options.node_name, MAXLEN) != 0)
+		if (strlen(application_name) && strncmp(application_name, config_file_options.node_name, sizeof(config_file_options.node_name)) != 0)
 		{
 			log_notice(_("\"application_name\" is set in repmgr.conf but will be replaced by the node name"));
 		}
@@ -605,7 +605,6 @@ do_standby_clone(void)
 			log_error(_("unknown clone mode"));
 	}
 	/* If the backup failed then exit */
 	if (r != SUCCESS)
 	{
@@ -771,31 +770,48 @@ do_standby_clone(void)
 void
 check_barman_config(void)
 {
-	char		command[MAXLEN];
+	PQExpBufferData command;
 	bool		command_ok = false;
 	/*
 	 * Check that there is at least one valid backup
 	 */
-	log_info(_("connecting to Barman server to verify backup for %s"), config_file_options.barman_server);
+	log_info(_("connecting to Barman server to verify backup for \"%s\""), config_file_options.barman_server);
-	maxlen_snprintf(command, "%s show-backup %s latest > /dev/null",
+	initPQExpBuffer(&command);
 	appendPQExpBuffer(&command, "%s show-backup %s latest > /dev/null",
 					  make_barman_ssh_command(barman_command_buf),
 					  config_file_options.barman_server);
-	command_ok = local_command(command, NULL);
+	command_ok = local_command(command.data, NULL);
 	if (command_ok == false)
 	{
-		log_error(_("no valid backup for server %s was found in the Barman catalogue"),
+		log_error(_("no valid backup for server \"%s\" was found in the Barman catalogue"),
 				  config_file_options.barman_server);
 		log_detail(_("command executed was:\n  %s"), command.data),
 		log_hint(_("refer to the Barman documentation for more information"));
 		termPQExpBuffer(&command);
 		exit(ERR_BARMAN);
 	}
 	else if (runtime_options.dry_run == true)
 	{
 		log_info(_("valid backup for server \"%s\" found in the Barman catalogue"),
 				 config_file_options.barman_server);
 	}
 	termPQExpBuffer(&command);
 	/*
 	 * Attempt to create data directory (unless --dry-run specified,
 	 * in which case do nothing; warnings will be emitted elsewhere about
 	 * any issues with the data directory)
 	 */
 	if (runtime_options.dry_run == false)
 	{
 		if (!create_pg_dir(local_data_directory, runtime_options.force))
 		{
 			log_error(_("unable to use directory %s"),
@@ -804,7 +820,6 @@ check_barman_config(void)
 			exit(ERR_BAD_CONFIG);
 		}
 		/*
 		 * Create the local repmgr subdirectory
 		 */
@@ -822,26 +837,44 @@ check_barman_config(void)
 			exit(ERR_BAD_CONFIG);
 		}
 	}
 	/*
 	 * Fetch server parameters from Barman
 	 */
 	log_info(_("connecting to Barman server to fetch server parameters"));
-	maxlen_snprintf(command, "%s show-server %s > %s/show-server.txt",
+	initPQExpBuffer(&command);
 	if (runtime_options.dry_run == true)
 	{
 		appendPQExpBuffer(&command, "%s show-server %s > /dev/null",
 						  make_barman_ssh_command(barman_command_buf),
 						  config_file_options.barman_server);
 	}
 	else
 	{
 		appendPQExpBuffer(&command, "%s show-server %s > %s/show-server.txt",
 						  make_barman_ssh_command(barman_command_buf),
 						  config_file_options.barman_server,
 						  local_repmgr_tmp_directory);
 	}
-	command_ok = local_command(command, NULL);
+	command_ok = local_command(command.data, NULL);
 	if (command_ok == false)
 	{
 		log_error(_("unable to fetch server parameters from Barman server"));
-
+		log_detail(_("command executed was:\n  %s"), command.data),
 		termPQExpBuffer(&command);
 		exit(ERR_BARMAN);
 	}
 	else if (runtime_options.dry_run == true)
 	{
 		log_info(_("server parameters were successfully fetched from Barman server"));
 	}
 	termPQExpBuffer(&command);
 }
@@ -873,7 +906,7 @@ _do_create_recovery_conf(void)
 	t_node_info upstream_node_record = T_NODE_INFO_INITIALIZER;
 	RecordStatus record_status = RECORD_NOT_FOUND;
-	char		recovery_file_path[MAXPGPATH] = "";
+	char		recovery_file_path[MAXPGPATH + sizeof(RECOVERY_COMMAND_FILE)] = "";
 	struct stat st;
 	bool		node_is_running = false;
 	bool		slot_creation_required = false;
@@ -1118,7 +1151,10 @@ _do_create_recovery_conf(void)
 	/* check if recovery.conf exists */
-	snprintf(recovery_file_path, MAXPGPATH, "%s/%s", local_data_directory, RECOVERY_COMMAND_FILE);
+	snprintf(recovery_file_path, sizeof(recovery_file_path),
 			 "%s/%s",
 			 local_data_directory,
 			 RECOVERY_COMMAND_FILE);
 	if (stat(recovery_file_path, &st) == -1)
 	{
@@ -1306,8 +1342,7 @@ do_standby_register(void)
 			log_error(_("unable to connect to local node \"%s\" (ID: %i)"),
 					  config_file_options.node_name,
 					  config_file_options.node_id);
-			log_detail("%s",
+			log_detail("\n%s", PQerrorMessage(conn));
 					   PQerrorMessage(conn));
 			log_hint(_("to register a standby which is not running, provide primary connection parameters and use option -F/--force"));
 			exit(ERR_BAD_CONFIG);
@@ -1437,6 +1472,17 @@ do_standby_register(void)
 		RecordStatus upstream_record_status = RECORD_NOT_FOUND;
 		t_node_info upstream_node_record = T_NODE_INFO_INITIALIZER;
 		if (runtime_options.upstream_node_id == config_file_options.node_id)
 		{
 			log_error(_("provided node ID for --upstream-node-id (%i) is the same as the configured local node ID (%i)"),
 					  runtime_options.upstream_node_id,
 					  config_file_options.node_id);
 			PQfinish(primary_conn);
 			if (PQstatus(conn) == CONNECTION_OK)
 				PQfinish(conn);
 			exit(ERR_BAD_CONFIG);
 		}
 		upstream_record_status = get_node_record(primary_conn,
 												 runtime_options.upstream_node_id,
 												 &upstream_node_record);
@@ -1888,7 +1934,7 @@ do_standby_unregister(void)
 	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
 		log_error(_("unable to connect to primary server"));
-		log_detail("%s", PQerrorMessage(conn));
+		log_detail("\n%s", PQerrorMessage(conn));
 		exit(ERR_BAD_CONFIG);
 	}
@@ -2010,7 +2056,7 @@ do_standby_promote(void)
 		init_replication_info(&replication_info);
-		if (get_replication_info(conn, &replication_info) == false)
+		if (get_replication_info(conn, STANDBY, &replication_info) == false)
 		{
 			log_error(_("unable to retrieve replication information from local node"));
 			PQfinish(conn);
@@ -2270,6 +2316,7 @@ void
 do_standby_follow(void)
 {
 	PGconn	   *local_conn = NULL;
 	t_node_info local_node_record = T_NODE_INFO_INITIALIZER;
 	PGconn	   *primary_conn = NULL;
 	int			primary_node_id = UNKNOWN_NODE_ID;
@@ -2308,6 +2355,19 @@ do_standby_follow(void)
 	if (PQserverVersion(local_conn) < 90400)
 		check_93_config();
 	/* attempt to retrieve local node record */
 	record_status = get_node_record(local_conn,
 									config_file_options.node_id,
 									&local_node_record);
 	if (record_status != RECORD_FOUND)
 	{
 		log_error(_("unable to retrieve record for local node %i"),
 				  config_file_options.node_id);
 		PQfinish(local_conn);
 		exit(ERR_BAD_CONFIG);
 	}
 	/*
 	 * --upstream-node-id provided - attempt to follow that node
 	 */
@@ -2552,6 +2612,9 @@ do_standby_follow(void)
 		conn_to_param_list(local_conn, &local_repl_conninfo);
 		/* Set the replication user from the node record */
 		param_set(&local_repl_conninfo, "user", local_node_record.repluser);
 		param_set(&local_repl_conninfo, "replication", "1");
 		local_repl_conn = establish_db_connection_by_params(&local_repl_conninfo, false);
@@ -2838,8 +2901,8 @@ do_standby_follow_internal(PGconn *primary_conn, PGconn *follow_target_conn, t_n
 		free_conninfo_params(&local_node_conninfo);
 		/*
-		 * store the original upstream node id so we can delete the
+		 * Store the original upstream node id so we can delete the
-		 * replication slot, if exists
+		 * replication slot, if it exists.
 		 */
 		if (local_node_record.upstream_node_id != UNKNOWN_NODE_ID)
 		{
@@ -2851,11 +2914,19 @@ do_standby_follow_internal(PGconn *primary_conn, PGconn *follow_target_conn, t_n
 		}
-		if (config_file_options.use_replication_slots && runtime_options.host_param_provided == false && original_upstream_node_id != UNKNOWN_NODE_ID)
+		if (config_file_options.use_replication_slots && runtime_options.host_param_provided == false)
 		{
 			/*
 			 * Only attempt to delete the old replication slot if the old upstream
 			 * node is known and is different to the follow target node.
 			 */
 			if (original_upstream_node_id != UNKNOWN_NODE_ID
 			 && original_upstream_node_id != follow_target_node_record->node_id)
 			{
 				remove_old_replication_slot = true;
 			}
 		}
 	}
 	/* Fetch original upstream's record */
 	if (remove_old_replication_slot == true)
@@ -3000,8 +3071,6 @@ do_standby_follow_internal(PGconn *primary_conn, PGconn *follow_target_conn, t_n
 	 * Note that if this function is called by do_standby_switchover(), the
 	 * "repmgr node rejoin" command executed on the demotion candidate may already
 	 * have removed the slot, so there may be nothing to do.
 	 *
 	 * XXX check if former upstream is current primary?
 	 */
 	if (remove_old_replication_slot == true)
@@ -3263,7 +3332,7 @@ do_standby_switchover(void)
 		ReplInfo 	replication_info;
 		init_replication_info(&replication_info);
-		if (get_replication_info(local_conn, &replication_info) == false)
+		if (get_replication_info(local_conn, STANDBY, &replication_info) == false)
 		{
 			log_error(_("unable to retrieve replication information from local node"));
 			PQfinish(local_conn);
@@ -3403,6 +3472,7 @@ do_standby_switchover(void)
 	command_success = remote_command(remote_host,
 									 runtime_options.remote_user,
 									 remote_command_str.data,
 									 config_file_options.ssh_options,
 									 &command_output);
 	termPQExpBuffer(&remote_command_str);
@@ -3466,6 +3536,7 @@ do_standby_switchover(void)
 	command_success = remote_command(remote_host,
 									 runtime_options.remote_user,
 									 remote_command_str.data,
 									 config_file_options.ssh_options,
 									 &command_output);
 	termPQExpBuffer(&remote_command_str);
@@ -3533,9 +3604,26 @@ do_standby_switchover(void)
 	{
 		if (sibling_nodes.node_count > 0)
 		{
 			PQExpBufferData nodes;
 			NodeInfoListCell *cell;
 			initPQExpBuffer(&nodes);
 			for (cell = sibling_nodes.head; cell; cell = cell->next)
 			{
 				appendPQExpBuffer(&nodes,
 								  "  %s (node ID: %i)",
 								  cell->node_info->node_name,
 								  cell->node_info->node_id);
 				if (cell->next)
 					appendPQExpBufferStr(&nodes, "\n");
 			}
 			log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"),
 						sibling_nodes.node_count);
-			log_detail(_("these nodes will remain attached to the current primary"));
+			log_detail(_("these nodes will remain attached to the current primary:\n%s"), nodes.data);
 			termPQExpBuffer(&nodes);
 		}
 	}
 	else
@@ -3693,6 +3781,7 @@ do_standby_switchover(void)
 		command_success = remote_command(remote_host,
 										 runtime_options.remote_user,
 										 remote_command_str.data,
 										 config_file_options.ssh_options,
 										 &command_output);
 		termPQExpBuffer(&remote_command_str);
@@ -3745,6 +3834,7 @@ do_standby_switchover(void)
 			command_success = remote_command(remote_host,
 											 runtime_options.remote_user,
 											 remote_command_str.data,
 											 config_file_options.ssh_options,
 											 &command_output);
 			termPQExpBuffer(&remote_command_str);
@@ -3881,7 +3971,7 @@ do_standby_switchover(void)
 			log_detail(_("lag is %i seconds (warning threshold: %i)"),
 					   lag_seconds, config_file_options.replication_lag_warning);
 		}
-		else if (lag_seconds < 0)
+		else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
 		{
 			if (runtime_options.force == false)
 			{
@@ -3983,13 +4073,14 @@ do_standby_switchover(void)
 		for (cell = all_nodes.head; cell; cell = cell->next)
 		{
 			cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
 			repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
 			repmgrd_info[i]->node_id = cell->node_info->node_id;
 			repmgrd_info[i]->pid = UNKNOWN_PID;
 			repmgrd_info[i]->paused = false;
 			repmgrd_info[i]->running = false;
 			repmgrd_info[i]->pg_running = true;
 			cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
 			if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
 			{
@@ -3999,12 +4090,25 @@ do_standby_switchover(void)
 				repmgrd_info[i]->pg_running = false;
-				item_list_append_format(&repmgrd_connection_errors,
+				/*
-										_("unable to connect to node \"%s\" (ID %i)"),
+				 * Only worry about unreachable nodes if they're marked as active
-										cell->node_info->node_name,
+				 * in the repmgr metadata.
-										cell->node_info->node_id);
+				 */
-
+				if (cell->node_info->active == true)
 				{
 					unreachable_node_count++;
 					item_list_append_format(&repmgrd_connection_errors,
 											_("unable to connect to node \"%s\" (ID %i):\n%s"),
 											cell->node_info->node_name,
 											cell->node_info->node_id,
 											PQerrorMessage(cell->node_info->conn));
 				}
 				PQfinish(cell->node_info->conn);
 				cell->node_info->conn = NULL;
 				i++;
 				continue;
 			}
@@ -4066,11 +4170,37 @@ do_standby_switchover(void)
 		}
 		/* pause repmgrd on all reachable nodes */
 		if (repmgrd_running_count > 0)
 		{
 			i = 0;
 			for (cell = all_nodes.head; cell; cell = cell->next)
 			{
 				/*
 				 * Skip if node was unreachable
 				 */
 				if (repmgrd_info[i]->pg_running == false)
 				{
 					log_warning(_("node %s (ID %i) unreachable, unable to pause repmgrd"),
 								cell->node_info->node_name,
 								cell->node_info->node_id);
 					i++;
 					continue;
 				}
 				/*
 				 * Skip if repmgrd not running on node
 				 */
 				if (repmgrd_info[i]->running == false)
 				{
 					log_warning(_("repmgrd not running on node %s (ID %i)"),
 								cell->node_info->node_name,
 								cell->node_info->node_id);
 					i++;
 					continue;
 				}
 				/*
 				 * Skip if node is already paused. Note we won't unpause these, to
 				 * leave the repmgrd instances in the cluster in the same state they
@@ -4109,12 +4239,15 @@ do_standby_switchover(void)
 		{
 			/* close all connections - we'll reestablish later */
 			for (cell = all_nodes.head; cell; cell = cell->next)
 			{
 				if (cell->node_info->conn != NULL)
 				{
 					PQfinish(cell->node_info->conn);
 					cell->node_info->conn = NULL;
 				}
 			}
 		}
 	}
 	/*
@@ -4174,6 +4307,7 @@ do_standby_switchover(void)
 	(void) remote_command(remote_host,
 						  runtime_options.remote_user,
 						  remote_command_str.data,
 						  config_file_options.ssh_options,
 						  &command_output);
 	termPQExpBuffer(&remote_command_str);
@@ -4184,6 +4318,7 @@ do_standby_switchover(void)
 	 */
 	if (runtime_options.dry_run == true)
 	{
 		/* we use a buffer here as it will be modified by string_remove_trailing_newlines() */
 		char		shutdown_command[MAXLEN] = "";
 		strncpy(shutdown_command, command_output.data, MAXLEN);
@@ -4242,6 +4377,7 @@ do_standby_switchover(void)
 			command_success = remote_command(remote_host,
 											 runtime_options.remote_user,
 											 remote_command_str.data,
 											 config_file_options.ssh_options,
 											 &command_output);
 			termPQExpBuffer(&remote_command_str);
@@ -4299,6 +4435,9 @@ do_standby_switchover(void)
 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
 		log_warning(_("connection to local node lost, reconnecting..."));
 		log_detail("\n%s", PQerrorMessage(local_conn));
 		PQfinish(local_conn);
 		local_conn = establish_db_connection(config_file_options.conninfo, false);
 		if (PQstatus(local_conn) != CONNECTION_OK)
@@ -4321,7 +4460,7 @@ do_standby_switchover(void)
 		for (i = 0; i < config_file_options.wal_receive_check_timeout; i++)
 		{
-			get_replication_info(local_conn, &replication_info);
+			get_replication_info(local_conn, STANDBY, &replication_info);
 			if (replication_info.last_wal_receive_lsn >= remote_last_checkpoint_lsn)
 				break;
@@ -4462,6 +4601,7 @@ do_standby_switchover(void)
 	command_success = remote_command(remote_host,
 									 runtime_options.remote_user,
 									 remote_command_str.data,
 									 config_file_options.ssh_options,
 									 &command_output);
 	termPQExpBuffer(&remote_command_str);
@@ -4570,6 +4710,7 @@ do_standby_switchover(void)
 			success = remote_command(host,
 									 runtime_options.remote_user,
 									 remote_command_str.data,
 									 config_file_options.ssh_options,
 									 &command_output);
 			termPQExpBuffer(&remote_command_str);
@@ -4712,9 +4853,10 @@ do_standby_switchover(void)
 				else
 				{
 					item_list_append_format(&repmgrd_unpause_errors,
-											_("unable to connect to node \"%s\" (ID %i)"),
+											_("unable to connect to node \"%s\" (ID %i):\n%s"),
 											cell->node_info->node_name,
-											cell->node_info->node_id);
+											cell->node_info->node_id,
 											PQerrorMessage(cell->node_info->conn));
 					error_node_count++;
 				}
@@ -4726,6 +4868,8 @@ do_standby_switchover(void)
 				PQExpBufferData detail;
 				ItemListCell *cell;
 				initPQExpBuffer(&detail);
 				for (cell = repmgrd_unpause_errors.head; cell; cell = cell->next)
 				{
 					appendPQExpBuffer(&detail,
@@ -4906,19 +5050,41 @@ check_source_server()
 	}
 	/*
-	 * In the default pg_basebackup mode, we'll cowardly refuse to overwrite
+	 * Check the local directory to see if it appears to be a PostgreSQL
-	 * an existing data directory
+	 * data directory.
 	 *
 	 * Note: a previous call to check_dir() will have checked whether it contains
 	 * a running PostgreSQL instance.
 	 */
-	if (mode == pg_basebackup)
+	if (is_pg_dir(local_data_directory))
 	{
-		if (is_pg_dir(local_data_directory) && runtime_options.force != true)
+		const char *msg = _("target data directory appears to be a PostgreSQL data directory");
 		const char *hint = _("use -F/--force to overwrite the existing data directory");
 		if (runtime_options.force == false && runtime_options.dry_run == false)
 		{
-			log_error(_("target data directory appears to be a PostgreSQL data directory"));
+			log_error("%s", msg);
 			log_detail(_("target data directory is \"%s\""), local_data_directory);
-			log_hint(_("use -F/--force to overwrite the existing data directory"));
+			log_hint("%s", hint);
 			PQfinish(source_conn);
 			exit(ERR_BAD_CONFIG);
 		}
 		if (runtime_options.dry_run == true)
 		{
 			if (runtime_options.force == true)
 			{
 				log_warning("%s and will be overwritten", msg);
 				log_detail(_("target data directory is \"%s\""), local_data_directory);
 			}
 			else
 			{
 				log_warning("%s", msg);
 				log_detail(_("target data directory is \"%s\""), local_data_directory);
 				log_hint("%s", hint);
 			}
 		}
 	}
 	/*
@@ -5794,6 +5960,12 @@ run_basebackup(t_node_info *node_record)
 	if (r != 0)
 		return ERR_BAD_BASEBACKUP;
 	/* check connections are still available */
 	(void)connection_ping_reconnect(primary_conn);
 	if (source_conn != primary_conn)
 		(void)connection_ping_reconnect(source_conn);
 	/*
 	 * If replication slots in use, check the created slot is on the correct
 	 * node; the slot will initially get created on the source node, and will
@@ -6002,10 +6174,11 @@ run_file_backup(t_node_info *node_record)
 				 * Remove prefix
 				 */
 				p = string_skip_prefix(prefix, output);
 				if (p == NULL)
 				{
-					log_error("unexpected output from \"barman list-files\": %s",
+					log_error("unexpected output from \"barman list-files\"");
-							  output);
+					log_detail("%s", output);
 					exit(ERR_BARMAN);
 				}
@@ -6023,6 +6196,14 @@ run_file_backup(t_node_info *node_record)
 					strncat(prefix, backup_id, MAXLEN - 1);
 					strncat(prefix, "/", MAXLEN - 1);
 					p = string_skip_prefix(backup_id, p);
 					if (p == NULL)
 					{
 						log_error("unexpected output from \"barman list-files\"");
 						log_detail("%s", output);
 						exit(ERR_BARMAN);
 					}
 					p = string_skip_prefix("/", p);
 					/*
@@ -6034,8 +6215,8 @@ run_file_backup(t_node_info *node_record)
 									basebackups_directory,
 									backup_id,
 									local_repmgr_tmp_directory);
-					(void) local_command(
+
-										 command,
+					(void) local_command(command,
 										 NULL);
 					/*
@@ -6359,6 +6540,8 @@ run_file_backup(t_node_info *node_record)
 		if (fputs(tablespace_map.data, tablespace_map_file) == EOF)
 		{
 			fclose(tablespace_map_file);
 			log_error(_("unable to write to tablespace_map file \"%s\""), tablespace_map_filename.data);
 			r = ERR_BAD_BASEBACKUP;
@@ -6396,6 +6579,15 @@ stop_backup:
 				RecordStatus record_status = RECORD_NOT_FOUND;
 				PGconn	   *upstream_conn = NULL;
 				/* check connections are still available */
 				(void)connection_ping_reconnect(primary_conn);
 				if (source_conn != primary_conn)
 					(void)connection_ping_reconnect(source_conn);
 				(void)connection_ping_reconnect(source_conn);
 				record_status = get_node_record(source_conn, upstream_node_id, &upstream_node_record);
 				if (record_status != RECORD_FOUND)
--- a/repmgr-action-witness.c
+++ b/repmgr-action-witness.c
@@ -56,8 +56,7 @@ do_witness_register(void)
 		log_error(_("unable to connect to witness node \"%s\" (ID: %i)"),
 				  config_file_options.node_name,
 				  config_file_options.node_id);
-		log_detail("%s",
+		log_detail("\n%s", PQerrorMessage(witness_conn));
 				   PQerrorMessage(witness_conn));
 		log_hint(_("the witness node must be running before it can be registered"));
 		exit(ERR_BAD_CONFIG);
 	}
@@ -411,7 +410,7 @@ do_witness_unregister(void)
 			log_error(_("unable to connect to node \"%s\" (ID: %i)"),
 					  config_file_options.node_name,
 					  config_file_options.node_id);
-			log_detail("%s", PQerrorMessage(local_conn));
+			log_detail("\n%s", PQerrorMessage(local_conn));
 			exit(ERR_BAD_CONFIG);
 		}
@@ -437,7 +436,7 @@ do_witness_unregister(void)
 	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
 		log_error(_("unable to connect to primary"));
-		log_detail("%s", PQerrorMessage(primary_conn));
+		log_detail("\n%s", PQerrorMessage(primary_conn));
 		if (local_node_available == true)
 		{
--- a/repmgr-client-global.h
+++ b/repmgr-client-global.h
@@ -70,7 +70,7 @@ typedef struct
 	/* general node options */
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	char		data_dir[MAXPGPATH];
 	int			remote_node_id;
@@ -135,6 +135,8 @@ typedef struct
 	/* following options for internal use */
 	char		config_archive_dir[MAXPGPATH];
 	OutputMode	output_mode;
 	bool		disable_wal_receiver;
 	bool		enable_wal_receiver;
 } t_runtime_options;
 #define T_RUNTIME_OPTIONS_INITIALIZER { \
@@ -174,7 +176,7 @@ typedef struct
 		/* "cluster cleanup" options */ \
 		0, \
 		/* following options for internal use */ \
-		"/tmp", OM_TEXT	\
+		"/tmp", OM_TEXT, false, false \
 }
@@ -224,8 +226,6 @@ extern int	check_server_version(PGconn *conn, char *server_type, bool exit_on_er
 extern void check_93_config(void);
 extern bool create_repmgr_extension(PGconn *conn);
 extern int	test_ssh_connection(char *host, char *remote_user);
 extern bool local_command(const char *command, PQExpBufferData *outputbuf);
 extern bool local_command_simple(const char *command, PQExpBufferData *outputbuf);
 extern standy_clone_mode get_standby_clone_mode(void);
@@ -238,8 +238,6 @@ extern char *make_pg_path(const char *file);
 extern void get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privileged_conn);
 extern bool remote_command(const char *host, const char *user, const char *command, PQExpBufferData *outputbuf);
 extern void make_remote_repmgr_path(PQExpBufferData *outputbuf, t_node_info *remote_node_record);
 extern void make_repmgrd_path(PQExpBufferData *output_buf);
--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -31,6 +31,7 @@
 * NODE CHECK
 * NODE REJOIN
 * NODE SERVICE
 * NODE CONTROL
 *
 * DAEMON STATUS
 * DAEMON PAUSE
@@ -97,8 +98,6 @@ t_node_info target_node_info = T_NODE_INFO_INITIALIZER;
 static ItemList cli_errors = {NULL, NULL};
 static ItemList cli_warnings = {NULL, NULL};
 static bool _local_command(const char *command, PQExpBufferData *outputbuf, bool simple);
 int
 main(int argc, char **argv)
 {
@@ -357,9 +356,15 @@ main(int argc, char **argv)
 				/* --node-name */
 			case OPT_NODE_NAME:
-				strncpy(runtime_options.node_name, optarg, MAXLEN);
+			{
 				if (strlen(optarg) < sizeof(runtime_options.node_name))
 					strncpy(runtime_options.node_name, optarg, sizeof(runtime_options.node_name));
 				else
 					item_list_append_format(&cli_errors,
 											_("value for \"--node-name\" must contain fewer than %lu characters"),
 											sizeof(runtime_options.node_name));
 				break;
-
+			}
 				/* --remote-node-id */
 			case OPT_REMOTE_NODE_ID:
 				runtime_options.remote_node_id = repmgr_atoi(optarg, "--remote-node-id", &cli_errors, MIN_NODE_ID);
@@ -626,7 +631,7 @@ main(int argc, char **argv)
 				break;
-				/*--------------
+				/*---------------
 				 * output options
 				 *---------------
 				 */
@@ -642,6 +647,19 @@ main(int argc, char **argv)
 				runtime_options.optformat = true;
 				break;
 				/*---------------------------------
 				 * undocumented options for testing
 				 *----------------------------------
 				 */
 			case OPT_DISABLE_WAL_RECEIVER:
 				runtime_options.disable_wal_receiver = true;
 				break;
 			case OPT_ENABLE_WAL_RECEIVER:
 				runtime_options.enable_wal_receiver = true;
 				break;
 				/*-----------------------------
 				 * options deprecated since 3.3
 				 *-----------------------------
@@ -914,6 +932,8 @@ main(int argc, char **argv)
 				action = NODE_REJOIN;
 			else if (strcasecmp(repmgr_action, "SERVICE") == 0)
 				action = NODE_SERVICE;
 			else if (strcasecmp(repmgr_action, "CONTROL") == 0)
 				action = NODE_CONTROL;
 		}
 		else if (strcasecmp(repmgr_command, "CLUSTER") == 0)
@@ -1337,6 +1357,9 @@ main(int argc, char **argv)
 		case NODE_SERVICE:
 			do_node_service();
 			break;
 		case NODE_CONTROL:
 			do_node_control();
 			break;
 			/* CLUSTER */
 		case CLUSTER_SHOW:
@@ -1657,6 +1680,8 @@ check_cli_parameters(const int action)
 				item_list_append_format(&cli_warnings,
 										_("--replication-user ignored when executing %s"),
 										action_name(action));
 				break;
 			default:
 				item_list_append_format(&cli_warnings,
 										_("--replication-user not required when executing %s"),
@@ -1905,6 +1930,28 @@ check_cli_parameters(const int action)
 										action_name(action));
 		}
 	}
 	/* --disable-wal-receiver / --enable-wal-receiver */
 	if (runtime_options.disable_wal_receiver == true || runtime_options.enable_wal_receiver == true)
 	{
 		switch (action)
 		{
 			case NODE_CONTROL:
 			{
 				if (runtime_options.disable_wal_receiver == true && runtime_options.enable_wal_receiver == true)
 				{
 						item_list_append(&cli_errors,
 										 _("provide either --disable-wal-receiver or --enable-wal-receiver"));
 				}
 			}
 				break;
 			default:
 					item_list_append_format(&cli_warnings,
 											_("--disable-wal-receiver / --enable-wal-receiver not effective when executing %s"),
 											action_name(action));
 		}
 	}
 }
@@ -2172,7 +2219,7 @@ create_repmgr_extension(PGconn *conn)
 			log_detail(_("version %s is installed but newer version %s is available"),
 					   extversions.installed_version,
 					   extversions.default_version);
-			log_hint(_("execute \"ALTER EXTENSION repmgr UPGRADE\""));
+			log_hint(_("update the installed extension version by executing \"ALTER EXTENSION repmgr UPDATE\""));
 			return false;
 		case REPMGR_INSTALLED:
@@ -2399,75 +2446,6 @@ test_ssh_connection(char *host, char *remote_user)
 /*
 * Execute a command locally. "outputbuf" should either be an
 * initialised PQexpbuffer, or NULL
 */
 bool
 local_command(const char *command, PQExpBufferData *outputbuf)
 {
 	return _local_command(command, outputbuf, false);
 }
 bool
 local_command_simple(const char *command, PQExpBufferData *outputbuf)
 {
 	return _local_command(command, outputbuf, true);
 }
 static bool
 _local_command(const char *command, PQExpBufferData *outputbuf, bool simple)
 {
 	FILE	   *fp = NULL;
 	char		output[MAXLEN];
 	int			retval = 0;
 	bool		success;
 	log_verbose(LOG_DEBUG, "executing:\n  %s", command);
 	if (outputbuf == NULL)
 	{
 		retval = system(command);
 		return (retval == 0) ? true : false;
 	}
 	fp = popen(command, "r");
 	if (fp == NULL)
 	{
 		log_error(_("unable to execute local command:\n%s"), command);
 		return false;
 	}
 	while (fgets(output, MAXLEN, fp) != NULL)
 	{
 		appendPQExpBuffer(outputbuf, "%s", output);
 		if (!feof(fp) && simple == false)
 		{
 			break;
 		}
 	}
 	retval = pclose(fp);
 	/*  */
 	success = (WEXITSTATUS(retval) == 0 || WEXITSTATUS(retval) == 141) ? true : false;
 	log_verbose(LOG_DEBUG, "result of command was %i (%i)", WEXITSTATUS(retval), retval);
 	if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
 		log_verbose(LOG_DEBUG, "local_command(): output returned was:\n%s", outputbuf->data);
 	else
 		log_verbose(LOG_DEBUG, "local_command(): no output returned");
 	return success;
 }
 /*
 * get_superuser_connection()
 *
@@ -2487,6 +2465,7 @@ get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privil
 	if (PQstatus(*conn) != CONNECTION_OK)
 	{
 		log_error(_("no database connection available"));
 		log_detail("\n%s", PQerrorMessage(*conn));
 		exit(ERR_INTERNAL);
 	}
@@ -2674,78 +2653,6 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
 }
 /*
 * Execute a command via ssh on the remote host.
 *
 * TODO: implement SSH calls using libssh2.
 */
 bool
 remote_command(const char *host, const char *user, const char *command, PQExpBufferData *outputbuf)
 {
 	FILE	   *fp;
 	char		ssh_command[MAXLEN] = "";
 	PQExpBufferData ssh_host;
 	char		output[MAXLEN] = "";
 	initPQExpBuffer(&ssh_host);
 	if (*user != '\0')
 	{
 		appendPQExpBuffer(&ssh_host, "%s@", user);
 	}
 	appendPQExpBuffer(&ssh_host, "%s", host);
 	maxlen_snprintf(ssh_command,
 					"ssh -o Batchmode=yes %s %s %s",
 					config_file_options.ssh_options,
 					ssh_host.data,
 					command);
 	termPQExpBuffer(&ssh_host);
 	log_debug("remote_command():\n  %s", ssh_command);
 	fp = popen(ssh_command, "r");
 	if (fp == NULL)
 	{
 		log_error(_("unable to execute remote command:\n  %s"), ssh_command);
 		return false;
 	}
 	if (outputbuf != NULL)
 	{
 		/* TODO: better error handling */
 		while (fgets(output, MAXLEN, fp) != NULL)
 		{
 			appendPQExpBuffer(outputbuf, "%s", output);
 		}
 	}
 	else
 	{
 		while (fgets(output, MAXLEN, fp) != NULL)
 		{
 			if (!feof(fp))
 			{
 				break;
 			}
 		}
 	}
 	pclose(fp);
 	if (outputbuf != NULL)
 	{
 		if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
 			log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n%s", outputbuf->data);
 		else
 			log_verbose(LOG_DEBUG, "remote_command(): no output returned");
 	}
 	return true;
 }
 void
@@ -3102,7 +3009,7 @@ init_node_record(t_node_info *node_record)
 		strncpy(node_record->location, "default", MAXLEN);
-	strncpy(node_record->node_name, config_file_options.node_name, MAXLEN);
+	strncpy(node_record->node_name, config_file_options.node_name, sizeof(node_record->node_name));
 	strncpy(node_record->conninfo, config_file_options.conninfo, MAXLEN);
 	strncpy(node_record->config_file, config_file_path, MAXPGPATH);
@@ -3156,9 +3063,6 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea
 	/* "full_page_writes" must be on in any case */
 	if (guc_set(conn, "full_page_writes", "=", "off"))
 	{
 		if (can_use == false)
 			appendPQExpBuffer(reason, "; ");
 		appendPQExpBuffer(reason,
 						  _("\"full_page_writes\" must be set to \"on\""));
@@ -3245,6 +3149,8 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
 /*
 * Here we'll perform some timeline sanity checks to ensure the follow target
 * can actually be followed.
 *
 * See also comment for check_node_can_follow() in repmgrd-physical.c .
 */
 bool
 check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin)
@@ -3335,6 +3241,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
 		return false;
 	}
 	/* timelines are the same - check relative positions */
 	if (follow_target_identification.timeline == local_tli)
 	{
 		XLogRecPtr follow_target_xlogpos = get_node_current_lsn(follow_target_conn);
@@ -3346,7 +3253,6 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
 			return false;
 		}
 		/* timeline is the same - check relative positions */
 		if (local_xlogpos <= follow_target_xlogpos)
 		{
 			log_info(_("timelines are same, this server is not ahead"));
--- a/repmgr-client.h
+++ b/repmgr-client.h
@@ -40,16 +40,17 @@
 #define NODE_CHECK			   14
 #define NODE_SERVICE		   15
 #define NODE_REJOIN            16
-#define CLUSTER_SHOW		   17
+#define NODE_CONTROL           17
-#define CLUSTER_CLEANUP		   18
+#define CLUSTER_SHOW		   18
-#define CLUSTER_MATRIX		   19
+#define CLUSTER_CLEANUP		   19
-#define CLUSTER_CROSSCHECK	   20
+#define CLUSTER_MATRIX		   20
-#define CLUSTER_EVENT		   21
+#define CLUSTER_CROSSCHECK	   21
-#define DAEMON_STATUS		   22
+#define CLUSTER_EVENT		   22
-#define DAEMON_PAUSE		   23
+#define DAEMON_STATUS		   23
-#define DAEMON_UNPAUSE		   24
+#define DAEMON_PAUSE		   24
-#define DAEMON_START 		   25
+#define DAEMON_UNPAUSE		   25
-#define DAEMON_STOP 		   26
+#define DAEMON_START 		   26
 #define DAEMON_STOP 		   27
 /* command line options without short versions */
 #define OPT_HELP						   1001
@@ -97,7 +98,8 @@
 #define OPT_VERSION_NUMBER				   1043
 #define OPT_DATA_DIRECTORY_CONFIG		   1044
 #define OPT_COMPACT		                   1045
-
+#define OPT_DISABLE_WAL_RECEIVER           1046
 #define OPT_ENABLE_WAL_RECEIVER            1047
 /* deprecated since 3.3 */
 #define OPT_DATA_DIR						999
@@ -202,6 +204,10 @@ static struct option long_options[] =
 /* "cluster cleanup" options */
 	{"keep-history", required_argument, NULL, 'k'},
 /* undocumented options for testing */
 	{"disable-wal-receiver", no_argument, NULL, OPT_DISABLE_WAL_RECEIVER},
 	{"enable-wal-receiver", no_argument, NULL, OPT_ENABLE_WAL_RECEIVER},
 /* deprecated */
 	{"check-upstream-config", no_argument, NULL, OPT_CHECK_UPSTREAM_CONFIG},
 	{"no-conninfo-password", no_argument, NULL, OPT_NO_CONNINFO_PASSWORD},
--- a/repmgr.c
+++ b/repmgr.c
@@ -53,6 +53,7 @@
 #include "voting.h"
 #define UNKNOWN_NODE_ID		-1
 #define ELECTION_RERUN_NOTIFICATION -2
 #define UNKNOWN_PID			-1
 #define TRANCHE_NAME "repmgrd"
@@ -77,7 +78,7 @@ typedef struct repmgrdSharedState
 	char		repmgrd_pidfile[MAXPGPATH];
 	bool		repmgrd_paused;
 	/* streaming failover */
-	TimestampTz primary_last_seen;
+	TimestampTz upstream_last_seen;
 	NodeVotingStatus voting_status;
 	int			current_electoral_term;
 	int			candidate_node_id;
@@ -108,11 +109,11 @@ PG_FUNCTION_INFO_V1(standby_set_last_updated);
 Datum		standby_get_last_updated(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(standby_get_last_updated);
-Datum		set_primary_last_seen(PG_FUNCTION_ARGS);
+Datum		set_upstream_last_seen(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(set_primary_last_seen);
+PG_FUNCTION_INFO_V1(set_upstream_last_seen);
-Datum		get_primary_last_seen(PG_FUNCTION_ARGS);
+Datum		get_upstream_last_seen(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(get_primary_last_seen);
+PG_FUNCTION_INFO_V1(get_upstream_last_seen);
 Datum		notify_follow_primary(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(notify_follow_primary);
@@ -147,6 +148,8 @@ PG_FUNCTION_INFO_V1(repmgrd_pause);
 Datum		repmgrd_is_paused(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(repmgrd_is_paused);
 Datum		get_wal_receiver_pid(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(get_wal_receiver_pid);
 /*
@@ -226,7 +229,7 @@ repmgr_shmem_startup(void)
 		shared_state->repmgrd_paused = false;
 		shared_state->current_electoral_term = 0;
 		/* arbitrary "magic" date to indicate this field hasn't been updated */
-		shared_state->primary_last_seen = POSTGRES_EPOCH_JDATE;
+		shared_state->upstream_last_seen = POSTGRES_EPOCH_JDATE;
 		shared_state->voting_status = VS_NO_VOTE;
 		shared_state->candidate_node_id = UNKNOWN_NODE_ID;
 		shared_state->follow_new_primary = false;
@@ -363,17 +366,14 @@ standby_get_last_updated(PG_FUNCTION_ARGS)
 Datum
-set_primary_last_seen(PG_FUNCTION_ARGS)
+set_upstream_last_seen(PG_FUNCTION_ARGS)
 {
 	if (!shared_state)
 		PG_RETURN_VOID();
 	LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
-	shared_state->primary_last_seen = GetCurrentTimestamp();
+	shared_state->upstream_last_seen = GetCurrentTimestamp();
 	elog(INFO,
 		 "primary_last_seen: %s",
 		 timestamptz_to_str( 	shared_state->primary_last_seen));
 	LWLockRelease(shared_state->lock);
@@ -382,7 +382,7 @@ set_primary_last_seen(PG_FUNCTION_ARGS)
 Datum
-get_primary_last_seen(PG_FUNCTION_ARGS)
+get_upstream_last_seen(PG_FUNCTION_ARGS)
 {
 	long		secs;
 	int			microsecs;
@@ -391,13 +391,9 @@ get_primary_last_seen(PG_FUNCTION_ARGS)
 	if (!shared_state)
 		PG_RETURN_INT32(-1);
 	/* A primary is always visible */
 	if (!RecoveryInProgress())
 		PG_RETURN_INT32(0);
 	LWLockAcquire(shared_state->lock, LW_SHARED);
-	last_seen = shared_state->primary_last_seen;
+	last_seen = shared_state->upstream_last_seen;
 	LWLockRelease(shared_state->lock);
@@ -441,10 +437,18 @@ notify_follow_primary(PG_FUNCTION_ARGS)
 	/* only do something if local_node_id is initialised */
 	if (shared_state->local_node_id != UNKNOWN_NODE_ID)
 	{
 		if (primary_node_id == ELECTION_RERUN_NOTIFICATION)
 		{
 			elog(INFO, "node %i received notification to rerun promotion candidate election",
 				 shared_state->local_node_id);
 		}
 		else
 		{
 			elog(INFO, "node %i received notification to follow node %i",
 				 shared_state->local_node_id,
 				 primary_node_id);
 		}
 		LWLockRelease(shared_state->lock);
 		LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
@@ -743,3 +747,17 @@ repmgrd_is_paused(PG_FUNCTION_ARGS)
 	PG_RETURN_BOOL(is_paused);
 }
 Datum
 get_wal_receiver_pid(PG_FUNCTION_ARGS)
 {
 	int wal_receiver_pid;
 	if (!shared_state)
 		PG_RETURN_NULL();
 	wal_receiver_pid = WalRcv->pid;
 	PG_RETURN_INT32(wal_receiver_pid);
 }
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -5,7 +5,14 @@
 # Some configuration items will be set with a default value; this
 # is noted for each item. Where no default value is shown, the
 # parameter will be treated as empty or false.
-
+#
 # IMPORTANT: string values can be provided as-is, or enclosed in single quotes
 # (but not double-quotes, which will be interpreted as part of the string),
 # e.g.:
 #
 #  node_name=foo
 #  node_name = 'foo'
 #
 # =============================================================================
 # Required configuration items
 # =============================================================================
@@ -18,9 +25,11 @@
 				 # using the server's hostname or another identifier
 				 # unambiguously associated with the server to avoid
 				 # confusion. Avoid choosing names which reflect the
-				 # node's current role, e.g. "primary" or "standby1",
+				 # node's current role, e.g. 'primary' or 'standby1',
 				 # as roles can change and it will be confusing if
-				 # the current primary is called "standby1".
+				 # the current primary is called 'standby1'.
                                 # The string's maximum length is 63 characters and it should
                                 # contain only printable ASCII characters.
 #conninfo=''			 # Database connection information as a conninfo string.
 				 # All servers in the cluster must be able to connect to
@@ -63,6 +72,7 @@
 				 # to the user defined in "conninfo".
 #replication_type=physical	 # Must be one of 'physical' or 'bdr'.
 				 # NOTE: "bdr" can only be used with BDR 2.x
 #location=default		 # arbitrary string defining the location of the node; this
 				 # is used during failover to check visibilty of the
@@ -281,10 +291,13 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					#    manual attention to reattach it to replication
 					# (does not apply to BDR mode)
-#priority=100				# indicate a preferred priority for promoting nodes;
+#priority=100				# indicates a preferred priority for promoting nodes;
 					# a value of zero prevents the node being promoted to primary
 					# (default: 100)
 #connection_check_type=ping		# How to check availability of the upstream node; valid options:
                                        #  'ping': use PQping() to check if the node is accepting connections
                                        #  'connection': execute a throwaway query on the current connection
 #reconnect_attempts=6			# Number of attempts which will be made to reconnect to an unreachable
 					# primary (or other upstream node)
 #reconnect_interval=10			# Interval between attempts to reconnect to an unreachable
@@ -308,7 +321,7 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 #monitoring_history=no                  # Whether to write monitoring data to the "montoring_history" table
 #monitor_interval_secs=2                # Interval (in seconds) at which to write monitoring data
 #degraded_monitoring_timeout=-1		# Interval (in seconds) after which repmgrd will terminate if the
-					# server being monitored is no longer available. -1 (default)
+					# server(s) being monitored are no longer available. -1 (default)
 					# disables the timeout completely.
 #async_query_timeout=60			# Interval (in seconds) which repmgrd will wait before
 					# cancelling an asynchronous query.
@@ -319,6 +332,18 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					# "--no-pid-file" will force PID file creation to be skipped.
 					# Note: there is normally no need to set this, particularly if
 					# repmgr was installed from packages.
 #standby_disconnect_on_failover=false	# If "true", in a failover situation wait for all standbys to
 					# disconnect their WAL receivers before electing a new primary
 					# (PostgreSQL 9.5 and later only; repmgr user must be a superuser for this)
 #sibling_nodes_disconnect_timeout=30	# If "standby_disconnect_on_failover" is true, the maximum length of time
 					#  (in seconds) to wait for other standbys to confirm they have disconnected their
 					# WAL receivers
 #failover_validation_command=		# Script to execute for an external mechanism to validate the failover
 					# decision made by repmgrd. One or both of the following parameter placeholders
 					# should be provided, which will be replaced by repmgrd with the appropriate
 					# value: %n (node_id), %a (node_name). *Must* be the same on all nodes.
 #election_rerun_interval=15		# if "failover_validation_command" is set, and the command returns
 					# an error, pause the specified amount of seconds before rerunning the election.
 #------------------------------------------------------------------------------
 # service control commands
--- a/repmgr.h
+++ b/repmgr.h
@@ -41,6 +41,7 @@
 #include "configfile.h"
 #include "dbutils.h"
 #include "log.h"
 #include "sysutils.h"
 #define MIN_SUPPORTED_VERSION		"9.3"
 #define MIN_SUPPORTED_VERSION_NUM	90300
@@ -54,13 +55,16 @@
 #define UNKNOWN_TIMELINE_ID -1
 #define UNKNOWN_SYSTEM_IDENTIFIER 0
 #define UNKNOWN_PID			-1
 #define UNKNOWN_REPLICATION_LAG	-1
 #define NODE_NOT_FOUND		-1
 #define NO_UPSTREAM_NODE	-1
 #define UNKNOWN_NODE_ID		-1
 #define MIN_NODE_ID          1
 #define ELECTION_RERUN_NOTIFICATION -2
 #define VOTING_TERM_NOT_SET -1
 #define ARCHIVE_STATUS_DIR_ERROR -1
 #define NO_DEGRADED_MONITORING_ELAPSED -1
 #define BDR2_REPLICATION_SET_NAME "repmgr"
@@ -90,6 +94,10 @@
 #define DEFAULT_STANDBY_RECONNECT_TIMEOUT    60  /* seconds */
 #define DEFAULT_NODE_REJOIN_TIMEOUT          60  /* seconds */
 #define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT    30  /* seconds */
 #define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
 #define DEFAULT_ELECTION_RERUN_INTERVAL      15  /* seconds */
 #define WALRECEIVER_DISABLE_TIMEOUT_VALUE    86400000 /* milliseconds */
 #ifndef RECOVERY_COMMAND_FILE
 #define RECOVERY_COMMAND_FILE "recovery.conf"
--- a/repmgr_version.h.in
+++ b/repmgr_version.h.in
@@ -1,3 +1,3 @@
 #define REPMGR_VERSION_DATE ""
-#define REPMGR_VERSION "4.3dev"
+#define REPMGR_VERSION "4.3.1"
-#define REPMGR_VERSION_NUM 40300
+#define REPMGR_VERSION_NUM 40301
--- a/repmgrd-bdr.c
+++ b/repmgrd-bdr.c
@@ -68,7 +68,6 @@ monitor_bdr(void)
 	t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER;
 	RecordStatus record_status;
 	NodeInfoListCell *cell;
 	PQExpBufferData event_details;
 	instr_time	log_status_interval_start;
 	/* sanity check local database */
@@ -97,9 +96,21 @@ monitor_bdr(void)
 	if (!is_bdr_db(local_conn, NULL))
 	{
 		log_error(_("database is not BDR-enabled"));
 		PQfinish(local_conn);
 		exit(ERR_BAD_CONFIG);
 	}
 	/*
 	 * Check this is a supported BDR version (basically BDR 2.x)
 	 */
 	if (get_bdr_version_num() > 2)
 	{
 		log_error(_("\"bdr\" mode is for BDR 2.x only"));
 		log_hint(_("for BDR 3 and later, use \"replication_type=physical\""));
 		log_error(_("database is not BDR-enabled"));
 		exit(ERR_DB_CONN);
 	}
 	if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr") == false)
 	{
 		log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"),
@@ -229,6 +240,7 @@ monitor_bdr(void)
 								if (cell->node_info->node_status == NODE_STATUS_UP)
 								{
 									int			node_unreachable_elapsed = calculate_elapsed(node_unreachable_start);
 									PQExpBufferData event_details;
 									initPQExpBuffer(&event_details);
@@ -366,7 +378,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *next_node_conn = NULL;
 	NodeInfoListCell *cell;
 	PQExpBufferData event_details;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
 	t_node_info target_node = T_NODE_INFO_INITIALIZER;
 	t_node_info failed_node = T_NODE_INFO_INITIALIZER;
@@ -460,6 +471,9 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
 	log_debug("this node is the failover handler");
 	{
 		PQExpBufferData event_details;
 		initPQExpBuffer(&event_details);
 		event_info.conninfo_str = target_node.conninfo;
@@ -499,6 +513,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
 		log_info("%s", event_details.data);
 		termPQExpBuffer(&event_details);
 	}
 	unset_bdr_failover_handler(next_node_conn);
@@ -513,7 +528,6 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *recovered_node_conn;
 	PQExpBufferData event_details;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
 	int			i;
 	bool		slot_reactivated = false;
@@ -543,6 +557,8 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 	 */
 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
 		PQExpBufferData event_details;
 		local_conn = NULL;
 		log_warning(_("unable to reconnect to local node"));
@@ -613,6 +629,8 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 	node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
 	monitored_node->monitoring_state = MS_NORMAL;
 	{
 		PQExpBufferData event_details;
 		initPQExpBuffer(&event_details);
@@ -641,8 +659,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 			event_info.conninfo_str = monitored_node->conninfo;
 			event_info.node_name = monitored_node->node_name;
-		create_event_notification_extended(
+			create_event_notification_extended(local_conn,
 										   local_conn,
 											   &config_file_options,
 											   config_file_options.node_id,
 											   "bdr_recovery",
@@ -651,11 +668,11 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 											   &event_info);
 		}
 		termPQExpBuffer(&event_details);
 	}
 	update_node_record_set_active(local_conn, monitored_node->node_id, true);
 	termPQExpBuffer(&event_details);
 	PQfinish(recovered_node_conn);
 	return;
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -383,6 +383,15 @@ main(int argc, char **argv)
 	 * repmgr has not been properly configured.
 	 */
 	/* warn about any settings which might not be relevant for the current PostgreSQL version  */
 	if (config_file_options.standby_disconnect_on_failover == true && PQserverVersion(local_conn) < 90500)
 	{
 		log_warning(_("\"standby_disconnect_on_failover\" specified, but not available for this PostgreSQL version"));
 		/* TODO: format server version */
 		log_detail(_("available from PostgreSQL 9.5, this PostgreSQL version is %i"), PQserverVersion(local_conn));
 	}
 	/* Check "repmgr" the extension is installed */
 	extension_status = get_repmgr_extension_status(local_conn, &extversions);
@@ -400,8 +409,8 @@ main(int argc, char **argv)
 			log_detail(_("\"repmgr\" version %s is installed but extension is version %s"),
 					   REPMGR_VERSION,
 					   extversions.installed_version);
 			log_hint(_("update the repmgr binaries to match the installed extension version"));
 			log_hint(_("verify the repmgr installation on this server is updated properly before continuing"));
 			close_connection(&local_conn);
 			exit(ERR_BAD_CONFIG);
 		}
@@ -412,8 +421,8 @@ main(int argc, char **argv)
 			log_detail(_("\"repmgr\" version %s is installed but extension is version %s"),
 					   REPMGR_VERSION,
 					   extversions.installed_version);
 			log_hint(_("update the installed extension version by executing \"ALTER EXTENSION repmgr UPDATE\""));
 			log_hint(_("verify the repmgr extension is updated properly before continuing"));
 			close_connection(&local_conn);
 			exit(ERR_BAD_CONFIG);
 		}
@@ -424,7 +433,7 @@ main(int argc, char **argv)
 		if (extension_status == REPMGR_UNKNOWN)
 		{
 			log_error(_("unable to determine status of \"repmgr\" extension"));
-			log_detail("%s", PQerrorMessage(local_conn));
+			log_detail("\n%s", PQerrorMessage(local_conn));
 			close_connection(&local_conn);
 			exit(ERR_DB_QUERY);
 		}
@@ -552,6 +561,8 @@ start_monitoring(void)
 			   local_node_info.node_name,
 			   local_node_info.node_id);
 	log_info(_("\"connection_check_type\" set to \"%s\""), print_connection_check_type(config_file_options.connection_check_type));
 	while (true)
 	{
 		switch (local_node_info.type)
@@ -818,6 +829,82 @@ show_help(void)
 }
 bool
 check_upstream_connection(PGconn **conn, const char *conninfo)
 {
 	/* Check the connection status twice in case it changes after reset */
 	bool		twice = false;
 	if (config_file_options.connection_check_type == CHECK_PING)
 		return is_server_available(conninfo);
 	if (config_file_options.connection_check_type == CHECK_CONNECTION)
 	{
 		bool success = true;
 		PGconn *test_conn = PQconnectdb(conninfo);
 		log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo);
 		if (PQstatus(test_conn) != CONNECTION_OK)
 		{
 			log_warning(_("unable to connect to \"%s\""), conninfo);
 			log_detail("\n%s", PQerrorMessage(test_conn));
 			success = false;
 		}
 		PQfinish(test_conn);
 		return success;
 	}
 	for (;;)
 	{
 		if (PQstatus(*conn) != CONNECTION_OK)
 		{
 			log_debug("check_upstream_connection(): connection not OK");
 			if (twice)
 				return false;
 			/* reconnect */
 			PQfinish(*conn);
 			*conn = PQconnectdb(conninfo);
 			twice = true;
 		}
 		else
 		{
 			if (!cancel_query(*conn, config_file_options.async_query_timeout))
 				goto failed;
 			if (wait_connection_availability(*conn, config_file_options.async_query_timeout) != 1)
 				goto failed;
 			/* execute a simple query to verify connection availability */
 			if (PQsendQuery(*conn, "SELECT 1") == 0)
 			{
 				log_warning(_("unable to send query to upstream"));
 				log_detail("%s", PQerrorMessage(*conn));
 				goto failed;
 			}
 			if (wait_connection_availability(*conn, config_file_options.async_query_timeout) != 1)
 				goto failed;
 			break;
 	failed:
 			/* retry once */
 			if (twice)
 				return false;
 			/* reconnect */
 			PQfinish(*conn);
 			*conn = PQconnectdb(conninfo);
 			twice = true;
 		}
 	}
 	return true;
 }
 void
 try_reconnect(PGconn **conn, t_node_info *node_info)
 {
@@ -843,8 +930,7 @@ try_reconnect(PGconn **conn, t_node_info *node_info)
 				 node_info->node_id, i + 1, max_attempts);
 		if (is_server_available_params(&conninfo_params) == true)
 		{
-
+			log_notice(_("node %i has recovered, reconnecting"), node_info->node_id);
 			log_notice(_("node has recovered, reconnecting"));
 			/*
 			 * XXX we should also handle the case where node is pingable but
@@ -874,7 +960,7 @@ try_reconnect(PGconn **conn, t_node_info *node_info)
 					if (ping_result != PGRES_TUPLES_OK)
 					{
-						log_info("original connnection no longer available, using new connection");
+						log_info("original connection no longer available, using new connection");
 						close_connection(conn);
 						*conn = our_conn;
 					}
--- a/repmgrd.h
+++ b/repmgrd.h
@@ -23,6 +23,7 @@ extern PGconn *local_conn;
 extern bool startup_event_logged;
 extern char pid_file[MAXPGPATH];
 bool		check_upstream_connection(PGconn **conn, const char *conninfo);
 void		try_reconnect(PGconn **conn, t_node_info *node_info);
 int			calculate_elapsed(instr_time start_time);
@@ -31,5 +32,4 @@ const char *print_monitoring_state(MonitoringState monitoring_state);
 void		update_registration(PGconn *conn);
 void		terminate(int retval);
 #endif							/* _REPMGRD_H_ */
--- a/sysutils.c
+++ b/sysutils.c
@@ -0,0 +1,366 @@
 /*
 * sysutils.c
 *
 * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <signal.h>
 #include "repmgr.h"
 static bool _local_command(const char *command, PQExpBufferData *outputbuf, bool simple, int *return_value);
 /*
 * Execute a command locally. "outputbuf" should either be an
 * initialised PQExpPuffer, or NULL
 */
 bool
 local_command(const char *command, PQExpBufferData *outputbuf)
 {
 	return _local_command(command, outputbuf, false, NULL);
 }
 bool
 local_command_return_value(const char *command, PQExpBufferData *outputbuf, int *return_value)
 {
 	return _local_command(command, outputbuf, false, return_value);
 }
 bool
 local_command_simple(const char *command, PQExpBufferData *outputbuf)
 {
 	return _local_command(command, outputbuf, true, NULL);
 }
 static bool
 _local_command(const char *command, PQExpBufferData *outputbuf, bool simple, int *return_value)
 {
 	FILE	   *fp = NULL;
 	char		output[MAXLEN];
 	int			retval = 0;
 	bool		success;
 	log_verbose(LOG_DEBUG, "executing:\n  %s", command);
 	if (outputbuf == NULL)
 	{
 		retval = system(command);
 		if (return_value != NULL)
 			*return_value = WEXITSTATUS(retval);
 		return (retval == 0) ? true : false;
 	}
 	fp = popen(command, "r");
 	if (fp == NULL)
 	{
 		log_error(_("unable to execute local command:\n%s"), command);
 		return false;
 	}
 	while (fgets(output, MAXLEN, fp) != NULL)
 	{
 		appendPQExpBufferStr(outputbuf, output);
 		if (!feof(fp) && simple == false)
 		{
 			break;
 		}
 	}
 	retval = pclose(fp);
 	/*  */
 	success = (WEXITSTATUS(retval) == 0 || WEXITSTATUS(retval) == 141) ? true : false;
 	log_verbose(LOG_DEBUG, "result of command was %i (%i)", WEXITSTATUS(retval), retval);
 	if (return_value != NULL)
 		*return_value = WEXITSTATUS(retval);
 	if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
 		log_verbose(LOG_DEBUG, "local_command(): output returned was:\n%s", outputbuf->data);
 	else
 		log_verbose(LOG_DEBUG, "local_command(): no output returned");
 	return success;
 }
 /*
 * Execute a command via ssh on the remote host.
 *
 * TODO: implement SSH calls using libssh2.
 */
 bool
 remote_command(const char *host, const char *user, const char *command, const char *ssh_options, PQExpBufferData *outputbuf)
 {
 	FILE	   *fp;
 	char		ssh_command[MAXLEN] = "";
 	PQExpBufferData ssh_host;
 	char		output[MAXLEN] = "";
 	initPQExpBuffer(&ssh_host);
 	if (*user != '\0')
 	{
 		appendPQExpBuffer(&ssh_host, "%s@", user);
 	}
 	appendPQExpBufferStr(&ssh_host, host);
 	maxlen_snprintf(ssh_command,
 					"ssh -o Batchmode=yes %s %s %s",
 					ssh_options,
 					ssh_host.data,
 					command);
 	termPQExpBuffer(&ssh_host);
 	log_debug("remote_command():\n  %s", ssh_command);
 	fp = popen(ssh_command, "r");
 	if (fp == NULL)
 	{
 		log_error(_("unable to execute remote command:\n  %s"), ssh_command);
 		return false;
 	}
 	if (outputbuf != NULL)
 	{
 		/* TODO: better error handling */
 		while (fgets(output, MAXLEN, fp) != NULL)
 		{
 			appendPQExpBufferStr(outputbuf, output);
 		}
 	}
 	else
 	{
 		while (fgets(output, MAXLEN, fp) != NULL)
 		{
 			if (!feof(fp))
 			{
 				break;
 			}
 		}
 	}
 	pclose(fp);
 	if (outputbuf != NULL)
 	{
 		if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
 			log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n%s", outputbuf->data);
 		else
 			log_verbose(LOG_DEBUG, "remote_command(): no output returned");
 	}
 	return true;
 }
 pid_t
 disable_wal_receiver(PGconn *conn)
 {
 	char buf[MAXLEN];
 	int wal_retrieve_retry_interval, new_wal_retrieve_retry_interval;
 	pid_t wal_receiver_pid = UNKNOWN_PID;
 	int kill_ret;
 	int i, j;
 	int max_retries = 2;
 	if (is_superuser_connection(conn, NULL) == false)
 	{
 		log_error(_("superuser connection required"));
 		return UNKNOWN_PID;
 	}
 	if (get_recovery_type(conn) == RECTYPE_PRIMARY)
 	{
 		log_error(_("node is not in recovery"));
 		log_detail(_("wal receiver can only run on standby nodes"));
 		return UNKNOWN_PID;
 	}
 	wal_receiver_pid = (pid_t)get_wal_receiver_pid(conn);
 	if (wal_receiver_pid == UNKNOWN_PID)
 	{
 		log_warning(_("unable to retrieve wal receiver PID"));
 		return UNKNOWN_PID;
 	}
 	get_pg_setting(conn, "wal_retrieve_retry_interval", buf);
 	/* TODO: potentially handle atoi error, though unlikely at this point */
 	wal_retrieve_retry_interval = atoi(buf);
 	new_wal_retrieve_retry_interval = wal_retrieve_retry_interval + WALRECEIVER_DISABLE_TIMEOUT_VALUE;
 	if (wal_retrieve_retry_interval < WALRECEIVER_DISABLE_TIMEOUT_VALUE)
 	{
 		log_notice(_("setting \"wal_retrieve_retry_interval\" to %i milliseconds"),
 				   new_wal_retrieve_retry_interval);
 		alter_system_int(conn, "wal_retrieve_retry_interval", new_wal_retrieve_retry_interval);
 		pg_reload_conf(conn);
 	}
 	/*
 	 * If, at this point, the WAL receiver is not running, we don't need to (and indeed can't)
 	 * kill it.
 	 */
 	if (wal_receiver_pid == 0)
 	{
 		log_warning(_("wal receiver not running"));
 		return UNKNOWN_PID;
 	}
 	/* why 5? */
 	log_info(_("sleeping 5 seconds"));
 	sleep(5);
 	/* see comment below as to why we need a loop here */
 	for (i = 0; i < max_retries; i++)
 	{
 		log_notice(_("killing WAL receiver with PID %i"), (int)wal_receiver_pid);
 		kill((int)wal_receiver_pid, SIGTERM);
 		for (j = 0; j < 30; j++)
 		{
 			kill_ret = kill(wal_receiver_pid, 0);
 			if (kill_ret != 0)
 			{
 				log_info(_("WAL receiver with pid %i killed"), (int)wal_receiver_pid);
 				break;
 			}
 			sleep(1);
 		}
 		/*
 		 * Wait briefly to check that the WAL receiver has indeed gone away -
 		 * for reasons as yet unclear, after a server start/restart, immediately
 		 * after the first time a WAL receiver is killed, a new one is started
 		 * straight away, so we'll need to kill that too.
 		 */
 		sleep(1);
 		wal_receiver_pid = (pid_t)get_wal_receiver_pid(conn);
 		if (wal_receiver_pid == UNKNOWN_PID || wal_receiver_pid == 0)
 			break;
 	}
 	return wal_receiver_pid;
 }
 pid_t
 enable_wal_receiver(PGconn *conn, bool wait_startup)
 {
 	char buf[MAXLEN];
 	int wal_retrieve_retry_interval;
 	pid_t wal_receiver_pid = UNKNOWN_PID;
 	/* make timeout configurable */
 	int i, timeout = 30;
 	if (is_superuser_connection(conn, NULL) == false)
 	{
 		log_error(_("superuser connection required"));
 		return UNKNOWN_PID;
 	}
 	if (get_recovery_type(conn) == RECTYPE_PRIMARY)
 	{
 		log_error(_("node is not in recovery"));
 		log_detail(_("wal receiver can only run on standby nodes"));
 		return UNKNOWN_PID;
 	}
 	if (get_pg_setting(conn, "wal_retrieve_retry_interval", buf) == false)
 	{
 		log_error(_("unable to retrieve \"wal_retrieve_retry_interval\""));
 		return UNKNOWN_PID;
 	}
 	/* TODO: potentially handle atoi error, though unlikely at this point */
 	wal_retrieve_retry_interval = atoi(buf);
 	if (wal_retrieve_retry_interval > WALRECEIVER_DISABLE_TIMEOUT_VALUE)
 	{
 		int new_wal_retrieve_retry_interval = wal_retrieve_retry_interval - WALRECEIVER_DISABLE_TIMEOUT_VALUE;
 		bool success;
 		log_notice(_("setting \"wal_retrieve_retry_interval\" to %i ms"),
 				   new_wal_retrieve_retry_interval);
 		success = alter_system_int(conn,
 								   "wal_retrieve_retry_interval",
 								   new_wal_retrieve_retry_interval);
 		if (success == false)
 		{
 			log_warning(_("unable to change \"wal_retrieve_retry_interval\""));
 			return UNKNOWN_PID;
 		}
 		pg_reload_conf(conn);
 	}
 	else
 	{
 		/* TODO: add threshold sanity check */
 		log_info(_("\"wal_retrieve_retry_interval\" is %i, not changing"),
 				 wal_retrieve_retry_interval);
 	}
 	if (wait_startup == false)
 		return UNKNOWN_PID;
 	for (i = 0; i < timeout; i++)
 	{
 		wal_receiver_pid = (pid_t)get_wal_receiver_pid(conn);
 		if (wal_receiver_pid > 0)
 			break;
 		log_info(_("sleeping %i of maximum %i seconds waiting for WAL receiver to start up"),
 				 i + 1, timeout)
 		sleep(1);
 	}
 	if (wal_receiver_pid == UNKNOWN_PID)
 	{
 		log_warning(_("unable to retrieve WAL receiver PID"));
 		return UNKNOWN_PID;
 	}
 	else if (wal_receiver_pid == 0)
 	{
 		log_error(_("WAL receiver did not start up after %i seconds"), timeout);
 		return UNKNOWN_PID;
 	}
 	log_info(_("WAL receiver started up with PID %i"), (int)wal_receiver_pid);
 	return wal_receiver_pid;
 }
--- a/sysutils.h
+++ b/sysutils.h
@@ -0,0 +1,32 @@
 /*
 * sysutils.h
 * Copyright (c) 2ndQuadrant, 2010-2019
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef _SYSUTILS_H_
 #define _SYSUTILS_H_
 extern bool local_command(const char *command, PQExpBufferData *outputbuf);
 extern bool local_command_return_value(const char *command, PQExpBufferData *outputbuf, int *return_value);
 extern bool local_command_simple(const char *command, PQExpBufferData *outputbuf);
 extern bool remote_command(const char *host, const char *user, const char *command, const char *ssh_options, PQExpBufferData *outputbuf);
 extern pid_t disable_wal_receiver(PGconn *conn);
 extern pid_t enable_wal_receiver(PGconn *conn, bool wait_startup);
 #endif							/* _SYSUTILS_H_ */