doc: add a link to the current documentation from the contents page

doc: fix typos
doc: fix typo
2026-03-23 15:16:29 +00:00 · 2019-04-03 10:47:19 +09:00 · 2018-10-23 09:22:11 +09:00 · 2018-10-23 09:00:46 +09:00 · 2018-10-16 13:24:48 +09:00 · 2018-10-16 11:39:54 +09:00
53 changed files with 2206 additions and 960 deletions
--- a/20
+++ b/20
@@ -1,4 +1,22 @@
-4.1.0   2018-??-??
+4.1.1   2018-09-05
+        logging: explicitly log the text of failed queries as ERRORs to
+          assist logfile analysis; GitHub #498
+        repmgr: truncate version string, if necessary; GitHub #490 (Ian)
+        repmgr: improve messages emitted during "standby promote" (Ian)
+        repmgr: "standby clone" - don't copy external config files in --dry-run
+          mode; GitHub #491 (Ian)
+        repmgr: add "cluster_cleanup" event; GitHub #492 (Ian)
+        repmgr: (standby switchover) improve detection of free walsenders;
+          GitHub #495 (Ian)
+        repmgr: (node rejoin) improve replication slot handling; GitHub #499 (Ian)
+        repmgrd: ensure that sending SIGHUP always results in the log file
+          being reopened; GitHub #485 (Ian)
+        repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian)
+        repmgrd: fix startup on witness node when local data is stale; GitHub #488/#489 (Ian)
+        repmgrd: improve cascaded standby failover handling; GitHub #480 (Ian)
+        repmgrd: improve reconnection handling (Ian)
+
+4.1.0   2018-07-31
        repmgr: change default log_level to INFO, add documentation; GitHub #470 (Ian)
        repmgr: add "--missing-slots" check to "repmgr node check" (Ian)
        repmgr: improve command line error handling; GitHub #464 (Ian)
--- a/configfile.c
+++ b/configfile.c
@@ -28,6 +28,7 @@ char		config_file_path[MAXPGPATH] = "";
 static bool config_file_provided = false;
 bool		config_file_found = false;

+static void parse_config(t_configuration_options *options, bool terse);
 static void _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *warning_list);

 static void _parse_line(char *buf, char *name, char *value);
@@ -238,7 +239,7 @@ end_search:
 }


-void
+static void
 parse_config(t_configuration_options *options, bool terse)
 {
 	/* Collate configuration file errors here for friendlier reporting */
@@ -785,7 +786,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 		PQconninfoFree(conninfo_options);
 	}

-
 	/* set values for parameters which default to other parameters */

 	/*
@@ -1052,11 +1052,13 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
 * - repmgrd_standby_startup_timeout
 * - retry_promote_interval_secs
 *
- * non-changeable options
+ * non-changeable options (repmgrd references these from the "repmgr.nodes"
+ * table, not the configuration file)
 *
 * - node_id
 * - node_name
 * - data_directory
+ * - location
 * - priority
 * - replication_type
 *
@@ -1065,7 +1067,7 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL

 */
 bool
-reload_config(t_configuration_options *orig_options)
+reload_config(t_configuration_options *orig_options, t_server_type server_type)
 {
 	PGconn	   *conn;
 	t_configuration_options new_options = T_CONFIGURATION_OPTIONS_INITIALIZER;
@@ -1081,6 +1083,20 @@ reload_config(t_configuration_options *orig_options)

 	_parse_config(&new_options, &config_errors, &config_warnings);

+
+	if (server_type == PRIMARY || server_type == STANDBY)
+	{
+		if (new_options.promote_command[0] == '\0')
+		{
+			item_list_append(&config_errors, _("\"promote_command\": required parameter was not found"));
+		}
+
+		if (new_options.follow_command[0] == '\0')
+		{
+			item_list_append(&config_errors, _("\"follow_command\": required parameter was not found"));
+		}
+	}
+
 	if (config_errors.head != NULL)
 	{
 		ItemListCell *cell = NULL;
@@ -1258,7 +1274,7 @@ reload_config(t_configuration_options *orig_options)
 		config_changed = true;
 	}

-	/* promote_delay */
+	/* promote_delay (for testing use only; not documented */
 	if (orig_options->promote_delay != new_options.promote_delay)
 	{
 		orig_options->promote_delay = new_options.promote_delay;
--- a/configfile.h
+++ b/configfile.h
@@ -273,13 +273,13 @@ typedef struct
 	"", "", "", "" \
 }

+#include "dbutils.h"

 void		set_progname(const char *argv0);
 const char *progname(void);

 void		load_config(const char *config_file, bool verbose, bool terse, t_configuration_options *options, char *argv0);
-void		parse_config(t_configuration_options *options, bool terse);
-bool		reload_config(t_configuration_options *orig_options);
+bool		reload_config(t_configuration_options *orig_options, t_server_type server_type);

 bool		parse_recovery_conf(const char *data_dir, t_recovery_conf *conf);

--- a/18
+++ b/18
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for repmgr 4.1.
+# Generated by GNU Autoconf 2.69 for repmgr 4.1.2.
 #
 # Report bugs to <pgsql-bugs@postgresql.org>.
 #
@@ -582,8 +582,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='repmgr'
 PACKAGE_TARNAME='repmgr'
-PACKAGE_VERSION='4.1'
-PACKAGE_STRING='repmgr 4.1'
+PACKAGE_VERSION='4.1.2'
+PACKAGE_STRING='repmgr 4.1.2'
 PACKAGE_BUGREPORT='pgsql-bugs@postgresql.org'
 PACKAGE_URL='https://2ndquadrant.com/en/resources/repmgr/'

@@ -1178,7 +1178,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures repmgr 4.1 to adapt to many kinds of systems.
+\`configure' configures repmgr 4.1.2 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1239,7 +1239,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of repmgr 4.1:";;
+     short | recursive ) echo "Configuration of repmgr 4.1.2:";;
   esac
  cat <<\_ACEOF

@@ -1313,7 +1313,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-repmgr configure 4.1
+repmgr configure 4.1.2
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1332,7 +1332,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by repmgr $as_me 4.1, which was
+It was created by repmgr $as_me 4.1.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2359,7 +2359,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by repmgr $as_me 4.1, which was
+This file was extended by repmgr $as_me 4.1.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -2422,7 +2422,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-repmgr config.status 4.1
+repmgr config.status 4.1.2
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.in
+++ b/configure.in
@@ -1,4 +1,4 @@
-AC_INIT([repmgr], [4.1], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])
+AC_INIT([repmgr], [4.1.2], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])

 AC_COPYRIGHT([Copyright (c) 2010-2018, 2ndQuadrant Ltd.])

--- a/controldata.c
+++ b/controldata.c
@@ -227,7 +227,15 @@ get_controlfile(const char *DataDir)

 	control_file_info->control_file_processed = true;

-	if (version_num >= 90500)
+	if (version_num >= 110000)
+	{
+		ControlFileData11 *ptr = (struct ControlFileData11 *)ControlFileDataPtr;
+		control_file_info->system_identifier = ptr->system_identifier;
+		control_file_info->state = ptr->state;
+		control_file_info->checkPoint = ptr->checkPoint;
+		control_file_info->data_checksum_version = ptr->data_checksum_version;
+	}
+	else if (version_num >= 90500)
 	{
 		ControlFileData95 *ptr = (struct ControlFileData95 *)ControlFileDataPtr;
 		control_file_info->system_identifier = ptr->system_identifier;
--- a/controldata.h
+++ b/controldata.h
@@ -265,6 +265,71 @@ typedef struct ControlFileData95

 } ControlFileData95;

+/*
+ * Following field removed in 11:
+ *
+ *  XLogRecPtr	prevCheckPoint;
+ *
+ * In 10, following field appended *after* "data_checksum_version":
+ *
+ * 	char		mock_authentication_nonce[MOCK_AUTH_NONCE_LEN];
+ *
+ * (but we don't care about that)
+ */
+
+typedef struct ControlFileData11
+{
+	uint64		system_identifier;
+
+	uint32		pg_control_version;		/* PG_CONTROL_VERSION */
+	uint32		catalog_version_no;		/* see catversion.h */
+
+	DBState		state;			/* see enum above */
+	pg_time_t	time;			/* time stamp of last pg_control update */
+	XLogRecPtr	checkPoint;		/* last check point record ptr */
+
+	CheckPoint95	checkPointCopy; /* copy of last check point record */
+
+	XLogRecPtr	unloggedLSN;	/* current fake LSN value, for unlogged rels */
+
+	XLogRecPtr	minRecoveryPoint;
+	TimeLineID	minRecoveryPointTLI;
+	XLogRecPtr	backupStartPoint;
+	XLogRecPtr	backupEndPoint;
+	bool		backupEndRequired;
+
+	int			wal_level;
+	bool		wal_log_hints;
+	int			MaxConnections;
+	int			max_worker_processes;
+	int			max_prepared_xacts;
+	int			max_locks_per_xact;
+	bool		track_commit_timestamp;
+
+	uint32		maxAlign;		/* alignment requirement for tuples */
+	double		floatFormat;	/* constant 1234567.0 */
+
+	uint32		blcksz;			/* data block size for this DB */
+	uint32		relseg_size;	/* blocks per segment of large relation */
+
+	uint32		xlog_blcksz;	/* block size within WAL files */
+	uint32		xlog_seg_size;	/* size of each WAL segment */
+
+	uint32		nameDataLen;	/* catalog name field width */
+	uint32		indexMaxKeys;	/* max number of columns in an index */
+
+	uint32		toast_max_chunk_size;	/* chunk size in TOAST tables */
+	uint32		loblksize;		/* chunk size in pg_largeobject */
+
+	bool		enableIntTimes; /* int64 storage enabled? */
+
+	bool		float4ByVal;	/* float4 pass-by-value? */
+	bool		float8ByVal;	/* float8, int8, etc pass-by-value? */
+
+	uint32		data_checksum_version;
+
+} ControlFileData11;
+


 extern DBState get_db_state(const char *data_directory);
--- a/dbutils.c
+++ b/dbutils.c
--- a/dbutils.h
+++ b/dbutils.h
@@ -475,7 +475,7 @@ int			wait_connection_availability(PGconn *conn, long long timeout);
 /* node availability functions */
 bool		is_server_available(const char *conninfo);
 bool		is_server_available_params(t_conninfo_param_list *param_list);
-void		connection_ping(PGconn *conn);
+ExecStatusType	connection_ping(PGconn *conn);

 /* monitoring functions  */
 void
--- a/doc/appendix-faq.sgml
+++ b/doc/appendix-faq.sgml
@@ -108,6 +108,14 @@
     is not possible, contact your vendor for assistance.
   </para>
  </sect2>
+
+  <sect2 id="faq-old-packages">
+   <title>How can I obtain old versions of &repmgr; packages?</title>
+   <para>
+     See appendix <xref linkend="packages-old-versions"> for details.
+   </para>
+  </sect2>
+
 </sect1>

 <sect1 id="faq-repmgr" xreflabel="repmgr">
@@ -239,11 +247,22 @@
     Under some circumstances event notifications can be generated for servers
     which have not yet been registered; it's also useful to retain a record
     of events which includes servers removed from the replication cluster
-     which no longer have an entry in the <literal>repmrg.nodes</literal> table.
+     which no longer have an entry in the <literal>repmgr.nodes</literal> table.
   </para>
  </sect2>

-
+  <sect2 id="faq-repmgr-recovery-conf-quoted-values" xreflabel="Quoted values in recovery.conf">
+    <title>Why are some values in <filename>recovery.conf</filename> surrounded by pairs of single quotes?</title>
+    <para>
+      This is to ensure that user-supplied values which are written as parameter values in <filename>recovery.conf</filename>
+      are escaped correctly and do not cause errors when <filename>recovery.conf</filename> is parsed.
+    </para>
+    <para>
+      The escaping is performed by an internal PostgreSQL routine, which leaves strings consisting
+      of digits and alphabetical characters only as-is, but wraps everything else in pairs of single quotes,
+      even if the string does not contain any characters which need escaping.
+    </para>
+  </sect2>


 </sect1>
@@ -255,7 +274,7 @@
  <sect2 id="faq-repmgrd-prevent-promotion" xreflabel="Prevent standby from being promoted to primary">
   <title>How can I prevent a node from ever being promoted to primary?</title>
   <para>
-    In `repmgr.conf`, set its priority to a value of 0 or less; apply the changed setting with
+     In <filename>repmgr.conf</filename>, set its priority to a value of <literal>0</literal>; apply the changed setting with
    <command><link linkend="repmgr-standby-register">repmgr standby register --force</link></command>.
   </para>
   <para>
@@ -303,5 +322,36 @@
   </para>
  </sect2>

+  <sect2 id="faq-repmgrd-pg-bindir" xreflabel="repmgrd does not apply pg_bindir to promote_command or follow_command">
+    <title>
+      <application>repmgrd</application> ignores pg_bindir when executing <varname>promote_command</varname> or <varname>follow_command</varname>
+    </title>
+    <para>
+      <varname>promote_command</varname> or <varname>follow_command</varname> can be user-defined scripts,
+      so &repmgr; will not apply <option>pg_bindir</option> even if excuting &repmgr;. Always provide the full
+      path; see <xref linkend="repmgrd-automatic-failover-configuration"> for more details.
+    </para>
+  </sect2>
+
+  <sect2 id="faq-repmgrd-startup-no-upstream" xreflabel="repmgrd does not start if upstream node is not running">
+    <title>
+      <application>repmgrd</application> aborts startup with the error "<literal>upstream node must be running before repmgrd can start</literal>"
+    </title>
+    <para>
+      <application>repmgrd</application> does this to avoid starting up on a replication cluster
+      which is not in a healthy state. If the upstream is unavailable, <application>repmgrd</application>
+      may initiate a failover immediately after starting up, which could have unintended side-effects,
+      particularly if <application>repmgrd</application> is not running on other nodes.
+    </para>
+    <para>
+      In particular, it's possible that the node's local copy of the <literal>repmgr.nodes</literal> copy
+      is out-of-date, which may lead to incorrect failover behaviour.
+    </para>
+    <para>
+      The onus is therefore on the adminstrator to manually set the cluster to a stable, healthy state before
+      starting <application>repmgrd</application>.
+    </para>
+  </sect2>
+
 </sect1>
 </appendix>
--- a/doc/appendix-packages.sgml
+++ b/doc/appendix-packages.sgml
@@ -53,11 +53,11 @@
          <tbody>
            <row>
              <entry>Repository URL:</entry>
-              <entry><ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink></entry>
+              <entry><ulink url="https://dl.2ndquadrant.com/">https://dl.2ndquadrant.com/</ulink></entry>
            </row>
            <row>
              <entry>Repository documentation:</entry>
-              <entry><ulink url="https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ">https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ</ulink></entry>
+              <entry><ulink url="https://repmgr.org/docs/4.1/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ">https://repmgr.org/docs/4.1/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ</ulink></entry>
            </row>
          </tbody>
        </tgroup>
@@ -253,6 +253,23 @@
      </para>


+      <table id="apt-2ndquadrant-repository">
+        <title>2ndQuadrant public repository</title>
+        <tgroup cols="2">
+          <tbody>
+            <row>
+              <entry>Repository URL:</entry>
+              <entry><ulink url="https://dl.2ndquadrant.com/">https://dl.2ndquadrant.com/</ulink></entry>
+            </row>
+            <row>
+              <entry>Repository documentation:</entry>
+              <entry><ulink url="https://repmgr.org/docs/4.1/installation-packages.html#INSTALLATION-PACKAGES-DEBIAN">https://repmgr.org/docs/4.1/installation-packages.html#INSTALLATION-PACKAGES-DEBIAN</ulink></entry>
+            </row>
+          </tbody>
+        </tgroup>
+      </table>
+
+
      <table id="apt-repository">
        <title>PostgreSQL Community APT repository (PGDG)</title>
        <tgroup cols="2">
@@ -365,6 +382,127 @@

  </sect1>

+  <sect1 id="packages-snapshot" xreflabel="Snapshot packages">
+    <title>Snapshot packages</title>
+    <indexterm>
+      <primary>snapshot packages</primary>
+    </indexterm>
+    <indexterm>
+      <primary>packages</primary>
+      <secondary>snaphots</secondary>
+    </indexterm>
+
+    <para>
+      For testing new features and bug fixes, from time to time 2ndQuadrant provides
+      so-called &quot;snapshot packages&quot; via its public repository. These packages
+      are built from the &repmgr; source at a particular point in time, and are not formal
+      releases.
+    </para>
+    <note>
+      <para>
+        We do not recommend installing these packages in a production environment
+        unless specifically advised.
+      </para>
+    </note>
+    <para>
+      To install a snapshot package, it's necessary to install the 2ndQuadrant public snapshot repository,
+      following the instructions here: <ulink url="https://dl.2ndquadrant.com/default/release/site/">https://dl.2ndquadrant.com/default/release/site/</ulink> but replace <literal>release</literal> with <literal>snapshot</literal>
+      in the appropriate URL.
+    </para>
+    <para>
+      For example, to install the snapshot RPM repository for PostgreSQL 9.6, execute (as <literal>root</literal>):
+      <programlisting>
+curl https://dl.2ndquadrant.com/default/snapshot/get/9.6/rpm | bash</programlisting>
+
+      or as a normal user with root sudo access:
+      <programlisting>
+curl https://dl.2ndquadrant.com/default/snapshot/get/9.6/rpm | sudo bash</programlisting>
+    </para>
+    <para>
+      Alternatively you can browse the repository here:
+      <ulink url="https://dl.2ndquadrant.com/default/snapshot/browse/">https://dl.2ndquadrant.com/default/snapshot/browse/</ulink>.
+    </para>
+    <para>
+      Once the repository is installed, installing or updating &repmgr; will result in the latest snapshot
+      package being installed.
+    </para>
+    <para>
+      The package name will be formatted like this:
+      <programlisting>
+repmgr96-4.1.1-0.0git320.g5113ab0.1.el7.x86_64.rpm</programlisting>
+      containg the snapshot build number (here: <literal>320</literal>) and the hash
+      of the <application>git</application> commit it was built from (here: <literal>g5113ab0</literal>).
+    </para>
+
+    <para>
+      Note that the next formal release (in the above example <literal>4.1.1</literal>), once available,
+      will install in place of any snapshot builds.
+    </para>
+
+  </sect1>
+
+  <sect1 id="packages-old-versions" xreflabel="Installing old package versions">
+    <title>Installing old package versions</title>
+    <indexterm>
+      <primary>old packages</primary>
+    </indexterm>
+    <indexterm>
+      <primary>packages</primary>
+      <secondary>old versions</secondary>
+    </indexterm>
+
+    <sect2 id="packages-old-versions-debian" xreflabel="old Debian package versions">
+      <title>Debian/Ubuntu</title>
+      <para>
+        An archive of old packages (<literal>3.3.2</literal> and later) for Debian/Ubuntu-based systems is available here:
+        <ulink url="http://atalia.postgresql.org/morgue/r/repmgr/">http://atalia.postgresql.org/morgue/r/repmgr/</ulink>
+      </para>
+    </sect2>
+
+    <sect2 id="packages-old-versions-rhel-centos" xreflabel="old RHEL/CentOS package versions">
+      <title>RHEL/CentOS</title>
+      <para>
+        Old RPM packages (<literal>3.2</literal> and later) can be retrieved from the
+        (deprecated) 2ndQuadrant repository at
+        <ulink url="http://packages.2ndquadrant.com/">http://packages.2ndquadrant.com/</ulink>
+        by installing the appropriate repository RPM:
+      </para>
+
+      <itemizedlist spacing="compact" mark="bullet">
+
+        <listitem>
+          <simpara>
+            <ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm</ulink>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</ulink>
+          </simpara>
+        </listitem>
+
+      </itemizedlist>
+
+      <para>
+        Old versions can be located with e.g.:
+        <programlisting>
+          yum --showduplicates list repmgr96</programlisting>
+        (substitute the appropriate package name; see <xref linkend="packages-centos">) and installed with:
+        <programlisting>
+          yum install {package_name}-{version}</programlisting>
+        where <literal>{package_name}</literal> is the base package name (e.g. <literal>repmgr96</literal>)
+        and <literal>{version}</literal> is the version listed by the
+        <command> yum --showduplicates list ...</command> command, e.g. <literal>4.0.6-1.rhel6</literal>.
+      </para>
+      <para>For example:
+        <programlisting>
+          yum install repmgr96-4.0.6-1.rhel6</programlisting>
+      </para>
+
+    </sect2>
+  </sect1>
+

  <sect1 id="packages-packager-info" xreflabel="Information for packagers">
    <title>Information for packagers</title>
@@ -373,7 +511,7 @@
      <secondary>information for packagers</secondary>
    </indexterm>
    <para>
-      We recommend patching the following  parameters when
+      We recommend patching the following parameters when
      building the package as built-in default values for user convenience.
      These values can nevertheless be overridden by the user, if desired.
    </para>
--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -15,9 +15,164 @@
    See also: <xref linkend="upgrading-repmgr">
  </para>

+  <sect1 id="release-4.1.1">
+    <title>Release 4.1.1</title>
+    <para><emphasis>Wed September 5, 2018</emphasis></para>
+    <para>
+      repmgr 4.1.1 contains a number of usability enhancements and bug fixes.
+    </para>
+    <para>
+	  We recommend upgrading to this version as soon as possible.
+	  This release can be installed as a simple package upgrade from repmgr 4.0 ~ 4.1.0;
+      <application>repmgrd</application> (if running) should be restarted.
+      See <xref linkend="upgrading-repmgr"> for more details.
+	</para>
+
+    <sect2>
+      <title>repmgr enhancements</title>
+      <para>
+        <itemizedlist>
+
+          <listitem>
+            <para>
+              <command><link linkend="repmgr-standby-switchover">repmgr standby switchover --dry-run</link></command>
+              no longer copies external configuration files to test they can be copied; this avoids making
+              any changes to the target system. (GitHub #491).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <command><link linkend="repmgr-cluster-cleanup">repmgr cluster cleanup</link></command>:
+              add <literal>cluster_cleanup</literal> event. (GitHub #492).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>:
+              improve detection of free walsenders. (GitHub #495).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Improve messages emitted during
+              <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>.
+            </para>
+          </listitem>
+
+        </itemizedlist>
+      </para>
+   </sect2>
+
+
+    <sect2>
+      <title>repmgrd enhancements</title>
+      <para>
+        <itemizedlist>
+
+          <listitem>
+            <para>
+              Always reopen the log file after
+              receiving <literal>SIGHUP</literal>. Previously this only happened if
+              a configuration file change was detected.
+              (GitHub #485).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Report version number <emphasis>after</emphasis>
+              logger initialisation. (GitHub #487).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Improve cascaded standby failover handling. (GitHub #480).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Improve reconnection handling after brief network outages; if
+              monitoring data being collected, this could lead to orphaned
+              sessions on the primary. (GitHub #480).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Check <varname>promote_command</varname> and <varname>follow_command</varname>
+              are defined when reloading configuration. These were checked on startup but
+              not reload by <application>repmgrd</application>, which made it possible to
+              make <application>repmgrd</application> with invalid values. It's unlikely
+              anyone would want to do this, but we should make it impossible anyway.
+              (GitHub #486).
+            </para>
+          </listitem>
+
+        </itemizedlist>
+      </para>
+    </sect2>
+
+    <sect2>
+      <title>Other</title>
+      <para>
+        <itemizedlist>
+
+          <listitem>
+            <para>
+              Text of any failed queries will now be logged as <literal>ERROR</literal> to assist
+              logfile analysis at log levels higher than <literal>DEBUG</literal>.
+              (GitHub #498).
+            </para>
+          </listitem>
+        </itemizedlist>
+      </para>
+    </sect2>
+
+    <sect2>
+      <title>Bug fixes</title>
+      <para>
+        <itemizedlist>
+
+          <listitem>
+            <para>
+              <command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>:
+              remove new upstream's replication slot if it still exists on the rejoined
+              standby. (GitHub #499).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <application>repmgrd</application>: fix startup on witness node when local data is stale. (GitHub #488, #489).
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              Truncate version string reported by PostgreSQL if necessary; some
+              distributions insert additional detail after the actual version.
+              (GitHub #490).
+            </para>
+          </listitem>
+
+
+        </itemizedlist>
+      </para>
+    </sect2>
+
+
+  </sect1>
+
+
+
  <sect1 id="release-4.1.0">
    <title>Release 4.1.0</title>
-    <para><emphasis>???? ??, 2018</emphasis></para>
+    <para><emphasis>Tue July 31, 2018</emphasis></para>
    <para>
      &repmgr; 4.1.0 introduces some changes to <application>repmgrd</application>
      behaviour and some additional configuration parameters.
@@ -29,19 +184,20 @@
       <itemizedlist>
          <listitem>
            <para>
-              <application>repmgrd</application> (if running) must be restarted.
+              Execute <command>ALTER EXTENSION repmgr UPDATE</command>
+              on the primary server in the database where &repmgr; is installed.
            </para>
          </listitem>
          <listitem>
            <para>
-              Execute <command>ALTER EXTENSION repmgr UPGRADE</command>
-              on the primary server in the database where &repmgr; is installed.
+              <application>repmgrd</application> must be restarted on all nodes where it is running.
            </para>
          </listitem>
+
       </itemizedlist>

       A restart of the PostgreSQL server is <emphasis>not</emphasis> required
-       for this release.
+       for this release (unless upgrading from repmgr 3.x).
    </para>
    <para>
       See <xref linkend="upgrading-repmgr-extension"> for more details.
@@ -53,6 +209,17 @@
      review the changes listed below.
    </para>

+    <note>
+      <para>
+        <emphasis>Repository changes</emphasis>
+      </para>
+      <para>
+        Coinciding with this release, the 2ndQuadrant repository structure has changed.
+        See section <xref linkend="installation-packages"> for details, particularly
+        if you are using a RPM-based system.
+      </para>
+    </note>
+
    <sect2>
      <title>Configuration file changes</title>

@@ -214,7 +381,7 @@

  <sect1 id="release-4.0.6">
    <title>Release 4.0.6</title>
-    <para><emphasis>June 14, 2018</emphasis></para>
+    <para><emphasis>Thu June 14, 2018</emphasis></para>
    <para>
 	  &repmgr; 4.0.6 contains a number of bug fixes and usability enhancements.
    </para>
--- a/doc/appendix-signatures.sgml
+++ b/doc/appendix-signatures.sgml
@@ -5,14 +5,14 @@
   <title>repmgr source code signing key</title>
   <para>
     The signing key ID used for <application>repmgr</application> source code bundles is:
-     <ulink url="http://packages.2ndquadrant.com/repmgr/SOURCE-GPG-KEY-repmgr">
+     <ulink url="https://repmgr.org/download/SOURCE-GPG-KEY-repmgr">
       <literal>0x297F1DCC</literal></ulink>.
   </para>

   <para>
     To download the <application>repmgr</application> source key to your computer:
     <programlisting>
-       curl -s http://packages.2ndquadrant.com/repmgr/SOURCE-GPG-KEY-repmgr | gpg --import
+       curl -s https://repmgr.org/download/SOURCE-GPG-KEY-repmgr | gpg --import
       gpg --fingerprint 0x297F1DCC
     </programlisting>
     then verify that the fingerprint is the expected value:
--- a/doc/configuration-file-service-commands.sgml
+++ b/doc/configuration-file-service-commands.sgml
@@ -17,15 +17,15 @@
    <link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>.
  </para>
  <para>
-    By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> to control the PostgreSQL
+    By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> utility to control the PostgreSQL
    server. However this can lead to various problems, particularly when PostgreSQL has been
-    installed from packages, and expecially so if <application>systemd</application> is in use.
+    installed from packages, and especially so if <application>systemd</application> is in use.
  </para>


  <note>
    <para>
-      If using <application>systemd</application>, ensure you have <varname>RemoteIPC</varname> set to <literal>off</literal>.
+      If using <application>systemd</application>, ensure you have <varname>RemoveIPC</varname> set to <literal>off</literal>.
      See the <ulink url="https://wiki.postgresql.org/wiki/Systemd">systemd</ulink>
      entry in the <ulink url="https://wiki.postgresql.org/wiki/Main_Page">PostgreSQL wiki</ulink> for details.
    </para>
@@ -48,6 +48,13 @@
    service_reload_command</programlisting>
  </para>

+  <note>
+    <para>
+      &repmgr; will not apply <option>pg_bindir</option> when executing any of these commands;
+      these can be user-defined scripts so must always be specified with the full path.
+    </para>
+  </note>
+
  <note>
    <para>
      It's also possible to specify a <varname>service_promote_command</varname>.
@@ -92,7 +99,7 @@
      Defaults:postgres !requiretty
      postgres ALL = NOPASSWD: /usr/bin/systemctl stop postgresql-9.6, \
        /usr/bin/systemctl start postgresql-9.6, \
-        /usr/bin/systemctl restart postgresql-9.6 \
+        /usr/bin/systemctl restart postgresql-9.6, \
        /usr/bin/systemctl reload postgresql-9.6</programlisting>
  </para>

--- a/doc/configuring-witness-server.sgml
+++ b/doc/configuring-witness-server.sgml
@@ -16,15 +16,22 @@
 <para>
   A typical use case for a witness server is a two-node streaming replication
   setup, where the primary and standby are in different locations (data centres).
-   By creating a witness server in the same location as the primary, if the primary
-   becomes unavailable  it's possible for the standby to decide whether it can
-   promote itself without risking a "split brain" scenario: if it can't see either the
+   By creating a witness server in the same location (data centre) as the primary,
+   if the primary becomes unavailable it's possible for the standby to decide whether
+   it can promote itself without risking a "split brain" scenario: if it can't see either the
   witness or the primary server, it's likely there's a network-level interruption
   and it should not promote itself. If it can seen the witness but not the primary,
   this proves there is no network interruption and the primary itself is unavailable,
   and it can therefore promote itself (and ideally take action to fence the
   former primary).
 </para>
+ <note>
+   <para>
+     <emphasis>Never</emphasis> install a witness server on the same physical host
+     as another node in the replication cluster managed by &repmgr; - it's essential
+     the witness is not affected in any way by failure of another node.
+   </para>
+ </note>
 <para>
   For more complex replication scenarios,e.g. with multiple datacentres, it may
   be preferable to use location-based failover, which ensures that only nodes
--- a/doc/event-notifications.sgml
+++ b/doc/event-notifications.sgml
@@ -147,58 +147,76 @@
 <para>
  By default, all notification types will be passed to the designated script;
  the notification types can be filtered to explicitly named ones using the
-  <varname>event_notifications</varname> parameter:
+  <varname>event_notifications</varname> parameter.
+ </para>
+
+ <para>
+   Events generated by the &repmgr; command:

  <itemizedlist spacing="compact" mark="bullet">

   <listitem>
-    <simpara><literal>primary_register</literal></simpara>
+     <simpara><literal><link linkend="repmgr-primary-register-events">cluster_created</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>primary_unregister</literal></simpara>
+     <simpara><literal><link linkend="repmgr-primary-register-events">primary_register</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>standby_register</literal></simpara>
+     <simpara><literal><link linkend="repmgr-primary-unregister-events">primary_unregister</link></literal></simpara>
+   </listitem>
+
+   <listitem>
+    <simpara><literal><link linkend="repmgr-standby-clone-events">standby_clone</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>standby_register_sync</literal></simpara>
+    <simpara><literal><link linkend="repmgr-standby-register-events">standby_register</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>standby_unregister</literal></simpara>
+    <simpara><literal><link linkend="repmgr-standby-register-events">standby_register_sync</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>standby_clone</literal></simpara>
+    <simpara><literal><link linkend="repmgr-standby-unregister-events">standby_unregister</link></literal></simpara>
+   </listitem>
+
+   <listitem>
+    <simpara><literal><link linkend="repmgr-standby-promote-events">standby_promote</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>standby_promote</literal></simpara>
+    <simpara><literal><link linkend="repmgr-standby-follow-events">standby_follow</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>standby_follow</literal></simpara>
+     <simpara><literal><link linkend="repmgr-standby-switchover-events">standby_switchover</link></literal></simpara>
+   </listitem>
+
+   <listitem>
+     <simpara><literal><link linkend="repmgr-witness-register-events">witness_register</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>standby_disconnect_manual</literal></simpara>
+    <simpara><literal><link linkend="repmgr-witness-unregister-events">witness_unregister</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>standby_failure</literal></simpara>
+    <simpara><literal><link linkend="repmgr-node-rejoin-events">node_rejoin</link></literal></simpara>
   </listitem>
   <listitem>
-    <simpara><literal>standby_recovery</literal></simpara>
-   </listitem>
-   <listitem>
-    <simpara><literal>witness_register</literal></simpara>
-   </listitem>
-   <listitem>
-    <simpara><literal>witness_unregister</literal></simpara>
-   </listitem>
-   <listitem>
-    <simpara><literal>node_rejoin</literal></simpara>
+    <simpara><literal><link linkend="repmgr-cluster-cleanup-events">cluster_cleanup</link></literal></simpara>
   </listitem>
+
+  </itemizedlist>
+ </para>
+
+ <para>
+   Events generated by <application>repmgrd</application> (streaming replication mode):
+
+   <itemizedlist spacing="compact" mark="bullet">
   <listitem>
    <simpara><literal>repmgrd_start</literal></simpara>
   </listitem>
   <listitem>
    <simpara><literal>repmgrd_shutdown</literal></simpara>
   </listitem>
+   <listitem>
+    <simpara><literal>repmgrd_reload</literal></simpara>
+   </listitem>
   <listitem>
    <simpara><literal>repmgrd_failover_promote</literal></simpara>
   </listitem>
@@ -208,15 +226,41 @@
   <listitem>
    <simpara><literal>repmgrd_failover_aborted</literal></simpara>
   </listitem>
+   <listitem>
+    <simpara><literal>repmgrd_standby_reconnect</literal></simpara>
+   </listitem>
+   <listitem>
+    <simpara><literal>repmgrd_promote_error</literal></simpara>
+   </listitem>
+   <listitem>
+    <simpara><literal>repmgrd_local_disconnect</literal></simpara>
+   </listitem>
+   <listitem>
+    <simpara><literal>repmgrd_local_reconnect</literal></simpara>
+   </listitem>
   <listitem>
    <simpara><literal>repmgrd_upstream_disconnect</literal></simpara>
   </listitem>
   <listitem>
    <simpara><literal>repmgrd_upstream_reconnect</literal></simpara>
   </listitem>
+
   <listitem>
-    <simpara><literal>repmgrd_promote_error</literal></simpara>
+    <simpara><literal>standby_disconnect_manual</literal></simpara>
   </listitem>
+   <listitem>
+    <simpara><literal>standby_failure</literal></simpara>
+   </listitem>
+   <listitem>
+    <simpara><literal>standby_recovery</literal></simpara>
+   </listitem>
+
+   </itemizedlist>
+ </para>
+
+  <para>
+   Events generated by <application>repmgrd</application> (BDR mode):
+   <itemizedlist spacing="compact" mark="bullet">
   <listitem>
    <simpara><literal>bdr_failover</literal></simpara>
   </listitem>
--- a/doc/install-packages.sgml
+++ b/doc/install-packages.sgml
@@ -16,7 +16,7 @@
  <para>
 	&repmgr; RPM packages for RedHat/CentOS variants and Fedora are available from the
 	<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
-	<ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>; see following
+	<ulink url="https://dl.2ndquadrant.com/">public repository</ulink>; see following
 	section for details.
  </para>
  <para>
@@ -46,26 +46,15 @@
  <sect3 id="installation-packages-redhat-2ndq">
    <title>2ndQuadrant public RPM yum repository</title>

-	<note>
-	  <para>
-		<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink> previously provided a dedicated
-        &repmgr; repository at
-        <ulink url="http://packages.2ndquadrant.com/repmgr/">http://packages.2ndquadrant.com/repmgr/</ulink>.
-		This repository will be deprecated in a future release as it is now replaced by
-		the <ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>
-		documented below.
-	  </para>
-	</note>
-
    <para>
-      Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
+      Beginning with <ulink url="https://repmgr.org/docs/4.1/release-4.0.5.html">repmgr 4.0.5</ulink>,
      <ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a dedicated <literal>yum</literal>
-	  <ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink> for 2ndQuadrant software,
+	  <ulink url="https://dl.2ndquadrant.com/">public repository</ulink> for 2ndQuadrant software,
 	  including &repmgr;. We recommend using this for all future &repmgr; releases.
 	</para>
 	<para>
 	  General instructions for using this repository can be found on its
-	  <ulink url="https://rpm.2ndquadrant.com/">homepage</ulink>. Specific instructions
+	  <ulink url="https://dl.2ndquadrant.com/">homepage</ulink>. Specific instructions
 	  for installing &repmgr; follow below.
 	</para>
    <para>
@@ -75,20 +64,19 @@
 		<listitem>
 		  <para>
 			Locate the repository RPM for your PostgreSQL version from the list at:
-			<ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink>
+			<ulink url="https://dl.2ndquadrant.com/">https://dl.2ndquadrant.com/</ulink>
 		  </para>
 		</listitem>

        <listitem>
          <para>
-            Install the repository RPM for your distribution and PostgreSQL version
+            Install the repository definition for your distribution and PostgreSQL version
 			(this enables the 2ndQuadrant repository as a source of &repmgr; packages).
 		  </para>
 		  <para>
 			For example, for PostgreSQL 10 on CentOS, execute:
 			<programlisting>
-sudo yum install https://rpm.2ndquadrant.com/site/content/2ndquadrant-repo-10-1-1.el7.noarch.rpm
-			</programlisting>
+curl https://dl.2ndquadrant.com/default/release/get/10/rpm | sudo bash</programlisting>
 		  </para>
 		  <para>
 			Verify that the repository is installed with:
@@ -96,8 +84,8 @@ sudo yum install https://rpm.2ndquadrant.com/site/content/2ndquadrant-repo-10-1-
 sudo yum repolist</programlisting>
 			The output should contain two entries like this:
 			<programlisting>
-2ndquadrant-repo-10/7/x86_64         2ndQuadrant packages for PG10 for rhel 7 - x86_64           1
-2ndquadrant-repo-10-debug/7/x86_64   2ndQuadrant packages for PG10 for rhel 7 - x86_64 - Debug   1</programlisting>
+2ndquadrant-dl-default-release-pg10/7/x86_64        2ndQuadrant packages (PG10) for 7 - x86_64          4
+2ndquadrant-dl-default-release-pg10-debug/7/x86_64  2ndQuadrant packages (PG10) for 7 - x86_64 - Debug  3</programlisting>
 		  </para>
 		</listitem>

@@ -177,52 +165,43 @@ $ yum install repmgr10</programlisting>
    <para>
      Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
      <ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a
-	  <ulink url="https://apt.2ndquadrant.com/">public apt repository</ulink> for 2ndQuadrant software,
+	  <ulink url="https://dl.2ndquadrant.com/">public apt repository</ulink> for 2ndQuadrant software,
 	  including &repmgr;.
 	</para>
 	<para>
 	  General instructions for using this repository can be found on its
-	  <ulink url="https://apt.2ndquadrant.com/">homepage</ulink>. Specific instructions
+	  <ulink url="https://dl.2ndquadrant.com/">homepage</ulink>. Specific instructions
 	  for installing &repmgr; follow below.
 	</para>

    <para>
+
      <emphasis>Installation</emphasis>

      <itemizedlist>

 		<listitem>
 		  <para>
-			If not already present, install the  <application>apt-transport-https</application> package:
-			<programlisting>
-sudo apt-get install apt-transport-https</programlisting>
+            Install the repository definition for your distribution and PostgreSQL version
+			(this enables the 2ndQuadrant repository as a source of &repmgr; packages) by executing:
+            <programlisting>
+curl https://dl.2ndquadrant.com/default/release/get/deb | sudo bash</programlisting>
 		  </para>
-		</listitem>
+          <note>
+            <para>
+              This will automatically install the following additional packages, if not already present:
+              <itemizedlist spacing="compact" mark="bullet">
+                <listitem>
+                  <simpara><literal>lsb-release</literal></simpara>
+                </listitem>
+                <listitem>
+                  <simpara><literal>apt-transport-https</literal></simpara>
+                </listitem>
+              </itemizedlist>
+            </para>
+          </note>
+        </listitem>

-		<listitem>
-		  <para>
-			Create <filename>/etc/apt/sources.list.d/2ndquadrant.list</filename> as follows:
-			<programlisting>
-sudo sh -c 'echo "deb https://apt.2ndquadrant.com/ $(lsb_release -cs)-2ndquadrant main" > /etc/apt/sources.list.d/2ndquadrant.list'</programlisting>
-		  </para>
-		</listitem>
-
-		<listitem>
-		  <para>
-			Install the 2ndQuadrant <ulink url="https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc">repository key</ulink>:
-			<programlisting>
-sudo apt-get install curl ca-certificates
-curl https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc | sudo apt-key add -</programlisting>
-		  </para>
-		</listitem>
-
-		<listitem>
-		  <para>
-			Update the package list
-			<programlisting>
- sudo apt-get update</programlisting>
-		  </para>
-		</listitem>

 		<listitem>
 		  <para>
--- a/doc/install-source.sgml
+++ b/doc/install-source.sgml
@@ -12,8 +12,8 @@
   To install &repmgr; the prerequisites for compiling
   &postgres; must be installed. These are described in &postgres;'s
   documentation
-   on <ulink url="https://www.postgresql.org/docs/current/install-requirements.html">build requirements</ulink>
-   and <ulink url="https://www.postgresql.org/docs/current/docguide-toolsets.html">build requirements for documentation</ulink>.
+   on <ulink url="https://www.postgresql.org/docs/current/static/install-requirements.html">build requirements</ulink>
+   and <ulink url="https://www.postgresql.org/docs/current/static/docguide-toolsets.html">build requirements for documentation</ulink>.
  </para>

  <para>
--- a/doc/quickstart.sgml
+++ b/doc/quickstart.sgml
@@ -240,11 +240,28 @@
  <tip>
   <simpara>
    For Debian-based distributions we recommend explictly setting
-    <literal>pg_bindir</literal> to the directory where <command>pg_ctl</command> and other binaries
+    <option>pg_bindir</option> to the directory where <command>pg_ctl</command> and other binaries
    not in the standard path are located. For PostgreSQL 9.6 this would be <filename>/usr/lib/postgresql/9.6/bin/</filename>.
   </simpara>
  </tip>

+  <note>
+    <para>
+      &repmgr; only uses <option>pg_bindir</option> when it executes
+      PostgreSQL binaries directly.
+    </para>
+    <para>
+      For user-defined scripts such as <option>promote_command</option> and the
+      various <option>service_*_command</option>s, you <emphasis>must</emphasis>
+      always explicitly provide the full path to the binary or script being
+      executed, even if it is &repmgr; itself.
+    </para>
+    <para>
+      This is because these options can contain user-defined scripts in arbitrary
+      locations, so prepending <option>pg_bindir</option> may break them.
+    </para>
+  </note>
+
  <para>
   See the file
   <ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</>
--- a/doc/repmgr-cluster-cleanup.sgml
+++ b/doc/repmgr-cluster-cleanup.sgml
@@ -15,9 +15,14 @@
    <title>Description</title>
    <para>
      Purges monitoring history from the <literal>repmgr.monitoring_history</literal> table to
-      prevent excessive table growth. Use the <literal>-k/--keep-history</literal> to specify the
-      number of days of monitoring history to retain. This command can be used
-      manually or as a cronjob.
+      prevent excessive table growth.
+    </para>
+    <para>
+      By default <emphasis>all</emphasis> data will be removed; Use the <option>-k/--keep-history</option>
+      option to specify the number of days of monitoring history to retain.
+    </para>
+    <para>
+      This command can be executed manually or as a cronjob.
    </para>
  </refsect1>

@@ -38,4 +43,21 @@
      <filename>repmgr.conf</filename>.
    </para>
  </refsect1>
+
+  <refsect1 id="repmgr-cluster-cleanup-events">
+    <title>Event notifications</title>
+    <para>
+      A <literal>cluster_cleanup</literal> <link linkend="event-notifications">event notification</link> will be generated.
+    </para>
+  </refsect1>
+
+
+  <refsect1>
+    <title>See also</title>
+    <para>
+      For more details see the sections <xref linkend="repmgrd-monitoring"> and
+      <xref linkend="repmgrd-monitoring-configuration">.
+    </para>
+  </refsect1>
+
 </refentry>
--- a/doc/repmgr-cluster-show.sgml
+++ b/doc/repmgr-cluster-show.sgml
@@ -81,36 +81,56 @@

  <refsect1>
    <title>Options</title>
-    <para>
-      <command>repmgr cluster show</command> accepts an optional parameter <literal>--csv</literal>, which
-      outputs the replication cluster's status in a simple CSV format, suitable for
-      parsing by scripts:
-      <programlisting>
+
+    <variablelist>
+
+      <varlistentry>
+        <term><option>--csv</option></term>
+        <listitem>
+		  <para>
+			<command>repmgr cluster show</command> accepts an optional parameter <literal>--csv</literal>, which
+			outputs the replication cluster's status in a simple CSV format, suitable for
+			parsing by scripts:
+			<programlisting>
    $ repmgr -f /etc/repmgr.conf cluster show --csv
    1,-1,-1
    2,0,0
    3,0,1</programlisting>
-    </para>
-    <para>
-      The columns have following meanings:
-      <itemizedlist spacing="compact" mark="bullet">
-        <listitem>
-          <simpara>
-            node ID
-          </simpara>
-        </listitem>
-        <listitem>
-          <simpara>
+		  </para>
+		  <para>
+			The columns have following meanings:
+			<itemizedlist spacing="compact" mark="bullet">
+			  <listitem>
+				<simpara>
+				  node ID
+				</simpara>
+			  </listitem>
+			  <listitem>
+				<simpara>
            availability (0 = available, -1 = unavailable)
-          </simpara>
-        </listitem>
+				</simpara>
+			  </listitem>
+			  <listitem>
+				<simpara>
+				  recovery state (0 = not in recovery, 1 = in recovery, -1 = unknown)
+				</simpara>
+			  </listitem>
+			</itemizedlist>
+		  </para>
+		</listitem>
+	  </varlistentry>
+
+      <varlistentry>
+        <term><option>--verbose</option></term>
        <listitem>
-          <simpara>
-            recovery state (0 = not in recovery, 1 = in recovery, -1 = unknown)
-          </simpara>
+          <para>
+			Display the full text of any database connection error messages
+          </para>
        </listitem>
-      </itemizedlist>
-    </para>
+      </varlistentry>
+
+	</variablelist>
+
  </refsect1>


--- a/doc/repmgr-node-rejoin.sgml
+++ b/doc/repmgr-node-rejoin.sgml
@@ -28,6 +28,10 @@
        If the node is running and needs to be attached to the current primary, use
        <xref linkend="repmgr-standby-follow">.
      </para>
+      <para>
+        Note <xref linkend="repmgr-standby-follow"> can only be used for standbys which have not diverged
+        from the rest of the cluster.
+      </para>
    </tip>
  </refsect1>

@@ -63,10 +67,10 @@
        <term><option>--force-rewind[=/path/to/pg_rewind]</option></term>
        <listitem>
          <para>
-            Execute <application>pg_rewind</application> if necessary.
+            Execute <application>pg_rewind</application>.
          </para>
          <para>
-            It is only necessary to provide the <application>pg_rewind</application>
+            It is only necessary to provide the <application>pg_rewind</application> path
            if using PostgreSQL 9.3 or 9.4, and <application>pg_rewind</application>
            is not installed in the PostgreSQL <filename>bin</filename> directory.
          </para>
@@ -115,7 +119,7 @@

    </variablelist>
  </refsect1>
- <refsect1>
+  <refsect1>
    <title>Configuration file settings</title>

    <para>
@@ -132,8 +136,9 @@
 	  </itemizedlist>
 	</para>

- </refsect1>
-  <refsect1>
+  </refsect1>
+
+  <refsect1 id="repmgr-node-rejoin-events">
    <title>Event notifications</title>
    <para>
      A <literal>node_rejoin</literal> <link linkend="event-notifications">event notification</link> will be generated.
@@ -188,7 +193,7 @@
    </note>

    <para>
-      To have <command>repmgr node rejoin</command> use <command>pg_rewind</command> if required,
+      To have <command>repmgr node rejoin</command> use <command>pg_rewind</command>,
      pass the command line option <literal>--force-rewind</literal>, which will tell &repmgr;
      to execute <command>pg_rewind</command> to ensure the node can be rejoined successfully.
    </para>
@@ -221,6 +226,15 @@
    INFO: pg_rewind would now be executed
    DETAIL: pg_rewind command is:
      pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node1 dbname=repmgr user=repmgr'</programlisting>
+
+    <note>
+      <para>
+        If <option>--force-rewind</option> is used with the <option>--dry-run</option> option,
+        this checks the prerequisites for using <application>pg_rewind</application>, but cannot
+        predict the outcome of actually executing <application>pg_rewind</application>.
+      </para>
+    </note>
+
    <programlisting>
    $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node1 dbname=repmgr user=repmgr' \
         --force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose
--- a/doc/repmgr-primary-register.sgml
+++ b/doc/repmgr-primary-register.sgml
@@ -75,10 +75,18 @@
  </refsect1>


-  <refsect1>
+  <refsect1 id="repmgr-primary-register-events">
    <title>Event notifications</title>
    <para>
-      A <literal>primary_register</literal> <link linkend="event-notifications">event notification</link> will be generated.
+      Following <link linkend="event-notifications">event notifications</link> will be generated:
+      <itemizedlist spacing="compact" mark="bullet">
+        <listitem>
+          <simpara><literal>cluster_created</literal></simpara>
+        </listitem>
+        <listitem>
+          <simpara><literal>primary_register</literal></simpara>
+        </listitem>
+      </itemizedlist>
    </para>
  </refsect1>

--- a/doc/repmgr-primary-unregister.sgml
+++ b/doc/repmgr-primary-unregister.sgml
@@ -64,7 +64,7 @@

  </refsect1>

-  <refsect1>
+  <refsect1 id="repmgr-primary-unregister-events">
    <title>Event notifications</title>
    <para>
      A <literal>primary_unregister</literal> <link linkend="event-notifications">event notification</link> will be generated.
--- a/doc/repmgr-standby-clone.sgml
+++ b/doc/repmgr-standby-clone.sgml
@@ -49,7 +49,7 @@
    not be copied by default. &repmgr; can copy these files, either to the same
    location on the standby server (provided appropriate directory and file permissions
    are available), or into the standby's data directory. This requires passwordless
-    SSH access to the primary server. Add the option <literal>--copy-external-config-files</literal>
+    SSH access to the primary server. Add the option <option>--copy-external-config-files</option>
    to the <command>repmgr standby clone</command> command; by default files will be copied to
    the same path as on the upstream server. Note that the user executing <command>repmgr</command>
    must have write access to those directories.
@@ -59,12 +59,29 @@
    <literal>--copy-external-config-files=pgdata</literal>, but note that
    any include directives in the copied files may need to be updated.
   </para>
+
+   <note>
+	 <para>
+	   When executing <command>repmgr standby clone</command> with the
+	   <option>--copy-external-config-files</option> aand <option>--dry-run</option>
+	   options, &repmgr; will check the SSH connection to the source node, but
+	   will not verify whether the files can actually be copied.
+	 </para>
+	 <para>
+	   During the actual clone operation, a check will be made before the database itself
+	   is cloned to determine whether the files can actually be copied; if any problems are
+	   encountered, the clone operation will be aborted, enabling the user to fix
+	   any issues before retrying the clone operation.
+	 </para>
+   </note>
+
   <tip>
    <simpara>
     For reliable configuration file management we recommend using a
     configuration management tool such as Ansible, Chef, Puppet or Salt.
    </simpara>
   </tip>
+
  </refsect1>

  <refsect1 id="repmgr-standby-clone-recovery-conf">
@@ -333,7 +350,7 @@
    </variablelist>
  </refsect1>

-  <refsect1>
+  <refsect1 id="repmgr-standby-clone-events">
    <title>Event notifications</title>
    <para>
      A <literal>standby_clone</literal> <link linkend="event-notifications">event notification</link> will be generated.
--- a/doc/repmgr-standby-follow.sgml
+++ b/doc/repmgr-standby-follow.sgml
@@ -94,7 +94,7 @@
    </variablelist>
  </refsect1>

-  <refsect1>
+  <refsect1 id="repmgr-standby-follow-events">
    <title>Event notifications</title>
    <para>
      A <literal>standby_follow</literal> <link linkend="event-notifications">event notification</link> will be generated.
--- a/doc/repmgr-standby-promote.sgml
+++ b/doc/repmgr-standby-promote.sgml
@@ -50,7 +50,7 @@
  </refsect1>


-  <refsect1>
+  <refsect1 id="repmgr-standby-promote-events">
    <title>Event notifications</title>
    <para>
      A <literal>standby_promote</literal> <link linkend="event-notifications">event notification</link> will be generated.
--- a/doc/repmgr-standby-register.sgml
+++ b/doc/repmgr-standby-register.sgml
@@ -159,7 +159,7 @@
    </variablelist>
  </refsect1>

-  <refsect1>
+  <refsect1 id="repmgr-standby-register-events">
    <title>Event notifications</title>
    <para>
      A <literal>standby_register</literal> <link linkend="event-notifications">event notification</link>
--- a/doc/repmgr-standby-switchover.sgml
+++ b/doc/repmgr-standby-switchover.sgml
@@ -196,7 +196,7 @@
    </para>
  </refsect1>

-  <refsect1>
+  <refsect1 id="repmgr-standby-switchover-events">
    <title>Event notifications</title>
    <para>
      <literal>standby_switchover</literal> and <literal>standby_promote</literal>
--- a/doc/repmgr-standby-unregister.sgml
+++ b/doc/repmgr-standby-unregister.sgml
@@ -59,7 +59,7 @@
    </variablelist>
  </refsect1>

-  <refsect1>
+  <refsect1 id="repmgr-standby-unregister-events">
    <title>Event notifications</title>
    <para>
      A <literal>standby_unregister</literal> <link linkend="event-notifications">event notification</link> will be generated.
--- a/doc/repmgr-witness-register.sgml
+++ b/doc/repmgr-witness-register.sgml
@@ -50,7 +50,7 @@
  </refsect1>


-  <refsect1>
+  <refsect1 id="repmgr-witness-register-events">
    <title>Event notifications</title>
    <para>
      A <literal>witness_register</literal> <link linkend="event-notifications">event notification</link> will be generated.
--- a/doc/repmgr-witness-unregister.sgml
+++ b/doc/repmgr-witness-unregister.sgml
@@ -92,7 +92,7 @@
  </refsect1>


-  <refsect1>
+  <refsect1 id="repmgr-witness-unregister-events">
    <title>Event notifications</title>
    <para>
      A <literal>witness_unregister</literal> <link linkend="event-notifications">event notification</link> will be generated.
--- a/doc/repmgr.sgml
+++ b/doc/repmgr.sgml
@@ -25,7 +25,13 @@
   <para>
   This is the official documentation of &repmgr; &repmgrversion; for
   use with PostgreSQL 9.3 - PostgreSQL 10.
-   It describes the functionality supported by the current version of &repmgr;.
+   </para>
+   <para>
+     &repmgr; is being continually developed and we strongly recommend using the
+     latest version. Please check the
+     <ulink url="https://repmgr.org/">repmgr website</ulink> for details
+     about the current &repmgr; version as well as the
+     <ulink url="https://repmgr.org/docs/current/index.html">current documentation</ulink>.
   </para>

   <para>
--- a/doc/repmgrd-bdr.sgml
+++ b/doc/repmgrd-bdr.sgml
@@ -15,7 +15,7 @@
  </para>
  <note>
    <simpara>
-      Due to the nature of BDR, it's only safe to use this solution for
+      Due to the nature of BDR 1.x/2.x, it's only safe to use this solution for
      a two-node scenario. Introducing additional nodes will create an inherent
      risk of node desynchronisation if a node goes down without being cleanly
      removed from the cluster.
--- a/doc/repmgrd-configuration.sgml
+++ b/doc/repmgrd-configuration.sgml
@@ -34,24 +34,6 @@
      the <ulink url="https://www.postgresql.org/docs/current/static/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
    </para>

-    <para>
-      To apply configuration file changes to a running <application>repmgrd</application>
-      daemon, execute the operating system's r<application>repmgrd</application> service reload command
-      (see <xref linkend="appendix-packages"> for examples),
-      or for instances  which were manually started, execute <command>kill -HUP</command>, e.g.
-      <command>kill -HUP `cat /tmp/repmgrd.pid`</command>.
-    </para>
-    <note>
-      <para>
-        Check the <application>repmgrd</application> log to see what changes were
-        applied, or if any issues were encountered when reloading the configuration.
-      </para>
-    </note>
-    <para>
-      Note that only a subset of configuration file parameters can be changed on a
-      running <application>repmgrd</application> daemon.
-    </para>
-

    <sect2 id="repmgrd-automatic-failover-configuration">
      <title>automatic failover configuration</title>
@@ -64,8 +46,17 @@
          follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
      </para>
      <para>
-        Adjust file paths as appropriate; we recomment specifying the full path to the &repmgr; binary.
+        Adjust file paths as appropriate; alway specify the full path to the &repmgr; binary.
      </para>
+
+      <note>
+        <para>
+          &repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
+          or <option>follow_command</option>; these can be user-defined scripts so must always be
+          specified with the full path.
+        </para>
+      </note>
+
      <para>
        Note that the <literal>--log-to-file</literal> option will cause
        output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
@@ -135,7 +126,7 @@
      </para>
    </sect2>

-    <sect2 id="repmgrd-monitoring-configuration">
+    <sect2 id="repmgrd-monitoring-configuration" xreflabel="repmgrd monitoring configuration">
      <indexterm>
        <primary>repmgrd</primary>
        <secondary>monitoring configuration</secondary>
@@ -158,6 +149,203 @@
      </para>
    </sect2>

+    <sect2 id="repmgrd-reloading-configuration"xreflabel="reloading repmgrd configuration">
+      <indexterm>
+        <primary>repmgrd</primary>
+        <secondary>applying configuration changes</secondary>
+      </indexterm>
+      <title>Applying configuration changes to repmgrd</title>
+      <para>
+        To apply configuration file changes to a running <application>repmgrd</application>
+        daemon, execute the operating system's <application>repmgrd</application> service reload command
+        (see <xref linkend="appendix-packages"> for examples),
+          or for instances  which were manually started, execute <command>kill -HUP</command>, e.g.
+          <command>kill -HUP `cat /tmp/repmgrd.pid`</command>.
+      </para>
+      <tip>
+        <para>
+          Check the <application>repmgrd</application> log to see what changes were
+          applied, or if any issues were encountered when reloading the configuration.
+        </para>
+      </tip>
+      <para>
+        Note that only the following subset of configuration file parameters can be changed on a
+        running <application>repmgrd</application> daemon:
+      </para>
+      <itemizedlist spacing="compact" mark="bullet">
+
+        <listitem>
+          <simpara>
+            <varname>async_query_timeout</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>bdr_local_monitoring_only</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>bdr_recovery_timeout</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>conninfo</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>degraded_monitoring_timeout</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>event_notification_command</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>event_notifications</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>failover</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>follow_command</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>log_facility</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>log_file</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>log_level</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>log_status_interval</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>monitor_interval_secs</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>monitoring_history</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>primary_notification_timeout</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>promote_command</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>reconnect_attempts</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>reconnect_interval</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>repmgrd_standby_startup_timeout</varname>
+          </simpara>
+        </listitem>
+
+      </itemizedlist>
+
+      <para>
+        The following set of configuration file parameters must be updated via
+        <command><link linkend="repmgr-standby-register">repmgr standby register --force</link></command>,
+        as they require changes to the <literal>repmgr.nodes</literal> table so they are visible to
+        all nodes in the replication cluster:
+      </para>
+      <itemizedlist spacing="compact" mark="bullet">
+
+        <listitem>
+          <simpara>
+            <varname>node_id</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>node_name</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>data_directory</varname>
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <varname>location</varname>
+          </simpara>
+        </listitem>
+
+
+        <listitem>
+          <simpara>
+            <varname>priority</varname>
+          </simpara>
+        </listitem>
+
+      </itemizedlist>
+
+      <note>
+        <para>
+          After executing <command><link linkend="repmgr-standby-register">repmgr standby register --force</link></command>,
+          <application>repmgrd</application> <emphasis>must</emphasis> be restarted for the changes to take effect.
+        </para>
+      </note>
+
+    </sect2>
+
  </sect1>

  <sect1 id="repmgrd-daemon">
@@ -323,25 +511,34 @@ REPMGRD_ENABLED=no
     <secondary>repmgrd</secondary>
   </indexterm>

+   <indexterm>
+     <primary>repmgrd</primary>
+     <secondary>log rotation</secondary>
+   </indexterm>
+
  <title>repmgrd log rotation</title>
  <para>
   To ensure the current <application>repmgrd</application> logfile
   (specified in <filename>repmgr.conf</filename> with the parameter
-   <option>log_file</option> does not grow indefinitely, configure your
+   <option>log_file</option>) does not grow indefinitely, configure your
   system's <command>logrotate</command> to regularly rotate it.
  </para>
  <para>
   Sample configuration to rotate logfiles weekly with retention for
   up to 52 weeks and rotation forced if a file grows beyond 100Mb:
   <programlisting>
-    /var/log/postgresql/repmgr-9.6.log {
+    /var/log/repmgr/repmgrd.log {
        missingok
        compress
        rotate 52
        maxsize 100M
        weekly
        create 0600 postgres postgres
+        postrotate
+            /usr/bin/killall -HUP repmgrd
+        endscript
    }</programlisting>
  </para>
+
 </sect1>
 </chapter>
--- a/doc/repmgrd-degraded-monitoring.sgml
+++ b/doc/repmgrd-degraded-monitoring.sgml
@@ -1,4 +1,4 @@
-<chapter id="repmgrd-degraded-monitoring">
+<chapter id="repmgrd-degraded-monitoring" xreflabel="repmgrd degraded monitoring">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>degraded monitoring</secondary>
@@ -7,8 +7,8 @@
 <title>"degraded monitoring" mode</title>
 <para>
  In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
-  of monitoring the nodes' upstream server. In these cases it enters "degraded
-  monitoring" mode, where <application>repmgrd</application> remains active but is waiting for the situation
+  of monitoring the node's upstream server. In these cases it enters &quot;degraded monitoring&quot;
+  mode, where <application>repmgrd</application> remains active but is waiting for the situation
  to be resolved.
 </para>
 <para>
--- a/doc/repmgrd-monitoring.sgml
+++ b/doc/repmgrd-monitoring.sgml
@@ -1,4 +1,4 @@
-<chapter id="repmgrd-monitoring">
+<chapter id="repmgrd-monitoring" xreflabel="Monitoring with repmgrd">
 <indexterm>
   <primary>repmgrd</primary>
   <secondary>monitoring</secondary>
--- a/doc/repmgrd-network-split.sgml
+++ b/doc/repmgrd-network-split.sgml
@@ -40,8 +40,8 @@
  In a failover situation, <application>repmgrd</application> will check if any servers in the
  same location as the current primary node are visible.  If not, <application>repmgrd</application>
  will assume a network interruption and not promote any node in any
-  other location (it will however enter <xref linkend="repmgrd-degraded-monitoring"> mode until
-  a primary becomes visible).
+  other location (it will however enter <link linkend="repmgrd-degraded-monitoring">degraded monitoring</link>
+  mode until a primary becomes visible).
 </para>

 </chapter>
--- a/doc/switchover.sgml
+++ b/doc/switchover.sgml
@@ -60,6 +60,13 @@
    &repmgr; being able to shut down the current primary server quickly and cleanly.
   </para>

+   <para>
+     Ensure that the promotion candidate has sufficient free walsenders available
+     (PostgreSQL configuration item <varname>max_wal_senders</varname>), and if replication
+     slots are in use, at least one free slot is available for the demotion candidate (
+     PostgreSQL configuration item <varname>max_replication_slots</varname>).
+   </para>
+
   <para>
     Ensure that a passwordless SSH connection is possible from the promotion candidate
     (standby) to the demotion candidate (current primary). If <literal>--siblings-follow</literal>
--- a/doc/version.sgml
+++ b/doc/version.sgml
@@ -1 +1 @@
-<!ENTITY repmgrversion "4.1dev">
+<!ENTITY repmgrversion "4.1.1">
--- a/repmgr-action-cluster.c
+++ b/repmgr-action-cluster.c
@@ -84,6 +84,7 @@ do_cluster_show(void)
 	ItemList	warnings = {NULL, NULL};
 	bool		success = false;
 	bool		error_found = false;
+	bool		connection_error_found = false;

 	/* Connect to local database to obtain cluster connection data */
 	log_verbose(LOG_INFO, _("connecting to database"));
@@ -141,14 +142,26 @@ do_cluster_show(void)
 		}
 		else
 		{
-			char		error[MAXLEN];
-
-			strncpy(error, PQerrorMessage(cell->node_info->conn), MAXLEN);
 			cell->node_info->node_status = NODE_STATUS_DOWN;
 			cell->node_info->recovery_type = RECTYPE_UNKNOWN;
-			item_list_append_format(&warnings,
-									"when attempting to connect to node \"%s\" (ID: %i), following error encountered :\n\"%s\"",
-									cell->node_info->node_name, cell->node_info->node_id, trim(error));
+
+			connection_error_found = true;
+
+			if (runtime_options.verbose)
+			{
+				char		error[MAXLEN];
+
+				strncpy(error, PQerrorMessage(cell->node_info->conn), MAXLEN);
+				item_list_append_format(&warnings,
+										"when attempting to connect to node \"%s\" (ID: %i), following error encountered :\n\"%s\"",
+										cell->node_info->node_name, cell->node_info->node_id, trim(error));
+			}
+			else
+			{
+				item_list_append_format(&warnings,
+										"unable to  connect to node \"%s\" (ID: %i)",
+										cell->node_info->node_name, cell->node_info->node_id);
+			}
 		}

 		initPQExpBuffer(&details);
@@ -437,6 +450,11 @@ do_cluster_show(void)
 		{
 			printf(_("  - %s\n"), cell->string);
 		}
+
+		if (runtime_options.verbose == false && connection_error_found == true)
+		{
+			log_hint(_("execute with --verbose option to see connection error messages"));
+		}
 	}

 	/*
@@ -1332,6 +1350,7 @@ do_cluster_cleanup(void)
 	PGconn	   *conn = NULL;
 	PGconn	   *primary_conn = NULL;
 	int			entries_to_delete = 0;
+	PQExpBufferData event_details;

 	conn = establish_db_connection(config_file_options.conninfo, true);

@@ -1345,7 +1364,13 @@ do_cluster_cleanup(void)

 	entries_to_delete = get_number_of_monitoring_records_to_delete(primary_conn, runtime_options.keep_history);

-	if (entries_to_delete == 0)
+	if (entries_to_delete < 0)
+	{
+		log_error(_("unable to query number of monitoring records to clean up"));
+		PQfinish(primary_conn);
+		exit(ERR_DB_QUERY);
+	}
+	else if (entries_to_delete == 0)
 	{
 		log_info(_("no monitoring records to delete"));
 		PQfinish(primary_conn);
@@ -1355,10 +1380,23 @@ do_cluster_cleanup(void)
 	log_debug("at least %i monitoring records for deletion",
 			  entries_to_delete);

+	initPQExpBuffer(&event_details);
+
 	if (delete_monitoring_records(primary_conn, runtime_options.keep_history) == false)
 	{
-		log_error(_("unable to delete monitoring records"));
+		appendPQExpBuffer(&event_details,
+						  _("unable to delete monitoring records"));
+
+		log_error("%s", event_details.data);
 		log_detail("%s", PQerrorMessage(primary_conn));
+
+		create_event_notification(primary_conn,
+								  &config_file_options,
+								  config_file_options.node_id,
+								  "cluster_cleanup",
+								  false,
+								  event_details.data);
+
 		PQfinish(primary_conn);
 		exit(ERR_DB_QUERY);
 	}
@@ -1370,7 +1408,22 @@ do_cluster_cleanup(void)
 		log_detail("%s", PQerrorMessage(primary_conn));
 	}

+	appendPQExpBuffer(&event_details,
+					  _("monitoring records deleted"));

+	if (runtime_options.keep_history > 0)
+		appendPQExpBuffer(&event_details,
+						  _("; records newer than %i day(s) retained"),
+						  runtime_options.keep_history);
+
+	create_event_notification(primary_conn,
+							  &config_file_options,
+							  config_file_options.node_id,
+							  "cluster_cleanup",
+							  true,
+							  event_details.data);
+
+	termPQExpBuffer(&event_details);
 	PQfinish(primary_conn);

 	if (runtime_options.keep_history > 0)
--- a/repmgr-action-node.c
+++ b/repmgr-action-node.c
@@ -2417,6 +2417,54 @@ do_node_rejoin(void)
 		success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
 	}

+	/*
+	 * Handle replication slots:
+	 *  - if a slot for the new upstream exists, delete that
+	 *  - warn about any other inactive replication slots
+	 */
+	if (runtime_options.force_rewind_used == false && config_file_options.use_replication_slots)
+	{
+		PGconn	   *local_conn = NULL;
+		local_conn = establish_db_connection(config_file_options.conninfo, false);
+
+		if (PQstatus(local_conn) != CONNECTION_OK)
+		{
+			log_warning(_("unable to connect to local node to check replication slot status"));
+			log_hint(_("execute \"repmgr node check\" to check inactive slots and drop manually if necessary"));
+		}
+		else
+		{
+			KeyValueList inactive_replication_slots = {NULL, NULL};
+			KeyValueListCell *cell = NULL;
+			int inactive_count = 0;
+			PQExpBufferData slotinfo;
+
+			drop_replication_slot_if_exists(local_conn,
+											config_file_options.node_id,
+											primary_node_record.slot_name);
+
+			(void) get_inactive_replication_slots(local_conn, &inactive_replication_slots);
+
+			initPQExpBuffer(&slotinfo);
+			for (cell = inactive_replication_slots.head; cell; cell = cell->next)
+			{
+				appendPQExpBuffer(&slotinfo,
+								  "  - %s (%s)", cell->key, cell->value);
+				inactive_count++;
+			}
+
+			if (inactive_count > 0)
+			{
+				log_warning(_("%i inactive replication slots detected"), inactive_count);
+				log_detail(_("inactive replication slots:\n%s"), slotinfo.data);
+				log_hint(_("these replication slots may need to be removed manually"));
+			}
+
+			termPQExpBuffer(&slotinfo);
+
+			PQfinish(local_conn);
+		}
+	}

 	if (success == true)
 	{
@@ -2426,7 +2474,8 @@ do_node_rejoin(void)
 	else
 	{
 		/*
-		 * if we reach here, no record found in upstream node's pg_stat_replication */
+		 * if we reach here, no record found in upstream node's pg_stat_replication
+		 */
 		log_notice(_("NODE REJOIN has completed but node is not yet reattached to upstream"));
 		log_hint(_("you will need to manually check the node's replication status"));
 	}
--- a/repmgr-action-primary.c
+++ b/repmgr-action-primary.c
@@ -64,12 +64,10 @@ do_primary_register(void)
 			PQfinish(conn);
 			exit(ERR_BAD_CONFIG);
 		}
-		else
-		{
-			log_error(_("connection to node lost"));
-			PQfinish(conn);
-			exit(ERR_DB_CONN);
-		}
+
+		log_error(_("unable to determine server's recovery type"));
+		PQfinish(conn);
+		exit(ERR_DB_CONN);
 	}

 	log_verbose(LOG_INFO, _("server is not in recovery"));
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -89,8 +89,6 @@ static int	run_file_backup(t_node_info *node_record);

 static void copy_configuration_files(bool delete_after_copy);

-static void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);
-
 static void tablespace_data_append(TablespaceDataList *list, const char *name, const char *oid, const char *location);

 static void get_barman_property(char *dst, char *name, char *local_repmgr_directory);
@@ -471,6 +469,7 @@ do_standby_clone(void)
 			termPQExpBuffer(&msg);

 			r = test_ssh_connection(runtime_options.host, runtime_options.remote_user);
+
 			if (r != 0)
 			{
 				log_error(_("remote host \"%s\" is not reachable via SSH - unable to copy external configuration files"),
@@ -498,32 +497,41 @@ do_standby_clone(void)

 			termPQExpBuffer(&msg);

+
 			/*
 			 * Here we'll attempt an initial test copy of the detected external
 			 * files, to detect any issues before we run the base backup.
 			 *
 			 * Note this will exit with an error, unless -F/--force supplied.
 			 *
+			 * We don't do this during a --dry-run as it may introduce unexpected changes
+			 * on the local node; during an actual clone operation, any problems with
+			 * copying files will be detected early and the operation aborted before
+			 * the actual database cloning commences.
+			 *
 			 * TODO: put the files in a temporary directory and move to their final
 			 * destination once the database has been cloned.
 			 */

-			if (runtime_options.copy_external_config_files_destination == CONFIG_FILE_SAMEPATH)
+			if (runtime_options.dry_run == false)
 			{
-				/*
-				 * Files will be placed in the same path as on the source server;
-				 * don't delete after copying.
-				 */
-				copy_configuration_files(false);
+				if (runtime_options.copy_external_config_files_destination == CONFIG_FILE_SAMEPATH)
+				{
+					/*
+					 * Files will be placed in the same path as on the source server;
+					 * don't delete after copying.
+					 */
+					copy_configuration_files(false);

-			}
-			else
-			{
-				/*
-				 * Files will be placed in the data directory - delete after copying.
-				 * They'll be copied again later; see TODO above.
-				 */
-				copy_configuration_files(true);
+				}
+				else
+				{
+					/*
+					 * Files will be placed in the data directory - delete after copying.
+					 * They'll be copied again later; see TODO above.
+					 */
+					copy_configuration_files(true);
+				}
 			}
 		}

@@ -1558,8 +1566,8 @@ do_standby_register(void)
 						exit(ERR_BAD_CONFIG);
 					}
 					log_warning(_("this node does not appear to be attached to upstream node \"%s\" (ID: %i)"),
-								config_file_options.node_name,
-								config_file_options.node_id);
+								upstream_node_record.node_name,
+								upstream_node_record.node_id);
 				}
 				PQfinish(upstream_conn);
 			}
@@ -2050,6 +2058,8 @@ _do_standby_promote_internal(PGconn *conn)
 			   local_node_record.node_name,
 			   local_node_record.node_id,
 			   script);
+	log_detail(_("waiting up to %i seconds (parameter \"promote_check_timeout\") for promotion to complete"),
+			   config_file_options.promote_check_timeout);

 	r = system(script);
 	if (r != 0)
@@ -2075,6 +2085,8 @@ _do_standby_promote_internal(PGconn *conn)
 		if (recovery_type == RECTYPE_STANDBY)
 		{
 			log_error(_("STANDBY PROMOTE failed, node is still a standby"));
+			log_detail(_("node still in recovery after %i seconds"), config_file_options.promote_check_timeout);
+			log_hint(_("the node may need more time to promote itself, check the PostgreSQL log for details"));
 			PQfinish(conn);
 			exit(ERR_PROMOTION_FAIL);
 		}
@@ -2720,6 +2732,10 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor
 	 * If replication slots are in use, and an inactive one for this node
 	 * exists on the former upstream, drop it.
 	 *
+	 * Note that if this function is called by do_standby_switchover(), the
+	 * "repmgr node rejoin" command executed on the demotion candidate may already
+	 * have removed the slot, so there may be nothing to do.
+	 *
 	 * XXX check if former upstream is current primary?
 	 */

@@ -2827,6 +2843,12 @@ do_standby_switchover(void)
 	int			reachable_sibling_nodes_with_slot_count = 0;
 	int			unreachable_sibling_node_count = 0;

+	/* number of free walsenders required on promotion candidate */
+	int			min_required_wal_senders = 1;
+
+	/* this will be calculated as max_wal_senders - COUNT(*) FROM pg_stat_replication */
+	int			available_wal_senders = 0;
+
 	/* number of free replication slots required on promotion candidate */
 	int			min_required_free_slots = 0;

@@ -3096,6 +3118,176 @@ do_standby_switchover(void)
 	}
 	termPQExpBuffer(&command_output);

+	/*
+	 * populate local node record with current state of various replication-related
+	 * values, so we can check for sufficient walsenders and replication slots
+	 */
+	get_node_replication_stats(local_conn, server_version_num, &local_node_record);
+
+	available_wal_senders = local_node_record.max_wal_senders -
+		local_node_record.attached_wal_receivers;
+
+	/*
+	 * If --siblings-follow specified, get list and check they're reachable
+	 * (if not just issue a warning)
+	 */
+	get_active_sibling_node_records(local_conn,
+									local_node_record.node_id,
+									local_node_record.upstream_node_id,
+									&sibling_nodes);
+
+	if (runtime_options.siblings_follow == false)
+	{
+		if (sibling_nodes.node_count > 0)
+		{
+			log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"),
+						sibling_nodes.node_count);
+			log_detail(_("these nodes will remain attached to the current primary"));
+		}
+	}
+	else
+	{
+		char		host[MAXLEN] = "";
+		NodeInfoListCell *cell;
+
+		log_verbose(LOG_INFO, _("%i active sibling nodes found"),
+					sibling_nodes.node_count);
+
+		if (sibling_nodes.node_count == 0)
+		{
+			log_warning(_("option \"--sibling-nodes\" specified, but no sibling nodes exist"));
+		}
+		else
+		{
+			/* include walsender for promotion candidate in total */
+
+
+			for (cell = sibling_nodes.head; cell; cell = cell->next)
+			{
+				/* get host from node record */
+				get_conninfo_value(cell->node_info->conninfo, "host", host);
+				r = test_ssh_connection(host, runtime_options.remote_user);
+
+				if (r != 0)
+				{
+					cell->node_info->reachable = false;
+					unreachable_sibling_node_count++;
+				}
+				else
+				{
+					cell->node_info->reachable = true;
+					reachable_sibling_node_count++;
+					min_required_wal_senders++;
+
+					if (cell->node_info->slot_name[0] != '\0')
+					{
+						reachable_sibling_nodes_with_slot_count++;
+						min_required_free_slots++;
+					}
+				}
+			}
+
+			if (unreachable_sibling_node_count > 0)
+			{
+				if (runtime_options.force == false)
+				{
+					log_error(_("%i of %i sibling nodes unreachable via SSH:"),
+							  unreachable_sibling_node_count,
+							  sibling_nodes.node_count);
+				}
+				else
+				{
+					log_warning(_("%i of %i sibling nodes unreachable via SSH:"),
+								unreachable_sibling_node_count,
+								sibling_nodes.node_count);
+				}
+
+				/* display list of unreachable sibling nodes */
+				for (cell = sibling_nodes.head; cell; cell = cell->next)
+				{
+					if (cell->node_info->reachable == true)
+						continue;
+					log_detail("  %s (ID: %i)",
+							   cell->node_info->node_name,
+							   cell->node_info->node_id);
+				}
+
+				if (runtime_options.force == false)
+				{
+					log_hint(_("use -F/--force to proceed in any case"));
+					PQfinish(local_conn);
+					exit(ERR_BAD_CONFIG);
+				}
+
+				if (runtime_options.dry_run == true)
+				{
+					log_detail(_("F/--force specified, would proceed anyway"));
+				}
+				else
+				{
+					log_detail(_("F/--force specified, proceeding anyway"));
+				}
+			}
+			else
+			{
+				char	   *msg = _("all sibling nodes are reachable via SSH");
+
+				if (runtime_options.dry_run == true)
+				{
+					log_info("%s", msg);
+				}
+				else
+				{
+					log_verbose(LOG_INFO, "%s", msg);
+				}
+			}
+		}
+	}
+
+
+	/*
+	 * check there are sufficient free walsenders - obviously there's potential
+	 * for a later race condition if some walsenders come into use before the
+	 * switchover operation gets around to attaching the sibling nodes, but
+	 * this should catch any actual existing configuration issue (and if anyone's
+	 * performing a switchover in such an unstable environment, they only have
+	 * themselves to blame).
+	 */
+	if (available_wal_senders < min_required_wal_senders)
+	{
+		if (runtime_options.force == false || runtime_options.dry_run == true)
+		{
+			log_error(_("insufficient free walsenders on promotion candidate"));
+			log_detail(_("at least %i walsenders required but only %i free walsenders on promotion candidate"),
+					   min_required_wal_senders,
+					   available_wal_senders);
+			log_hint(_("increase parameter \"max_wal_senders\" or use -F/--force to proceed in any case"));
+
+			if (runtime_options.dry_run == false)
+			{
+				PQfinish(local_conn);
+				exit(ERR_BAD_CONFIG);
+			}
+		}
+		else
+		{
+			log_warning(_("insufficient free walsenders on promotion candidate"));
+			log_detail(_("at least %i walsenders required but only %i free walsender(s) on promotion candidate"),
+					   min_required_wal_senders,
+					   available_wal_senders);
+		}
+	}
+	else
+	{
+		if (runtime_options.dry_run == true)
+		{
+			log_info(_("%i walsenders required, %i available"),
+					 min_required_wal_senders,
+					 available_wal_senders);
+		}
+	}
+
+
 	/* check demotion candidate can make replication connection to promotion candidate */
 	{
 		initPQExpBuffer(&remote_command_str);
@@ -3339,171 +3531,6 @@ do_standby_switchover(void)

 	PQfinish(remote_conn);

-	/*
-	 * populate local node record with current state of various replication-related
-	 * values, so we can check for sufficient walsenders and replication slots
-	 */
-	get_node_replication_stats(local_conn, server_version_num, &local_node_record);
-
-	/*
-	 * If --siblings-follow specified, get list and check they're reachable
-	 * (if not just issue a warning)
-	 */
-	get_active_sibling_node_records(local_conn,
-									local_node_record.node_id,
-									local_node_record.upstream_node_id,
-									&sibling_nodes);
-
-	if (runtime_options.siblings_follow == false)
-	{
-		if (sibling_nodes.node_count > 0)
-		{
-			log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"),
-						sibling_nodes.node_count);
-			log_detail(_("these nodes will remain attached to the current primary"));
-		}
-	}
-	else
-	{
-		char		host[MAXLEN] = "";
-		NodeInfoListCell *cell;
-
-		log_verbose(LOG_INFO, _("%i active sibling nodes found"),
-					sibling_nodes.node_count);
-
-		if (sibling_nodes.node_count == 0)
-		{
-			log_warning(_("option \"--sibling-nodes\" specified, but no sibling nodes exist"));
-		}
-		else
-		{
-			/* include walsender for promotion candidate in total */
-			int			min_required_wal_senders = 1;
-			int			available_wal_senders = local_node_record.max_wal_senders -
-				local_node_record.attached_wal_receivers;
-
-			for (cell = sibling_nodes.head; cell; cell = cell->next)
-			{
-				/* get host from node record */
-				get_conninfo_value(cell->node_info->conninfo, "host", host);
-				r = test_ssh_connection(host, runtime_options.remote_user);
-
-				if (r != 0)
-				{
-					cell->node_info->reachable = false;
-					unreachable_sibling_node_count++;
-				}
-				else
-				{
-					cell->node_info->reachable = true;
-					reachable_sibling_node_count++;
-					min_required_wal_senders++;
-
-					if (cell->node_info->slot_name[0] != '\0')
-					{
-						reachable_sibling_nodes_with_slot_count++;
-						min_required_free_slots++;
-					}
-				}
-			}
-
-			if (unreachable_sibling_node_count > 0)
-			{
-				if (runtime_options.force == false)
-				{
-					log_error(_("%i of %i sibling nodes unreachable via SSH:"),
-							  unreachable_sibling_node_count,
-							  sibling_nodes.node_count);
-				}
-				else
-				{
-					log_warning(_("%i of %i sibling nodes unreachable via SSH:"),
-								unreachable_sibling_node_count,
-								sibling_nodes.node_count);
-				}
-
-				/* display list of unreachable sibling nodes */
-				for (cell = sibling_nodes.head; cell; cell = cell->next)
-				{
-					if (cell->node_info->reachable == true)
-						continue;
-					log_detail("  %s (ID: %i)",
-							   cell->node_info->node_name,
-							   cell->node_info->node_id);
-				}
-
-				if (runtime_options.force == false)
-				{
-					log_hint(_("use -F/--force to proceed in any case"));
-					PQfinish(local_conn);
-					exit(ERR_BAD_CONFIG);
-				}
-
-				if (runtime_options.dry_run == true)
-				{
-					log_detail(_("F/--force specified, would proceed anyway"));
-				}
-				else
-				{
-					log_detail(_("F/--force specified, proceeding anyway"));
-				}
-			}
-			else
-			{
-				char	   *msg = _("all sibling nodes are reachable via SSH");
-
-				if (runtime_options.dry_run == true)
-				{
-					log_info("%s", msg);
-				}
-				else
-				{
-					log_verbose(LOG_INFO, "%s", msg);
-				}
-			}
-
-			/*
-			 * check there are sufficient free walsenders - obviously there's potential
-			 * for a later race condition if some walsenders come into use before the
-			 * switchover operation gets around to attaching the sibling nodes, but
-			 * this should catch any actual existing configuration issue.
-			 */
-			if (available_wal_senders < min_required_wal_senders)
-			{
-				if (runtime_options.force == false || runtime_options.dry_run == true)
-				{
-					log_error(_("insufficient free walsenders to attach all sibling nodes"));
-					log_detail(_("at least %i walsenders required but only %i free walsenders on promotion candidate"),
-							   min_required_wal_senders,
-							   available_wal_senders);
-					log_hint(_("increase parameter \"max_wal_senders\" or use -F/--force to proceed in any case"));
-
-					if (runtime_options.dry_run == false)
-					{
-						PQfinish(local_conn);
-						exit(ERR_BAD_CONFIG);
-					}
-				}
-				else
-				{
-					log_warning(_("insufficient free walsenders to attach all sibling nodes"));
-					log_detail(_("at least %i walsenders required but only %i free walsender(s) on promotion candidate"),
-							   min_required_wal_senders,
-							   available_wal_senders);
-				}
-			}
-			else
-			{
-				if (runtime_options.dry_run == true)
-				{
-					log_info(_("%i walsenders required, %i available"),
-							 min_required_wal_senders,
-							 available_wal_senders);
-				}
-			}
-		}
-	}
-

 	/*
 	 * if replication slots are required by demotion candidate and/or siblings,
@@ -5111,65 +5138,81 @@ run_basebackup(t_node_info *node_record)
 		{
 			PGconn	   *upstream_conn = NULL;

-			upstream_conn = establish_db_connection(upstream_node_record.conninfo, true);
+			upstream_conn = establish_db_connection(upstream_node_record.conninfo, false);

-			record_status = get_slot_record(upstream_conn, node_record->slot_name, &slot_info);
-
-			if (record_status == RECORD_FOUND)
+			/*
+			 * It's possible the upstream node is not yet running, in which case we'll
+			 * have to rely on the user taking action to create the slot
+			 */
+			if (PQstatus(upstream_conn) != CONNECTION_OK)
 			{
-				log_verbose(LOG_INFO,
-							_("replication slot \"%s\" aleady exists on upstream node %i"),
-							node_record->slot_name,
-							upstream_node_id);
-				slot_exists_on_upstream = true;
+				log_warning(_("unable to connect to upstream node to create replication slot"));
+				/*
+				 * TODO: if slot creation also handled by "standby register", update warning
+				 */
+				log_hint(_("you may need to create the replication slot manually"));
 			}
 			else
 			{
-				PQExpBufferData event_details;
+				record_status = get_slot_record(upstream_conn, node_record->slot_name, &slot_info);

-				log_notice(_("creating replication slot \"%s\" on upstream node %i"),
-						   node_record->slot_name,
-						   upstream_node_id);
-
-				get_superuser_connection(&upstream_conn, &superuser_conn, &privileged_conn);
-
-				initPQExpBuffer(&event_details);
-				if (create_replication_slot(privileged_conn, node_record->slot_name, source_server_version_num, &event_details) == false)
+				if (record_status == RECORD_FOUND)
 				{
-					log_error("%s", event_details.data);
+					log_verbose(LOG_INFO,
+								_("replication slot \"%s\" aleady exists on upstream node %i"),
+								node_record->slot_name,
+								upstream_node_id);
+					slot_exists_on_upstream = true;
+				}
+				else
+				{
+					PQExpBufferData event_details;

-					create_event_notification(
-											  primary_conn,
-											  &config_file_options,
-											  config_file_options.node_id,
-											  "standby_clone",
-											  false,
-											  event_details.data);
+					log_notice(_("creating replication slot \"%s\" on upstream node %i"),
+							   node_record->slot_name,
+							   upstream_node_id);

-					PQfinish(source_conn);
+					get_superuser_connection(&upstream_conn, &superuser_conn, &privileged_conn);
+
+					initPQExpBuffer(&event_details);
+					if (create_replication_slot(privileged_conn, node_record->slot_name, source_server_version_num, &event_details) == false)
+					{
+						log_error("%s", event_details.data);
+
+						create_event_notification(primary_conn,
+												  &config_file_options,
+												  config_file_options.node_id,
+												  "standby_clone",
+												  false,
+												  event_details.data);
+
+						PQfinish(source_conn);
+
+						if (superuser_conn != NULL)
+							PQfinish(superuser_conn);
+
+						exit(ERR_DB_QUERY);
+					}

 					if (superuser_conn != NULL)
 						PQfinish(superuser_conn);

-					exit(ERR_DB_QUERY);
+					termPQExpBuffer(&event_details);
 				}

-				if (superuser_conn != NULL)
-					PQfinish(superuser_conn);
-
-				termPQExpBuffer(&event_details);
+				PQfinish(upstream_conn);
 			}
-
-			PQfinish(upstream_conn);
 		}

+		/* delete slot on source server */
+
 		get_superuser_connection(&source_conn, &superuser_conn, &privileged_conn);

 		if (slot_info.active == false)
 		{
 			if (slot_exists_on_upstream == false)
 			{
-				if (drop_replication_slot(source_conn, node_record->slot_name) == true)
+				if (drop_replication_slot(privileged_conn, node_record->slot_name) == true)
 				{
 					log_notice(_("replication slot \"%s\" deleted on source node"), node_record->slot_name);
 				}
@@ -5827,7 +5870,7 @@ get_barman_property(char *dst, char *name, char *local_repmgr_directory)
 	initPQExpBuffer(&command_output);

 	maxlen_snprintf(command,
-					"grep \"^\t%s:\" %s/show-server.txt",
+					"grep \"^[[:space:]]%s:\" %s/show-server.txt",
 					name, local_repmgr_tmp_directory);
 	(void) local_command(command, &command_output);

@@ -6024,45 +6067,6 @@ check_recovery_type(PGconn *conn)
 }


-static void
-drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
-{
-	t_replication_slot slot_info = T_REPLICATION_SLOT_INITIALIZER;
-	RecordStatus record_status = get_slot_record(conn, slot_name, &slot_info);
-
-	log_verbose(LOG_DEBUG, "attempting to delete slot \"%s\" on node %i",
-				slot_name, node_id);
-
-	if (record_status != RECORD_FOUND)
-	{
-		log_info(_("no slot record found for slot \"%s\" on node %i"),
-				 slot_name, node_id);
-	}
-	else
-	{
-		if (slot_info.active == false)
-		{
-			if (drop_replication_slot(conn, slot_name) == true)
-			{
-				log_notice(_("replication slot \"%s\" deleted on node %i"), slot_name, node_id);
-			}
-			else
-			{
-				log_error(_("unable to delete replication slot \"%s\" on node %i"), slot_name, node_id);
-			}
-		}
-
-		/*
-		 * if active replication slot exists, call Houston as we have a
-		 * problem
-		 */
-		else
-		{
-			log_warning(_("replication slot \"%s\" is still active on node %i"), slot_name, node_id);
-		}
-	}
-}
-

 /*
 * Creates a recovery.conf file for a standby
--- a/repmgr-client-global.h
+++ b/repmgr-client-global.h
@@ -237,5 +237,6 @@ extern void get_node_config_directory(char *config_dir_buf);
 extern void get_node_data_directory(char *data_dir_buf);
 extern void init_node_record(t_node_info *node_record);
 extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
+extern void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);

 #endif							/* _REPMGR_CLIENT_GLOBAL_H_ */
--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -2978,3 +2978,46 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea

 	return can_use;
 }
+
+
+void
+drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
+{
+	t_replication_slot slot_info = T_REPLICATION_SLOT_INITIALIZER;
+	RecordStatus record_status = get_slot_record(conn, slot_name, &slot_info);
+
+	log_verbose(LOG_DEBUG, "attempting to delete slot \"%s\" on node %i",
+				slot_name, node_id);
+
+	if (record_status != RECORD_FOUND)
+	{
+		/* this is a good thing */
+		log_verbose(LOG_INFO,
+					_("slot \"%s\" does not exist on node %i, nothing to remove"),
+					slot_name, node_id);
+	}
+	else
+	{
+		if (slot_info.active == false)
+		{
+			if (drop_replication_slot(conn, slot_name) == true)
+			{
+				log_notice(_("replication slot \"%s\" deleted on node %i"), slot_name, node_id);
+			}
+			else
+			{
+				log_error(_("unable to delete replication slot \"%s\" on node %i"), slot_name, node_id);
+			}
+		}
+
+		/*
+		 * if active replication slot exists, call Houston as we have a
+		 * problem
+		 */
+		else
+		{
+			log_warning(_("replication slot \"%s\" is still active on node %i"), slot_name, node_id);
+		}
+	}
+}
+
--- a/repmgr.c
+++ b/repmgr.c
@@ -416,9 +416,9 @@ unset_bdr_failover_handler(PG_FUNCTION_ARGS)
 		LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);

 		shared_state->bdr_failover_handler = UNKNOWN_NODE_ID;
-
-		LWLockRelease(shared_state->lock);
 	}

+	LWLockRelease(shared_state->lock);
+
 	PG_RETURN_VOID();
 }
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -143,6 +143,11 @@
 					# Debian/Ubuntu users: you will probably need to
 					# set this to the directory where `pg_ctl` is located,
 					# e.g. /usr/lib/postgresql/9.6/bin/
+					#
+					# *NOTE* "pg_bindir" is only used when repmgr directly
+					# executes PostgreSQL binaries; any user-defined scripts
+					# *must* be specified with the full path
+					#
 #use_primary_conninfo_password=false	# explicitly set "password" in recovery.conf's
 					# "primary_conninfo" parameter using the value contained
 					# in the environment variable PGPASSWORD
@@ -156,7 +161,7 @@
 # Examples:
 #
 #   pg_ctl_options='-s'
-#   pg_basebackup_options='--label=repmgr_backup
+#   pg_basebackup_options='--label=repmgr_backup'
 #   rsync_options=--archive --checksum --compress --progress --rsh="ssh -o \"StrictHostKeyChecking no\""
 #   ssh_options=-o "StrictHostKeyChecking no"

@@ -183,11 +188,11 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					# parameter can be provided multiple times.

 #restore_command=''			# This will be placed in the recovery.conf file generated
-                                        # by repmgr.
+					# by repmgr.

 #archive_cleanup_command=''		# This will be placed in the recovery.conf file generated
-                                        # by repmgr. Note we recommend using Barman for managing
-                                        # WAL archives (see: https://www.pgbarman.org )
+					# by repmgr. Note we recommend using Barman for managing
+					# WAL archives (see: https://www.pgbarman.org )

 #recovery_min_apply_delay=		# If provided, "recovery_min_apply_delay" in recovery.conf
 					# will be set to this value (PostgreSQL 9.4 and later).
@@ -259,10 +264,10 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 # are defaults.

 #repmgrd_pid_file=			# Path of PID file to use for repmgrd; if not set, a PID file will
-                                        # be generated in a temporary directory specified by the environment
-                                        # variable $TMPDIR, or if not set, in "/tmp". This value can be overridden
-                                        # by the command line option "-p/--pid-file"; the command line option
-                                        # "--no-pid-file" will force PID file creation to be skipped.
+					# be generated in a temporary directory specified by the environment
+					# variable $TMPDIR, or if not set, in "/tmp". This value can be overridden
+					# by the command line option "-p/--pid-file"; the command line option
+					# "--no-pid-file" will force PID file creation to be skipped.
 #failover=manual			# one of 'automatic', 'manual'.
 					# determines what action to take in the event of upstream failure
 					#
@@ -276,9 +281,9 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					# a value of zero prevents the node being promoted to primary
 					# (default: 100)

-#reconnect_attempts=6			# Number attempts which will be made to reconnect to an unreachable
+#reconnect_attempts=6			# Number of attempts which will be made to reconnect to an unreachable
 					# primary (or other upstream node)
-#reconnect_interval=10			# Interval between attempts  to reconnect to an unreachable
+#reconnect_interval=10			# Interval between attempts to reconnect to an unreachable
 					# primary (or other upstream node)
 #promote_command=			# command repmgrd executes when promoting a new primary; use something like:
 					#
@@ -332,7 +337,7 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 #
 # Debian/Ubuntu users: use "sudo pg_ctlcluster" to execute service control commands.
 #
-# For more details, see: https://repmgr.org/docs/4.0/configuration-service-commands.html
+# For more details, see: https://repmgr.org/docs/4.1/configuration-service-commands.html

 #service_start_command = ''
 #service_stop_command = ''
@@ -376,7 +381,7 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 #------------------------------------------------------------------------------

 #bdr_local_monitoring_only=false         # Only monitor the local node; no checks will be
-                                         # performed on the other node
+					 # performed on the other node
 #bdr_recovery_timeout                    # If a BDR node was offline and has become available
-                                         # maximum length of time in seconds to wait for the
-                                         # node to reconnect to the cluster
+					 # maximum length of time in seconds to wait for the
+					 # node to reconnect to the cluster
--- a/repmgr_version.h.in
+++ b/repmgr_version.h.in
@@ -1,2 +1,2 @@
 #define REPMGR_VERSION_DATE ""
-#define REPMGR_VERSION "4.1dev"
+#define REPMGR_VERSION "4.1.2"
--- a/repmgrd-bdr.c
+++ b/repmgrd-bdr.c
@@ -214,7 +214,8 @@ monitor_bdr(void)

 								log_warning(_("unable to connect to node %s (ID %i)"),
 											cell->node_info->node_name, cell->node_info->node_id);
-								cell->node_info->conn = try_reconnect(cell->node_info);
+								//cell->node_info->conn = try_reconnect(cell->node_info);
+								try_reconnect(&cell->node_info->conn, cell->node_info);

 								/* node has recovered - log and continue */
 								if (cell->node_info->node_status == NODE_STATUS_UP)
@@ -293,7 +294,7 @@ loop:
 			/*
 			 * if we can reload, then could need to change local_conn
 			 */
-			if (reload_config(&config_file_options))
+			if (reload_config(&config_file_options, BDR))
 			{
 				PQfinish(local_conn);
 				local_conn = establish_db_connection(config_file_options.conninfo, true);
@@ -303,11 +304,12 @@ loop:
 			got_SIGHUP = false;
 		}

+		/* XXX this looks like it will never be called */
 		if (got_SIGHUP)
 		{
 			log_debug("SIGHUP received");

-			if (reload_config(&config_file_options))
+			if (reload_config(&config_file_options, BDR))
 			{
 				PQfinish(local_conn);
 				local_conn = establish_db_connection(config_file_options.conninfo, true);
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -60,6 +60,8 @@ static int	primary_node_id = UNKNOWN_NODE_ID;
 static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
 static NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;

+static instr_time last_monitoring_update;
+

 static ElectionResult do_election(void);
 static const char *_print_election_result(ElectionResult result);
@@ -81,6 +83,8 @@ static bool do_witness_failover(void);

 static void update_monitoring_history(void);

+static void handle_sighup(PGconn **conn, t_server_type server_type);
+
 static const char * format_failover_state(FailoverState failover_state);


@@ -264,7 +268,12 @@ monitor_streaming_primary(void)
 		 * TODO: cache node list here, refresh at `node_list_refresh_interval`
 		 * also return reason for inavailability so we can log it
 		 */
-		if (is_server_available(local_node_info.conninfo) == false)
+
+		(void) connection_ping(local_conn);
+
+		check_connection(&local_node_info, &local_conn);
+
+		if (PQstatus(local_conn) != CONNECTION_OK)
 		{

 			/* local node is down, we were expecting it to be up */
@@ -284,8 +293,6 @@ monitor_streaming_primary(void)

 				local_node_info.node_status = NODE_STATUS_UNKNOWN;

-				close_connection(&local_conn);
-
 				/*
 				 * as we're monitoring the primary, no point in trying to
 				 * write the event to the database
@@ -301,11 +308,12 @@ monitor_streaming_primary(void)

 				termPQExpBuffer(&event_details);

-				local_conn = try_reconnect(&local_node_info);
+				try_reconnect(&local_conn, &local_node_info);

 				if (local_node_info.node_status == NODE_STATUS_UP)
 				{
 					int			local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);
+					int 		stored_local_node_id = UNKNOWN_NODE_ID;

 					initPQExpBuffer(&event_details);

@@ -322,6 +330,17 @@ monitor_streaming_primary(void)
 											  event_details.data);
 					termPQExpBuffer(&event_details);

+					/*
+					 * If the local node was restarted, we'll need to reinitialise values
+					 * stored in shared memory.
+					 */
+
+					stored_local_node_id = repmgrd_get_local_node_id(local_conn);
+					if (stored_local_node_id == UNKNOWN_NODE_ID)
+					{
+						repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
+					}
+
 					goto loop;
 				}

@@ -545,26 +564,7 @@ loop:

 		if (got_SIGHUP)
 		{
-			log_debug("SIGHUP received");
-
-			if (reload_config(&config_file_options))
-			{
-				close_connection(&local_conn);
-				local_conn = establish_db_connection(config_file_options.conninfo, true);
-
-				if (*config_file_options.log_file)
-				{
-					FILE	   *fd;
-
-					fd = freopen(config_file_options.log_file, "a", stderr);
-					if (fd == NULL)
-					{
-						fprintf(stderr, "error reopening stderr to \"%s\": %s",
-								config_file_options.log_file, strerror(errno));
-					}
-				}
-			}
-			got_SIGHUP = false;
+			handle_sighup(&local_conn, PRIMARY);
 		}

 		log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
@@ -582,9 +582,11 @@ monitor_streaming_standby(void)
 	instr_time	log_status_interval_start;
 	PQExpBufferData event_details;

+	log_debug("monitor_streaming_standby()");
+
 	reset_node_voting_status();

-	log_debug("monitor_streaming_standby()");
+	INSTR_TIME_SET_ZERO(last_monitoring_update);

 	/*
 	 * If no upstream node id is specified in the metadata, we'll try and
@@ -733,10 +735,9 @@ monitor_streaming_standby(void)
 								  _("unable to connect to upstream node \"%s\" (node ID: %i)"),
 								  upstream_node_info.node_name, upstream_node_info.node_id);

-				/* */
+				/* XXX possible pre-action event */
 				if (upstream_node_info.type == STANDBY)
 				{
-					/* XXX possible pre-action event */
 					create_event_record(primary_conn,
 										&config_file_options,
 										config_file_options.node_id,
@@ -758,8 +759,6 @@ monitor_streaming_standby(void)
 				log_warning("%s", event_details.data);
 				termPQExpBuffer(&event_details);

-				close_connection(&upstream_conn);
-
 				/*
 				 * if local node is unreachable, make a last-minute attempt to reconnect
 				 * before continuing with the failover process
@@ -770,13 +769,18 @@ monitor_streaming_standby(void)
 					check_connection(&local_node_info, &local_conn);
 				}

-				upstream_conn = try_reconnect(&upstream_node_info);
+				try_reconnect(&upstream_conn, &upstream_node_info);

 				/* Node has recovered - log and continue */
 				if (upstream_node_info.node_status == NODE_STATUS_UP)
 				{
 					int			upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);

+					if (upstream_node_info.type == PRIMARY)
+					{
+						primary_conn = upstream_conn;
+					}
+
 					initPQExpBuffer(&event_details);

 					appendPQExpBuffer(&event_details,
@@ -784,7 +788,7 @@ monitor_streaming_standby(void)
 									  upstream_node_unreachable_elapsed);
 					log_notice("%s", event_details.data);

-					create_event_notification(upstream_conn,
+					create_event_notification(primary_conn,
 											  &config_file_options,
 											  config_file_options.node_id,
 											  "repmgrd_upstream_reconnect",
@@ -1004,6 +1008,13 @@ monitor_streaming_standby(void)
 								continue;
 							}

+							/* skip witness node - we can't possibly "follow" that */
+
+							if (cell->node_info->type == WITNESS)
+							{
+								continue;
+							}
+
 							cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);

 							if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
@@ -1026,6 +1037,7 @@ monitor_streaming_standby(void)
 							follow_new_primary(follow_node_id);
 						}
 					}
+
 					clear_node_info_list(&sibling_nodes);
 				}
 			}
@@ -1054,8 +1066,7 @@ loop:

 				if (config_file_options.failover == FAILOVER_MANUAL)
 				{
-					appendPQExpBuffer(
-									  &monitoring_summary,
+					appendPQExpBuffer(&monitoring_summary,
 									  _(" (automatic failover disabled)"));
 				}

@@ -1065,6 +1076,18 @@ loop:
 				{
 					log_detail(_("waiting for upstream or another primary to reappear"));
 				}
+				else if (config_file_options.monitoring_history == true)
+				{
+					if (INSTR_TIME_IS_ZERO(last_monitoring_update))
+					{
+						log_detail(_("no monitoring statistics have been written yet"));
+					}
+					else
+					{
+						log_detail(_("last monitoring statistics update was %i seconds ago"),
+								   calculate_elapsed(last_monitoring_update));
+					}
+				}

 				INSTR_TIME_SET_CURRENT(log_status_interval_start);
 			}
@@ -1076,7 +1099,16 @@ loop:
 		}
 		else
 		{
-			connection_ping(local_conn);
+			if (config_file_options.monitoring_history == true)
+			{
+				log_verbose(LOG_WARNING, _("monitoring_history requested but primary connection not available"));
+			}
+
+			/*
+			 * if monitoring not in use, we'll need to ensure the local connection
+			 * handle isn't stale
+			 */
+			(void) connection_ping(local_conn);
 		}

 		/*
@@ -1129,8 +1161,11 @@ loop:
 		}
 		else
 		{
+			/* we've reconnected to the local node after an outage */
 			if (local_node_info.active == false)
 			{
+				int stored_local_node_id = UNKNOWN_NODE_ID;
+
 				if (PQstatus(primary_conn) == CONNECTION_OK)
 				{
 					if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
@@ -1146,45 +1181,36 @@ loop:
 										  local_node_info.node_name,
 										  local_node_info.node_id);

-						log_warning("%s", event_details.data)
+						log_notice("%s", event_details.data);

-
-							create_event_notification(primary_conn,
-													  &config_file_options,
-													  local_node_info.node_id,
-													  "standby_recovery",
-													  true,
-													  event_details.data);
+						create_event_notification(primary_conn,
+												  &config_file_options,
+												  local_node_info.node_id,
+												  "standby_recovery",
+												  true,
+												  event_details.data);

 						termPQExpBuffer(&event_details);
 					}
 				}
+
+				/*
+				 * If the local node was restarted, we'll need to reinitialise values
+				 * stored in shared memory.
+				 */
+
+				stored_local_node_id = repmgrd_get_local_node_id(local_conn);
+				if (stored_local_node_id == UNKNOWN_NODE_ID)
+				{
+					repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
+				}
 			}
 		}


 		if (got_SIGHUP)
 		{
-			log_debug("SIGHUP received");
-
-			if (reload_config(&config_file_options))
-			{
-				close_connection(&local_conn);
-				local_conn = establish_db_connection(config_file_options.conninfo, true);
-
-				if (*config_file_options.log_file)
-				{
-					FILE	   *fd;
-
-					fd = freopen(config_file_options.log_file, "a", stderr);
-					if (fd == NULL)
-					{
-						fprintf(stderr, "error reopening stderr to \"%s\": %s",
-								config_file_options.log_file, strerror(errno));
-					}
-				}
-			}
-			got_SIGHUP = false;
+			handle_sighup(&local_conn, STANDBY);
 		}

 		log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
@@ -1204,36 +1230,18 @@ monitor_streaming_witness(void)
 	PQExpBufferData event_details;
 	RecordStatus record_status;

+	int primary_node_id = UNKNOWN_NODE_ID;
+
 	reset_node_voting_status();

 	log_debug("monitor_streaming_witness()");

-	if (get_primary_node_record(local_conn, &upstream_node_info) == false)
-	{
-		PQExpBufferData event_details;
-
-		initPQExpBuffer(&event_details);
-
-		appendPQExpBuffer(&event_details,
-						  _("unable to retrieve record for primary node"));
-
-		log_error("%s", event_details.data);
-		log_hint(_("execute \"repmgr witness register --force\" to update the witness node "));
-		close_connection(&local_conn);
-
-		create_event_notification(NULL,
-								  &config_file_options,
-								  config_file_options.node_id,
-								  "repmgrd_shutdown",
-								  false,
-								  event_details.data);
-
-		termPQExpBuffer(&event_details);
-
-		terminate(ERR_BAD_CONFIG);
-	}
-
-	primary_conn = establish_db_connection(upstream_node_info.conninfo, false);
+	/*
+	 * At this point we can't trust the local copy of "repmgr.nodes", as
+	 * it may not have been updated. We'll scan the cluster for the current
+[''	 * primary and refresh the copy from that before proceeding further.
+	 */
+	primary_conn = get_primary_connection_quiet(local_conn, &primary_node_id, NULL);

 	/*
 	 * Primary node must be running at repmgrd startup.
@@ -1258,7 +1266,7 @@ monitor_streaming_witness(void)
 	 * refresh upstream node record from primary, so it's as up-to-date
 	 * as possible
 	 */
-	record_status = get_node_record(primary_conn, upstream_node_info.node_id, &upstream_node_info);
+	record_status = get_node_record(primary_conn, primary_node_id, &upstream_node_info);

 	/*
 	 * This is unlikely to happen; if it does emit a warning for diagnostic
@@ -1330,8 +1338,7 @@ monitor_streaming_witness(void)
 									true,
 									event_details.data);

-				close_connection(&primary_conn);
-				primary_conn = try_reconnect(&upstream_node_info);
+				try_reconnect(&primary_conn, &upstream_node_info);

 				/* Node has recovered - log and continue */
 				if (upstream_node_info.node_status == NODE_STATUS_UP)
@@ -1345,7 +1352,7 @@ monitor_streaming_witness(void)
 									  upstream_node_unreachable_elapsed);
 					log_notice("%s", event_details.data);

-					create_event_notification(upstream_conn,
+					create_event_notification(primary_conn,
 											  &config_file_options,
 											  config_file_options.node_id,
 											  "repmgrd_upstream_reconnect",
@@ -1468,6 +1475,105 @@ monitor_streaming_witness(void)
 		}
 loop:

+		/*
+		 * handle local node failure
+		 *
+		 * currently we'll just check the connection, and try to reconnect
+		 *
+		 * TODO: add timeout, after which we run in degraded state
+		 */
+
+		(void) connection_ping(local_conn);
+
+		check_connection(&local_node_info, &local_conn);
+
+		if (PQstatus(local_conn) != CONNECTION_OK)
+		{
+			if (local_node_info.active == true)
+			{
+				bool success = true;
+				PQExpBufferData event_details;
+
+				initPQExpBuffer(&event_details);
+
+				local_node_info.active = false;
+
+				appendPQExpBuffer(&event_details,
+								  _("unable to connect to local node \"%s\" (ID: %i), marking inactive"),
+								  local_node_info.node_name,
+								  local_node_info.node_id);
+				log_notice("%s", event_details.data);
+
+				if (PQstatus(primary_conn) == CONNECTION_OK)
+				{
+					if (update_node_record_set_active(primary_conn, local_node_info.node_id, false) == false)
+					{
+						success = false;
+						log_warning(_("unable to mark node \"%s\" (ID: %i) as inactive"),
+									  local_node_info.node_name,
+									  local_node_info.node_id);
+					}
+				}
+
+				create_event_notification(primary_conn,
+										  &config_file_options,
+										  local_node_info.node_id,
+										  "standby_failure",
+										  success,
+										  event_details.data);
+
+				termPQExpBuffer(&event_details);
+			}
+		}
+		else
+		{
+			/* we've reconnected to the local node after an outage */
+			if (local_node_info.active == false)
+			{
+				int stored_local_node_id = UNKNOWN_NODE_ID;
+
+				if (PQstatus(primary_conn) == CONNECTION_OK)
+				{
+					if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
+					{
+						PQExpBufferData event_details;
+
+						initPQExpBuffer(&event_details);
+
+						local_node_info.active = true;
+
+						appendPQExpBuffer(&event_details,
+										  _("reconnected to local node \"%s\" (ID: %i), marking active"),
+										  local_node_info.node_name,
+										  local_node_info.node_id);
+
+						log_notice("%s", event_details.data);
+
+						create_event_notification(primary_conn,
+												  &config_file_options,
+												  local_node_info.node_id,
+												  "standby_recovery",
+												  true,
+												  event_details.data);
+
+						termPQExpBuffer(&event_details);
+					}
+				}
+
+				/*
+				 * If the local node was restarted, we'll need to reinitialise values
+				 * stored in shared memory.
+				 */
+
+				stored_local_node_id = repmgrd_get_local_node_id(local_conn);
+				if (stored_local_node_id == UNKNOWN_NODE_ID)
+				{
+					repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
+				}
+			}
+		}
+
+
 		/* refresh repmgr.nodes after "witness_sync_interval" seconds */

 		{
@@ -1511,28 +1617,10 @@ loop:
 		}


+
 		if (got_SIGHUP)
 		{
-			log_debug("SIGHUP received");
-
-			if (reload_config(&config_file_options))
-			{
-				close_connection(&local_conn);
-				local_conn = establish_db_connection(config_file_options.conninfo, true);
-
-				if (*config_file_options.log_file)
-				{
-					FILE	   *fd;
-
-					fd = freopen(config_file_options.log_file, "a", stderr);
-					if (fd == NULL)
-					{
-						fprintf(stderr, "error reopening stderr to \"%s\": %s",
-								config_file_options.log_file, strerror(errno));
-					}
-				}
-			}
-			got_SIGHUP = false;
+			handle_sighup(&local_conn, WITNESS);
 		}

 		log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
@@ -1778,12 +1866,21 @@ update_monitoring_history(void)
 	long long unsigned int replication_lag_bytes = 0;

 	/* both local and primary connections must be available */
-	if (PQstatus(primary_conn) != CONNECTION_OK || PQstatus(local_conn) != CONNECTION_OK)
+	if (PQstatus(primary_conn) != CONNECTION_OK)
+	{
+		log_warning(_("primary connection is not available, unable to update monitoring history"));
 		return;
+	}
+
+	if (PQstatus(local_conn) != CONNECTION_OK)
+	{
+		log_warning(_("local connection is not available, unable to update monitoring history"));
+		return;
+	}

 	if (get_replication_info(local_conn, &replication_info) == false)
 	{
-		log_warning(_("unable to retrieve replication status information"));
+		log_warning(_("unable to retrieve replication status information, unable to update monitoring history"));
 		return;
 	}

@@ -1835,8 +1932,7 @@ update_monitoring_history(void)
 		replication_lag_bytes = 0;
 	}

-	add_monitoring_record(
-						  primary_conn,
+	add_monitoring_record(primary_conn,
 						  local_conn,
 						  primary_node_id,
 						  local_node_info.node_id,
@@ -1846,6 +1942,8 @@ update_monitoring_history(void)
 						  replication_info.last_xact_replay_timestamp,
 						  replication_lag_bytes,
 						  apply_lag_bytes);
+
+	INSTR_TIME_SET_CURRENT(last_monitoring_update);
 }


@@ -1870,7 +1968,7 @@ do_upstream_standby_failover(void)
 	t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
 	RecordStatus record_status = RECORD_NOT_FOUND;
 	RecoveryType primary_type = RECTYPE_UNKNOWN;
-	int			i, r;
+	int			i, standby_follow_result;
 	char		parsed_follow_command[MAXPGPATH] = "";

 	close_connection(&upstream_conn);
@@ -1904,9 +2002,18 @@ do_upstream_standby_failover(void)

 	if (primary_type != RECTYPE_PRIMARY)
 	{
-		log_error(_("last known primary\"%s\" (ID: %i) is in recovery, not following"),
-				  primary_node_info.node_name,
-				  primary_node_info.node_id);
+		if (primary_type == RECTYPE_STANDBY)
+		{
+			log_error(_("last known primary \"%s\" (ID: %i) is in recovery, not following"),
+					  primary_node_info.node_name,
+					  primary_node_info.node_id);
+		}
+		else
+		{
+			log_error(_("unable to determine status of last known primary \"%s\" (ID: %i), not following"),
+					  primary_node_info.node_name,
+					  primary_node_info.node_id);
+		}

 		close_connection(&primary_conn);
 		monitoring_state = MS_DEGRADED;
@@ -1917,8 +2024,6 @@ do_upstream_standby_failover(void)
 	/* Close the connection to this server */
 	close_connection(&local_conn);

-	initPQExpBuffer(&event_details);
-
 	log_debug(_("standby follow command is:\n  \"%s\""),
 			  config_file_options.follow_command);

@@ -1928,10 +2033,12 @@ do_upstream_standby_failover(void)
 	 */
 	parse_follow_command(parsed_follow_command, config_file_options.follow_command, primary_node_info.node_id);

-	r = system(parsed_follow_command);
+	standby_follow_result = system(parsed_follow_command);

-	if (r != 0)
+	if (standby_follow_result != 0)
 	{
+		initPQExpBuffer(&event_details);
+
 		appendPQExpBuffer(&event_details,
 						  _("unable to execute follow command:\n %s"),
 						  config_file_options.follow_command);
@@ -1942,8 +2049,7 @@ do_upstream_standby_failover(void)
 		 * It may not possible to write to the event notification table but we
 		 * should be able to generate an external notification if required.
 		 */
-		create_event_notification(
-								  primary_conn,
+		create_event_notification(primary_conn,
 								  &config_file_options,
 								  local_node_info.node_id,
 								  "repmgrd_failover_follow",
@@ -1956,6 +2062,10 @@ do_upstream_standby_failover(void)
 	/*
 	 * It's possible that the standby is still starting up after the "follow_command"
 	 * completes, so poll for a while until we get a connection.
+	 *
+	 * NOTE: we've previously closed the local connection, so even if the follow command
+	 * failed for whatever reason and the local node remained up, we can re-open
+	 * the local connection.
 	 */

 	for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
@@ -1965,7 +2075,7 @@ do_upstream_standby_failover(void)
 		if (PQstatus(local_conn) == CONNECTION_OK)
 			break;

-		log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
+		log_debug("sleeping 1 second; %i of %i (\"repmgrd_standby_startup_timeout\") attempts to reconnect to local node",
 				  i + 1,
 				  config_file_options.repmgrd_standby_startup_timeout);
 		sleep(1);
@@ -1981,28 +2091,47 @@ do_upstream_standby_failover(void)
 	/* refresh shared memory settings which will have been zapped by the restart */
 	repmgrd_set_local_node_id(local_conn, config_file_options.node_id);

-	if (update_node_record_set_upstream(primary_conn,
-										local_node_info.node_id,
-										primary_node_info.node_id) == false)
+	/*
+	 *
+	 */
+
+	if (standby_follow_result != 0)
 	{
-		appendPQExpBuffer(&event_details,
-						  _("unable to set node %i's new upstream ID to %i"),
-						  local_node_info.node_id,
-						  primary_node_info.node_id);
+		monitoring_state = MS_DEGRADED;
+		INSTR_TIME_SET_CURRENT(degraded_monitoring_start);

-		log_error("%s", event_details.data);
+		return FAILOVER_STATE_FOLLOW_FAIL;
+	}

-		create_event_notification(
-								  NULL,
-								  &config_file_options,
-								  local_node_info.node_id,
-								  "repmgrd_failover_follow",
-								  false,
-								  event_details.data);
+	/*
+	 * update upstream_node_id to primary node (but only if follow command
+	 * was successful)
+	 */

-		termPQExpBuffer(&event_details);
+	{
+		if (update_node_record_set_upstream(primary_conn,
+											local_node_info.node_id,
+											primary_node_info.node_id) == false)
+		{
+			initPQExpBuffer(&event_details);
+			appendPQExpBuffer(&event_details,
+							  _("unable to set node %i's new upstream ID to %i"),
+							  local_node_info.node_id,
+							  primary_node_info.node_id);

-		terminate(ERR_BAD_CONFIG);
+			log_error("%s", event_details.data);
+
+			create_event_notification(NULL,
+									  &config_file_options,
+									  local_node_info.node_id,
+									  "repmgrd_failover_follow",
+									  false,
+									  event_details.data);
+
+			termPQExpBuffer(&event_details);
+
+			terminate(ERR_BAD_CONFIG);
+		}
 	}

 	/* refresh own internal node record */
@@ -2018,6 +2147,8 @@ do_upstream_standby_failover(void)
 		local_node_info.upstream_node_id = primary_node_info.node_id;
 	}

+	initPQExpBuffer(&event_details);
+
 	appendPQExpBuffer(&event_details,
 					  _("node %i is now following primary node %i"),
 					  local_node_info.node_id,
@@ -2025,8 +2156,7 @@ do_upstream_standby_failover(void)

 	log_notice("%s", event_details.data);

-	create_event_notification(
-							  primary_conn,
+	create_event_notification(primary_conn,
 							  &config_file_options,
 							  local_node_info.node_id,
 							  "repmgrd_failover_follow",
@@ -2264,6 +2394,8 @@ follow_new_primary(int new_primary_id)
 	RecordStatus record_status = RECORD_NOT_FOUND;
 	bool		new_primary_ok = false;

+	log_verbose(LOG_DEBUG, "follow_new_primary(): new primary id is %i", new_primary_id);
+
 	record_status = get_node_record(local_conn, new_primary_id, &new_primary);

 	if (record_status != RECORD_FOUND)
@@ -2483,20 +2615,26 @@ witness_follow_new_primary(int new_primary_id)
 	{
 		RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);

-		if (primary_recovery_type == RECTYPE_PRIMARY)
+		switch (primary_recovery_type)
 		{
-			new_primary_ok = true;
-		}
-		else
-		{
-			new_primary_ok = false;
-			log_warning(_("new primary is not in recovery"));
-			close_connection(&upstream_conn);
+			case RECTYPE_PRIMARY:
+				new_primary_ok = true;
+				break;
+			case RECTYPE_STANDBY:
+				new_primary_ok = false;
+				log_warning(_("new primary is not in recovery"));
+				break;
+			case RECTYPE_UNKNOWN:
+				new_primary_ok = false;
+				log_warning(_("unable to determine status of new primary"));
+				break;
 		}
 	}

 	if (new_primary_ok == false)
 	{
+		close_connection(&upstream_conn);
+
 		return FAILOVER_STATE_FOLLOW_FAIL;
 	}

@@ -2936,9 +3074,18 @@ check_connection(t_node_info *node_info, PGconn **conn)
 		}
 		else
 		{
+			int 		stored_local_node_id = UNKNOWN_NODE_ID;
+
 			log_info(_("reconnected to node \"%s\" (ID: %i)"),
 					 node_info->node_name,
 					 node_info->node_id);
+
+			stored_local_node_id = repmgrd_get_local_node_id(*conn);
+			if (stored_local_node_id == UNKNOWN_NODE_ID)
+			{
+				repmgrd_set_local_node_id(*conn, config_file_options.node_id);
+			}
+
 		}
 	}
 }
@@ -2982,3 +3129,30 @@ format_failover_state(FailoverState failover_state)
 }


+static void
+handle_sighup(PGconn **conn, t_server_type server_type)
+{
+	log_debug("SIGHUP received");
+
+	if (reload_config(&config_file_options, server_type))
+	{
+		PQfinish(*conn);
+		*conn = establish_db_connection(config_file_options.conninfo, true);
+	}
+
+	if (*config_file_options.log_file)
+	{
+		FILE	   *fd;
+
+		log_debug("reopening %s", config_file_options.log_file);
+
+		fd = freopen(config_file_options.log_file, "a", stderr);
+		if (fd == NULL)
+		{
+			fprintf(stderr, "error reopening stderr to \"%s\": %s",
+					config_file_options.log_file, strerror(errno));
+		}
+	}
+
+	got_SIGHUP = false;
+}
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -320,8 +320,6 @@ main(int argc, char **argv)
 		strncpy(config_file_options.log_level, cli_log_level, MAXLEN);
 	}

-	log_notice(_("repmgrd (repmgr %s) starting up"), REPMGR_VERSION);
-
 	/*
 	 * -m/--monitoring-history, if provided, will override repmgr.conf's
 	 * monitoring_history; this is for backwards compatibility as it's
@@ -349,6 +347,8 @@ main(int argc, char **argv)

 	logger_init(&config_file_options, progname());

+	log_notice(_("repmgrd (%s %s) starting up"), progname(), REPMGR_VERSION);
+
 	if (verbose)
 		logger_set_verbose();

@@ -770,10 +770,10 @@ show_help(void)
 }


-PGconn *
-try_reconnect(t_node_info *node_info)
+void
+try_reconnect(PGconn **conn, t_node_info *node_info)
 {
-	PGconn	   *conn;
+	PGconn	   *our_conn;
 	t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;

 	int			i;
@@ -782,7 +782,6 @@ try_reconnect(t_node_info *node_info)

 	initialize_conninfo_params(&conninfo_params, false);

-
 	/* we assume by now the conninfo string is parseable */
 	(void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false);

@@ -805,18 +804,47 @@ try_reconnect(t_node_info *node_info)
 			 * degraded monitoring? - make that configurable
 			 */

-			conn = establish_db_connection_by_params(&conninfo_params, false);
+			our_conn = establish_db_connection_by_params(&conninfo_params, false);

-			if (PQstatus(conn) == CONNECTION_OK)
+			if (PQstatus(our_conn) == CONNECTION_OK)
 			{
 				free_conninfo_params(&conninfo_params);

+				log_info(_("connection to node %i succeeded"), node_info->node_id);
+
+				if (PQstatus(*conn) == CONNECTION_BAD)
+				{
+					log_verbose(LOG_INFO, "original connection handle returned CONNECTION_BAD, using new connection");
+					close_connection(conn);
+					*conn = our_conn;
+				}
+				else
+				{
+					ExecStatusType ping_result;
+
+					ping_result = connection_ping(*conn);
+
+					if (ping_result != PGRES_TUPLES_OK)
+					{
+						log_info("original connnection no longer available, using new connection");
+						close_connection(conn);
+						*conn = our_conn;
+					}
+					else
+					{
+						log_info(_("original connection is still available"));
+
+						PQfinish(our_conn);
+					}
+				}
+
 				node_info->node_status = NODE_STATUS_UP;
-				return conn;
+
+				return;
 			}

-			close_connection(&conn);
-			log_notice(_("unable to reconnect to node"));
+			close_connection(&our_conn);
+			log_notice(_("unable to reconnect to node %i"), node_info->node_id);
 		}

 		if (i + 1 < max_attempts)
@@ -835,7 +863,7 @@ try_reconnect(t_node_info *node_info)

 	free_conninfo_params(&conninfo_params);

-	return NULL;
+	return;
 }


--- a/repmgrd.h
+++ b/repmgrd.h
@@ -21,7 +21,7 @@ extern t_node_info local_node_info;
 extern PGconn *local_conn;
 extern bool startup_event_logged;

-PGconn	   *try_reconnect(t_node_info *node_info);
+void		try_reconnect(PGconn **conn, t_node_info *node_info);

 int			calculate_elapsed(instr_time start_time);
 const char *print_monitoring_state(MonitoringState monitoring_state);
Author	SHA1	Message	Date
Ian Barwick	222f7e6080	doc: add a link to the current documentation from the contents page	2019-04-03 10:47:19 +09:00
Ian Barwick	446695e328	doc: fix typos	2018-10-23 09:22:11 +09:00
Ian Barwick	ec3da13e22	doc: fix typo Per user report on mailing list.	2018-10-23 09:00:46 +09:00
Ian Barwick	1488c014ff	Changes for a 4.1.2 snapshot release	2018-10-16 13:24:48 +09:00
Ian Barwick	f471316504	repmgrd: improve promotion script failure handling While scanning for a new primary following a promotion script failure, repmgrd was treating a witness server as a potential new primary and would attempt to "follow" it. Fortunately "repmgr standby follow" would do the right thing and choose the actual primary, if available, otherwise do nothing, so the cluster would eventually end up in the correct state, albeit for the wrong reason. By skipping the witness server as a potential new primary, repmgrd will do the right thing if the original primary does come back online, i.e. resume monitoring as before.	2018-10-16 11:39:54 +09:00
Gilles Pietri	726299f7ef	Missing comma in sudoers example	2018-10-11 09:59:15 +09:00
Ian Barwick	7fda2a1bcf	doc: fix typo in repmgr.conf.sample	2018-10-08 09:37:41 +09:00
Ian Barwick	d26141b8ab	Fix LWLockRelease() call in unset_bdr_failover_handler()	2018-10-08 09:37:31 +09:00
Ian Barwick	4a6b5fe913	Update control file checks for PostgreSQL 11	2018-09-27 14:08:39 +09:00
Ian Barwick	a71e644255	repmgrd: document parameters which can be reloaded via SIGHUP Also add a new subsection with details on reloading repmgrd configuration.	2018-09-27 10:44:34 +09:00
Ian Barwick	8646fd6004	doc: fix link in 4.1.1 release notes	2018-09-25 14:30:57 +09:00
Ian Barwick	3e1bb1a523	doc: minor fixes to "repmgr.conf.sample"	2018-09-25 10:54:54 +09:00
Ian Barwick	f5e58fc062	doc: update "repmgr node rejoin" documentation Clarify various points related to --force-rewind and pg_rewind usage.	2018-09-14 14:09:33 +09:00
Ian Barwick	6b95a96f3a	repmgr: improve "cluster show" output Only output full contents of connection error messages in --verbose mode, otherwise it can spew a lot of text onto the screen.	2018-09-12 14:17:39 +09:00
Ian Barwick	bd146ae9ac	repmgrd: update local node id in shared memory after local node restart Also ensure local node restarts are handled more elegantly, so we're not surprised by a stale connection handle. GitHub #502.	2018-09-12 14:17:35 +09:00
Ian Barwick	c7f8e48d12	Bump version 4.1.2	2018-09-07 13:08:55 +09:00
Ian Barwick	322190516c	doc: update link	2018-09-05 15:41:32 +09:00
Ian Barwick	31a49ff781	doc: update version	2018-09-04 12:33:44 +09:00
Ian Barwick	a6f99b58dd	doc: update 4.1.1 release notes	2018-09-04 12:33:10 +09:00
Ian Barwick	09b041433e	doc: update 4.1.1 release notes	2018-09-04 09:46:59 +09:00
Ian Barwick	058c8168e1	repmgrd: fix syntax	2018-08-30 15:54:31 +09:00
Ian Barwick	0468e47ef3	repmgrd: improve reconnection handling Previously, if the server being monitored was not available, repmgrd would always close the existing connection handle and open a new one. However, in some cases, e.g. a brief network outage, the existing connection handle is still good and does not need to be reopened. This could be particularly problematic if monitoring_history is on, as this risks leaving orphan sessions on the primary which (given a sufficiently unstable network) could lead to all available backends being occupied. Instead, during an outage we now use a new connection to verify the server is accessible; if the old connection is still available (e.g. following a short network interruption) we continue using that; if not (e.g. the server was restarted), we use the new one.	2018-08-30 15:47:49 +09:00
Ian Barwick	216326f316	doc: update release notes	2018-08-30 13:09:41 +09:00
Ian Barwick	3fb20ce774	repmgr: improve slot handling in "node rejoin" On the rejoined node, if a replication slot for the new upstream exists (which is typically the case after a failover), delete that slot. Also emit a warning about any inactive replication slots which may need to be cleaned up manually. GitHub #499.	2018-08-30 11:57:44 +09:00
Ian Barwick	e468ca859e	repmgrd: improve monitoring statistics logging Add more granular logging to help diagnose issues, and also keep track of when the last monitoring statistics update was set and emit that as DETAIL every time we emit a log status update.	2018-08-29 14:48:30 +09:00
Ian Barwick	623c84c022	Add additional query error logging It's unlikely we'll get an error in these cases, but you never know. Also, with queries which return a list of node records, it's necessary to call _populate_node_records() even if the query fails, so a properly initalised, albeit empty list is returned to the caller.	2018-08-29 10:27:42 +09:00
Ian Barwick	c2dded1d7b	Log text of failed queries at log level ERROR Previously query texts were always logged at log level DEBUG, but that doesn't help much in a normal production environment when trying to identify the cause of issues. Also make various other minor improvements to query logging and handling of database errors. Implements GitHub #498.	2018-08-29 10:09:51 +09:00
Ian Barwick	457dbbd267	"standby switchover": improve replication connection check Previously repmgr would first check that a replication can be made from the demotion candidate to the promotion candidate, however it's preferable to sanity-check the number of available walsenders first, to provide a more useful error message.	2018-08-24 16:31:46 +09:00
Ian Barwick	5485c06bc1	doc: fix internal link	2018-08-24 09:43:18 +09:00
Cédric Villemain	00ae42eb07	Fix grep to find conninfo it used to use \t* but [[:space:]] should be better as it does match more kind of spaces (the current one being broken in my case on RH7)	2018-08-24 09:20:51 +09:00
Ian Barwick	33525491ae	doc: update package signing key link	2018-08-23 12:33:48 +09:00
Ian Barwick	8c84f7a214	doc: update source requirement links Per report from Daymel Bonne.	2018-08-23 10:56:49 +09:00
Ian Barwick	efe4bed88e	doc: improve event notification documentation - add undocumented events (per report from Daymel Bonne) - split up list into sections for better overview - where feasible, add cross-links	2018-08-23 10:22:05 +09:00
Ian Barwick	9ba8dcbac3	doc: clarify statement about BDR HA support	2018-08-23 09:36:58 +09:00
Ian Barwick	a8996a5bfa	doc: clarify when "standby follow" can be used. The unqualified wording previously implied that any running server could be rejoined with "standby follow", which is not the case with a "split brain" primary.	2018-08-21 13:53:21 +09:00
Ian Barwick	4cbba98193	repmgr: add "cluster_cleanup" event GitHub #492.	2018-08-20 16:48:08 +09:00
Ian Barwick	23e6b85de3	doc: document sources of old package versions	2018-08-20 14:16:48 +09:00
Ian Barwick	d5ecb09f22	doc: add information about snapshot packages	2018-08-20 13:03:04 +09:00
Ian Barwick	719dd93676	doc: update release notes	2018-08-20 12:33:11 +09:00
Ian Barwick	5747f1d446	repmgrd: improve cascaded standby failover handling In particular, improve handling of the case where the standby follow command fails due to the primary not being available. GitHub #480.	2018-08-16 17:14:05 +09:00
Ian Barwick	9313b43cb1	repmgrd: fix PQExpBuffer handling in upstream failover handler Was sometimes leading to blank log lines.	2018-08-16 16:14:14 +09:00
Ian Barwick	5aeb1b0589	repmgrd: don't imply primary is in recovery if it's not available	2018-08-16 15:31:25 +09:00
Ian Barwick	6c93388848	repmgrd: fix "repmgrd_upstream_reconnect" event notification Upstream node is not always the primary node. Per report in GitHub #480.	2018-08-16 14:57:11 +09:00
Ian Barwick	d4ad8ce20c	"standby clone" - don't copy external config files in dry run mode Avoid copying files during a --dry-run as it may introduce unexpected changes on the target node. During an actual clone operation, any problems with copying files will be detected early and the operation aborted before the actual database cloning commences. GitHub #491.	2018-08-16 14:03:39 +09:00
Ian Barwick	bacab8d31c	"standby promote": improve log messages Make it clearer what repmgr is waiting for, and what to do if the promotion appears to fail.	2018-08-16 11:52:18 +09:00
Ian Barwick	14856e3a4d	repmgrd: ensure primary connection handle is refreshed after reconnect In some circumstances, if monitoring history was in use, repmgrd was attempting to fetch the primary's current LSN on a stale connection handle.	2018-08-15 16:57:21 +09:00
Ian Barwick	ca9242badb	repmgr: fix handling of slot creation error when cloning If cloning from another node other than the intended upstream, and replication slots are in use, once the cloning process is complete, repmgr will attempt to connect to the intended upstream to create the replication slot. Previously it would abort with a connection error, but as this issue is not fatal to the cloning process itself, and in some situations may be intentional, it's better to log a warning and continue. We should probably collate this (and any similar items needing attention after the cloning operation) into a list output at the end, otherwise the warning may get overlooked.	2018-08-15 15:11:13 +09:00
Ian Barwick	ff0929e882	doc: update FAQ Explain why some values in recovery.conf are surrounded by pairs of single quotes.	2018-08-15 14:48:23 +09:00
Abhijit Menon-Sen	8cd1811edb	Fix upstream node name in warning This log_warning is supposed to reproduce the error in the block above, but used the current node's name instead of the intended upstream node.	2018-08-14 10:10:50 +09:00
Ian Barwick	bf15c0d40f	doc: improve "repmgr cluster cleanup" documentation	2018-08-14 10:09:18 +09:00
Ian Barwick	9ae9d31165	repmgr: truncate version string if necessary Some distributions may add extra information to PG_VERSION after the actual version number (e.g. "10.4 (Debian 10.4-2.pgdg90+1)"), so copy the version number string up until the first space is found. GitHub #490.	2018-08-14 09:56:54 +09:00
Ian Barwick	d5064bdc02	doc: clarify repmgrd FAQ item "priority" must be 0 or greater.	2018-08-10 10:53:08 +09:00
Ian Barwick	9d0524a008	doc: update FAQ Add note about why repmgrd refuses to start up if the upstream is not running.	2018-08-10 10:47:23 +09:00
Ian Barwick	5398fd2d22	doc: better explain where pg_bindir won't be applied Basically any setting which can contain a user-defined script must have the full path set, even if it's repmgr being executed. We could potentially apply some heuristics to detect if the first item in the setting is "repmgr" (or more precisely repmgrd's program name), but this will require some careful thought and testing that it works as intended.	2018-08-10 10:29:06 +09:00
Ian Barwick	4c44c01380	doc: update release notes	2018-08-10 09:52:39 +09:00
Ian Barwick	5113ab0274	repmgrd: fix startup on witness node when local data is stale Previously, when running on a witness server, repmgrd didn't consider the local cache of the "repmgr.nodes" table might be outdated, e.g. as repmgrd wasn't running on the witness server during a failover, so could potentially end up monitoring a former primary now running as a standby. When running on a witness server, at startup repmgrd will now scan all nodes to determine the current primary, and refresh its local cache from there. This will also ensure it can start up even if the node currently registered as primary in the local cache is not available. Implements GitHub #488 and #489.	2018-08-09 16:42:20 +09:00
Ian Barwick	25f68bb283	repmgrd: report version number after logger initialisation This ensures the version number always makes it into the log destination. Implements GitHub #487.	2018-08-08 15:45:48 +09:00
Ian Barwick	730f67258c	Bump version 4.1.1	2018-08-07 15:22:11 +09:00
Ian Barwick	ca0e4de1ee	doc: clarify witness server location	2018-08-07 13:11:27 +09:00
Ian Barwick	2fb0f056fe	repmgrd: fix configuration file reloading Don't allow "promote_command" or "follow_command" to be empty. GitHub #486.	2018-08-02 16:35:36 +09:00
Ian Barwick	3a789d53e0	repmgrd: always reopen log file after receiving SIGHUP For whatever reason, since at least repmgr 2.0 the log file was only ever reopened if a configuration file change took place. GitHub #485.	2018-08-02 10:51:18 +09:00
Ian Barwick	fb67b2cd4f	doc: fix typo	2018-08-01 16:37:01 +09:00
Ian Barwick	9f07804b6a	doc: update repmgrd log rotation configuration In the sample logrotate configuration file, use "copytruncate" rather than "create", as repmgrd currently doesn't reopen the log file (unless the configuration changes). Per suggestion in GitHub #465.	2018-08-01 16:33:22 +09:00
Ian Barwick	d5b2fa2309	doc: update 2ndQuadrant repository locations in packaging appendix	2018-08-01 15:57:45 +09:00
Ian Barwick	d696c4019e	repmgrd: consolidate SIGHUP handling Move identical code blocks into single function.	2018-08-01 11:53:57 +09:00
Ian Barwick	e6ffbcc67a	doc: add note about new repository structure to 4.1.0 release notes	2018-08-01 11:47:27 +09:00
Ian Barwick	e1410831e0	doc: update 4.1.0 release notes	2018-08-01 11:38:08 +09:00
Ian Barwick	cb4f6f6e3f	doc: add release date for 4.1.0	2018-07-31 10:58:06 +09:00
Ian Barwick	75e5d79654	doc: update Debian installation instructions 2ndQuadrant repository structure has changed.	2018-07-31 10:53:04 +09:00
Ian Barwick	55fbe12971	doc: update RPM installation instructions 2ndQuadrant repository structure has changed. Also remove reference to the old, very deprecated original repmgr RPM repository.	2018-07-30 17:26:46 +09:00
Ian Barwick	db4199e08f	doc: update document build version for 4.1 branch	2018-07-24 14:02:38 +09:00
Ian Barwick	0d9ed02729	doc: fix typo	2018-07-24 14:02:08 +09:00
Ian Barwick	8e9f0b802b	Create 4.1 branch	2018-07-24 10:22:31 +09:00