repmgrd: add parameter "failover_delay"

This parameter is not documented and intended for use during testing. It should not be used in production.
Fix typo
2026-03-23 07:06:30 +00:00 · 2020-10-05 17:43:32 +09:00 · 2020-10-05 17:38:47 +09:00 · 2020-10-05 17:38:09 +09:00 · 2020-09-15 15:31:31 +09:00 · 2020-09-15 14:46:39 +09:00
26 changed files with 1086 additions and 250 deletions
--- a/4
+++ b/4
@@ -1,4 +1,4 @@
-5.1     2020-??-??
+5.1.0   2020-04-13
        repmgr: remove BDR 2.x support
        repmgr: don't query upstream's data directory (Ian)
        repmgr: rename --recovery-conf-only to --replication-conf-only (Ian)
@@ -14,6 +14,8 @@
        repmgr: improve "standby switchover" completion checks (Ian)
        repmgr: add replication configuration file ownership check to
          "standby switchover" (Ian)
+        repmgr: check the demotion candidate's registered repmgr.conf file can
+          be found (laixiong; GitHub 615)
        repmgr: consolidate replication connection code (Ian)
        repmgr: check permissions for "pg_promote()" and fall back to pg_ctl
          if necessary (Ian)
--- a/Makefile.in
+++ b/Makefile.in
@@ -11,6 +11,7 @@ EXTENSION = repmgr

 DATA = \
  repmgr--unpackaged--4.0.sql \
+  repmgr--unpackaged--5.1.sql \
  repmgr--4.0.sql \
  repmgr--4.0--4.1.sql \
  repmgr--4.1.sql \
--- a/configfile.h
+++ b/configfile.h
@@ -29,7 +29,7 @@
 #define TARGET_TIMELINE_LATEST 0

 /*
- * This is defined src/include/utils.h, however it's not practical
+ * This is defined in src/include/utils.h, however it's not practical
 * to include that from a frontend application.
 */
 #define PG_AUTOCONF_FILENAME "postgresql.auto.conf"
@@ -189,6 +189,7 @@ typedef struct

 	/* undocumented test settings */
 	int			promote_delay;
+	int			failover_delay;
 } t_configuration_options;

 /*
@@ -243,8 +244,9 @@ typedef struct
 		/* barman settings */ \
 		"", "", "",	 \
 		/* rsync/ssh settings */ \
-		 "", "", \
+		"", "", \
 		/* undocumented test settings */ \
+		0, \
 		0 \
 }

--- a/18
+++ b/18
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for repmgr 5.1.
+# Generated by GNU Autoconf 2.69 for repmgr 5.1.0.
 #
 # Report bugs to <repmgr@googlegroups.com>.
 #
@@ -582,8 +582,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='repmgr'
 PACKAGE_TARNAME='repmgr'
-PACKAGE_VERSION='5.1'
-PACKAGE_STRING='repmgr 5.1'
+PACKAGE_VERSION='5.1.0'
+PACKAGE_STRING='repmgr 5.1.0'
 PACKAGE_BUGREPORT='repmgr@googlegroups.com'
 PACKAGE_URL='https://repmgr.org/'

@@ -1181,7 +1181,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures repmgr 5.1 to adapt to many kinds of systems.
+\`configure' configures repmgr 5.1.0 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1242,7 +1242,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of repmgr 5.1:";;
+     short | recursive ) echo "Configuration of repmgr 5.1.0:";;
   esac
  cat <<\_ACEOF

@@ -1316,7 +1316,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-repmgr configure 5.1
+repmgr configure 5.1.0
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1335,7 +1335,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by repmgr $as_me 5.1, which was
+It was created by repmgr $as_me 5.1.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2487,7 +2487,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by repmgr $as_me 5.1, which was
+This file was extended by repmgr $as_me 5.1.0, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -2550,7 +2550,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-repmgr config.status 5.1
+repmgr config.status 5.1.0
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.in
+++ b/configure.in
@@ -1,4 +1,4 @@
-AC_INIT([repmgr], [5.1], [repmgr@googlegroups.com], [repmgr], [https://repmgr.org/])
+AC_INIT([repmgr], [5.1.0], [repmgr@googlegroups.com], [repmgr], [https://repmgr.org/])

 AC_COPYRIGHT([Copyright (c) 2010-2020, 2ndQuadrant Ltd.])

--- a/dbutils.c
+++ b/dbutils.c
@@ -2951,7 +2951,7 @@ get_child_nodes(PGconn *conn, int node_id, NodeInfoList *node_list)
 					  "     WHERE n.upstream_node_id = %i ",
 					  node_id);

-		log_verbose(LOG_DEBUG, "get_active_sibling_node_records():\n%s", query.data);
+	log_verbose(LOG_DEBUG, "get_child_nodes():\n%s", query.data);

 	res = PQexec(conn, query.data);

@@ -5720,16 +5720,16 @@ get_node_replication_stats(PGconn *conn, t_node_info *node_info)


 NodeAttached
-is_downstream_node_attached(PGconn *conn, char *node_name)
+is_downstream_node_attached(PGconn *conn, char *node_name, char **node_state)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
-	int			c = 0;
+	const char *state = NULL;

 	initPQExpBuffer(&query);

 	appendPQExpBuffer(&query,
-					  " SELECT pg_catalog.count(*) "
+					  " SELECT pid, state "
 					  "   FROM pg_catalog.pg_stat_replication "
 					  "  WHERE application_name = '%s'",
 					  node_name);
@@ -5748,31 +5748,53 @@ is_downstream_node_attached(PGconn *conn, char *node_name)
 		return NODE_ATTACHED_UNKNOWN;
 	}

-	if (PQntuples(res) != 1)
-	{
-		log_verbose(LOG_WARNING, _("unexpected number of tuples (%i) returned"), PQntuples(res));
+	termPQExpBuffer(&query);
+
+	/*
+	 * If there's more than one entry in pg_stat_application, there's no
+	 * way we can reliably determine which one belongs to the node we're
+	 * checking, so there's nothing more we can do.
+	 */
+	if (PQntuples(res) > 1)
+	{
+		log_error(_("multiple entries with \"application_name\" set to  \"%s\" found in \"pg_stat_replication\""),
+				  node_name);
+		log_hint(_("verify that a unique node name is configured for each node"));

-		termPQExpBuffer(&query);
 		PQclear(res);

 		return NODE_ATTACHED_UNKNOWN;
 	}

-	c = atoi(PQgetvalue(res, 0, 0));
-
-	termPQExpBuffer(&query);
-	PQclear(res);
-
-	if (c == 0)
+	if (PQntuples(res) == 0)
 	{
-		log_verbose(LOG_WARNING, _("node \"%s\" not found in \"pg_stat_replication\""), node_name);
+		log_warning(_("node \"%s\" not found in \"pg_stat_replication\""), node_name);
+
+		PQclear(res);

 		return NODE_DETACHED;
 	}

-	if (c > 1)
-		log_verbose(LOG_WARNING, _("multiple entries with \"application_name\" set to  \"%s\" found in \"pg_stat_replication\""),
-					node_name);
+	state = PQgetvalue(res, 0, 1);
+
+	if (node_state != NULL)
+	{
+		*node_state = palloc0(strlen(state) + 1);
+		strncpy(*node_state, state, strlen(state));
+	}
+
+	if (strcmp(state, "streaming") != 0)
+	{
+		log_warning(_("node \"%s\" attached in state \"%s\""),
+					node_name,
+					state);
+
+		PQclear(res);
+
+		return NODE_NOT_ATTACHED;
+	}
+
+	PQclear(res);

 	return NODE_ATTACHED;
 }
--- a/dbutils.h
+++ b/dbutils.h
@@ -119,9 +119,14 @@ typedef enum

 typedef enum
 {
+	/* unable to query "pg_stat_replication" or other error */
 	NODE_ATTACHED_UNKNOWN = -1,
-	NODE_DETACHED,
-	NODE_ATTACHED
+	/* node has record in "pg_stat_replication" and state is not "streaming" */
+	NODE_ATTACHED,
+	/* node has record in "pg_stat_replication" but state is not "streaming" */
+	NODE_NOT_ATTACHED,
+	/* node has no record in "pg_stat_replication" */
+	NODE_DETACHED
 } NodeAttached;

 typedef enum
@@ -589,7 +594,7 @@ bool		get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *repl
 int			get_replication_lag_seconds(PGconn *conn);
 TimeLineID	get_node_timeline(PGconn *conn, char *timeline_id_str);
 void		get_node_replication_stats(PGconn *conn, t_node_info *node_info);
-NodeAttached is_downstream_node_attached(PGconn *conn, char *node_name);
+NodeAttached is_downstream_node_attached(PGconn *conn, char *node_name, char **node_state);
 void		set_upstream_last_seen(PGconn *conn, int upstream_node_id);
 int			get_upstream_last_seen(PGconn *conn, t_server_type node_type);

--- a/doc/Makefile
+++ b/doc/Makefile
@@ -95,6 +95,7 @@ clean:
 	rm -f repmgr.html
 	rm -f repmgr-A4.pdf
 	rm -f repmgr-US.pdf
+	rm -f html/*

 maintainer-clean:
 	rm -rf html
--- a/doc/appendix-packages.xml
+++ b/doc/appendix-packages.xml
@@ -471,7 +471,7 @@ repmgr96-4.1.1-0.0git320.g5113ab0.1.el7.x86_64.rpm</programlisting>
      <title>Debian/Ubuntu</title>
      <para>
        An archive of old packages (<literal>3.3.2</literal> and later) for Debian/Ubuntu-based systems is available here:
-        <ulink url="http://atalia.postgresql.org/morgue/r/repmgr/">http://atalia.postgresql.org/morgue/r/repmgr/</ulink>
+        <ulink url="https://apt-archive.postgresql.org/">https://apt-archive.postgresql.org/</ulink>
      </para>
    </sect2>

--- a/doc/appendix-release-notes.xml
+++ b/doc/appendix-release-notes.xml
@@ -17,12 +17,12 @@

 <!-- remember to update the release date in ../repmgr_version.h.in -->

-  <sect1 id="release-5.1">
-    <title>Release 5.1</title>
-    <para><emphasis>?? ?? ??, 2020</emphasis></para>
+  <sect1 id="release-5.1.0">
+    <title id="release-current">Release 5.1.0</title>
+    <para><emphasis>Mon 13 April, 2020</emphasis></para>

    <para>
-      &repmgr; 5.1 is a major release.
+      &repmgr; 5.1.0 is a major release.
    </para>
    <para>
      For details on how to upgrade an existing &repmgr; installation, see
@@ -50,6 +50,23 @@
      <para>
        <itemizedlist>

+          <listitem>
+            <para>
+              The requirement that the &repmgr; user is a database superuser has been
+              removed as far as possible.
+            </para>
+            <para>
+              In theory, &repmgr; can be operated with a normal database user for managing
+              the &repmgr; database, and a separate replication user for managing replication
+              connections (and replication slots, if these are in use).
+            </para>
+            <para>
+              Some operations will still require superuser permissions, e.g. for issuing
+              a <command>CHECKPOINT</command> as par of a switchover operation; in this case
+              a valid superuser should be provided with the <option>-S</option>/<option>--superuser</option>
+              option.
+            </para>
+          </listitem>

          <listitem>
            <para>
@@ -73,6 +90,7 @@
              Improve logging and checking of potential failure situations.
            </para>
          </listitem>
+
          <listitem>
            <para>
              <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>:
@@ -82,12 +100,37 @@
              data directory.
            </para>
          </listitem>
+
+          <listitem>
+            <para>
+              <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>:
+              Provide additional information in <option>--dry-run mode</option> output.
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>:
+              Checks that the demotion candidate's registered repmgr.conf file can be found, to
+              prevent confusing references to an incorrectly configured data directory. GitHub 615.
+            </para>
+          </listitem>
+
          <listitem>
            <para>
              <link linkend="repmgr-node-check"><command>repmgr node check</command></link>:
              accept option <option>-S</option>/<option>--superuser</option>. GitHub #621.
            </para>
          </listitem>
+
+          <listitem>
+            <para>
+              <link linkend="repmgr-node-check"><command>repmgr node check</command></link>:
+              add  <option>--upstream</option> option to check whether the node is attached
+              to the expected upstream node.
+            </para>
+          </listitem>
+
        </itemizedlist>
      </para>
    </sect2>
@@ -112,6 +155,15 @@
            </para>
          </listitem>

+
+          <listitem>
+            <para>
+              <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>:
+              in <option>--dry-run mode</option>, display promote command which will be executed.
+            </para>
+          </listitem>
+
+
          <listitem>
            <para>
              <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>
@@ -123,8 +175,17 @@

          <listitem>
            <para>
-              <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>:
-              in PostgreSQL 12 and later, use <varname>service_promote_command</varname> if set.
+              <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>:
+              check for demotion candidate reattachment as late as possible to avoid spurious failure
+              reports.
+            </para>
+          </listitem>
+
+
+          <listitem>
+            <para>
+              &repmgrd;: check for presence of <option>promote_command</option> and
+              <option>follow_command</option> on receipt of <literal>SIGHUP</literal>. GitHub 614.
            </para>
          </listitem>

@@ -150,7 +211,7 @@
  </sect1>

  <sect1 id="release-5.0">
-    <title id="release-current">Release 5.0</title>
+    <title>Release 5.0</title>
    <para><emphasis>Tue 15 October, 2019</emphasis></para>

    <para>
--- a/doc/configuration-file.xml
+++ b/doc/configuration-file.xml
@@ -182,6 +182,14 @@ conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'</programlistin
   </itemizedlist>
  </para>

+  <para>
+    In examples provided in this documentation, it is assumed the configuration file is located
+    at <filename>/etc/repmgr.conf</filename>. If &repmgr; is installed from a package, the
+    configuration file will probably be located at another location specified by the packager;
+    see appendix <xref linkend="appendix-packages"/> for configuration file locations in
+    different packaging systems.
+  </para>
+
  <para>
   Note that if a file is explicitly specified with <literal>-f/--config-file</literal>,
   an error will be raised if it is not found or not readable, and no attempt will be made to
@@ -202,6 +210,61 @@ conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'</programlistin
      <filename>/path/to/repmgr.conf</filename>).
    </para>
   </note>
+  </sect2>
+
+  <sect2 id="configuration-file-postgresql-major-upgrades" xreflabel="configuration file and PostgreSQL major version upgrades">
+    <title>Configuration file and PostgreSQL major version upgrades</title>
+
+    <indexterm>
+      <primary>repmgr.conf</primary>
+      <secondary>ostgreSQL major version upgrades</secondary>
+    </indexterm>
+
+    <para>
+      When upgrading the PostgreSQL cluster to a new major version, <filename>repmgr.conf</filename>
+      will probably needed to be updated.
+    </para>
+    <para>
+      Usually <option>pg_bindir</option> and <option>data_directory</option> will need to be modified,
+      particularly if the default package locations are used, as these usually change.
+    </para>
+
+     <para>
+       It's also possible the location of <filename>repmgr.conf</filename> itself will change
+       (e.g. from <filename>/etc/repmgr/11/repmgr.conf</filename> to <filename>/etc/repmgr/12/repmgr.conf</filename>).
+       This is stored as part of the &repmgr; metadata and is used by &repmgr; to execute &repmgr; remotely
+       (e.g. during a <link linkend="performing-switchover">switchover operation</link>).
+     </para>
+     <para>
+       If the content and/or location of <filename>repmgr.conf</filename> has changed, the &repmgr; metadata
+       needs to be updated to reflect this. The &repmgr; metadata can be updated on each node with:
+       <itemizedlist spacing="compact" mark="bullet">
+         <listitem>
+           <simpara>
+             <link linkend="repmgr-primary-register">
+               <command>repmgr primary register --force -f /path/to/repmgr.conf</command>
+             </link>
+           </simpara>
+         </listitem>
+
+         <listitem>
+           <simpara>
+             <link linkend="repmgr-standby-register">
+               <command>repmgr standby register --force -f /path/to/repmgr.conf</command>
+             </link>
+           </simpara>
+         </listitem>
+
+
+         <listitem>
+           <simpara>
+             <link linkend="repmgr-witness-register">
+               <command>repmgr witness register --force -f /path/to/repmgr.conf -h primary_host</command>
+             </link>
+           </simpara>
+         </listitem>
+       </itemizedlist>
+     </para>

   </sect2>
 </sect1>
--- a/doc/configuration-password-management.xml
+++ b/doc/configuration-password-management.xml
@@ -127,8 +127,31 @@ node2:5432:repmgr:repmgr:foo
 node2:5432:replication:repluser:foo
 node3:5432:repmgr:repmgr:foo
 node3:5432:replication:repluser:foo</programlisting>
-
+      If you are planning to use the <option>-S</option>/<option>--superuser</option> option,
+      there must also be an entry enabling the superuser to connect to the &repmgr; database.
+      Assuming the superuser is <literal>postgres</literal>, the file would look like this:
+        <programlisting>
+node1:5432:repmgr:repmgr:foo
+node1:5432:repmgr:postgres:foo
+node1:5432:replication:repluser:foo
+node2:5432:repmgr:repmgr:foo
+node2:5432:repmgr:postgres:foo
+node2:5432:replication:repluser:foo
+node3:5432:repmgr:repmgr:foo
+node3:5432:repmgr:postgres:foo
+node3:5432:replication:repluser:foo</programlisting>
    </para>
+
+    <para>
+      The <filename>~/.pgpass</filename> file can be simplified with the use of wildcards if
+      there is no requirement to restrict provision of passwords to particular hosts, ports
+      or databases. The preceding file could then be formatted like this:
+        <programlisting>
+*:*:*:repmgr:foo
+*:*:*:postgres:foo
+</programlisting>
+    </para>
+
    <note>
      <para>
        It's possible to specify an alternative location for the <filename>~/.pgpass</filename> file, either via
@@ -140,6 +163,11 @@ node3:5432:replication:repluser:foo</programlisting>
        location on all nodes, as when connecting to a remote node, the file referenced is the one on the
        local node.
      </para>
+      <para>
+        Additionally, you <emphasis>must</emphasis> specify the passfile location in <filename>repmgr.conf</filename>
+        with the <option>passfile</option> option so &repmgr; can write the correct path when creating the
+        <option>primary_conninfo</option> parameter for replication configuration on standbys.
+      </para>
    </note>

  </sect2>
--- a/doc/repmgr-node-rejoin.xml
+++ b/doc/repmgr-node-rejoin.xml
@@ -43,7 +43,12 @@
      <programlisting>
      repmgr node rejoin -d '$conninfo'</programlisting>

-      where <literal>$conninfo</literal> is the conninfo string of any reachable node in the cluster.
+      where <literal>$conninfo</literal> is the PostgreSQL <literal>conninfo</literal> string of the
+      <emphasis>current</emphasis> primary node (or that of any reachable node in the cluster, but
+      <emphasis>not</emphasis> the local node). This is so that &repmgr; can fetch up-to-date information
+      about the current state of the cluster.
+    </para>
+    <para>
      <filename>repmgr.conf</filename> for the stopped node *must* be supplied explicitly if not
      otherwise available.
    </para>
@@ -283,7 +288,15 @@
      to execute <command>pg_rewind</command> to ensure the node can be rejoined successfully.
    </para>

-    <important>
+    <refsect2 id="repmgr-node-rejoin-pg-rewind-config-files" xreflabel="pg_rewind and configuration files">
+
+      <title><command>pg_rewind</command> and configuration file retention</title>
+
+      <indexterm>
+        <primary>pg_rewind</primary>
+        <secondary>configuration file retention</secondary>
+      </indexterm>
+
      <para>
        Be aware that if <command>pg_rewind</command> is executed and actually performs a
        rewind operation, any configuration files in the PostgreSQL data directory will be
@@ -291,17 +304,27 @@
      </para>
      <para>
        To prevent this happening, provide a comma-separated list of files to retain
-        using the <literal>--config-file</literal> command line option; the specified files
+        using the <option>--config-file</option> command line option; the specified files
        will be archived in a temporary directory (whose parent directory can be specified with
-        <literal>--config-archive-dir</literal>) and restored once the rewind operation is
-        complete.
+        <option>--config-archive-dir</option>, default: <filename>/tmp</filename>)
+        and restored once the rewind operation is complete.
      </para>
-    </important>
+    </refsect2>

-    <para>
-      Example, first using <literal>--dry-run</literal>, then actually executing the
-      <literal>node rejoin command</literal>.
-    <programlisting>
+    <refsect2 id="repmgr-node-rejoin-pg-rewind-example" xreflabel="example using repmgr node rejoin and pg_rewind">
+
+      <title>Example using <command>repmgr node rejoin</command> and <command>pg_rewind</command></title>
+
+      <indexterm>
+        <primary>pg_rewind</primary>
+        <secondary>configuration file retention</secondary>
+      </indexterm>
+
+
+      <para>
+        Example, first using <option>--dry-run</option>, then actually executing the
+        <literal>node rejoin command</literal>.
+        <programlisting>
    $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node3 dbname=repmgr user=repmgr' \
        --config-files=postgresql.local.conf,postgresql.conf --verbose --force-rewind --dry-run
    INFO: replication connection to the rejoin target node was successful
@@ -317,17 +340,17 @@
      pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node3 dbname=repmgr user=repmgr'
    INFO: prerequisites for executing NODE REJOIN are met</programlisting>

-    <note>
-      <para>
-        If <option>--force-rewind</option> is used with the <option>--dry-run</option> option,
-        this checks the prerequisites for using <application>pg_rewind</application>, but is
-        not an absolute guarantee that actually executing <application>pg_rewind</application>
-        will succeed. See also section <xref linkend="repmgr-node-rejoin-caveats"/> below.
-      </para>
+        <note>
+          <para>
+            If <option>--force-rewind</option> is used with the <option>--dry-run</option> option,
+            this checks the prerequisites for using <application>pg_rewind</application>, but is
+            not an absolute guarantee that actually executing <application>pg_rewind</application>
+            will succeed. See also section <xref linkend="repmgr-node-rejoin-caveats"/> below.
+          </para>

-    </note>
+        </note>

-    <programlisting>
+        <programlisting>
    $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node3 dbname=repmgr user=repmgr' \
        --config-files=postgresql.local.conf,postgresql.conf --verbose --force-rewind
    NOTICE: pg_rewind execution required for this node to attach to rejoin target node 3
@@ -339,8 +362,8 @@
    NOTICE: starting server using "pg_ctl -l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' start"
    NOTICE: NODE REJOIN successful
    DETAIL: node 2 is now attached to node 3</programlisting>
-    </para>
-
+      </para>
+    </refsect2>
  </refsect1>

  <refsect1 id="repmgr-node-rejoin-caveats" xreflabel="Caveats">
@@ -378,6 +401,51 @@
     is running in <option>--dry-run</option> mode.
   </para>

+   <warning>
+     <para>
+       In all current PostgreSQL versions (as of September 2020), <application>pg_rewind</application>
+       contains a corner-case bug which affects standbys in a very specific situation.
+     </para>
+     <para>
+       This situation occurs when a standby was shut down <emphasis>before</emphasis> its
+       primary node, and an attempt is made to attach this standby to another primary
+       in the same cluster (following a &quot;split brain&quot; situation where the standby
+       was connected to the wrong primary). In this case, &repmgr; will correctly determine
+       that <application>pg_rewind</application> should be executed, however
+       <application>pg_rewind</application> incorrectly decides that no action is necessary.
+     </para>
+     <para>
+       In this situation, &repmgr; will report something like:
+<programlisting>
+    NOTICE: pg_rewind execution required for this node to attach to rejoin target node 1
+    DETAIL: rejoin target server's timeline 3 forked off current database system timeline 2 before current recovery point 0/7019C10</programlisting>
+       but when executed, <application>pg_rewind</application> will report:
+<programlisting>
+    pg_rewind: servers diverged at WAL location 0/7015540 on timeline 2
+    pg_rewind: no rewind required</programlisting>
+       and if an attempt is made to attach the standby to the new primary, PostgreSQL logs on the standby
+       will contain errors like:
+<programlisting>
+    [2020-09-07 15:01:41 UTC]    LOG:  00000: replication terminated by primary server
+    [2020-09-07 15:01:41 UTC]    DETAIL:  End of WAL reached on timeline 2 at 0/7015540.
+    [2020-09-07 15:01:41 UTC]    LOG:  00000: new timeline 3 forked off current database system timeline 2 before current recovery point 0/7019C10</programlisting>
+     </para>
+     <para>
+       Currently it is not possible to resolve this situation using <application>pg_rewind</application>.
+       A <ulink url="https://www.postgresql.org/message-id/flat/CABvVfJU-LDWvoz4-Yow3Ay5LZYTuPD7eSjjE4kGyNZpXC6FrVQ@mail.gmail.com">patch</ulink>
+       has been submitted and will hopefully be included in a forthcoming PostgreSQL minor release.
+     </para>
+     <para>
+       As a workaround, start the primary server the standby was previously attached to,
+       and ensure the standby can be attached to it. If <application>pg_rewind</application> was actually executed,
+       it will have copied in the <filename>.history</filename> file from the target primary server; this must
+       be removed. <command>repmgr node rejoin</command> can then be used to attach the standby to the original
+       primary. Ensure any changes pending on the primary have propogated to the standby. Then shut down the primary
+       server <emphasis>first</emphasis>, before shutting down the standby. It should then be possible to
+       use <command>repmgr node rejoin</command> to attach the standby to the new primary.
+     </para>
+   </warning>
+
  </refsect1>

  <refsect1>
--- a/doc/repmgr-standby-promote.xml
+++ b/doc/repmgr-standby-promote.xml
@@ -95,7 +95,6 @@
      NOTICE: promoting standby to primary
      DETAIL: promoting server "node2" (ID: 2) using "pg_ctl -l /var/log/postgres/startup.log -w -D '/var/lib/postgres/data' promote"
      server promoting
-      DEBUG: setting node 2 as primary and marking existing primary as failed
      NOTICE: STANDBY PROMOTE successful
      DETAIL: server "node2" (ID: 2) was successfully promoted to primary</programlisting>
    </para>
@@ -170,6 +169,42 @@
        </listitem>
      </varlistentry>

+      <varlistentry>
+        <term><option>-F</option></term>
+        <term><option>--force</option></term>
+        <listitem>
+          <para>
+            Ignore warnings and continue anyway.
+          </para>
+          <para>
+            This option is relevant in the following situations if <option>--siblings-follow</option> was specified:
+            <itemizedlist spacing="compact" mark="bullet">
+              <listitem>
+                <simpara>
+                  If one or more sibling nodes was not reachable via SSH, the standby will be promoted anyway.
+                </simpara>
+              </listitem>
+              <listitem>
+                <simpara>
+                  If the promotion candidate has insufficient free walsenders to accomodate the standbys which will
+                  be attached to it, the standby will be promoted anyway.
+                </simpara>
+              </listitem>
+              <listitem>
+                <simpara>
+                  If replication slots are in use but the promotion candidate has insufficient free replication slots
+                  to accomodate the standbys which will be attached to it, the standby will be promoted anyway.
+                </simpara>
+              </listitem>
+            </itemizedlist>
+          </para>
+          <para>
+            Note that if the <option>-F</option>/<option>--force</option> option is used when any of the above
+            situations is encountered, the onus is on the user to manually resolve any resulting issues.
+          </para>
+        </listitem>
+      </varlistentry>
+
    </variablelist>
  </refsect1>

--- a/doc/repmgr-witness-register.xml
+++ b/doc/repmgr-witness-register.xml
@@ -63,6 +63,34 @@
  </refsect1>


+
+  <refsect1>
+
+    <title>Options</title>
+   <variablelist>
+
+      <varlistentry>
+        <term><option>--dry-run</option></term>
+        <listitem>
+          <para>
+            Check prerequisites but don't actually register the witness
+          </para>
+        </listitem>
+      </varlistentry>
+
+
+      <varlistentry>
+       <term><option>-F</option>/<option>--force</option></term>
+        <listitem>
+          <para>
+            Overwrite an existing node record
+          </para>
+        </listitem>
+      </varlistentry>
+
+   </variablelist>
+  </refsect1>
+
  <refsect1 id="repmgr-witness-register-events">
    <title>Event notifications</title>
    <para>
--- a/doc/repmgrd-configuration.xml
+++ b/doc/repmgrd-configuration.xml
@@ -15,9 +15,13 @@
  </para>
  <para>
    &repmgrd; can be configured to provide failover
-    capability in case the primary upstream node becomes unreachable, and/or
+    capability in case the primary or upstream node becomes unreachable, and/or
    provide monitoring data to the &repmgr; metadatabase.
  </para>
+  <para>
+    From &repmgr; 4.4, when running on the primary node, &repmgrd; can also monitor
+    standby disconnections/reconnections (see <xref linkend="repmgrd-primary-child-disconnection"/>).
+  </para>

  <sect1 id="repmgrd-basic-configuration">
    <title>repmgrd configuration</title>
@@ -583,7 +587,8 @@ repmgrd_service_stop_command='sudo systemctl repmgr12 stop'
        the option <option>monitor_interval_secs</option> (see above).
      </para>
      <para>
-        For more details on monitoring, see <xref linkend="repmgrd-monitoring"/>.
+        For more details on monitoring, see <xref linkend="repmgrd-monitoring"/>. For information on
+        monitoring standby disconnections, see <xref linkend="repmgrd-primary-child-disconnection"/>.
      </para>
    </sect2>

--- a/doc/upgrading-repmgr.xml
+++ b/doc/upgrading-repmgr.xml
@@ -201,9 +201,13 @@ ALTER EXTENSION repmgr UPDATE</programlisting>
 	</para>
 	<tip>
 	  <para>
-		If the &repmgr; upgrade requires a PostgreSQL restart, combine the &repmgr; upgrade
-		with a PostgreSQL minor version upgrade, which will require a restart in any case.
-		New PostgreSQL minor version are usually released every couple of months.
+        If the &repmgr; upgrade requires a PostgreSQL restart, combine the &repmgr; upgrade
+        with a PostgreSQL minor version upgrade, which will require a restart in any case.
+      </para>
+      <para>
+		New PostgreSQL minor versions are usually released every couple of months;
+        see the <ulink url="https://www.postgresql.org/developer/roadmap/">Roadmap</ulink>
+        for the current schedule.
 	  </para>
 	</tip>
  </sect2>
--- a/repmgr--unpackaged--5.1.sql
+++ b/repmgr--unpackaged--5.1.sql
@@ -0,0 +1,265 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION repmgr" to load this file. \quit
+
+-- extract the current schema name
+-- NOTE: this assumes there will be only one schema matching 'repmgr_%';
+-- user is responsible for ensuring this is the case
+
+CREATE TEMPORARY TABLE repmgr_old_schema (schema_name TEXT);
+INSERT INTO repmgr_old_schema (schema_name)
+SELECT nspname AS schema_name
+  FROM pg_catalog.pg_namespace
+ WHERE nspname LIKE 'repmgr_%'
+ LIMIT 1;
+
+-- move old objects into new schema
+DO $repmgr$
+DECLARE
+  old_schema TEXT;
+BEGIN
+  SELECT schema_name FROM repmgr_old_schema
+    INTO old_schema;
+  EXECUTE format('ALTER TABLE %I.repl_nodes SET SCHEMA repmgr', old_schema);
+  EXECUTE format('ALTER TABLE %I.repl_events SET SCHEMA repmgr', old_schema);
+  EXECUTE format('ALTER TABLE %I.repl_monitor SET SCHEMA repmgr', old_schema);
+  EXECUTE format('DROP VIEW IF EXISTS %I.repl_show_nodes', old_schema);
+  EXECUTE format('DROP VIEW IF EXISTS %I.repl_status', old_schema);
+END$repmgr$;
+
+-- convert "repmgr_$cluster.repl_nodes" to "repmgr.nodes"
+CREATE TABLE repmgr.nodes (
+  node_id          INTEGER     PRIMARY KEY,
+  upstream_node_id INTEGER     NULL REFERENCES repmgr.nodes (node_id) DEFERRABLE,
+  active           BOOLEAN     NOT NULL DEFAULT TRUE,
+  node_name        TEXT        NOT NULL,
+  type             TEXT        NOT NULL CHECK (type IN('primary','standby','witness','bdr')),
+  location         TEXT        NOT NULL DEFAULT 'default',
+  priority         INT         NOT NULL DEFAULT 100,
+  conninfo         TEXT        NOT NULL,
+  repluser         VARCHAR(63) NOT NULL,
+  slot_name        TEXT        NULL,
+  config_file      TEXT        NOT NULL
+);
+
+INSERT INTO repmgr.nodes
+  (node_id, upstream_node_id, active, node_name, type, location, priority, conninfo, repluser, slot_name, config_file)
+SELECT id, upstream_node_id, active, name,
+       CASE WHEN type = 'master' THEN 'primary' ELSE type END,
+       'default', priority, conninfo, 'unknown', slot_name, 'unknown'
+  FROM repmgr.repl_nodes
+ ORDER BY id;
+
+
+-- convert "repmgr_$cluster.repl_event" to "event"
+
+ALTER TABLE repmgr.repl_events RENAME TO events;
+
+-- create new table "repmgr.voting_term"
+CREATE TABLE repmgr.voting_term (
+  term INT NOT NULL
+);
+
+CREATE UNIQUE INDEX voting_term_restrict
+ON repmgr.voting_term ((TRUE));
+
+CREATE RULE voting_term_delete AS
+   ON DELETE TO repmgr.voting_term
+   DO INSTEAD NOTHING;
+
+INSERT INTO repmgr.voting_term (term) VALUES (1);
+
+
+-- convert "repmgr_$cluster.repl_monitor" to "monitoring_history"
+
+
+DO $repmgr$
+DECLARE
+  DECLARE server_version_num INT;
+BEGIN
+  SELECT setting
+    FROM pg_catalog.pg_settings
+   WHERE name = 'server_version_num'
+    INTO server_version_num;
+  IF server_version_num >= 90400 THEN
+    EXECUTE $repmgr_func$
+CREATE TABLE repmgr.monitoring_history (
+  primary_node_id                INTEGER NOT NULL,
+  standby_node_id                INTEGER NOT NULL,
+  last_monitor_time              TIMESTAMP WITH TIME ZONE NOT NULL,
+  last_apply_time                TIMESTAMP WITH TIME ZONE,
+  last_wal_primary_location      PG_LSN NOT NULL,
+  last_wal_standby_location      PG_LSN,
+  replication_lag                BIGINT NOT NULL,
+  apply_lag                      BIGINT NOT NULL
+)
+    $repmgr_func$;
+    INSERT INTO repmgr.monitoring_history
+      (primary_node_id, standby_node_id, last_monitor_time,  last_apply_time, last_wal_primary_location, last_wal_standby_location, replication_lag, apply_lag)
+    SELECT primary_node, standby_node, last_monitor_time,  last_apply_time, last_wal_primary_location::pg_lsn, last_wal_standby_location::pg_lsn, replication_lag, apply_lag
+      FROM repmgr.repl_monitor;
+  ELSE
+    EXECUTE $repmgr_func$
+CREATE TABLE repmgr.monitoring_history (
+  primary_node_id                INTEGER NOT NULL,
+  standby_node_id                INTEGER NOT NULL,
+  last_monitor_time              TIMESTAMP WITH TIME ZONE NOT NULL,
+  last_apply_time                TIMESTAMP WITH TIME ZONE,
+  last_wal_primary_location      TEXT NOT NULL,
+  last_wal_standby_location      TEXT,
+  replication_lag                BIGINT NOT NULL,
+  apply_lag                      BIGINT NOT NULL
+)
+    $repmgr_func$;
+    INSERT INTO repmgr.monitoring_history
+      (primary_node_id, standby_node_id, last_monitor_time,  last_apply_time, last_wal_primary_location, last_wal_standby_location, replication_lag, apply_lag)
+    SELECT primary_node, standby_node, last_monitor_time,  last_apply_time, last_wal_primary_location, last_wal_standby_location, replication_lag, apply_lag
+      FROM repmgr.repl_monitor;
+
+  END IF;
+END$repmgr$;
+
+CREATE INDEX idx_monitoring_history_time
+          ON repmgr.monitoring_history (last_monitor_time, standby_node_id);
+
+CREATE VIEW repmgr.show_nodes AS
+   SELECT n.node_id,
+          n.node_name,
+          n.active,
+          n.upstream_node_id,
+          un.node_name AS upstream_node_name,
+          n.type,
+          n.priority,
+          n.conninfo
+     FROM repmgr.nodes n
+LEFT JOIN repmgr.nodes un
+       ON un.node_id = n.upstream_node_id;
+
+
+/* ================= */
+/* repmgrd functions */
+/* ================= */
+
+/* monitoring functions */
+
+CREATE FUNCTION set_local_node_id(INT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_local_node_id'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_local_node_id()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_local_node_id'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION standby_set_last_updated()
+  RETURNS TIMESTAMP WITH TIME ZONE
+  AS 'MODULE_PATHNAME', 'standby_set_last_updated'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION standby_get_last_updated()
+  RETURNS TIMESTAMP WITH TIME ZONE
+  AS 'MODULE_PATHNAME', 'standby_get_last_updated'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION set_upstream_last_seen(INT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_upstream_last_seen'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_upstream_last_seen()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_upstream_last_seen'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_upstream_node_id()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_upstream_node_id'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION set_upstream_node_id(INT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_upstream_node_id'
+  LANGUAGE C STRICT;
+
+/* failover functions */
+
+CREATE FUNCTION notify_follow_primary(INT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'notify_follow_primary'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_new_primary()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_new_primary'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION reset_voting_status()
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'reset_voting_status'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_repmgrd_pid()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_repmgrd_pid'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_repmgrd_pidfile()
+  RETURNS TEXT
+  AS 'MODULE_PATHNAME', 'get_repmgrd_pidfile'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION set_repmgrd_pid(INT, TEXT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_repmgrd_pid'
+  LANGUAGE C CALLED ON NULL INPUT;
+
+CREATE FUNCTION repmgrd_is_running()
+  RETURNS BOOL
+  AS 'MODULE_PATHNAME', 'repmgrd_is_running'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION repmgrd_pause(BOOL)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'repmgrd_pause'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION repmgrd_is_paused()
+  RETURNS BOOL
+  AS 'MODULE_PATHNAME', 'repmgrd_is_paused'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_wal_receiver_pid()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_wal_receiver_pid'
+  LANGUAGE C STRICT;
+
+
+/* views */
+
+CREATE VIEW repmgr.replication_status AS
+  SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
+ 	     n.type AS node_type, n.active, last_monitor_time,
+         CASE WHEN n.type='standby' THEN m.last_wal_primary_location ELSE NULL END AS last_wal_primary_location,
+         m.last_wal_standby_location,
+         CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.replication_lag) ELSE NULL END AS replication_lag,
+         CASE WHEN n.type='standby' THEN
+           CASE WHEN replication_lag > 0 THEN age(now(), m.last_apply_time) ELSE '0'::INTERVAL END
+           ELSE NULL
+         END AS replication_time_lag,
+         CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.apply_lag) ELSE NULL END AS apply_lag,
+         AGE(NOW(), CASE WHEN pg_catalog.pg_is_in_recovery() THEN repmgr.standby_get_last_updated() ELSE m.last_monitor_time END) AS communication_time_lag
+    FROM repmgr.monitoring_history m
+    JOIN repmgr.nodes n ON m.standby_node_id = n.node_id
+   WHERE (m.standby_node_id, m.last_monitor_time) IN (
+	          SELECT m1.standby_node_id, MAX(m1.last_monitor_time)
+			    FROM repmgr.monitoring_history m1 GROUP BY 1
+         );
+
+
+
+/* drop old tables */
+DROP TABLE repmgr.repl_nodes;
+DROP TABLE repmgr.repl_monitor;
+
+-- remove temporary table
+DROP TABLE repmgr_old_schema;
--- a/repmgr-action-cluster.c
+++ b/repmgr-action-cluster.c
@@ -55,10 +55,8 @@ typedef enum
 struct ColHeader headers_show[SHOW_HEADER_COUNT];
 struct ColHeader headers_event[EVENT_HEADER_COUNT];

-
-
-static int	build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, ItemList *warnings, int *error_code);
-static int	build_cluster_crosscheck(t_node_status_cube ***cube_dest, int *name_length, ItemList *warnings, int *error_code);
+static int	build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, ItemList *warnings, int *error_code);
+static int	build_cluster_crosscheck(t_node_status_cube ***cube_dest, ItemList *warnings, int *error_code);
 static void cube_set_node_status(t_node_status_cube **cube, int n, int node_id, int matrix_node_id, int connection_node_id, int connection_status);

 /*
@@ -538,9 +536,6 @@ do_cluster_crosscheck(void)
 {
 	int			i = 0,
 				n = 0;
-	char		c;
-	const char *node_header = "Name";
-	int			name_length = strlen(node_header);

 	t_node_status_cube **cube;

@@ -548,7 +543,7 @@ do_cluster_crosscheck(void)
 	int			error_code = SUCCESS;
 	ItemList	warnings = {NULL, NULL};

-	n = build_cluster_crosscheck(&cube, &name_length, &warnings, &error_code);
+	n = build_cluster_crosscheck(&cube, &warnings, &error_code);

 	if (runtime_options.output_mode == OM_CSV)
 	{
@@ -582,24 +577,56 @@ do_cluster_crosscheck(void)
 	}
 	else
 	{
-		printf("%*s | Id ", name_length, node_header);
-		for (i = 0; i < n; i++)
-			printf("| %2d ", cube[i]->node_id);
-		printf("\n");
+		/* output header contains node name, node ID and one column for each node in the cluster */
+		struct ColHeader *headers_crosscheck = NULL;
+
+		int header_count = n + 2;
+		int header_id = 2;
+
+		headers_crosscheck = palloc0(sizeof(ColHeader) * header_count);
+
+		/* Initialize column headers  */
+		strncpy(headers_crosscheck[0].title, _("Name"), MAXLEN);
+		strncpy(headers_crosscheck[1].title, _("ID"), MAXLEN);

-		for (i = 0; i < name_length; i++)
-			printf("-");
-		printf("-+----");
 		for (i = 0; i < n; i++)
-			printf("+----");
-		printf("\n");
+		{
+			maxlen_snprintf(headers_crosscheck[header_id].title, "%i", cube[i]->node_id);
+			header_id++;
+		}
+
+		/* Initialize column max values */
+		for (i = 0; i < header_count; i++)
+		{
+			headers_crosscheck[i].display = true;
+			headers_crosscheck[i].max_length = strlen(headers_crosscheck[i].title);
+			headers_crosscheck[i].cur_length = headers_crosscheck[i].max_length;
+
+			/* We can derive the maximum node ID length for the ID column from
+			 * the generated matrix node ID headers
+			 */
+			if (i >= 2 && headers_crosscheck[i].max_length > headers_crosscheck[1].max_length)
+				headers_crosscheck[1].max_length = headers_crosscheck[i].max_length;
+		}
+
+		for (i = 0; i < n; i++)
+		{
+			if (strlen(cube[i]->node_name) > headers_crosscheck[0].max_length)
+			{
+				headers_crosscheck[0].max_length = strlen(cube[i]->node_name);
+			}
+		}
+
+		print_status_header(header_count, headers_crosscheck);

 		for (i = 0; i < n; i++)
 		{
 			int			column_node_ix;

-			printf("%*s | %2d ", name_length,
+			printf(" %-*s | %-*i ",
+				   headers_crosscheck[0].max_length,
 				   cube[i]->node_name,
+				   headers_crosscheck[1].max_length,
 				   cube[i]->node_id);

 			for (column_node_ix = 0; column_node_ix < n; column_node_ix++)
@@ -607,6 +634,8 @@ do_cluster_crosscheck(void)
 				int			max_node_status = -2;
 				int			node_ix = 0;

+				char		c;
+
 				/*
 				 * The value of entry (i,j) is equal to the maximum value of all
 				 * the (i,j,k). Indeed:
@@ -646,7 +675,7 @@ do_cluster_crosscheck(void)
 						exit(ERR_INTERNAL);
 				}

-				printf("|  %c ", c);
+				printf("| %-*c ", headers_crosscheck[column_node_ix + 2].max_length, c);
 			}

 			printf("\n");
@@ -708,16 +737,13 @@ do_cluster_matrix()
 				j = 0,
 				n = 0;

-	const char *node_header = "Name";
-	int			name_length = strlen(node_header);
-
 	t_node_matrix_rec **matrix_rec_list;

 	bool		connection_error_found = false;
 	int			error_code = SUCCESS;
 	ItemList	warnings = {NULL, NULL};

-	n = build_cluster_matrix(&matrix_rec_list, &name_length, &warnings, &error_code);
+	n = build_cluster_matrix(&matrix_rec_list, &warnings, &error_code);

 	if (runtime_options.output_mode == OM_CSV)
 	{
@@ -740,27 +766,60 @@ do_cluster_matrix()
 	}
 	else
 	{
-		char		c;
+		/* output header contains node name, node ID and one column for each node in the cluster */
+		struct ColHeader *headers_matrix = NULL;

-		printf("%*s | Id ", name_length, node_header);
-		for (i = 0; i < n; i++)
-			printf("| %2d ", matrix_rec_list[i]->node_id);
-		printf("\n");
+		int header_count = n + 2;
+		int header_id = 2;

-		for (i = 0; i < name_length; i++)
-			printf("-");
-		printf("-+----");
-		for (i = 0; i < n; i++)
-			printf("+----");
-		printf("\n");
+		headers_matrix = palloc0(sizeof(ColHeader) * header_count);
+
+		/* Initialize column headers  */
+		strncpy(headers_matrix[0].title, _("Name"), MAXLEN);
+		strncpy(headers_matrix[1].title, _("ID"), MAXLEN);

 		for (i = 0; i < n; i++)
 		{
-			printf("%*s | %2d ", name_length,
+			maxlen_snprintf(headers_matrix[header_id].title, "%i", matrix_rec_list[i]->node_id);
+			header_id++;
+		}
+
+		/* Initialize column max values */
+		for (i = 0; i < header_count; i++)
+		{
+			headers_matrix[i].display = true;
+			headers_matrix[i].max_length = strlen(headers_matrix[i].title);
+			headers_matrix[i].cur_length = headers_matrix[i].max_length;
+
+			/* We can derive the maximum node ID length for the ID column from
+			 * the generated matrix node ID headers
+			 */
+			if (i >= 2 && headers_matrix[i].max_length > headers_matrix[1].max_length)
+				headers_matrix[1].max_length = headers_matrix[i].max_length;
+		}
+
+
+		for (i = 0; i < n; i++)
+		{
+			if (strlen(matrix_rec_list[i]->node_name) > headers_matrix[0].max_length)
+			{
+				headers_matrix[0].max_length = strlen(matrix_rec_list[i]->node_name);
+			}
+		}
+
+		print_status_header(header_count, headers_matrix);
+
+		for (i = 0; i < n; i++)
+		{
+			printf(" %-*s | %-*i ",
+				   headers_matrix[0].max_length,
 				   matrix_rec_list[i]->node_name,
+				   headers_matrix[1].max_length,
 				   matrix_rec_list[i]->node_id);
 			for (j = 0; j < n; j++)
 			{
+				char		c;
+
 				switch (matrix_rec_list[i]->node_status_list[j]->node_status)
 				{
 					case -2:
@@ -778,7 +837,7 @@ do_cluster_matrix()
 						exit(ERR_INTERNAL);
 				}

-				printf("|  %c ", c);
+				printf("| %-*c ", headers_matrix[j + 2].max_length, c);
 			}
 			printf("\n");
 		}
@@ -838,7 +897,7 @@ matrix_set_node_status(t_node_matrix_rec **matrix_rec_list, int n, int node_id,


 static int
-build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, ItemList *warnings, int *error_code)
+build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, ItemList *warnings, int *error_code)
 {
 	PGconn	   *conn = NULL;
 	int			i = 0,
@@ -896,7 +955,6 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite
 	/* Initialise matrix structure for each node */
 	for (cell = nodes.head; cell; cell = cell->next)
 	{
-		int			name_length_cur;
 		NodeInfoListCell *cell_j;

 		matrix_rec_list[i] = (t_node_matrix_rec *) pg_malloc0(sizeof(t_node_matrix_rec));
@@ -906,13 +964,6 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite
 				cell->node_info->node_name,
 				sizeof(matrix_rec_list[i]->node_name));

-		/*
-		 * Find the maximum length of a node name
-		 */
-		name_length_cur = strlen(matrix_rec_list[i]->node_name);
-		if (name_length_cur > *name_length)
-			*name_length = name_length_cur;
-
 		matrix_rec_list[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec) * nodes.node_count);

 		j = 0;
@@ -1077,7 +1128,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite


 static int
-build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, ItemList *warnings, int *error_code)
+build_cluster_crosscheck(t_node_status_cube ***dest_cube, ItemList *warnings, int *error_code)
 {
 	PGconn	   *conn = NULL;
 	int			h,
@@ -1126,20 +1177,12 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item

 	for (cell = nodes.head; cell; cell = cell->next)
 	{
-		int			name_length_cur = 0;
 		NodeInfoListCell *cell_i = NULL;

 		cube[h] = (t_node_status_cube *) pg_malloc(sizeof(t_node_status_cube));
 		cube[h]->node_id = cell->node_info->node_id;
 		strncpy(cube[h]->node_name, cell->node_info->node_name, sizeof(cube[h]->node_name));

-		/*
-		 * Find the maximum length of a node name
-		 */
-		name_length_cur = strlen(cube[h]->node_name);
-		if (name_length_cur > *name_length)
-			*name_length = name_length_cur;
-
 		cube[h]->matrix_list_rec = (t_node_matrix_rec **) pg_malloc(sizeof(t_node_matrix_rec) * nodes.node_count);

 		i = 0;
--- a/repmgr-action-node.c
+++ b/repmgr-action-node.c
@@ -43,7 +43,7 @@ static void _do_node_restore_config(void);

 static void do_node_check_replication_connection(void);
 static CheckStatus do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
-static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
+static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
 static CheckStatus do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
 static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
 static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
@@ -204,7 +204,16 @@ do_node_status(void)

 		if (enabled == false && recovery_type == RECTYPE_STANDBY)
 		{
-			appendPQExpBufferStr(&archiving_status, " (on standbys \"archive_mode\" must be set to \"always\" to be effective)");
+			if (PQserverVersion(conn) >= 90500)
+			{
+				appendPQExpBufferStr(&archiving_status,
+									 " (on standbys \"archive_mode\" must be set to \"always\" to be effective)");
+			}
+			else
+			{
+				appendPQExpBufferStr(&archiving_status,
+									 " (\"archive_mode\" has no effect on standbys)");
+			}
 		}

 		key_value_list_set(&node_status,
@@ -294,7 +303,7 @@ do_node_status(void)
 				continue;
 			}

-			if (is_downstream_node_attached(conn, node_cell->node_info->node_name) != NODE_ATTACHED)
+			if (is_downstream_node_attached(conn, node_cell->node_info->node_name, NULL) != NODE_ATTACHED)
 			{
 				missing_nodes_count++;
 				item_list_append_format(&missing_nodes,
@@ -797,6 +806,7 @@ do_node_check(void)
 	{
 		return_code = do_node_check_downstream(conn,
 											   runtime_options.output_mode,
+											   &node_info,
 											   NULL);
 		PQfinish(conn);
 		exit(return_code);
@@ -888,7 +898,7 @@ do_node_check(void)
 	if (do_node_check_upstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
 		issue_detected = true;

-	if (do_node_check_downstream(conn, runtime_options.output_mode, &status_list) != CHECK_STATUS_OK)
+	if (do_node_check_downstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
 		issue_detected = true;

 	if (do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
@@ -1183,7 +1193,7 @@ do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list


 static CheckStatus
-do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_output)
+do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
 {
 	NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER;
 	NodeInfoListCell *cell = NULL;
@@ -1217,7 +1227,7 @@ do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_ou
 			continue;
 		}

-		if (is_downstream_node_attached(conn, cell->node_info->node_name) != NODE_ATTACHED)
+		if (is_downstream_node_attached(conn, cell->node_info->node_name, NULL) != NODE_ATTACHED)
 		{
 			missing_nodes_count++;
 			item_list_append_format(&missing_nodes,
@@ -1234,7 +1244,13 @@ do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_ou
 		}
 	}

-	if (missing_nodes_count == 0)
+	if (node_info->type == WITNESS)
+	{
+		/* witness is not connecting to any upstream */
+		appendPQExpBufferStr(&details,
+							 _("N/A - node is a witness"));
+	}
+	else if (missing_nodes_count == 0)
 	{
 		if (expected_nodes_count == 0)
 			appendPQExpBufferStr(&details,
@@ -1367,7 +1383,13 @@ do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, Ch

 	initPQExpBuffer(&details);

-	if (get_node_record(conn, node_info->upstream_node_id, &upstream_node_info) != RECORD_FOUND)
+	if (node_info->type == WITNESS)
+	{
+		/* witness is not connecting to any upstream */
+		appendPQExpBufferStr(&details,
+							 _("N/A - node is a witness"));
+	}
+	else if (get_node_record(conn, node_info->upstream_node_id, &upstream_node_info) != RECORD_FOUND)
 	{
 		if (get_recovery_type(conn) == RECTYPE_STANDBY)
 		{
@@ -1388,7 +1410,7 @@ do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, Ch
 		upstream_conn = establish_db_connection(upstream_node_info.conninfo, true);

 		/* check our node is connected */
-		if (is_downstream_node_attached(upstream_conn, config_file_options.node_name) != NODE_ATTACHED)
+		if (is_downstream_node_attached(upstream_conn, config_file_options.node_name, NULL) != NODE_ATTACHED)
 		{
 			appendPQExpBuffer(&details,
 							  _("node \"%s\" (ID: %i) is not attached to expected upstream node \"%s\" (ID: %i)"),
@@ -2458,7 +2480,7 @@ do_node_rejoin(void)
 		log_hint(_("check the local node is registered with the current primary \"%s\" (ID: %i)"),
 				 primary_node_record.node_name,
 				 primary_node_record.node_id);
-		PQfinish(upstream_conn);
+
 		PQfinish(primary_conn);
 		exit(ERR_BAD_CONFIG);
 	}
@@ -2777,7 +2799,7 @@ do_node_rejoin(void)
 						   config_file_options.node_rejoin_timeout);
 			}
 			else {
-				log_detail(_("no record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
+				log_detail(_("no active record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
 						   config_file_options.node_name,
 						   primary_node_record.node_name);
 			}
@@ -2789,7 +2811,7 @@ do_node_rejoin(void)
 	else
 	{
 		/* -W/--no-wait provided - check once */
-		NodeAttached node_attached = is_downstream_node_attached(primary_conn, config_file_options.node_name);
+		NodeAttached node_attached = is_downstream_node_attached(primary_conn, config_file_options.node_name, NULL);
 		if (node_attached == NODE_ATTACHED)
 			success = true;
 	}
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -986,14 +986,15 @@ check_barman_config(void)
 /*
 * _do_create_replication_conf()
 *
- * Create recovery.conf for a previously cloned instance.
+ * Create replication configuration for a previously cloned instance.
 *
 * Prerequisites:
 *
- * - data directory must be provided
+ * - data directory must be provided, either explicitly or via
+ *   repmgr.conf
 * - the instance should not be running
 * - an existing "recovery.conf" file can only be overwritten with
- *   -F/--force
+ *   -F/--force (Pg11 and earlier)
 * - connection parameters for an existing, running node must be provided
 * - --upstream-node-id, if provided, will be "primary_conninfo",
 *   otherwise primary node id; node must exist; unless -F/--force
@@ -1168,7 +1169,7 @@ _do_create_replication_conf(void)
 		}
 		else
 		{
-			log_hint(_("standby must be registered before a new recovery.conf file can be created"));
+			log_hint(_("standby must be registered before replication can be configured"));
 		}

 		exit(ERR_BAD_CONFIG);
@@ -1796,7 +1797,7 @@ do_standby_register(void)
 			else
 			{
 				/* check our standby is connected */
-				if (is_downstream_node_attached(upstream_conn, config_file_options.node_name) == NODE_ATTACHED)
+				if (is_downstream_node_attached(upstream_conn, config_file_options.node_name, NULL) == NODE_ATTACHED)
 				{
 					log_verbose(LOG_INFO, _("local node is attached to specified upstream node %i"), runtime_options.upstream_node_id);
 				}
@@ -1855,7 +1856,7 @@ do_standby_register(void)
 						primary_node_id);

 			/* check our standby is connected */
-			if (is_downstream_node_attached(primary_conn, config_file_options.node_name) == NODE_ATTACHED)
+			if (is_downstream_node_attached(primary_conn, config_file_options.node_name, NULL) == NODE_ATTACHED)
 			{
 				log_verbose(LOG_INFO, _("local node is attached to primary"));
 			}
@@ -2031,6 +2032,9 @@ do_standby_register(void)
 				if (node_record_on_standby.priority != node_record_on_primary.priority)
 					records_match = false;

+				if (strcmp(node_record_on_standby.location, node_record_on_primary.location) != 0)
+					records_match = false;
+
 				if (node_record_on_standby.active != node_record_on_primary.active)
 					records_match = false;

@@ -2388,7 +2392,7 @@ do_standby_promote(void)
 	 */
 	if (check_free_wal_senders(available_wal_senders, &sibling_nodes_stats, &dry_run_success) == false)
 	{
-		if (runtime_options.dry_run == false)
+		if (runtime_options.dry_run == false || runtime_options.force == false)
 		{
 			PQfinish(local_conn);
 			exit(ERR_BAD_CONFIG);
@@ -2402,7 +2406,7 @@ do_standby_promote(void)
 	 */
 	if (check_free_slots(&local_node_record, &sibling_nodes_stats, &dry_run_success) == false)
 	{
-		if (runtime_options.dry_run == false)
+		if (runtime_options.dry_run == false || runtime_options.force == false)
 		{
 			PQfinish(local_conn);
 			exit(ERR_BAD_CONFIG);
@@ -2503,7 +2507,7 @@ _do_standby_promote_internal(PGconn *conn)
 	/*
 	 * Promote standby to primary.
 	 *
-	 * `pg_ctl promote` returns immediately and (prior to 10.0) has no -w
+	 * "pg_ctl promote: returns immediately and (prior to 10.0) has no -w
 	 * option so we can't be sure when or if the promotion completes. For now
 	 * we'll poll the server until the default timeout (60 seconds)
 	 *
@@ -3069,7 +3073,9 @@ do_standby_follow(void)

 	for (timer = 0; timer < config_file_options.standby_follow_timeout; timer++)
 	{
-		NodeAttached node_attached = is_downstream_node_attached(follow_target_conn, config_file_options.node_name);
+		NodeAttached node_attached = is_downstream_node_attached(follow_target_conn,
+																 config_file_options.node_name,
+																 NULL);

 		if (node_attached == NODE_ATTACHED)
 		{
@@ -3563,7 +3569,8 @@ do_standby_switchover(void)
 	{
 		if (runtime_options.dry_run == true)
 		{
-			log_info(_("validating database connection for superuser \"%s\""), runtime_options.superuser);
+			log_info(_("validating connection to local database for superuser \"%s\""),
+					 runtime_options.superuser);
 		}

 		superuser_conn = establish_db_connection_with_replacement_param(
@@ -3573,23 +3580,27 @@ do_standby_switchover(void)

 		if (PQstatus(superuser_conn) != CONNECTION_OK)
 		{
-			log_error(_("unable to connect as provided superuser \"%s\""),
+			log_error(_("unable to connect to local database \"%s\" as provided superuser \"%s\""),
+					  PQdb(superuser_conn),
 					  runtime_options.superuser);
 			exit(ERR_BAD_CONFIG);
 		}

 		if (is_superuser_connection(superuser_conn, NULL) == false)
 		{
-			log_error(_("database connection established for provided superuser \"%s\" is not a superuser connection"),
+			log_error(_("connection established to local database \"%s\" for provided superuser \"%s\" is not a superuser connection"),
+					  PQdb(superuser_conn),
 					  runtime_options.superuser);
 			exit(ERR_BAD_CONFIG);
 		}

 		if (runtime_options.dry_run == true)
 		{
-			log_info(_("successfully established database connection established for provided superuser \"%s\""),
+			log_info(_("successfully established connection to local database \"%s\" for provided superuser \"%s\""),
+					 PQdb(superuser_conn),
 					 runtime_options.superuser);
 		}
+
 	}

 	/*
@@ -3702,7 +3713,7 @@ do_standby_switchover(void)
 		exit(ERR_BAD_CONFIG);
 	}

-	if (is_downstream_node_attached(remote_conn, local_node_record.node_name) != NODE_ATTACHED)
+	if (is_downstream_node_attached(remote_conn, local_node_record.node_name, NULL) != NODE_ATTACHED)
 	{
 		log_error(_("local node \"%s\" (ID: %i) is not attached to demotion candidate \"%s\" (ID: %i)"),
 				  local_node_record.node_name,
@@ -4052,11 +4063,13 @@ do_standby_switchover(void)

 	if (parse_data_directory_config(command_output.data) == false)
 	{
-		log_error(_("\"data_directory\" parameter in repmgr.conf on \"%s\" is incorrectly configured"),
-				  remote_node_record.node_name);
+		log_error(_("\"data_directory\" parameter in \"repmgr.conf\" on \"%s\" (ID: %i) is incorrectly configured"),
+				  remote_node_record.node_name,
+				  remote_node_record.node_id);

-		log_hint(_("execute \"repmgr node check --data-directory-config\" on \"%s\" to diagnose the issue"),
-				 remote_node_record.node_name);
+		log_hint(_("execute \"repmgr node check --data-directory-config\" on \"%s\" (ID: %i) to diagnose the issue"),
+				 remote_node_record.node_name,
+				 remote_node_record.node_id);

 		PQfinish(remote_conn);
 		PQfinish(local_conn);
@@ -5184,7 +5197,8 @@ do_standby_switchover(void)
 		 */

 		 node_attached = is_downstream_node_attached(local_conn,
-													 remote_node_record.node_name);
+													 remote_node_record.node_name,
+													 NULL);
 		if (node_attached == NODE_ATTACHED)
 		{
 			switchover_success = true;
@@ -5473,6 +5487,7 @@ check_source_server()
 					{
 						uint64		test_system_identifier = system_identifier(cell->node_info->conn);
 						PQfinish(cell->node_info->conn);
+						cell->node_info->conn = NULL;

 						if (test_system_identifier != UNKNOWN_SYSTEM_IDENTIFIER)
 						{
@@ -5496,6 +5511,7 @@ check_source_server()
 					else
 					{
 						PQfinish(cell->node_info->conn);
+						cell->node_info->conn = NULL;
 					}
 				}
 				clear_node_info_list(&all_nodes);
@@ -6691,6 +6707,11 @@ run_basebackup(t_node_info *node_record)
 }


+/*
+ * Perform a filesystem backup using rsync.
+ *
+ * From repmgr 4 this is only used for Barman backups.
+ */
 static int
 run_file_backup(t_node_info *local_node_record)
 {
@@ -6721,10 +6742,11 @@ run_file_backup(t_node_info *local_node_record)
 		/*
 		 * Read the list of backup files into a local file. In the process:
 		 *
-		 * - determine the backup ID; - check, and remove, the prefix; -
-		 * detect tablespaces; - filter files in one list per tablespace;
+		 * - determine the backup ID
+		 * - check, and remove, the prefix
+		 * - detect tablespaces
+		 * - filter files in one list per tablespace
 		 */
-
 		{
 			FILE	   *fi;		/* input stream */
 			FILE	   *fd;		/* output for data.txt */
@@ -7007,11 +7029,13 @@ run_file_backup(t_node_info *local_node_record)

 		if (mode == barman)
 		{
-			create_pg_dir(cell_t->location, false);
+			create_pg_dir(tblspc_dir_dest, false);

 			if (cell_t->f != NULL)	/* cell_t->f == NULL iff the tablespace is
 									 * empty */
 			{
+				fclose(cell_t->f);
+
 				maxlen_snprintf(command,
 								"rsync --progress -a --files-from=%s/%s.txt %s:%s/%s/%s %s",
 								local_repmgr_tmp_directory,
@@ -7024,7 +7048,6 @@ run_file_backup(t_node_info *local_node_record)
 				(void) local_command(
 									 command,
 									 NULL);
-				fclose(cell_t->f);
 				maxlen_snprintf(filename,
 								"%s/%s.txt",
 								local_repmgr_tmp_directory,
@@ -7141,7 +7164,10 @@ stop_backup:

 	if (mode == barman)
 	{
-		/* In Barman mode, remove local_repmgr_directory */
+		/*
+		 * In Barman mode, remove local_repmgr_tmp_directory,
+		 * which contains various temporary files containing Barman metadata.
+		 */
 		rmtree(local_repmgr_tmp_directory, true);
 	}

@@ -8579,6 +8605,10 @@ do_standby_help(void)
 	puts("");
 	printf(_("  \"standby promote\" promotes a standby node to primary.\n"));
 	puts("");
+	printf(_("  --dry-run                           perform checks etc. but don't actually promote the node\n"));
+	printf(_("  -F, --force                         ignore warnings and continue anyway\n"));
+	printf(_("  --siblings-follow                   have other standbys follow new primary\n"));
+	puts("");

 	printf(_("STANDBY FOLLOW\n"));
 	puts("");
--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -1196,7 +1196,7 @@ main(int argc, char **argv)

 	/*
 	 * If --dry-run specified, ensure log_level is at least LOG_INFO, regardless
-	 * of what's in the configuration file or -L/--log-level paremeter, otherwise
+	 * of what's in the configuration file or -L/--log-level parameter, otherwise
 	 * some or output might not be displayed.
 	 */
 	if (runtime_options.dry_run == true)
@@ -2351,6 +2351,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *node_status, PQExpBu
 			 * connected to the upstream
 			 */
 			NodeAttached attached_to_upstream = NODE_ATTACHED_UNKNOWN;
+			char *replication_state = NULL;
 			t_node_info upstream_node_rec = T_NODE_INFO_INITIALIZER;
 			RecordStatus upstream_node_rec_found = get_node_record(node_info->conn,
 																   node_info->upstream_node_id,
@@ -2378,7 +2379,7 @@ format_node_status(t_node_info *node_info, PQExpBufferData *node_status, PQExpBu
 				}
 				else
 				{
-					attached_to_upstream = is_downstream_node_attached(upstream_conn, node_info->node_name);
+					attached_to_upstream = is_downstream_node_attached(upstream_conn, node_info->node_name, &replication_state);
 				}

 				PQfinish(upstream_conn);
@@ -2394,6 +2395,18 @@ format_node_status(t_node_info *node_info, PQExpBufferData *node_status, PQExpBu
 										upstream_node_rec.node_name,
 										upstream_node_rec.node_id);
 			}
+			if (attached_to_upstream == NODE_NOT_ATTACHED)
+			{
+				appendPQExpBufferStr(upstream, "? ");
+				item_list_append_format(warnings,
+										"node \"%s\" (ID: %i) attached to its upstream node \"%s\" (ID: %i) in state \"%s\"",
+										node_info->node_name,
+										node_info->node_id,
+										upstream_node_rec.node_name,
+										upstream_node_rec.node_id,
+										replication_state);
+			}
+
 			else if (attached_to_upstream == NODE_DETACHED)
 			{
 				appendPQExpBufferStr(upstream, "! ");
@@ -3986,8 +3999,10 @@ check_standby_join(PGconn *upstream_conn, t_node_info *upstream_node_record, t_n

 	 for (; i < config_file_options.node_rejoin_timeout; i++)
 	 {
+		 char *node_state = NULL;
 		 NodeAttached node_attached = is_downstream_node_attached(upstream_conn,
-																  standby_node_record->node_name);
+																  standby_node_record->node_name,
+																  &node_state);
 		 if (node_attached == NODE_ATTACHED)
 		 {
 			 log_verbose(LOG_INFO, _("node \"%s\" (ID: %i) has attached to its upstream node"),
@@ -4004,9 +4019,19 @@ check_standby_join(PGconn *upstream_conn, t_node_info *upstream_node_record, t_n
 					  i + 1,
 					  config_file_options.node_rejoin_timeout);

-			 log_detail(_("checking for record in node \"%s\"'s \"pg_stat_replication\" table where \"application_name\" is \"%s\""),
-						upstream_node_record->node_name,
-						standby_node_record->node_name);
+			 if (node_attached == NODE_NOT_ATTACHED)
+			 {
+				 log_detail(_("node \"%s\" (ID: %i) is currrently attached to its upstream node in state \"%s\""),
+							upstream_node_record->node_name,
+							standby_node_record->node_id,
+							node_state);
+			 }
+			 else
+			 {
+				 log_detail(_("checking for record in node \"%s\"'s \"pg_stat_replication\" table where \"application_name\" is \"%s\""),
+							upstream_node_record->node_name,
+							standby_node_record->node_name);
+			 }
 		 }
 		 else
 		 {
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -297,6 +297,7 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 #connection_check_type=ping		# How to check availability of the upstream node; valid options:
 					#  'ping': use PQping() to check if the node is accepting connections
 					#  'connection': execute a throwaway query on the current connection
+					#  'query': execute an SQL statement on the node via the existing connection
 #reconnect_attempts=6			# Number of attempts which will be made to reconnect to an unreachable
 					# primary (or other upstream node)
 #reconnect_interval=10			# Interval between attempts to reconnect to an unreachable
--- a/repmgr_version.h.in
+++ b/repmgr_version.h.in
@@ -1,5 +1,5 @@
 #define REPMGR_VERSION_DATE ""
-#define REPMGR_VERSION "5.1dev"
+#define REPMGR_VERSION "5.1.0"
 #define REPMGR_VERSION_NUM 50100
-#define REPMGR_RELEASE_DATE "2020-XX-XX"
+#define REPMGR_RELEASE_DATE "2020-04-13"
 #define PG_ACTUAL_VERSION_NUM 
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -1328,6 +1328,7 @@ monitor_streaming_standby(void)
 	 */
 	if (PQstatus(upstream_conn) != CONNECTION_OK)
 	{
+		close_connection(&upstream_conn);
 		log_error(_("unable connect to upstream node (ID: %i), terminating"),
 				  local_node_info.upstream_node_id);
 		log_hint(_("upstream node must be running before repmgrd can start"));
@@ -1339,8 +1340,8 @@ monitor_streaming_standby(void)

 	if (upstream_node_info.node_id == local_node_info.node_id)
 	{
-		PQfinish(upstream_conn);
-		upstream_conn = NULL;
+		close_connection(&upstream_conn);
+
 		return;
 	}

@@ -1364,6 +1365,8 @@ monitor_streaming_standby(void)

 		if (PQstatus(primary_conn) != CONNECTION_OK)
 		{
+			close_connection(&primary_conn);
+
 			log_error(_("unable to connect to primary node"));
 			log_hint(_("ensure the primary node is reachable from this node"));

@@ -1440,6 +1443,7 @@ monitor_streaming_standby(void)
 	while (true)
 	{
 		log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo);
+
 		if (check_upstream_connection(&upstream_conn, upstream_node_info.conninfo) == true)
 		{
 			set_upstream_last_seen(local_conn, upstream_node_info.node_id);
@@ -1557,8 +1561,9 @@ monitor_streaming_standby(void)

 							log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
 									   upstream_node_info.node_name, upstream_node_info.node_id);
-							PQfinish(upstream_conn);
-							upstream_conn = NULL;
+
+							close_connection(&upstream_conn);
+
 							local_node_info.upstream_node_id = UNKNOWN_NODE_ID;

 							/* check local connection */
@@ -1568,7 +1573,7 @@ monitor_streaming_standby(void)
 							{
 								int i;

-								PQfinish(local_conn);
+								close_connection(&local_conn);

 								for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
 								{
@@ -1577,6 +1582,8 @@ monitor_streaming_standby(void)
 									if (PQstatus(local_conn) == CONNECTION_OK)
 										break;

+									close_connection(&local_conn);
+
 									log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
 											  i + 1,
 											  config_file_options.repmgrd_standby_startup_timeout);
@@ -1706,7 +1713,12 @@ monitor_streaming_standby(void)
 					}
 					else
 					{
-						if (primary_conn == NULL || PQstatus(primary_conn) != CONNECTION_OK)
+						if (primary_conn != NULL && PQstatus(primary_conn) != CONNECTION_OK)
+						{
+							close_connection(&primary_conn);
+						}
+
+						if (primary_conn == NULL)
 						{
 							primary_conn = establish_primary_db_connection(upstream_conn, false);
 						}
@@ -1715,7 +1727,8 @@ monitor_streaming_standby(void)
 					initPQExpBuffer(&event_details);

 					appendPQExpBuffer(&event_details,
-									  _("reconnected to upstream node %i after %i seconds, resuming monitoring"),
+									  _("reconnected to upstream node \"%s\" (ID: %i) after %i seconds, resuming monitoring"),
+									  upstream_node_info.node_name,
 									  upstream_node_info.node_id,
 									  degraded_monitoring_elapsed);

@@ -1851,7 +1864,9 @@ monitor_streaming_standby(void)

 							if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
 							{
+								close_connection(&cell->node_info->conn);
 								log_debug("unable to connect to %i ... ", cell->node_info->node_id);
+								close_connection(&cell->node_info->conn);
 								continue;
 							}

@@ -2036,7 +2051,8 @@ loop:

 				if (last_known_upstream_node_id != local_node_info.upstream_node_id)
 				{
-					log_notice(_("local node %i upstream appears to have changed, restarting monitoring"),
+					log_notice(_("upstream for local node \"%s\" (ID: %i) appears to have changed, restarting monitoring"),
+							   local_node_info.node_name,
 							   local_node_info.node_id);
 					log_detail(_("currently monitoring upstream %i; new upstream is %i"),
 							   last_known_upstream_node_id,
@@ -2050,7 +2066,8 @@ loop:
 				 */
 				if (local_node_info.type != STANDBY)
 				{
-					log_notice(_("local node %i is no longer a standby, restarting monitoring"),
+					log_notice(_("local node \"%s\" (ID: %i) is no longer a standby, restarting monitoring"),
+							   local_node_info.node_name,
 							   local_node_info.node_id);
 					close_connection(&upstream_conn);
 					return;
@@ -2075,8 +2092,8 @@ loop:
 				{
 					log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
 							   upstream_node_info.node_name, upstream_node_info.node_id);
-					PQfinish(primary_conn);
-					primary_conn = NULL;
+
+					close_connection(&primary_conn);

 					local_node_info.upstream_node_id = UNKNOWN_NODE_ID;
 					return;
@@ -2144,7 +2161,8 @@ loop:

 			if (last_known_upstream_node_id != local_node_info.upstream_node_id)
 			{
-				log_notice(_("local node %i's upstream appears to have changed, restarting monitoring"),
+				log_notice(_("local node \"%s\" (ID: %i)'s upstream appears to have changed, restarting monitoring"),
+						   local_node_info.node_name,
 						   local_node_info.node_id);
 				log_detail(_("currently monitoring upstream %i; new upstream is %i"),
 						   last_known_upstream_node_id,
@@ -2335,8 +2353,9 @@ monitor_streaming_witness(void)
 					{
 						log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
 								   upstream_node_info.node_name, upstream_node_info.node_id);
-						PQfinish(primary_conn);
-						primary_conn = NULL;
+
+						close_connection(&primary_conn);
+
 						termPQExpBuffer(&event_details);
 						return;
 					}
@@ -2397,7 +2416,8 @@ monitor_streaming_witness(void)
 					initPQExpBuffer(&event_details);

 					appendPQExpBuffer(&event_details,
-									  _("reconnected to upstream node %i after %i seconds, resuming monitoring"),
+									  _("reconnected to upstream node \"%s\" (ID: %i) after %i seconds, resuming monitoring"),
+									  upstream_node_info.node_name,
 									  upstream_node_info.node_id,
 									  degraded_monitoring_elapsed);

@@ -2407,9 +2427,11 @@ monitor_streaming_witness(void)
 					if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
 					{
 						log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
-								   upstream_node_info.node_name, upstream_node_info.node_id);
-						PQfinish(primary_conn);
-						primary_conn = NULL;
+								   upstream_node_info.node_name,
+								   upstream_node_info.node_id);
+
+						close_connection(&primary_conn);
+
 						termPQExpBuffer(&event_details);
 						return;
 					}
@@ -2463,7 +2485,9 @@ monitor_streaming_witness(void)

 						if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
 						{
+							close_connection(&cell->node_info->conn);
 							log_debug("unable to connect to %i ... ", cell->node_info->node_id);
+							close_connection(&cell->node_info->conn);
 							continue;
 						}

@@ -2605,8 +2629,9 @@ loop:
 				{
 					log_notice(_("current upstream node \"%s\" (ID: %i) is not primary, restarting monitoring"),
 							   upstream_node_info.node_name, upstream_node_info.node_id);
-					PQfinish(primary_conn);
-					primary_conn = NULL;
+
+					close_connection(&primary_conn);
+
 					return;
 				}

@@ -2716,24 +2741,32 @@ do_primary_failover(void)
 			{
 				for (cell = check_sibling_nodes.head; cell; cell = cell->next)
 				{
-					pid_t sibling_wal_receiver_pid;
-
 					if (cell->node_info->conn == NULL)
 						cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);

-					sibling_wal_receiver_pid = (pid_t)get_wal_receiver_pid(cell->node_info->conn);
-
-					if (sibling_wal_receiver_pid == UNKNOWN_PID)
+					if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
 					{
-						log_warning(_("unable to query WAL receiver PID on node %i"),
+						log_warning(_("unable to query WAL receiver PID on node \"%s\" (ID: %i)"),
+									cell->node_info->node_name,
 									cell->node_info->node_id);
+						close_connection(&cell->node_info->conn);
 					}
-					else if (sibling_wal_receiver_pid > 0)
+					else
 					{
-						log_info(_("WAL receiver PID on node %i is %i"),
-								 cell->node_info->node_id,
-								 sibling_wal_receiver_pid);
-						sibling_node_wal_receiver_connected = true;
+						pid_t sibling_wal_receiver_pid = (pid_t)get_wal_receiver_pid(cell->node_info->conn);
+
+						if (sibling_wal_receiver_pid == UNKNOWN_PID)
+						{
+							log_warning(_("unable to query WAL receiver PID on node %i"),
+										cell->node_info->node_id);
+						}
+						else if (sibling_wal_receiver_pid > 0)
+						{
+							log_info(_("WAL receiver PID on node %i is %i"),
+									 cell->node_info->node_id,
+									 sibling_wal_receiver_pid);
+							sibling_node_wal_receiver_connected = true;
+						}
 					}
 				}

@@ -2895,7 +2928,8 @@ do_primary_failover(void)

 					initPQExpBuffer(&event_details);
 					appendPQExpBuffer(&event_details,
-									  _("node %i is in manual failover mode and is now disconnected from streaming replication"),
+									  _("node \"%s\" (ID: %i) is in manual failover mode and is now disconnected from streaming replication"),
+									  local_node_info.node_name,
 									  local_node_info.node_id);

 					new_primary_conn = establish_db_connection(new_primary.conninfo, false);
@@ -3055,8 +3089,6 @@ do_primary_failover(void)
 }


-
-
 static void
 update_monitoring_history(void)
 {
@@ -3291,6 +3323,8 @@ do_upstream_standby_failover(void)
 		if (PQstatus(local_conn) == CONNECTION_OK)
 			break;

+		close_connection(&local_conn);
+
 		log_debug("sleeping 1 second; %i of %i (\"repmgrd_standby_startup_timeout\") attempts to reconnect to local node",
 				  i + 1,
 				  config_file_options.repmgrd_standby_startup_timeout);
@@ -3299,7 +3333,8 @@ do_upstream_standby_failover(void)

 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
-		log_error(_("unable to reconnect to local node %i"),
+		log_error(_("unable to reconnect to local node \"%s\" (ID: %i)"),
+				  local_node_info.node_name,
 				  local_node_info.node_id);
 		return FAILOVER_STATE_FOLLOW_FAIL;
 	}
@@ -3334,7 +3369,8 @@ do_upstream_standby_failover(void)

 			initPQExpBuffer(&event_details);
 			appendPQExpBuffer(&event_details,
-							  _("unable to set node %i's new upstream ID to %i"),
+							  _("unable to set node \"%s\" (ID: %i)'s new upstream ID to %i"),
+							  local_node_info.node_name,
 							  local_node_info.node_id,
 							  primary_node_info.node_id);

@@ -3372,8 +3408,10 @@ do_upstream_standby_failover(void)
 		initPQExpBuffer(&event_details);

 		appendPQExpBuffer(&event_details,
-						  _("node %i is now following primary node %i"),
+						  _("node \"%s\" (ID: %i) is now following primary node \"%s\" (ID: %i)"),
+						  local_node_info.node_name,
 						  local_node_info.node_id,
+						  primary_node_info.node_name,
 						  primary_node_info.node_id);

 		log_notice("%s", event_details.data);
@@ -3438,12 +3476,16 @@ promote_self(void)

 	r = system(promote_command);

+	log_debug("result of promote_command: %i", WEXITSTATUS(r));
+
 	/* connection should stay up, but check just in case */
 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
 		log_warning(_("local database connection not available"));
 		log_detail("\n%s", PQerrorMessage(local_conn));

+		close_connection(&local_conn);
+
 		local_conn = establish_db_connection(local_node_info.conninfo, true);

 		/* assume node failed */
@@ -3451,24 +3493,37 @@ promote_self(void)
 		{
 			log_error(_("unable to reconnect to local node"));
 			log_detail("\n%s", PQerrorMessage(local_conn));
+
+			close_connection(&local_conn);
+
 			/* XXX handle this */
 			return FAILOVER_STATE_LOCAL_NODE_FAILURE;
 		}
 	}

-	if (r != 0)
+	if (WIFEXITED(r) && WEXITSTATUS(r))
 	{
-		int			primary_node_id;
+		int			primary_node_id = UNKNOWN_NODE_ID;
+
+		log_error(_("promote command failed"));
+		log_detail(_("promote command exited with error code %i"), WEXITSTATUS(r));
+
+		log_info(_("checking if original primary node has reappeared"));

 		upstream_conn = get_primary_connection(local_conn,
 											   &primary_node_id,
 											   NULL);

-		if (PQstatus(upstream_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
+		if (PQstatus(upstream_conn) != CONNECTION_OK)
+		{
+			close_connection(&upstream_conn);
+		}
+		else if (primary_node_id == failed_primary.node_id)
 		{
 			PQExpBufferData event_details;

-			log_notice(_("original primary (ID: %i) reappeared before this standby was promoted - no action taken"),
+			log_notice(_("original primary \"%s\" (ID: %i) reappeared before this standby was promoted - no action taken"),
+					   failed_primary.node_name,
 					   failed_primary.node_id);

 			initPQExpBuffer(&event_details);
@@ -3493,9 +3548,6 @@ promote_self(void)
 			return FAILOVER_STATE_PRIMARY_REAPPEARED;
 		}

-
-		log_error(_("promote command failed"));
-
 		create_event_notification(NULL,
 								  &config_file_options,
 								  local_node_info.node_id,
@@ -3506,6 +3558,11 @@ promote_self(void)
 		return FAILOVER_STATE_PROMOTION_FAILED;
 	}

+	/*
+	 * Promotion has succeeded - verify local connection is still available
+	 */
+	try_reconnect(&local_conn, &local_node_info);
+
 	/* bump the electoral term */
 	increment_current_term(local_conn);

@@ -3521,8 +3578,10 @@ promote_self(void)
 		initPQExpBuffer(&event_details);

 		appendPQExpBuffer(&event_details,
-						  _("node %i promoted to primary; old primary %i marked as failed"),
+						  _("node \"%s\" (ID: %i) promoted to primary; old primary \"%s\" (ID: %i) marked as failed"),
+						  local_node_info.node_name,
 						  local_node_info.node_id,
+						  failed_primary.node_name,
 						  failed_primary.node_id);

 		/* local_conn is now the primary connection */
@@ -3566,6 +3625,8 @@ notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
 					 cell->node_info->node_name,
 					 cell->node_info->node_id);

+			close_connection(&cell->node_info->conn);
+
 			cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
 		}

@@ -3576,6 +3637,7 @@ notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
 						cell->node_info->node_id);
 			log_detail("\n%s", PQerrorMessage(cell->node_info->conn));

+			close_connection(&cell->node_info->conn);
 			continue;
 		}

@@ -3800,15 +3862,18 @@ follow_new_primary(int new_primary_id)
 		if (PQstatus(local_conn) == CONNECTION_OK)
 			break;

+		close_connection(&local_conn);
+
 		log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
 				  i + 1,
 				  config_file_options.repmgrd_standby_startup_timeout);
 		sleep(1);
 	}

-	if (PQstatus(local_conn) != CONNECTION_OK)
+	if (local_conn == NULL || PQstatus(local_conn) != CONNECTION_OK)
 	{
-		log_error(_("unable to reconnect to local node %i"),
+		log_error(_("unable to reconnect to local node \"%s\" (ID: %i)"),
+				  local_node_info.node_name,
 				  local_node_info.node_id);
 		return FAILOVER_STATE_FOLLOW_FAIL;
 	}
@@ -3822,8 +3887,10 @@ follow_new_primary(int new_primary_id)

 		initPQExpBuffer(&event_details);
 		appendPQExpBuffer(&event_details,
-						  _("node %i now following new upstream node %i"),
+						  _("node \"%s\" (ID: %i) now following new upstream node \"%s\" (ID: %i)"),
+						  local_node_info.node_name,
 						  local_node_info.node_id,
+						  upstream_node_info.node_name,
 						  upstream_node_info.node_id);

 		log_notice("%s", event_details.data);
@@ -3918,7 +3985,7 @@ witness_follow_new_primary(int new_primary_id)
 	record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
 	if (record_status != RECORD_FOUND)
 	{
-		log_error(_("unable to retrieve metadata record found for node %i"),
+		log_error(_("unable to retrieve metadata record for node %i"),
 				  local_node_info.node_id);
 		return FAILOVER_STATE_FOLLOW_FAIL;
 	}
@@ -3928,8 +3995,10 @@ witness_follow_new_primary(int new_primary_id)

 		initPQExpBuffer(&event_details);
 		appendPQExpBuffer(&event_details,
-						  _("witness node %i now following new primary node %i"),
+						  _("witness node \"%s\" (ID: %i) now following new primary node \"%s\" (ID: %i)"),
+						  local_node_info.node_name,
 						  local_node_info.node_id,
+						  upstream_node_info.node_name,
 						  upstream_node_info.node_id);

 		log_notice("%s", event_details.data);
@@ -4031,6 +4100,12 @@ do_election(NodeInfoList *sibling_nodes, int *new_primary_id)

 		return ELECTION_NOT_CANDIDATE;
 	}
+	if (config_file_options.failover_delay > 0)
+	{
+		log_debug("sleeping %i seconds (\"failover_delay\") before initiating failover",
+				  config_file_options.failover_delay);
+		sleep(config_file_options.failover_delay);
+	}

 	/* node priority is set to zero - don't become a candidate, and lose by default */
 	if (local_node_info.priority <= 0)
@@ -4165,6 +4240,8 @@ do_election(NodeInfoList *sibling_nodes, int *new_primary_id)

 		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
 		{
+			close_connection(&cell->node_info->conn);
+
 			continue;
 		}

@@ -4295,7 +4372,8 @@ do_election(NodeInfoList *sibling_nodes, int *new_primary_id)
 		{
 			if (sibling_replication_info.upstream_node_id != upstream_node_info.node_id)
 			{
-				log_warning(_("assumed sibling node %i monitoring different upstream node %i"),
+				log_warning(_("assumed sibling node \"%s\" (ID: %i) monitoring different upstream node %i"),
+							cell->node_info->node_name,
 							cell->node_info->node_id,
 							sibling_replication_info.upstream_node_id);

@@ -4335,7 +4413,8 @@ do_election(NodeInfoList *sibling_nodes, int *new_primary_id)
 		/* don't check 0-priority nodes */
 		if (cell->node_info->priority <= 0)
 		{
-			log_info(_("node %i has priority of %i, skipping"),
+			log_info(_("node \"%s\" (ID: %i) has priority of %i, skipping"),
+					   cell->node_info->node_name,
 					   cell->node_info->node_id,
 					   cell->node_info->priority);
 			continue;
@@ -4596,8 +4675,8 @@ check_connection(t_node_info *node_info, PGconn **conn)
 					node_info->node_name,
 					node_info->node_id);
 		log_detail("\n%s", PQerrorMessage(*conn));
-		PQfinish(*conn);
-		*conn = NULL;
+
+		close_connection(conn);
 	}

 	if (PQstatus(*conn) != CONNECTION_OK)
@@ -4606,13 +4685,14 @@ check_connection(t_node_info *node_info, PGconn **conn)
 				 node_info->node_name,
 				 node_info->node_id);

-		PQfinish(*conn);
+		close_connection(conn);
+
 		*conn = establish_db_connection(node_info->conninfo, false);

 		if (PQstatus(*conn) != CONNECTION_OK)
 		{
-			PQfinish(*conn);
-			*conn = NULL;
+			close_connection(conn);
+
 			log_warning(_("reconnection to node \"%s\" (ID: %i) failed"),
 						node_info->node_name,
 						node_info->node_id);
@@ -4686,7 +4766,8 @@ handle_sighup(PGconn **conn, t_server_type server_type)

 	if (reload_config(&config_file_options, server_type))
 	{
-		PQfinish(*conn);
+		close_connection(conn);
+
 		*conn = establish_db_connection(config_file_options.conninfo, true);
 	}

@@ -4847,6 +4928,8 @@ check_node_can_follow(PGconn *local_conn, XLogRecPtr local_xlogpos, PGconn *foll
 	if (PQstatus(follow_target_repl_conn) != CONNECTION_OK)
 	{
 		log_error(_("unable to establish a replication connection to the follow target node"));
+
+		PQfinish(follow_target_repl_conn);
 		return false;
 	}

@@ -4978,7 +5061,6 @@ check_node_can_follow(PGconn *local_conn, XLogRecPtr local_xlogpos, PGconn *foll
 	if (follow_target_history)
 		pfree(follow_target_history);

-
 	return can_follow;
 }

@@ -5152,11 +5234,8 @@ parse_child_nodes_disconnect_command(char *parsed_command, char *template, int r
 int
 try_primary_reconnect(PGconn **conn, PGconn *local_conn, t_node_info *node_info)
 {
-	PGconn	   *our_conn;
 	t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
-
 	int			i;
-
 	int			max_attempts = config_file_options.reconnect_attempts;

 	initialize_conninfo_params(&conninfo_params, false);
@@ -5170,11 +5249,18 @@ try_primary_reconnect(PGconn **conn, PGconn *local_conn, t_node_info *node_info)

 	for (i = 0; i < max_attempts; i++)
 	{
-		log_info(_("checking state of node %i, %i of %i attempts"),
-				 node_info->node_id, i + 1, max_attempts);
+		log_info(_("checking state of node \"%s\" (ID: %i), %i of %i attempts"),
+				 node_info->node_name,
+				 node_info->node_id,
+				 i + 1, max_attempts);
+
 		if (is_server_available_params(&conninfo_params) == true)
 		{
-			log_notice(_("node %i has recovered, reconnecting"), node_info->node_id);
+			PGconn	   *our_conn;
+
+			log_notice(_("node \"%s\" (ID: %i) has recovered, reconnecting"),
+					   node_info->node_name,
+					   node_info->node_id);

 			/*
 			 * Note: we could also handle the case where node is pingable but
@@ -5187,7 +5273,9 @@ try_primary_reconnect(PGconn **conn, PGconn *local_conn, t_node_info *node_info)
 			{
 				free_conninfo_params(&conninfo_params);

-				log_info(_("connection to node %i succeeded"), node_info->node_id);
+				log_info(_("connection to node \"%s\" (ID: %i) succeeded"),
+						 node_info->node_name,
+						 node_info->node_id);

 				if (PQstatus(*conn) == CONNECTION_BAD)
 				{
@@ -5244,6 +5332,8 @@ try_primary_reconnect(PGconn **conn, PGconn *local_conn, t_node_info *node_info)
 					{
 						log_notice(_("received notification that new primary is node %i"), new_primary_node_id);
 					}
+
+					free_conninfo_params(&conninfo_params);
 					return new_primary_node_id;
 				}
 				sleep(1);
@@ -5251,7 +5341,8 @@ try_primary_reconnect(PGconn **conn, PGconn *local_conn, t_node_info *node_info)
 		}
 	}

-	log_warning(_("unable to reconnect to node %i after %i attempts"),
+	log_warning(_("unable to reconnect to node \"%s\" (ID: %i) after %i attempts"),
+				node_info->node_name,
 				node_info->node_id,
 				max_attempts);

--- a/repmgrd.c
+++ b/repmgrd.c
@@ -817,32 +817,65 @@ check_upstream_connection(PGconn **conn, const char *conninfo)
 	/* Check the connection status twice in case it changes after reset */
 	bool		twice = false;

-	if (config_file_options.connection_check_type == CHECK_PING)
-		return is_server_available(conninfo);

-	if (config_file_options.connection_check_type == CHECK_CONNECTION)
+	log_debug("connection check type is \"%s\"",
+			  print_connection_check_type(config_file_options.connection_check_type));
+	/*
+	 * For the check types which do not involve using the existing database
+	 * connection, we'll perform the actual check, then as an additional
+	 * safeguard verify that the connection is still valid (as it might have
+	 * gone away during a brief outage between checks).
+	 */
+	if (config_file_options.connection_check_type != CHECK_QUERY)
 	{
 		bool success = true;
-		PGconn *test_conn = PQconnectdb(conninfo);

-		log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo);
-
-		if (PQstatus(test_conn) != CONNECTION_OK)
+		if (config_file_options.connection_check_type == CHECK_PING)
 		{
-			log_warning(_("unable to connect to \"%s\""), conninfo);
-			log_detail("\n%s", PQerrorMessage(test_conn));
-			success = false;
+			success = is_server_available(conninfo);
 		}
-		PQfinish(test_conn);
+		else if (config_file_options.connection_check_type == CHECK_CONNECTION)
+		{
+			PGconn *test_conn = PQconnectdb(conninfo);

-		return success;
+			log_debug("check_upstream_connection(): attempting to connect to \"%s\"", conninfo);
+
+			if (PQstatus(test_conn) != CONNECTION_OK)
+			{
+				log_warning(_("unable to connect to \"%s\""), conninfo);
+				log_detail("\n%s", PQerrorMessage(test_conn));
+				success = false;
+			}
+			PQfinish(test_conn);
+		}
+
+		if (success == false)
+			return false;
+
+		if (PQstatus(*conn) == CONNECTION_OK)
+			return true;
+
+		/*
+		 * Checks have succeeded, but the open connection to the primary has gone away,
+		 * possibly due to a brief outage between monitoring intervals - attempt to
+		 * reset it.
+		 */
+		log_notice(_("upstream is available but upstream connection has gone away, resetting"));
+
+		PQfinish(*conn);
+		*conn = PQconnectdb(conninfo);
+
+		if (PQstatus(*conn) == CONNECTION_OK)
+			return true;
+
+		return false;
 	}

 	for (;;)
 	{
 		if (PQstatus(*conn) != CONNECTION_OK)
 		{
-			log_debug("check_upstream_connection(): connection not OK");
+			log_debug("check_upstream_connection(): upstream connection has gone away, resetting");
 			if (twice)
 				return false;
 			/* reconnect */
@@ -877,6 +910,7 @@ check_upstream_connection(PGconn **conn, const char *conninfo)
 				return false;

 			/* reconnect */
+			log_debug("check_upstream_connection(): upstream connection not available, resetting");
 			PQfinish(*conn);
 			*conn = PQconnectdb(conninfo);
 			twice = true;
Author	SHA1	Message	Date
Ian Barwick	d43270008c	repmgrd: add parameter "failover_delay" This parameter is not documented and intended for use during testing. It should not be used in production.	2020-10-05 17:43:32 +09:00
Ian Barwick	a99768fdd8	Fix typo	2020-10-05 17:38:47 +09:00
Ian Barwick	2a4e81ef1c	repmgrd: check local connection after promoting local node In theory the local connection should not be affected by the node's promotion. However we're handing over control to an external command which is usually just "repmgr standby promote", but could potentially be a user-defined script with unknowable side effects. So it's better to be safe than sorry.	2020-10-05 17:38:09 +09:00
Ian Barwick	0ad6aceceb	Improve replication connection check Previously the check verifying that a node has connected to its upstream merely assumed the presence of a record in pg_stat_replication indicates a successful replication connection. However the record may contain a state other than "streaming", typically "startup" (which will occur when a node has diverged from its upstream and will therefore never transition to "streaming"), which needs to be taken into account when considering the state of the replication connection to avoid false positives.	2020-09-15 15:31:31 +09:00
Ian Barwick	53c9eacbc4	doc: note existing pg_rewind corner-case bug	2020-09-15 14:46:39 +09:00
Ian Barwick	e93f1c0439	doc: rearrange "repmgr node rejoin" reference for clarity The <important> section looked like an actual subsection, so convert that and the following example section into <refsect2> sections.	2020-09-15 14:46:36 +09:00
Ian Barwick	7332d0251c	doc: fix "release-current" tag	2020-09-04 15:13:01 +09:00
Ian Barwick	7006a6d9c3	doc: note use of wildcards in .pgpass file	2020-08-19 10:33:09 +09:00
Ian Barwick	b8677a0fa2	docs: link to PostgreSQL roadmap	2020-08-06 10:01:40 +09:00
Ian Barwick	992d2e0e49	doc: update "repmgr witness register" description Add missing "Options" section.	2020-08-06 10:01:37 +09:00
Ian Barwick	5c71809261	docs: reformat additonal config file upgrade notes into a new section It's easier to link to the information that way.	2020-08-06 10:01:32 +09:00
Martín Marqués	22f2ee3050	doc: add two notes on section related to configuration files Add notes to the documention mentioning that after postgres or repmgr upgrades (postgres major upgrades), there are some changes that need to be taken care of. Signed-off-by: Martín Marqués <martin.marques@2ndquadrant.com>	2020-08-06 10:01:29 +09:00
Ian Barwick	2371f30f8a	cluster matrix/crosscheck: improve text mode output formatting Previously these actions were hard-wired to assume node IDs would only ever have two digits at most. Refactor to use the same table generation code as other actions, which properly handles variable column sizes.	2020-07-06 14:26:20 +09:00
Ian Barwick	eddf06b60b	node status: clarify "archive_mode" message on standbys "archive_mode = 'always'" available from PostgreSQL 9.5.	2020-07-06 10:22:41 +09:00
Ian Barwick	b9874cd751	doc: clarify "node rejoin" usage Emphasize that conninfo must be provided for a running node.	2020-07-06 09:57:48 +09:00
Ian Barwick	8313944535	node rejoin: remove unneeded PQfinish()	2020-06-10 11:58:23 +09:00
Ian Barwick	3bfaa8e722	doc: note downstream node (dis)connection monitoring in more places	2020-06-09 16:30:53 +09:00
Ian Barwick	ca42dd563b	standby clone: fixes for Barman tablespace handling. repmgr creates a file with a list of tablespace files to fetch from Barman, however the file may not actually have been flushed to disk at the point the rsync operation was executed, so may be incomplete or empty. Also fix handling of tablespace remapping. Addresses GitHub #650.	2020-06-09 10:52:28 +09:00
Ian Barwick	a2f73a5086	run_file_backup(): fix comments Explicitly document use-case for this function, and fix a comment which probably got munged by pg_indent.	2020-06-09 10:52:26 +09:00
Ian Barwick	af144d39cb	standby register: ensure location field is compared during record check	2020-05-21 14:36:20 +09:00
Ian Barwick	68ad58f5fc	repmgrd: additional check for the upstream connection It's possible the upstream server was intermittently unavailable in the interval between checks, invalidating the upstream connection. With check types "ping" and "connection", the connection would not be restored, so if the availability check was successful, additionally verify the upstream connection and restore if necessary. Addresses GitHub #633.	2020-05-14 10:27:45 +09:00
Ian Barwick	c76fee98ef	doc: update repmgr.conf.sample Was missing "query" option for "connection_check_type".	2020-05-12 17:06:10 +09:00
Ian Barwick	de634eb593	repmgrd: include node name in log output Missed in commit `fd52df0`.	2020-05-12 15:49:07 +09:00
Ian Barwick	03c2c8cebd	repmgrd: minor refactoring of try_primary_reconnect()	2020-05-12 15:05:54 +09:00
Ian Barwick	da7db96e76	repmgrd: consolidate connection closing code PQfinish() should only be called on local PGconn pointers which will not be reused.	2020-05-12 15:05:50 +09:00
Ian Barwick	b4c9064903	repmgrd: ensure "close_connection()" always called after connection failure	2020-05-12 15:05:46 +09:00
Ian Barwick	5abef8e4ed	repmgrd: remove redundant log message	2020-05-12 15:05:39 +09:00
Ian Barwick	0813a31c08	repmgrd: include node name in log output in more places Still a few places where only the node ID was reported, but it's always useful to have the node name as well.	2020-05-12 15:05:35 +09:00
Ian Barwick	4c7be798e0	repmgrd: ensure PQfinish() always executed on failed connections in NodeInfoLists clear_node_info_list() will clean up any remaining active connections, but we need to ensure all failed connections are cleaned up at the point of failure to prevent leaks. Per report in GitHub #643.	2020-05-12 14:25:56 +09:00
Ian Barwick	0a7c7ae7ab	standby clone: explicitly set closed connection pointers to NULL We omitted to do this with the connections used when checking the system identifier, which means libpq calls by the teardown function using the pointer risk using unallocated memory. Addresses issue reported in GitHub #644.	2020-05-11 14:01:09 +09:00
Ian Barwick	9ce1c2e640	doc: clarify usage of "-f /etc/repmgr.conf" in examples	2020-05-08 10:24:24 +09:00
Ian Barwick	5a2399cb25	standby check: ignore upstream/downstream connections if node is witness Per report in GitHub #641.	2020-05-08 09:45:52 +09:00
Ian Barwick	54d33dd50b	Fix typo in comment	2020-05-01 12:13:38 +09:00
Ian Barwick	2b0218456a	doc: clarify usage of the "passfile" parameter.	2020-04-23 15:04:40 +09:00
Ian Barwick	2b773480e7	Update references to "recovery.conf" in _do_create_replication_conf()	2020-04-23 11:43:12 +09:00
Ian Barwick	a934f19e80	repmgrd: improve logging of promote_command failure - log failure before we check if the primary has reappeared - log the error code	2020-04-21 15:03:25 +09:00
Ian Barwick	9040d53e55	standby switchover: note database name for superuser connections It's useful to have a confirmation of which database repmgr is trying to connect to when the -S/--superuser connection is provided. It will always be the database defined in the repmgr.conf "conninfo" parameter, but having the name available is useful when e.g. troubleshooting issues with .pgpass configuration.	2020-04-20 16:49:56 +09:00
Ian Barwick	cb19311b35	doc: have Makefile clean up generated html files	2020-04-20 15:41:29 +09:00
Ian Barwick	9ce7cb7012	doc: clarify .pgpass usage with -S/--superuser option	2020-04-20 15:41:26 +09:00
Ian Barwick	687ed68ce2	doc: remove DEBUG output from example	2020-04-20 12:17:48 +09:00
Ian Barwick	8472d99277	doc: clarify usage of -F/--force with "standby promote" Per GitHub #632.	2020-04-20 12:17:45 +09:00
Ian Barwick	177b84345d	Fix debug logging Per GitHub #630.	2020-04-20 11:08:35 +09:00
Ian Barwick	555351f8c1	standby switchover: standardize log message	2020-04-15 10:25:48 +09:00
Ian Barwick	1e90d5e018	doc: update link to Debian package archive See also https://www.df7cb.de/blog/2020/apt-archive.postgresql.org.html	2020-04-14 12:34:06 +09:00
Ian Barwick	d45e64ca9e	doc: fix typo in 5.1.0 release section ID	2020-04-13 10:40:25 +09:00
Ian Barwick	374f19675d	Bump version to 5.1.0	2020-04-10 16:55:58 +09:00
Ian Barwick	ce88f3ec43	doc: finalize release notes	2020-04-10 16:40:23 +09:00
Ian Barwick	5acdd69add	doc: update release notes	2020-04-10 13:39:03 +09:00
Ian Barwick	71e23107e7	Add upgrade route for repmgr 3.x to repmgr 5.1 The removal of some extensions functions means it's not possible to follow the conventional incremental upgrade path; instead we'll create a script for direct upgrades to 5.1.	2020-04-09 16:08:35 +09:00