README: clarify that the repmgr metadatabase must be part of the replication cluster

Update HISTORY
Merge branch 'master' of github.com:2ndQuadrant/repmgr into REL3_2_STABLE
2026-03-23 15:16:29 +00:00 · 2016-10-26 20:28:22 +09:00 · 2016-10-24 09:03:46 +09:00 · 2016-10-24 08:14:04 +09:00 · 2016-10-20 10:56:16 +09:00 · 2016-10-19 20:26:55 +09:00
30 changed files with 5734 additions and 1609 deletions
--- a/FAQ.md
+++ b/FAQ.md
@@ -38,7 +38,7 @@ General

  No. Hash indexes and replication do not mix well and their use is
  explicitly discouraged; see:
-    http://www.postgresql.org/docs/current/interactive/sql-createindex.html#AEN74175
+    https://www.postgresql.org/docs/current/interactive/sql-createindex.html#AEN74175

 `repmgr`
 --------
@@ -137,6 +137,7 @@ General
  of events which includes servers removed from the replication cluster
  which no longer have an entry in the `repl_nodes` table.

+
 `repmgrd`
 ---------

@@ -151,6 +152,9 @@ General

  In `repmgr.conf`, set its priority to a value of 0 or less.

+  Additionally, if `failover` is set to `manual`, the node will never
+  be considered as a promotion candidate.
+
 - Does `repmgrd` support delayed standbys?

  `repmgrd` can monitor delayed standbys - those set up with
@@ -169,3 +173,11 @@ General

  Configure your system's `logrotate` service to do this; see example
  in README.md
+
+- I've recloned a failed master as a standby, but `repmgrd` refuses to start?
+
+  Check you registered the standby after recloning. If unregistered the standby
+  cannot be considered as a promotion candidate even if `failover` is set to
+  `automatic`, which is probably not what you want. `repmgrd` will start if
+  `failover` is set to `manual` so the node's replication status can still
+  be monitored, if desired.
--- a/60
+++ b/60
@@ -1,3 +1,63 @@
+3.2.1   2016-10-24
+        repmgr: require a valid repmgr cluster name unless -F/--force
+          supplied (Ian)
+        repmgr: check master server is registered with repmgr before
+          cloning (Ian)
+        repmgr: ensure data directory defaults to that of the source node (Ian)
+        repmgr: various fixes to Barman cloning mode (Gianni, Ian)
+        repmgr: fix `repmgr cluster crosscheck` output (Ian)
+
+3.2     2016-10-05
+        repmgr: add support for cloning from a Barman backup (Gianni)
+        repmgr: add commands `standby matrix` and `standby crosscheck` (Gianni)
+        repmgr: suppress connection error display in `repmgr cluster show`
+          unless `--verbose` supplied (Ian)
+        repmgr: add commands `witness register` and `witness unregister` (Ian)
+        repmgr: enable `standby unregister` / `witness unregister` to be
+          executed for a node which is not running (Ian)
+        repmgr: remove deprecated command line options --initdb-no-pwprompt and
+           -l/--local-port (Ian)
+        repmgr: before cloning with pg_basebackup, check that sufficient free
+           walsenders are available (Ian)
+        repmgr: add option `--wait-sync` for `standby register` which causes
+           repmgr to wait for the registered node record to synchronise to
+           the standby (Ian)
+        repmgr: add option `--copy-external-config-files` for files outside
+           of the data directory (Ian)
+        repmgr: only require `wal_keep_segments` to be set in certain corner
+           cases (Ian)
+        repmgr: better support cloning from a node other than the one to
+           stream from (Ian)
+        repmgrd: add configuration options to override the default pg_ctl
+           commands (Jarkko Oranen, Ian)
+        repmgrd: don't start if node is inactive and failover=automatic (Ian)
+        packaging: improve "repmgr-auto" Debian package (Gianni)
+
+
+3.1.5   2016-08-15
+        repmgrd: in a failover situation, prevent endless looping when
+          attempting to establish the status of a node with
+          `failover=manual` (Ian)
+        repmgrd: improve handling of failover events on standbys with
+          `failover=manual`, and create a new event notification
+          for this, `standby_disconnect_manual` (Ian)
+        repmgr: add further event notifications (Gianni)
+        repmgr: when executing `standby switchover`, don't collect remote
+          command output unless required (Gianni, Ian)
+        repmgrd: improve standby monitoring query (Ian, based on suggestion
+          from  Álvaro)
+        repmgr: various command line handling improvements (Ian)
+
+3.1.4   2016-07-12
+        repmgr: new configuration option for setting "restore_command"
+          in the recovery.conf file generated by repmgr (Martín)
+        repmgr: add --csv option to "repmgr cluster show" (Gianni)
+        repmgr: enable provision of a conninfo string as the -d/--dbname
+          parameter, similar to other PostgreSQL utilities (Ian)
+        repmgr: during switchover operations improve detection of
+          demotion candidate shutdown (Ian)
+        various bugfixes and documentation updates (Ian, Martín)
+
 3.1.3   2016-05-17
        repmgrd: enable monitoring when a standby is catching up by
          replaying archived WAL (Ian)
--- a/8
+++ b/8
@@ -5,7 +5,7 @@
 HEADERS = $(wildcard *.h)

 repmgrd_OBJS = dbutils.o config.o repmgrd.o log.o strutil.o
-repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o
+repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o dirmod.o

 DATA = repmgr.sql uninstall_repmgr.sql

@@ -87,10 +87,12 @@ PG_VERSION = $(shell pg_config --version | cut -d ' ' -f 2 | cut -d '.' -f 1,2)
 REPMGR_VERSION = $(shell grep REPMGR_VERSION version.h | cut -d ' ' -f 3 | cut -d '"' -f 2)
 PKGLIBDIR = $(shell pg_config --pkglibdir)
 SHAREDIR = $(shell pg_config --sharedir)
+PGBINDIR = /usr/lib/postgresql/$(PG_VERSION)/bin

 deb: repmgrd repmgr
-	mkdir -p ./debian/usr/bin
-	cp repmgrd repmgr ./debian/usr/bin/
+	mkdir -p ./debian/usr/bin ./debian$(PGBINDIR)
+	cp repmgrd repmgr ./debian$(PGBINDIR)
+	ln -s ../..$(PGBINDIR)/repmgr ./debian/usr/bin/repmgr
 	mkdir -p ./debian$(SHAREDIR)/contrib/
 	cp sql/repmgr_funcs.sql ./debian$(SHAREDIR)/contrib/
 	cp sql/uninstall_repmgr_funcs.sql ./debian$(SHAREDIR)/contrib/
--- a/README.md
+++ b/README.md
@@ -7,6 +7,8 @@ replication capabilities with utilities to set up standby servers, monitor
 replication, and perform administrative tasks such as failover or switchover
 operations.

+The current `repmgr` version, 3.2, supports all PostgreSQL versions from
+9.3 to 9.6.

 Overview
 --------
@@ -48,7 +50,7 @@ This guide assumes that you are familiar with PostgreSQL administration and
 streaming replication concepts. For further details on streaming
 replication, see this link:

-  http://www.postgresql.org/docs/current/interactive/warm-standby.html#STREAMING-REPLICATION
+  https://www.postgresql.org/docs/current/interactive/warm-standby.html#STREAMING-REPLICATION

 The following terms are used throughout the `repmgr` documentation.

@@ -119,7 +121,8 @@ views:
    status for each node

 The `repmgr` metadata schema can be stored in an existing database or in its own
-dedicated database.
+dedicated database. Note that the `repmgr` metadata schema cannot reside on a database
+server which is not part of the replication cluster managed by `repmgr`.

 A dedicated database superuser is required to own the meta-database as well as carry
 out administrative actions.
@@ -143,10 +146,27 @@ The `repmgr` tools must be installed on each server in the replication cluster.

 A dedicated system user for `repmgr` is *not* required; as many `repmgr` and
 `repmgrd` actions require direct access to the PostgreSQL data directory,
-it should be executed by the `postgres` user.
+these commands should be executed by the `postgres` user.

-Additionally, we recommend installing `rsync` and enabling passwordless
-`ssh` connectivity between all servers in the replication cluster.
+Passwordless `ssh` connectivity between all servers in the replication cluster
+is not required, but is necessary in the following cases:
+
+* if you need `repmgr` to copy configuration files from outside the PostgreSQL
+  data directory
+* when using `rsync` to clone a standby
+* to perform switchover operations
+* when executing `repmgr cluster matrix` and `repmgr cluster crosscheck`
+
+In these cases `rsync` is required on all servers too.
+
+* * *
+
+> *TIP*: We recommend using a session multiplexer utility such as `screen` or
+> `tmux` when performing long-running actions (such as cloning a database)
+> on a remote server - this will ensure the `repmgr` action won't be prematurely
+> terminated if your `ssh` session to the server is interrupted or closed.
+
+* * *

 ### Packages

@@ -155,9 +175,15 @@ system.

 - RedHat/CentOS: RPM packages for `repmgr` are available via Yum through
  the PostgreSQL Global Development Group RPM repository ( http://yum.postgresql.org/ ).
-  You need to follow the instructions for your distribution (RedHat, CentOS,
+  Follow the instructions for your distribution (RedHat, CentOS,
  Fedora, etc.) and architecture as detailed at yum.postgresql.org.

+  2ndQuadrant also provides its own RPM packages which are made available
+  at the same time as each `repmgr` release, as it can take some days for
+  them to become available via the main PGDG repository. See here for details:
+
+     http://repmgr.org/yum-repository.html
+
 - Debian/Ubuntu: the most recent `repmgr` packages are available from the
  PostgreSQL Community APT repository ( http://apt.postgresql.org/ ).
  Instructions can be found in the APT section of the PostgreSQL Wiki
@@ -215,6 +241,34 @@ command line options:
 - `-b/--pg_bindir`


+### Command line options and environment variables
+
+For some commands, e.g. `repmgr standby clone`, database connection parameters
+need to be provided. Like other PostgreSQL utilities, following standard
+parameters can be used:
+
+- `-d/--dbname=DBNAME`
+- `-h/--host=HOSTNAME`
+- `-p/--port=PORT`
+- `-U/--username=USERNAME`
+
+If `-d/--dbname` contains an `=` sign or starts with a valid URI prefix (`postgresql://`
+or `postgres://`), it is treated as a conninfo string. See the PostgreSQL
+documentation for further details:
+
+  https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING
+
+Note that if a `conninfo` string is provided, values set in this will override any
+provided as individual parameters. For example, with `-d 'host=foo' --host bar`, `foo`
+will be chosen over `bar`.
+
+Like other PostgreSQL utilities, `repmgr` will default to any values set in environment
+variables if explicit command line parameters are not provided. See the PostgreSQL
+documentation for further details:
+
+  https://www.postgresql.org/docs/current/static/libpq-envars.html
+
+
 Setting up a simple replication cluster with repmgr
 ---------------------------------------------------

@@ -235,30 +289,43 @@ both servers.
 ### PostgreSQL configuration

 On the master server, a PostgreSQL instance must be initialised and running.
-The following replication settings must be included in `postgresql.conf`:
+The following replication settings may need to be adjusted:
+
+
+    # Enable replication connections; set this figure to at least one more
+    # than the number of standbys which will connect to this server
+    # (note that repmgr will execute `pg_basebackup` in WAL streaming mode,
+    # which requires two free WAL senders)
+
+    max_wal_senders = 10

    # Ensure WAL files contain enough information to enable read-only queries
    # on the standby

    wal_level = 'hot_standby'

-    # Enable up to 10 replication connections
-
-    max_wal_senders = 10
-
-    # How much WAL to retain on the master to allow a temporarily
-    # disconnected standby to catch up again. The larger this is, the
-    # longer the standby can be disconnected. This is needed only in
-    # 9.3; from 9.4, replication slots can be used instead (see below).
-
-    wal_keep_segments = 5000
-
    # Enable read-only queries on a standby
    # (Note: this will be ignored on a master but we recommend including
    # it anyway)

    hot_standby = on

+    # Enable WAL file archiving
+    archive_mode = on
+
+    # Set archive command to a script or application that will safely store
+    # you WALs in a secure place. /bin/true is an example of a command that
+    # ignores archiving. Use something more sensible.
+    archive_command = '/bin/true'
+
+    # If cloning using rsync, or you have configured `pg_basebackup_options`
+    # in `repmgr.conf` to include the setting `--xlog-method=fetch`, *and*
+    # you have not set `restore_command` in `repmgr.conf`to fetch WAL files
+    # from another source such as Barman, you'll need to set `wal_keep_segments`
+    # to a high enough value to ensure that all WAL files generated while
+    # the standby is being cloned are retained until the standby starts up.
+
+    # wal_keep_segments = 5000

 * * *

@@ -329,6 +396,16 @@ to include this schema name, e.g.

    ALTER USER repmgr SET search_path TO repmgr_test, "$user", public;

+* * *
+
+> *TIP*: for Debian-based distributions we recommend explictly setting
+> `pg_bindir` to the directory where `pg_ctl` and other binaries not in
+> the standard path are located. For PostgreSQL 9.5 this would be
+> `/usr/lib/postgresql/9.5/bin/`.
+
+* * *
+
+
 ### Initialise the master server

 To enable `repmgr` to support a replication cluster, the master node must
@@ -371,14 +448,59 @@ Clone the standby with:
    [2016-01-07 17:21:28] [NOTICE] you can now start your PostgreSQL server
    [2016-01-07 17:21:28] [HINT] for example : pg_ctl -D /path/to/node2/data/ start

-This will clone the PostgreSQL data directory files from the master at repmgr_node1
-using PostgreSQL's pg_basebackup utility. A `recovery.conf` file containing the
+This will clone the PostgreSQL data directory files from the master at `repmgr_node1`
+using PostgreSQL's `pg_basebackup` utility. A `recovery.conf` file containing the
 correct parameters to start streaming from this master server will be created
-automatically, and unless otherwise the `postgresql.conf` and `pg_hba.conf`
-files will be copied from the master.
+automatically.

-Make any adjustments to the PostgreSQL configuration files now, then start the
-standby server.
+Note that by default, any configuration files in the master's data directory will be
+copied to the standby. Typically these will be `postgresql.conf`, `postgresql.auto.conf`,
+`pg_hba.conf` and `pg_ident.conf`. These may require modification before the standby
+is started so it functions as desired.
+
+In some cases (e.g. on Debian or Ubuntu Linux installations), PostgreSQL's
+configuration files are located outside of the data directory and will
+not be copied by default. `repmgr` can copy these files, either to the same
+location on the standby server (provided appropriate directory and file permissions
+are available), or into the standby's data directory. This requires passwordless
+SSH access to the master server. Add the option `--copy-external-config-files`
+to the `repmgr standby clone` command; by default files will be copied to
+the same path as on the upstream server. To have them placed in the standby's
+data directory, specify `--copy-external-config-files=pgdata`, but note that
+any include directives in the copied files may need to be updated.
+
+*Caveat*: when copying external configuration files: `repmgr` will only be able
+to detect files which contain active settings. If a file is referenced by
+an include directive but is empty, only contains comments or contains
+settings which have not been activated, the file will not be copied.
+
+* * *
+
+> *TIP*: for reliable configuration file management we recommend using a
+> configuration management tool such as Ansible, Chef, Puppet or Salt.
+
+* * *
+
+Be aware that when initially cloning a standby, you will need to ensure
+that all required WAL files remain available while the cloning is taking
+place. To ensure this happens when using the default `pg_basebackup` method,
+`repmgr` will set `pg_basebackup`'s `--xlog-method` parameter to `stream`,
+which will ensure all WAL files generated during the cloning process are
+streamed in parallel with the main backup. Note that this requires two
+replication connections to be available.
+
+To override this behaviour, in `repmgr.conf` set `pg_basebackup`'s
+`--xlog-method` parameter to `fetch`:
+
+    pg_basebackup_options='--xlog-method=fetch'
+
+and ensure that `wal_keep_segments` is set to an appropriately high value.
+See the `pg_basebackup` documentation for details:
+
+    https://www.postgresql.org/docs/current/static/app-pgbasebackup.html
+
+Make any adjustments to the standby's PostgreSQL configuration files now,
+then start the server.

 * * *

@@ -439,13 +561,102 @@ standby's upstream server is the replication cluster master. While of limited
 use in a simple master/standby replication cluster, this information is required
 to effectively manage cascading replication (see below).

+* * *
+
+> *TIP*: depending on your environment and workload, it may take some time for
+> the standby's node record to propagate from the master to the standby. Some
+> actions (such as starting `repmgrd`) require that the standby's node record
+> is present and up-to-date to function correctly - by providing the option
+> `--wait-sync` to the `repmgr standby register` command, `repmgr` will wait
+> until the record is synchronised before exiting. An optional timeout (in
+> seconds) can be added to this option (e.g. `--wait-sync=60`).
+
+* * *
+
+### Using Barman to clone a standby
+
+`repmgr standby clone` also supports Barman, the Backup and
+Replication manager (http://www.pgbarman.org/), as a provider of both
+base backups and WAL files.
+
+Barman support provides the following advantages:
+
+- the master node does not need to perform a new backup every time a
+  new standby is cloned;
+- a standby node can be disconnected for longer periods without losing
+  the ability to catch up, and without causing accumulation of WAL
+  files on the master node;
+- therefore, `repmgr` does not need to use replication slots, and the
+  master node does not need to set `wal_keep_segments`.
+
+> *NOTE*: In view of the above, Barman support is incompatible with
+> the `use_replication_slots` setting in `repmgr.conf`.
+
+In order to enable Barman support for `repmgr standby clone`, you must
+ensure that:
+
+- the name of the server configured in Barman is equal to the
+  `cluster_name` setting in `repmgr.conf`;
+- the `barman_server` setting in `repmgr.conf` is set to the SSH
+  hostname of the Barman server;
+- the `restore_command` setting in `repmgr.conf` is configured to
+  use a copy of the `barman-wal-restore` script shipped with the
+  `barman-cli` package (see below);
+- the Barman catalogue includes at least one valid backup for this
+  server.
+
+> *NOTE*: Barman support is automatically enabled if `barman_server`
+> is set. Normally it is a good practice to use Barman, for instance
+> when fetching a base backup while cloning a standby; in any case,
+> Barman mode can be disabled using the `--without-barman` command
+> line option.
+
+> *NOTE*: if you have a non-default SSH configuration on the Barman
+> server, e.g. using a port other than 22, then you can set those
+> parameters in a dedicated Host section in `~/.ssh/config`
+> corresponding to the value of `barman_server` in `repmgr.conf`. See
+> the "Host" section in `man 5 ssh_config` for more details.
+
+`barman-wal-restore` is a Python script provided by the Barman
+development team as part of the `barman-cli` package (Barman 2.0
+and later; for Barman 1.x the script is provided separately as
+`barman-wal-restore.py`).
+
+`restore_command` must then be set in `repmgr.conf` as follows:
+
+    <script> <Barman hostname> <cluster_name> %f %p
+
+For instance, suppose that we have installed Barman on the `barmansrv`
+host, and that `barman-wal-restore` is located as an executable at
+`/usr/bin/barman-wal-restore`;  `repmgr.conf` should include the following
+lines:
+
+    barman_server=barmansrv
+    restore_command=/usr/bin/barman-wal-restore barmansrv test %f %p
+
+NOTE: to use a non-default Barman configuration file on the Barman server,
+specify this in `repmgr.conf` with `barman_config`:
+
+    barman_config=/path/to/barman.conf
+
+Now we can clone a standby using the Barman server:
+
+    $ repmgr -h node1 -D 9.5/main -f /etc/repmgr.conf standby clone
+    [2016-06-12 20:08:35] [NOTICE] destination directory '9.5/main' provided
+    [2016-06-12 20:08:35] [NOTICE] getting backup from Barman...
+    [2016-06-12 20:08:36] [NOTICE] standby clone (from Barman) complete
+    [2016-06-12 20:08:36] [NOTICE] you can now start your PostgreSQL server
+    [2016-06-12 20:08:36] [HINT] for example : pg_ctl -D 9.5/data start
+    [2016-06-12 20:08:36] [HINT] After starting the server, you need to register this standby with "repmgr standby register"
+
+

 Advanced options for cloning a standby
 --------------------------------------

-The above section demonstrates the simplest possible way to cloneb a standby
-server. Depending on your circumstances, finer-grained controlover the cloning
-process may be necessary.
+The above section demonstrates the simplest possible way to clone a standby
+server. Depending on your circumstances, finer-grained control over the
+cloning process may be necessary.

 ### pg_basebackup options when cloning a standby

@@ -458,7 +669,7 @@ so should be used with care.
 Further options can be passed to the `pg_basebackup` utility via
 the setting `pg_basebackup_options` in `repmgr.conf`. See the PostgreSQL
 documentation for more details of available options:
-  http://www.postgresql.org/docs/current/static/app-pgbasebackup.html
+  https://www.postgresql.org/docs/current/static/app-pgbasebackup.html

 ### Using rsync to clone a standby

@@ -476,20 +687,32 @@ and destination server as the contents of files existing on both servers need
 to be compared, meaning this method is not necessarily faster than making a
 fresh clone with `pg_basebackup`.

+> *NOTE*: `barman-wal-restore` supports command line switches to
+> control parallelism (`--parallel=N`) and compression (`--bzip2`,
+> `--gzip`).

-### Dealing with PostgreSQL configuration files
+### Controlling `primary_conninfo` in `recovery.conf`

-By default, `repmgr` will attempt to copy the standard configuration files
-(`postgresql.conf`, `pg_hba.conf` and `pg_ident.conf`) even if they are located
-outside of the data directory (though currently they will be copied
-into the standby's data directory). To prevent this happening, when executing
-`repmgr standby clone` provide the `--ignore-external-config-files` option.
+The `primary_conninfo` setting in `recovery.conf` generated by `repmgr`
+is generated from the following sources, in order of highest to lowest priority:

-If using `rsync` to clone a standby, additional control over which files
-not to transfer is possible by configuring `rsync_options` in `repmgr.conf`,
-which enables any valid `rsync` options to be passed to that command, e.g.:
+- the upstream node's `conninfo` setting (as defined in the `repl_nodes` table)
+- the connection parameters provided to `repmgr standby clone`
+- PostgreSQL's standard connection defaults, including any environment variables
+  set on the local node.

-    rsync_options='--exclude=postgresql.local.conf'
+To include specific connection parameters other than the standard host, port,
+username and database values (e.g. `sslmode`), include these in a `conninfo`-style
+string passed to `repmgr` with `-d/--dbname` (see above for details), and/or set
+appropriate environment variables.
+
+Note that PostgreSQL will always set explicit defaults for `sslmode` and
+`sslcompression`.
+
+If `application_name` is set in the standby's `conninfo` parameter in
+`repmgr.conf`, this value will be appended to `primary_conninfo`, otherwise
+`repmgr` will set `application_name` to the same value as the `node_name`
+parameter.


 Setting up cascading replication with repmgr
@@ -564,6 +787,10 @@ To enable `repmgr` to use replication slots, set the boolean parameter
 Note that `repmgr` will fail with an error if this option is specified when
 working with PostgreSQL 9.3.

+Replication slots must be enabled in `postgresql.conf` by setting the parameter
+`max_replication_slots` to at least the number of expected standbys (changes
+to this parameter require a server restart).
+
 When cloning a standby, `repmgr` will automatically generate an appropriate
 slot name, which is stored in the `repl_nodes` table, and create the slot
 on the master:
@@ -586,21 +813,9 @@ Note that a slot name will be created by default for the master but not
 actually used unless the master is converted to a standby using e.g.
 `repmgr standby switchover`.

-Be aware that when initially cloning a standby, you will need to ensure
-that all required WAL files remain available while the cloning is taking
-place. If using the default `pg_basebackup` method, we recommend setting
-`pg_basebackup`'s `--xlog-method` parameter to `stream` like this:
-
-    pg_basebackup_options='--xlog-method=stream'
-
-See the `pg_basebackup` documentation for details:
-    http://www.postgresql.org/docs/current/static/app-pgbasebackup.html
-
-Otherwise it's necessary to set `wal_keep_segments` to an appropriately high
-value.

 Further information on replication slots in the PostgreSQL documentation:
-    http://www.postgresql.org/docs/current/interactive/warm-standby.html#STREAMING-REPLICATION-SLOTS
+    https://www.postgresql.org/docs/current/interactive/warm-standby.html#STREAMING-REPLICATION-SLOTS


 Promoting a standby server with repmgr
@@ -699,8 +914,9 @@ updated to reflect this:


 Note that with cascading replication, `repmgr standby follow` can also be
-used to detach a standby from its current upstream server and follow another
-upstream server, including the master.
+used to detach a standby from its current upstream server and follow the
+master. However it's currently not possible to have it follow another standby;
+we hope to improve this in a future release.


 Performing a switchover with repmgr
@@ -727,7 +943,7 @@ both passwordless SSH access and the path of `repmgr.conf` on that server.
 > careful preparation and with adequate attention. In particular you should
 > be confident that your network environment is stable and reliable.
 >
-> We recommend running `repmgr standby switchover`  at the most verbose
+> We recommend running `repmgr standby switchover` at the most verbose
 > logging level (`--log-level DEBUG --verbose`) and capturing all output
 > to assist troubleshooting any problems.
 >
@@ -793,20 +1009,25 @@ should have been updated to reflect this:

 ### Caveats

- the functionality provided `repmgr standby switchover` is primarily aimed
+- The functionality provided `repmgr standby switchover` is primarily aimed
  at a two-server master/standby replication cluster and currently does
  not support additional standbys.
 - `repmgr standby switchover` is designed to use the `pg_rewind` utility,
-  standard in 9.5 and later and available for seperately in 9.3 and 9.4
+  standard in 9.5 and later and available separately in 9.3 and 9.4
  (see note below)
 - `pg_rewind` *requires* that either `wal_log_hints` is enabled, or that
   data checksums were enabled when the cluster was initialized. See the
  `pg_rewind` documentation for details:
-     http://www.postgresql.org/docs/current/static/app-pgrewind.html
+     https://www.postgresql.org/docs/current/static/app-pgrewind.html
 - `repmgrd` should not be running when a switchover is carried out, otherwise
  the `repmgrd` may try and promote a standby by itself.
 - Any other standbys attached to the old master will need to be manually
  instructed to point to the new master (e.g. with `repmgr standby follow`).
+- You must ensure that following a server start using `pg_ctl`, log output
+  is not send to STDERR (the default behaviour). If logging is not configured,
+  we recommend setting `logging_collector=on` in `postgresql.conf` and
+  providing an explicit `-l/--log` setting in `repmgr.conf`'s `pg_ctl_options`
+  parameter.

 We hope to remove some of these restrictions in future versions of `repmgr`.

@@ -820,7 +1041,7 @@ will have diverged slightly following the shutdown of the old master.

 The utility `pg_rewind` provides an efficient way of doing this, however
 is not included in the core PostgreSQL distribution for versions 9.3 and 9.4.
-Hoever, `pg_rewind` is available separately for these versions and we
+However, `pg_rewind` is available separately for these versions and we
 strongly recommend its installation. To use it with versions 9.3 and 9.4,
 provide the command line option `--pg_rewind`, optionally with the
 path to the `pg_rewind` binary location if not installed in the PostgreSQL
@@ -829,6 +1050,10 @@ path to the `pg_rewind` binary location if not installed in the PostgreSQL
 `pg_rewind` for versions 9.3 and 9.4 can be obtained from:
  https://github.com/vmware/pg_rewind

+Note that building this version of `pg_rewind` requires the PostgreSQL source
+code. Also, PostgreSQL 9.3 does not provide `wal_log_hints`, meaning data
+checksums must have been enabled when the database was initialized.
+
 If `pg_rewind` is not available, as a fallback `repmgr` will use `repmgr
 standby clone` to resynchronise the old master's data directory using
 `rsync`. However, in order to ensure all files are synchronised, the
@@ -848,20 +1073,21 @@ This will remove the standby record from `repmgr`'s internal metadata
 table (`repl_nodes`). A `standby_unregister` event notification will be
 recorded in the `repl_events` table.

-Note that this command will not stop the server itself or remove
-it from the replication cluster.
+Note that this command will not stop the server itself or remove it from
+the replication cluster. Note that if the standby was using a replication
+slot, this will not be removed.

-If the standby is not running, the standby record must be manually
-removed from the `repl_nodes` table with e.g.:
+If the standby is not running, the command can be executed on another
+node by providing the id of the node to be unregistered using
+the command line parameter `--node`, e.g. executing the following
+command on the master server will unregister the standby with
+id 3:

-    DELETE FROM repmgr_test.repl_nodes WHERE id = 3;
-
-Adjust schema and node ID accordingly. A future `repmgr` release
-will make it possible to unregister failed standbys.
+    repmgr standby unregister -f /etc/repmgr.conf --node=3


-Automatic failover with repmgrd
-------------------------------
+Automatic failover with `repmgrd`
+---------------------------------

 `repmgrd` is a management and monitoring daemon which runs on standby nodes
 and which can automate actions such as failover and updating standbys to
@@ -871,8 +1097,8 @@ To use `repmgrd` for automatic failover, the following `repmgrd` options must
 be set in `repmgr.conf`:

    failover=automatic
-    promote_command='repmgr standby promote -f /etc/repmgr/repmgr.conf'
-    follow_command='repmgr standby follow -f /etc/repmgr/repmgr.conf'
+    promote_command='repmgr standby promote -f /etc/repmgr.conf'
+    follow_command='repmgr standby follow -f /etc/repmgr.conf'

 (See `repmgr.conf.sample` for further `repmgrd`-specific settings).

@@ -889,7 +1115,7 @@ actions happening, but we strongly recommend executing `repmgr` directly.

 `repmgrd` can be started simply with e.g.:

-    repmgrd -f /etc/repmgr.conf --verbose > $HOME/repmgr/repmgr.log 2>&1
+    repmgrd -f /etc/repmgr.conf --verbose >> $HOME/repmgr/repmgr.log 2>&1

 For permanent operation, we recommend using the options `-d/--daemonize` to
 detach the `repmgrd` process, and `-p/--pid-file` to write the process PID
@@ -911,7 +1137,7 @@ table looks like this:


 Start `repmgrd` on each standby and verify that it's running by examining
-the log output, which at default log level will look like this:
+the log output, which at log level INFO will look like this:

    [2016-01-05 13:15:40] [INFO] checking cluster configuration with schema 'repmgr_test'
    [2016-01-05 13:15:40] [INFO] checking node 2 in cluster 'test'
@@ -981,8 +1207,8 @@ during the failover:
    (3 rows)


-repmgrd log rotation
--------------------
+`repmgrd` log rotation
+----------------------

 Note that currently `repmgrd` does not provide logfile rotation. To ensure
 the current logfile does not grow indefinitely, configure your system's `logrotate`
@@ -998,8 +1224,29 @@ for up to 52 weeks and rotation forced if a file grows beyond 100Mb:
        create 0600 postgres postgres
    }

-Monitoring
----------
+
+`repmgrd` and PostgreSQL connection settings
+--------------------------------------------
+
+In addition to the `repmgr` configuration settings, parameters in the
+`conninfo` string influence how `repmgr` makes a network connection to
+PostgreSQL. In particular, if another server in the replication cluster
+is unreachable at network level, system network settings will influence
+the length of time it takes to determine that the connection is not possible.
+
+In particular explicitly setting a parameter for `connect_timeout` should
+be considered; the effective minimum value of `2` (seconds) will ensure
+that a connection failure at network level is reported as soon as possible,
+otherwise depending on the system settings (e.g. `tcp_syn_retries` in Linux)
+a delay of a minute or more is possible.
+
+For further details on `conninfo` network connection parameters, see:
+
+  https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-PARAMKEYWORDS
+
+
+Monitoring with `repmgrd`
+-------------------------

 When `repmgrd` is running with the option `-m/--monitoring-history`, it will
 constantly write standby node status information to the `repl_monitor` table,
@@ -1031,9 +1278,16 @@ table , it's advisable to regularly purge historical data with
 `repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
 many day's worth of data should be retained.

+It's possible to use `repmgrd` to provide monitoring only for some or all
+nodes by setting `failover = manual` in the node's `repmgr.conf`. In the
+event of the node's upstream failing, no failover action will be taken
+and the node will require manual intervention to be reattached to replication.
+If this occurs, event notification `standby_disconnect_manual` will be
+created.
+
 Note that when a standby node is not streaming directly from its upstream
-node, i.e. recovering WAL from an archive, `apply_lag` will always
-appear as `0 bytes`.
+node, e.g. recovering WAL from an archive, `apply_lag` will always appear as
+`0 bytes`.


 Using a witness server with repmgrd
@@ -1130,8 +1384,10 @@ The following event types are available:
  * `standby_promote`
  * `standby_follow`
  * `standby_switchover`
+  * `standby_disconnect_manual`
  * `witness_create`
-  * `witness_create`
+  * `witness_register`
+  * `witness_unregister`
  * `repmgrd_start`
  * `repmgrd_shutdown`
  * `repmgrd_failover_promote`
@@ -1153,6 +1409,42 @@ In general `repmgr` can be upgraded as-is without any further action required,
 however feature releases may require the `repmgr` database to be upgraded.
 An SQL script will be provided - please check the release notes for details.

+
+Distribution-specific configuration
+-----------------------------------
+
+`repmgr` is largely OS-agnostic and can be run on any UNIX-like environment
+including various Linux distributions, Solaris, macOS and the various BSDs.
+
+However, often OS-specific configuration is required, particularly when
+dealing with system service management (e.g. stopping and starting the
+PostgreSQL server), file paths and configuration file locations.
+
+### PostgreSQL server control
+
+By default, `repmgr` will use PostgreSQL's standard `pg_ctl` utility to control
+a running PostgreSQL server. However it may be better to use the operating
+system's service management system, e.g. `systemd`. To specify which service
+control commands are used, the following `repmgr.conf` configuration settings
+are available:
+
+    service_start_command
+    service_stop_command
+    service_restart_command
+    service_reload_command
+    service_promote_command
+
+See `repmgr.conf.sample` for further details.
+
+### Binary directory
+
+Some PostgreSQL system packages, such as those provided for Debian/Ubuntu, like
+to hide some PostgreSQL utility programs outside of the default path. To ensure
+`repmgr` finds all required executables, explicitly set `pg_bindir` to the
+appropriate location, e.g. for PostgreSQL 9.6 on Debian/Ubuntu this would be
+`/usr/lib/postgresql/9.6/bin/`.
+
+
 Reference
 ---------

@@ -1276,32 +1568,156 @@ which contains connection details for the local database.

    This command also requires the location of the witness server's data
    directory to be provided (`-D/--datadir`) as well as valid connection
-    parameters for the master server.
+    parameters for the master server. If not explicitly provided,
+    database and user names will be extracted from the `conninfo` string in
+    `repmgr.conf`.

    By default this command will create a superuser and a repmgr user.
    The `repmgr` user name will be extracted from the `conninfo` string
    in `repmgr.conf`.

+* `witness register`
+
+    This will set up the witness server configuration, including the witness
+    server's copy of the `repmgr` meta database, on a running PostgreSQL
+    instance and register the witness server with the master. It requires
+    the same command line options as `witness create`.
+
+* `witness unregister`
+
+    Removes the entry for a witness server from the `repl_nodes` table. This
+    command will not shut down the witness server or remove its data directory.
+
 * `cluster show`

    Displays information about each active node in the replication cluster. This
-    command polls each registered server and shows its role (master / standby /
-    witness) or `FAILED` if the node doesn't respond. It polls each server
+    command polls each registered server and shows its role (`master` / `standby` /
+    `witness`) or `FAILED` if the node doesn't respond. It polls each server
    directly and can be run on any node in the cluster; this is also useful
    when analyzing connectivity from a particular node.

    This command requires a valid `repmgr.conf` file to be provided; no
-    additional arguments are required.
+    additional arguments are needed.

    Example:

        $ repmgr -f /etc/repmgr.conf cluster show

        Role      | Name  | Upstream | Connection String
-        ----------+-------|----------|--------------------------------------------
-        * master  | node1 |          | host=repmgr_node1 dbname=repmgr user=repmgr
-          standby | node2 | node1    | host=repmgr_node1 dbname=repmgr user=repmgr
-          standby | node3 | node2    | host=repmgr_node1 dbname=repmgr user=repmgr
+        ----------+-------|----------|----------------------------------------
+        * master  | node1 |          | host=db_node1 dbname=repmgr user=repmgr
+          standby | node2 | node1    | host=db_node2 dbname=repmgr user=repmgr
+          standby | node3 | node2    | host=db_node3 dbname=repmgr user=repmgr
+
+    To show database connection errors when polling nodes, run the command in
+    `--verbose` mode.
+
+    The `cluster show` command now accepts the optional parameter `--csv`, which
+    outputs the replication cluster's status in a simple CSV format, suitable for
+    parsing by scripts:
+
+        $ repmgr -f /etc/repmgr.conf cluster show --csv
+        1,-1
+        2,0
+        3,1
+
+    The first column is the node's ID, and the second column represents the
+    node's status (0 = available, -1 = failed).
+
+* `cluster matrix` and `cluster crosscheck`
+
+    These commands display connection information for each pair of
+    nodes in the replication cluster.
+
+    - `cluster matrix` runs a `cluster show` on each node and arranges
+      the results in a matrix, recording success or failure;
+
+    - `cluster crosscheck` runs a `cluster matrix` on each node and
+      combines the results in a single matrix, providing a full
+      overview of connections between all databases in the cluster.
+
+    These commands require a valid `repmgr.conf` file on each node.
+    Additionally password-less `ssh` connections are required between
+    all nodes.
+
+    Example 1 (all nodes up):
+
+        $ repmgr -f /etc/repmgr.conf cluster matrix
+
+        Name   | Id |  1 |  2 |  3
+        -------+----+----+----+----
+         node1 |  1 |  * |  * |  *
+         node2 |  2 |  * |  * |  *
+         node3 |  3 |  * |  * |  *
+
+    Here `cluster matrix` is sufficient to establish the state of each
+    possible connection.
+
+
+    Example 2 (node1 and `node2` up, `node3` down):
+
+        $ repmgr -f /etc/repmgr.conf cluster matrix
+
+        Name   | Id |  1 |  2 |  3
+        -------+----+----+----+----
+         node1 |  1 |  * |  * |  x
+         node2 |  2 |  * |  * |  x
+         node3 |  3 |  ? |  ? |  ?
+
+    Each row corresponds to one server, and indicates the result of
+    testing an outbound connection from that server.
+
+    Since `node3` is down, all the entries in its row are filled with
+    "?", meaning that there we cannot test outbound connections.
+
+    The other two nodes are up; the corresponding rows have "x" in the
+    column corresponding to node3, meaning that inbound connections to
+    that node have failed, and "*" in the columns corresponding to
+    node1 and node2, meaning that inbound connections to these nodes
+    have succeeded.
+
+    In this case, `cluster crosscheck` gives the same result as `cluster
+    matrix`, because from any functioning node we can observe the same
+    state: `node1` and `node2` are up, `node3` is down.
+
+    Example 3 (all nodes up, firewall dropping packets originating
+               from `node1` and directed to port 5432 on node3)
+
+    Running `cluster matrix` from `node1` gives the following output:
+
+        $ repmgr -f /etc/repmgr.conf cluster matrix
+
+        Name   | Id |  1 |  2 |  3
+        -------+----+----+----+----
+         node1 |  1 |  * |  * |  x
+         node2 |  2 |  * |  * |  *
+         node3 |  3 |  ? |  ? |  ?
+
+    (Note this may take some time depending on the `connect_timeout`
+    setting in the registered node `conninfo` strings; default is 1
+    minute which means without modification the above command would
+    take around 2 minutes to run; see comment elsewhere about setting
+    `connect_timeout`)
+
+    The matrix tells us that we cannot connect from `node1` to `node3`,
+    and that (therefore) we don't know the state of any outbound
+    connection from node3.
+
+    In this case, the `cluster crosscheck` command is more informative:
+
+        $ repmgr -f /etc/repmgr.conf cluster crosscheck
+
+        Name   | Id |  1 |  2 |  3
+        -------+----+----+----+----
+         node1 |  1 |  * |  * |  x
+         node2 |  2 |  * |  * |  *
+         node3 |  3 |  * |  * |  *
+
+    What happened is that `cluster crosscheck` merged its own `cluster
+    matrix` with the `cluster matrix` output from `node2`; the latter is
+    able to connect to `node3` and therefore determine the state of
+    outbound connections from that node.
+

 * `cluster cleanup`

@@ -1315,25 +1731,45 @@ which contains connection details for the local database.
    the current working directory; no additional arguments are required.


+### Further documentation
+
+As well as this README, the `repmgr` source contains following additional
+documentation files:
+
+* FAQ.md - frequently asked questions
+* CONTRIBUTING.md - how to contribute to `repmgr`
+* PACKAGES.md - details on building packages
+* SSH-RSYNC.md - how to set up passwordless SSH between nodes
+* docs/repmgrd-failover-mechanism.md - how repmgrd picks which node to promote
+* docs/repmgrd-node-fencing.md - how to "fence" a failed master node
+
+
+
+
 ### Error codes

 `repmgr` or `repmgrd` will return one of the following error codes on program
 exit:

-* SUCCESS (0)              Program ran successfully.
-* ERR_BAD_CONFIG (1)       Configuration file could not be parsed or was invalid
-* ERR_BAD_RSYNC (2)        An rsync call made by the program returned an error
-* ERR_NO_RESTART (4)       An attempt to restart a PostgreSQL instance failed
-* ERR_DB_CON (6)           Error when trying to connect to a database
-* ERR_DB_QUERY (7)         Error while executing a database query
-* ERR_PROMOTED (8)         Exiting program because the node has been promoted to master
-* ERR_BAD_PASSWORD (9)     Password used to connect to a database was rejected
-* ERR_STR_OVERFLOW (10)    String overflow error
-* ERR_FAILOVER_FAIL (11)   Error encountered during failover (repmgrd only)
-* ERR_BAD_SSH (12)         Error when connecting to remote host via SSH
-* ERR_SYS_FAILURE (13)     Error when forking (repmgrd only)
-* ERR_BAD_BASEBACKUP (14)  Error when executing pg_basebackup
-* ERR_MONITORING_FAIL (16) Unrecoverable error encountered during monitoring (repmgrd only)
+* SUCCESS (0)                Program ran successfully.
+* ERR_BAD_CONFIG (1)         Configuration file could not be parsed or was invalid
+* ERR_BAD_RSYNC (2)          An rsync call made by the program returned an error
+                               (repmgr only)
+* ERR_NO_RESTART (4)         An attempt to restart a PostgreSQL instance failed
+* ERR_DB_CON (6)             Error when trying to connect to a database
+* ERR_DB_QUERY (7)           Error while executing a database query
+* ERR_PROMOTED (8)           Exiting program because the node has been promoted to master
+* ERR_STR_OVERFLOW (10)      String overflow error
+* ERR_FAILOVER_FAIL (11)     Error encountered during failover (repmgrd only)
+* ERR_BAD_SSH (12)           Error when connecting to remote host via SSH (repmgr only)
+* ERR_SYS_FAILURE (13)       Error when forking (repmgrd only)
+* ERR_BAD_BASEBACKUP (14)    Error when executing pg_basebackup (repmgr only)
+* ERR_MONITORING_FAIL (16)   Unrecoverable error encountered during monitoring (repmgrd only)
+* ERR_BAD_BACKUP_LABEL (17)  Corrupt or unreadable backup label encountered (repmgr only)
+* ERR_SWITCHOVER_FAIL (18)   Error encountered during switchover (repmgr only)
+* ERR_BARMAN (19)            Unrecoverable error while accessing the barman server (repmgr only)
+* ERR_REGISTRATION_SYNC (20) After registering a standby, local node record was not
+                                syncrhonised (repmgr only, with --wait option)

 Support and Assistance
 ----------------------
@@ -1379,5 +1815,6 @@ Thanks from the repmgr core team.
 Further reading
 ---------------

+* http://blog.2ndquadrant.com/improvements-in-repmgr-3-1-4/
 * http://blog.2ndquadrant.com/managing-useful-clusters-repmgr/
 * http://blog.2ndquadrant.com/easier_postgresql_90_clusters/
--- a/RHEL/repmgr3-93.spec
+++ b/RHEL/repmgr3-93.spec
@@ -1,61 +0,0 @@
-Summary: repmgr
-Name: repmgr
-Version: 3.0
-Release: 1
-License: GPLv3
-Group: System Environment/Daemons
-URL: http://repmgr.org
-Packager: Ian Barwick <ian@2ndquadrant.com>
-Vendor: 2ndQuadrant Limited
-Distribution: centos
-Source0: %{name}-%{version}.tar.gz
-BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root
-
-%description
-repmgr is a utility suite which greatly simplifies
-the process of setting up and managing replication
-using streaming replication within a cluster of
-PostgreSQL servers.
-
-%prep
-%setup
-
-%build
-export PATH=$PATH:/usr/pgsql-9.3/bin/
-%{__make} USE_PGXS=1
-
-%install
-[ "%{buildroot}" != "/" ] && %{__rm} -rf %{buildroot}
-
-export PATH=$PATH:/usr/pgsql-9.3/bin/
-%{__make} USE_PGXS=1 install DESTDIR=%{buildroot} INSTALL="install -p"
-%{__make} USE_PGXS=1 install_prog DESTDIR=%{buildroot} INSTALL="install -p"
-%{__make} USE_PGXS=1 install_rhel DESTDIR=%{buildroot} INSTALL="install -p"
-
-
-%clean
-[ "%{buildroot}" != "/" ] && %{__rm} -rf %{buildroot}
-
-
-%files
-%defattr(-,root,root)
-/usr/bin/repmgr
-/usr/bin/repmgrd
-/usr/pgsql-9.3/bin/repmgr
-/usr/pgsql-9.3/bin/repmgrd
-/usr/pgsql-9.3/lib/repmgr_funcs.so
-/usr/pgsql-9.3/share/contrib/repmgr.sql
-/usr/pgsql-9.3/share/contrib/repmgr_funcs.sql
-/usr/pgsql-9.3/share/contrib/uninstall_repmgr.sql
-/usr/pgsql-9.3/share/contrib/uninstall_repmgr_funcs.sql
-%attr(0755,root,root)/etc/init.d/repmgrd
-%attr(0644,root,root)/etc/sysconfig/repmgrd
-%attr(0644,root,root)/etc/repmgr/repmgr.conf.sample
-
-%changelog
-* Tue Mar 10 2015 Ian Barwick ian@2ndquadrant.com>
- build for repmgr 3.0
-* Thu Jun 05 2014 Nathan Van Overloop <nathan.van.overloop@nexperteam.be> 2.0.2
- fix witness creation to create db and user if needed
-* Fri Apr 04 2014 Nathan Van Overloop <nathan.van.overloop@nexperteam.be> 2.0.1
- initial build for RHEL6
--- a/RHEL/repmgrd.init
+++ b/RHEL/repmgrd.init
@@ -1,133 +0,0 @@
-#!/bin/sh
-#
-# chkconfig: - 75 16
-# description: Enable repmgrd replication management and monitoring daemon for PostgreSQL
-# processname: repmgrd
-# pidfile="/var/run/${NAME}.pid"
-
-# Source function library.
-INITD=/etc/rc.d/init.d
-. $INITD/functions
-
-# Get function listing for cross-distribution logic.
-TYPESET=`typeset -f|grep "declare"`
-
-# Get network config.
-. /etc/sysconfig/network
-
-DESC="PostgreSQL replication management and monitoring daemon"
-NAME=repmgrd
-
-REPMGRD_ENABLED=no
-REPMGRD_OPTS=
-REPMGRD_USER=postgres
-REPMGRD_BIN=/usr/pgsql-9.3/bin/repmgrd
-REPMGRD_PIDFILE=/var/run/repmgrd.pid
-REPMGRD_LOCK=/var/lock/subsys/${NAME}
-REPMGRD_LOG=/var/lib/pgsql/9.3/data/pg_log/repmgrd.log
-
-# Read configuration variable file if it is present
-[ -r /etc/sysconfig/$NAME ] && . /etc/sysconfig/$NAME
-
-# For SELinux we need to use 'runuser' not 'su'
-if [ -x /sbin/runuser ]
-then
-    SU=runuser
-else
-    SU=su
-fi
-
-test -x $REPMGRD_BIN || exit 0
-
-case "$REPMGRD_ENABLED" in
-    [Yy]*)
-	break
-	;;
-    *)
-	exit 0
-	;;
-esac
-
-
-if [ -z "${REPMGRD_OPTS}" ]
-then
-    echo "Not starting ${NAME}, REPMGRD_OPTS not set in /etc/sysconfig/${NAME}"
-    exit 0
-fi
-
-start()
-{
-    REPMGRD_START=$"Starting ${NAME} service: "
-
-    # Make sure startup-time log file is valid
-    if [ ! -e "${REPMGRD_LOG}" -a ! -h "${REPMGRD_LOG}" ]
-    then
-        touch "${REPMGRD_LOG}" || exit 1
-        chown ${REPMGRD_USER}:postgres "${REPMGRD_LOG}"
-        chmod go-rwx "${REPMGRD_LOG}"
-        [ -x /sbin/restorecon ] && /sbin/restorecon "${REPMGRD_LOG}"
-    fi
-
-    echo -n "${REPMGRD_START}"
-    $SU -l $REPMGRD_USER -c "${REPMGRD_BIN} ${REPMGRD_OPTS} -p ${REPMGRD_PIDFILE} &" >> "${REPMGRD_LOG}" 2>&1 < /dev/null
-    sleep 2
-    pid=`head -n 1 "${REPMGRD_PIDFILE}" 2>/dev/null`
-    if [ "x${pid}" != "x" ]
-    then
-        success "${REPMGRD_START}"
-        touch "${REPMGRD_LOCK}"
-        echo $pid > "${REPMGRD_PIDFILE}"
-        echo
-    else
-        failure "${REPMGRD_START}"
-        echo
-        script_result=1
-    fi
-}
-
-stop()
-{
-    echo -n $"Stopping ${NAME} service: "
-    if [ -e "${REPMGRD_LOCK}" ]
-    then
-        killproc ${NAME}
-        ret=$? 
-        if [ $ret -eq 0 ]
-        then
-            echo_success
-            rm -f "${REPMGRD_PIDFILE}"
-            rm -f "${REPMGRD_LOCK}"
-        else
-            echo_failure
-            script_result=1
-        fi
-    else
-        # not running; per LSB standards this is "ok"   
-        echo_success
-    fi
-    echo
-}
-
-
-# See how we were called.
-case "$1" in
-  start)
-        start
-        ;;
-  stop)
-        stop
-        ;;
-  status)
-        status -p $REPMGRD_PIDFILE $NAME
-        script_result=$?
-        ;;
-  restart)
-        stop
-	start
-        ;;
-  *)
-        echo $"Usage: $0 {start|stop|status|restart}"
-        exit 2
-esac
-
-exit $script_result
--- a/RHEL/repmgrd.sysconfig
+++ b/RHEL/repmgrd.sysconfig
@@ -1,21 +0,0 @@
-# default settings for repmgrd. This file is source by /bin/sh from
-# /etc/init.d/repmgrd
-
-# disable repmgrd by default so it won't get started upon installation
-# valid values: yes/no
-REPMGRD_ENABLED=no
-
-# Options for repmgrd (required)
-#REPMGRD_OPTS="--verbose -d -f /var/lib/pgsql/repmgr/repmgr.conf"
-
-# User to run repmgrd as
-#REPMGRD_USER=postgres
-
-# repmgrd binary
-#REPMGRD_BIN=/usr/bin/repmgrd
-
-# pid file
-#REPMGRD_PIDFILE=/var/lib/pgsql/repmgr/repmgrd.pid
-
-# log file
-#REPMGRD_LOG=/var/lib/pgsql/repmgr/repmgrd.log
--- a/5
+++ b/5
@@ -53,8 +53,9 @@ Planned feature improvements
  requested, activate the replication slot using pg_receivexlog to negate the
  need to set `wal_keep_segments` just for the initial clone (9.4 and 9.5).

-* Take into account the fact that a standby can obtain WAL from an archive,
-  so even if direct streaming replication is interrupted, it may be up-to-date
+* repmgr: enable "standby follow" to point a standby at another standby, not
+  just the replication cluster master (see GitHub #130)
+

 Usability improvements
 ======================
--- a/config.c
+++ b/config.c
@@ -1,5 +1,6 @@
 /*
 * config.c - Functions to parse the config file
+ *
 * Copyright (C) 2ndQuadrant, 2010-2016
 *
 * This program is free software: you can redistribute it and/or modify
@@ -26,9 +27,9 @@

 static void parse_event_notifications_list(t_configuration_options *options, const char *arg);
 static void tablespace_list_append(t_configuration_options *options, const char *arg);
-static void exit_with_errors(ErrorList *config_errors);
+static void exit_with_errors(ItemList *config_errors);

-const static char *_progname = '\0';
+const static char *_progname = NULL;
 static char config_file_path[MAXPGPATH];
 static bool config_file_provided = false;
 bool config_file_found = false;
@@ -201,7 +202,7 @@ parse_config(t_configuration_options *options)
 	char	   *conninfo_errmsg = NULL;

 	/* Collate configuration file errors here for friendlier reporting */
-	static ErrorList config_errors = { NULL, NULL };
+	static ItemList config_errors = { NULL, NULL };

 	bool		node_found = false;

@@ -214,16 +215,24 @@ parse_config(t_configuration_options *options)
 	options->upstream_node = NO_UPSTREAM_NODE;
 	options->use_replication_slots = 0;
 	memset(options->conninfo, 0, sizeof(options->conninfo));
+	memset(options->barman_server, 0, sizeof(options->barman_server));
+	memset(options->barman_config, 0, sizeof(options->barman_config));
 	options->failover = MANUAL_FAILOVER;
 	options->priority = DEFAULT_PRIORITY;
 	memset(options->node_name, 0, sizeof(options->node_name));
 	memset(options->promote_command, 0, sizeof(options->promote_command));
 	memset(options->follow_command, 0, sizeof(options->follow_command));
+	memset(options->service_stop_command, 0, sizeof(options->service_stop_command));
+	memset(options->service_start_command, 0, sizeof(options->service_start_command));
+	memset(options->service_restart_command, 0, sizeof(options->service_restart_command));
+	memset(options->service_reload_command, 0, sizeof(options->service_reload_command));
+	memset(options->service_promote_command, 0, sizeof(options->service_promote_command));
 	memset(options->rsync_options, 0, sizeof(options->rsync_options));
 	memset(options->ssh_options, 0, sizeof(options->ssh_options));
 	memset(options->pg_bindir, 0, sizeof(options->pg_bindir));
 	memset(options->pg_ctl_options, 0, sizeof(options->pg_ctl_options));
 	memset(options->pg_basebackup_options, 0, sizeof(options->pg_basebackup_options));
+	memset(options->restore_command, 0, sizeof(options->restore_command));

 	/* default master_response_timeout is 60 seconds */
 	options->master_response_timeout = 60;
@@ -239,6 +248,8 @@ parse_config(t_configuration_options *options)
 	options->witness_repl_nodes_sync_interval_secs = 30;

 	memset(options->event_notification_command, 0, sizeof(options->event_notification_command));
+	options->event_notifications.head = NULL;
+	options->event_notifications.tail = NULL;

 	options->tablespace_mapping.head = NULL;
 	options->tablespace_mapping.tail = NULL;
@@ -303,6 +314,10 @@ parse_config(t_configuration_options *options)
 			options->upstream_node = repmgr_atoi(value, "upstream_node", &config_errors, false);
 		else if (strcmp(name, "conninfo") == 0)
 			strncpy(options->conninfo, value, MAXLEN);
+		else if (strcmp(name, "barman_server") == 0)
+			strncpy(options->barman_server, value, MAXLEN);
+		else if (strcmp(name, "barman_config") == 0)
+			strncpy(options->barman_config, value, MAXLEN);
 		else if (strcmp(name, "rsync_options") == 0)
 			strncpy(options->rsync_options, value, QUERY_STR_LEN);
 		else if (strcmp(name, "ssh_options") == 0)
@@ -327,7 +342,7 @@ parse_config(t_configuration_options *options)
 			}
 			else
 			{
-				error_list_append(&config_errors,_("value for 'failover' must be 'automatic' or 'manual'\n"));
+				item_list_append(&config_errors,_("value for 'failover' must be 'automatic' or 'manual'\n"));
 			}
 		}
 		else if (strcmp(name, "priority") == 0)
@@ -338,9 +353,20 @@ parse_config(t_configuration_options *options)
 			strncpy(options->promote_command, value, MAXLEN);
 		else if (strcmp(name, "follow_command") == 0)
 			strncpy(options->follow_command, value, MAXLEN);
+		else if (strcmp(name, "service_stop_command") == 0)
+			strncpy(options->service_stop_command, value, MAXLEN);
+		else if (strcmp(name, "service_start_command") == 0)
+			strncpy(options->service_start_command, value, MAXLEN);
+		else if (strcmp(name, "service_restart_command") == 0)
+			strncpy(options->service_restart_command, value, MAXLEN);
+		else if (strcmp(name, "service_reload_command") == 0)
+			strncpy(options->service_reload_command, value, MAXLEN);
+		else if (strcmp(name, "service_promote_command") == 0)
+			strncpy(options->service_promote_command, value, MAXLEN);
 		else if (strcmp(name, "master_response_timeout") == 0)
 			options->master_response_timeout = repmgr_atoi(value, "master_response_timeout", &config_errors, false);
-		/* 'primary_response_timeout' as synonym for 'master_response_timeout' -
+		/*
+		 * 'primary_response_timeout' as synonym for 'master_response_timeout' -
 		 * we'll switch terminology in a future release (3.1?)
 		 */
 		else if (strcmp(name, "primary_response_timeout") == 0)
@@ -372,6 +398,8 @@ parse_config(t_configuration_options *options)
 			parse_event_notifications_list(options, value);
 		else if (strcmp(name, "tablespace_mapping") == 0)
 			tablespace_list_append(options, value);
+		else if (strcmp(name, "restore_command") == 0)
+			strncpy(options->restore_command, value, MAXLEN);
 		else
 		{
 			known_parameter = false;
@@ -391,7 +419,7 @@ parse_config(t_configuration_options *options)
 					 _("no value provided for parameter \"%s\""),
 					 name);

-			error_list_append(&config_errors, error_message_buf);
+			item_list_append(&config_errors, error_message_buf);
 		}
 	}

@@ -400,11 +428,11 @@ parse_config(t_configuration_options *options)

 	if (node_found == false)
 	{
-		error_list_append(&config_errors, _("\"node\": parameter was not found"));
+		item_list_append(&config_errors, _("\"node\": parameter was not found"));
 	}
 	else if (options->node == 0)
 	{
-		error_list_append(&config_errors, _("\"node\": must be greater than zero"));
+		item_list_append(&config_errors, _("\"node\": must be greater than zero"));
 	}

 	if (strlen(options->conninfo))
@@ -424,7 +452,7 @@ parse_config(t_configuration_options *options)
 					 _("\"conninfo\": %s"),
 					 conninfo_errmsg);

-			error_list_append(&config_errors, error_message_buf);
+			item_list_append(&config_errors, error_message_buf);
 		}

 		PQconninfoFree(conninfo_options);
@@ -619,6 +647,13 @@ reload_config(t_configuration_options *orig_options)
 		config_changed = true;
 	}

+	/* barman_server */
+	if (strcmp(orig_options->barman_server, new_options.barman_server) != 0)
+	{
+		strcpy(orig_options->barman_server, new_options.barman_server);
+		config_changed = true;
+	}
+
 	/* node */
 	if (orig_options->node != new_options.node)
 	{
@@ -755,11 +790,11 @@ reload_config(t_configuration_options *orig_options)


 void
-error_list_append(ErrorList *error_list, char *error_message)
+item_list_append(ItemList *item_list, char *error_message)
 {
-	ErrorListCell *cell;
+	ItemListCell *cell;

-	cell = (ErrorListCell *) pg_malloc0(sizeof(ErrorListCell));
+	cell = (ItemListCell *) pg_malloc0(sizeof(ItemListCell));

 	if (cell == NULL)
 	{
@@ -767,19 +802,19 @@ error_list_append(ErrorList *error_list, char *error_message)
 		exit(ERR_BAD_CONFIG);
 	}

-	cell->error_message = pg_malloc0(MAXLEN);
-	strncpy(cell->error_message, error_message, MAXLEN);
+	cell->string = pg_malloc0(MAXLEN);
+	strncpy(cell->string, error_message, MAXLEN);

-	if (error_list->tail)
+	if (item_list->tail)
 	{
-		error_list->tail->next = cell;
+		item_list->tail->next = cell;
 	}
 	else
 	{
-		error_list->head = cell;
+		item_list->head = cell;
 	}

-	error_list->tail = cell;
+	item_list->tail = cell;
 }


@@ -789,7 +824,7 @@ error_list_append(ErrorList *error_list, char *error_message)
 * otherwise exit
 */
 int
-repmgr_atoi(const char *value, const char *config_item, ErrorList *error_list, bool allow_negative)
+repmgr_atoi(const char *value, const char *config_item, ItemList *error_list, bool allow_negative)
 {
 	char	  *endptr;
 	long	   longval = 0;
@@ -838,7 +873,7 @@ repmgr_atoi(const char *value, const char *config_item, ErrorList *error_list, b
 			exit(ERR_BAD_CONFIG);
 		}

-		error_list_append(error_list, error_message_buf);
+		item_list_append(error_list, error_message_buf);
 	}

 	return (int32) longval;
@@ -980,15 +1015,15 @@ parse_event_notifications_list(t_configuration_options *options, const char *arg


 static void
-exit_with_errors(ErrorList *config_errors)
+exit_with_errors(ItemList *config_errors)
 {
-	ErrorListCell *cell;
+	ItemListCell *cell;

 	log_err(_("%s: following errors were found in the configuration file.\n"), progname());

 	for (cell = config_errors->head; cell; cell = cell->next)
 	{
-		log_err("%s\n", cell->error_message);
+		log_err("%s\n", cell->string);
 	}

 	exit(ERR_BAD_CONFIG);
--- a/config.h
+++ b/config.h
@@ -1,5 +1,6 @@
 /*
 * config.h
+ *
 * Copyright (c) 2ndQuadrant, 2010-2016
 *
 * This program is free software: you can redistribute it and/or modify
@@ -57,11 +58,20 @@ typedef struct
 	int			node;
 	int         upstream_node;
 	char		conninfo[MAXLEN];
+	char		barman_server[MAXLEN];
+	char		barman_config[MAXLEN];
 	int			failover;
 	int			priority;
 	char		node_name[MAXLEN];
+	/* commands executed by repmgrd */
 	char		promote_command[MAXLEN];
 	char		follow_command[MAXLEN];
+	/* Overrides for pg_ctl commands */
+	char		service_stop_command[MAXLEN];
+	char		service_start_command[MAXLEN];
+	char		service_restart_command[MAXLEN];
+	char		service_reload_command[MAXLEN];
+	char		service_promote_command[MAXLEN];
 	char		loglevel[MAXLEN];
 	char		logfacility[MAXLEN];
 	char		rsync_options[QUERY_STR_LEN];
@@ -72,6 +82,7 @@ typedef struct
 	char		pg_bindir[MAXLEN];
 	char		pg_ctl_options[MAXLEN];
 	char		pg_basebackup_options[MAXLEN];
+	char		restore_command[MAXLEN];
 	char		logfile[MAXLEN];
 	int			monitor_interval_secs;
 	int			retry_promote_interval_secs;
@@ -82,19 +93,39 @@ typedef struct
 	TablespaceList tablespace_mapping;
 }	t_configuration_options;

-#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", 0, 0, 0, 0, "", { NULL, NULL }, {NULL, NULL} }
+/*
+ * The following will initialize the structure with a minimal set of options;
+ * actual defaults are set in parse_config() before parsing the configuration file
+ */
+#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", "", "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", "", 0, 0, 0, 0, "", { NULL, NULL }, { NULL, NULL } }

-typedef struct ErrorListCell
+typedef struct ItemListCell
 {
-	struct ErrorListCell *next;
-	char			     *error_message;
-} ErrorListCell;
+	struct ItemListCell *next;
+	char			    *string;
+} ItemListCell;

-typedef struct ErrorList
+typedef struct ItemList
 {
-	ErrorListCell *head;
-	ErrorListCell *tail;
-} ErrorList;
+	ItemListCell *head;
+	ItemListCell *tail;
+} ItemList;
+
+typedef struct TablespaceDataListCell
+{
+	struct TablespaceDataListCell *next;
+	char	   *name;
+	char	   *oid;
+	char	   *location;
+	/* optional payload */
+	FILE       *f;
+} TablespaceDataListCell;
+
+typedef struct TablespaceDataList
+{
+	TablespaceDataListCell *head;
+	TablespaceDataListCell *tail;
+} TablespaceDataList;

 void set_progname(const char *argv0);
 const char * progname(void);
@@ -104,10 +135,10 @@ bool		reload_config(t_configuration_options *orig_options);
 bool		parse_config(t_configuration_options *options);
 void		parse_line(char *buff, char *name, char *value);
 char	   *trim(char *s);
-void		error_list_append(ErrorList *error_list, char *error_message);
+void		item_list_append(ItemList *item_list, char *error_message);
 int			repmgr_atoi(const char *s,
 						const char *config_item,
-						ErrorList *error_list,
+						ItemList *error_list,
 						bool allow_negative);
-
+extern bool		config_file_found;
 #endif
--- a/dbutils.c
+++ b/dbutils.c
@@ -1,5 +1,6 @@
 /*
 * dbutils.c - Database connection/management functions
+ *
 * Copyright (C) 2ndQuadrant, 2010-2016
 *
 * This program is free software: you can redistribute it and/or modify
@@ -31,9 +32,10 @@
 char repmgr_schema[MAXLEN] = "";
 char repmgr_schema_quoted[MAXLEN] = "";

+static int _get_node_record(PGconn *conn, char *cluster, char *sqlquery, t_node_info *node_info);

 PGconn *
-_establish_db_connection(const char *conninfo, const bool exit_on_error, const bool log_notice)
+_establish_db_connection(const char *conninfo, const bool exit_on_error, const bool log_notice, const bool verbose_only)
 {
 	/* Make a connection to the database */
 	PGconn	   *conn = NULL;
@@ -49,15 +51,23 @@ _establish_db_connection(const char *conninfo, const bool exit_on_error, const b
 	/* Check to see that the backend connection was successfully made */
 	if ((PQstatus(conn) != CONNECTION_OK))
 	{
-		if (log_notice)
+		bool emit_log = true;
+
+		if (verbose_only == true && verbose_logging == false)
+			emit_log = false;
+
+		if (emit_log)
 		{
-			log_notice(_("connection to database failed: %s\n"),
-					PQerrorMessage(conn));
-		}
-		else
-		{
-			log_err(_("connection to database failed: %s\n"),
-					PQerrorMessage(conn));
+			if (log_notice)
+			{
+				log_notice(_("connection to database failed: %s\n"),
+						   PQerrorMessage(conn));
+			}
+			else
+			{
+				log_err(_("connection to database failed: %s\n"),
+						PQerrorMessage(conn));
+			}
 		}

 		if (exit_on_error)
@@ -70,16 +80,35 @@ _establish_db_connection(const char *conninfo, const bool exit_on_error, const b
 	return conn;
 }

+
+/*
+ * Establish a database connection, optionally exit on error
+ */
 PGconn *
 establish_db_connection(const char *conninfo, const bool exit_on_error)
 {
-	return _establish_db_connection(conninfo, exit_on_error, false);
+	return _establish_db_connection(conninfo, exit_on_error, false, false);
 }

+/*
+ * Attempt to establish a database connection, never exit on error, only
+ * output error messages if --verbose option used
+ */
 PGconn *
-test_db_connection(const char *conninfo, const bool exit_on_error)
+establish_db_connection_quiet(const char *conninfo)
 {
-	return _establish_db_connection(conninfo, exit_on_error, true);
+	return _establish_db_connection(conninfo, false, false, true);
+}
+
+/*
+ * Attempt to establish a database connection, never exit on error,
+ * output connection error messages as NOTICE (useful when connection
+ * failure is expected)
+ */
+PGconn *
+test_db_connection(const char *conninfo)
+{
+	return _establish_db_connection(conninfo, false, true, false);
 }


@@ -185,7 +214,7 @@ check_cluster_schema(PGconn *conn)
 	char		sqlquery[QUERY_STR_LEN];

 	sqlquery_snprintf(sqlquery,
-					  "SELECT 1 FROM pg_namespace WHERE nspname = '%s'",
+					  "SELECT 1 FROM pg_catalog.pg_namespace WHERE nspname = '%s'",
 					  get_repmgr_schema());

 	log_verbose(LOG_DEBUG, "check_cluster_schema(): %s\n", sqlquery);
@@ -380,7 +409,7 @@ guc_set(PGconn *conn, const char *parameter, const char *op,
 	int			retval = 1;

 	sqlquery_snprintf(sqlquery,
-					  "SELECT true FROM pg_settings "
+					  "SELECT true FROM pg_catalog.pg_settings "
 					  " WHERE name = '%s' AND setting %s '%s'",
 					  parameter, op, value);

@@ -416,7 +445,7 @@ guc_set_typed(PGconn *conn, const char *parameter, const char *op,
 	int			retval = 1;

 	sqlquery_snprintf(sqlquery,
-					  "SELECT true FROM pg_settings "
+					  "SELECT true FROM pg_catalog.pg_settings "
 					  " WHERE name = '%s' AND setting::%s %s '%s'::%s",
 					  parameter, datatype, op, value, datatype);

@@ -448,7 +477,7 @@ get_cluster_size(PGconn *conn, char *size)

 	sqlquery_snprintf(sqlquery,
 					  "SELECT pg_catalog.pg_size_pretty(SUM(pg_catalog.pg_database_size(oid))::bigint) "
-					  "	 FROM pg_database ");
+					  "	 FROM pg_catalog.pg_database ");

 	log_verbose(LOG_DEBUG, "get_cluster_size():\n%s\n", sqlquery);

@@ -475,11 +504,11 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
 	char		sqlquery[QUERY_STR_LEN];
 	PGresult   *res;
 	int			i;
-	bool        success = true;
+	bool        success = false;

 	sqlquery_snprintf(sqlquery,
 					  "SELECT name, setting "
-					  " FROM pg_settings WHERE name = '%s'",
+					  "  FROM pg_catalog.pg_settings WHERE name = '%s'",
 					  setting);

 	log_verbose(LOG_DEBUG, "get_pg_setting(): %s\n", sqlquery);
@@ -538,7 +567,7 @@ get_conninfo_value(const char *conninfo, const char *keyword, char *output)

 	conninfo_options = PQconninfoParse(conninfo, NULL);

-	if (conninfo_options == false)
+	if (conninfo_options == NULL)
 	{
 		log_err(_("Unable to parse provided conninfo string \"%s\""), conninfo);
 		return false;
@@ -916,7 +945,7 @@ get_repmgr_schema_quoted(PGconn *conn)


 bool
-create_replication_slot(PGconn *conn, char *slot_name, int server_version_num)
+create_replication_slot(PGconn *conn, char *slot_name, int server_version_num, PQExpBufferData *error_msg)
 {
 	char				sqlquery[QUERY_STR_LEN];
 	int					query_res;
@@ -935,8 +964,9 @@ create_replication_slot(PGconn *conn, char *slot_name, int server_version_num)
 	{
 		if (strcmp(slot_info.slot_type, "physical") != 0)
 		{
-			log_err(_("Slot '%s' exists and is not a physical slot\n"),
-					slot_name);
+			appendPQExpBuffer(error_msg,
+							  _("Slot '%s' exists and is not a physical slot\n"),
+							  slot_name);
 			return false;
 		}

@@ -948,8 +978,9 @@ create_replication_slot(PGconn *conn, char *slot_name, int server_version_num)
 			return true;
 		}

-		log_err(_("Slot '%s' already exists as an active slot\n"),
-				slot_name);
+		appendPQExpBuffer(error_msg,
+						  _("Slot '%s' already exists as an active slot\n"),
+						  slot_name);
 		return false;
 	}

@@ -957,25 +988,26 @@ create_replication_slot(PGconn *conn, char *slot_name, int server_version_num)
 	if (server_version_num >= 90600)
 	{
 		sqlquery_snprintf(sqlquery,
-						  "SELECT * FROM pg_create_physical_replication_slot('%s', TRUE)",
+						  "SELECT * FROM pg_catalog.pg_create_physical_replication_slot('%s', TRUE)",
 						  slot_name);
 	}
 	else
 	{
 		sqlquery_snprintf(sqlquery,
-						  "SELECT * FROM pg_create_physical_replication_slot('%s')",
+						  "SELECT * FROM pg_catalog.pg_create_physical_replication_slot('%s')",
 						  slot_name);
 	}

-	log_debug(_("create_replication_slot(): Creating slot '%s' on primary\n"), slot_name);
+	log_debug(_("create_replication_slot(): Creating slot '%s' on master\n"), slot_name);
 	log_verbose(LOG_DEBUG, "create_replication_slot():\n%s\n", sqlquery);

 	res = PQexec(conn, sqlquery);
 	if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		log_err(_("unable to create slot '%s' on the primary node: %s\n"),
-				slot_name,
-				PQerrorMessage(conn));
+		appendPQExpBuffer(error_msg,
+						  _("unable to create slot '%s' on the master node: %s\n"),
+						  slot_name,
+						  PQerrorMessage(conn));
 		PQclear(res);
 		return false;
 	}
@@ -993,7 +1025,7 @@ get_slot_record(PGconn *conn, char *slot_name, t_replication_slot *record)

 	sqlquery_snprintf(sqlquery,
 					  "SELECT slot_name, slot_type, active "
-                      "  FROM pg_replication_slots "
+                      "  FROM pg_catalog.pg_replication_slots "
 					  " WHERE slot_name = '%s' ",
 					  slot_name);

@@ -1195,7 +1227,8 @@ witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *cluster

 	/* Get current records from primary */
 	sqlquery_snprintf(sqlquery,
-					  "SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name, active FROM %s.repl_nodes",
+					  "SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name, active "
+					  "  FROM %s.repl_nodes",
 					  get_repmgr_schema_quoted(masterconn));

 	log_verbose(LOG_DEBUG, "witness_copy_node_records():\n%s\n", sqlquery);
@@ -1309,7 +1342,8 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
 	sqlquery_snprintf(sqlquery,
 					  "INSERT INTO %s.repl_nodes "
 					  "       (id, type, upstream_node_id, cluster, "
-					  "        name, conninfo, slot_name, priority, active) "
+					  "        name, conninfo, slot_name, "
+					  "        priority, active) "
 					  "VALUES (%i, '%s', %s, '%s', '%s', '%s', %s, %i, %s) ",
 					  get_repmgr_schema_quoted(conn),
 					  node,
@@ -1455,7 +1489,6 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
 						PQerrorMessage(conn));

 			success = false;
-
 		}
 		else
 		{
@@ -1681,12 +1714,12 @@ int
 get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info)
 {
 	char		sqlquery[QUERY_STR_LEN];
-	PGresult   *res;
-	int         ntuples;
+	int		    result;

 	sqlquery_snprintf(
 		sqlquery,
-		"SELECT id, type, upstream_node_id, name, conninfo, slot_name, priority, active"
+		"SELECT id, type, upstream_node_id, name, conninfo, "
+		"       slot_name, priority, active"
 		"  FROM %s.repl_nodes "
 		" WHERE cluster = '%s' "
 		"   AND id = %i",
@@ -1696,6 +1729,51 @@ get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info

 	log_verbose(LOG_DEBUG, "get_node_record():\n%s\n", sqlquery);

+	result = _get_node_record(conn, cluster, sqlquery, node_info);
+
+	if (result == 0)
+	{
+		log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %i\n", node_id);
+	}
+
+	return result;
+}
+
+int
+get_node_record_by_name(PGconn *conn, char *cluster, const char *node_name, t_node_info *node_info)
+{
+	char		sqlquery[QUERY_STR_LEN];
+	int result;
+
+	sqlquery_snprintf(
+		sqlquery,
+		"SELECT id, type, upstream_node_id, name, conninfo, slot_name, priority, active"
+		"  FROM %s.repl_nodes "
+		" WHERE cluster = '%s' "
+		"   AND name = '%s'",
+		get_repmgr_schema_quoted(conn),
+		cluster,
+		node_name);
+
+	log_verbose(LOG_DEBUG, "get_node_record_by_name():\n%s\n", sqlquery);
+
+	result = _get_node_record(conn, cluster, sqlquery, node_info);
+
+	if (result == 0)
+	{
+		log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %s\n", node_name);
+	}
+
+	return result;
+}
+
+
+static int
+_get_node_record(PGconn *conn, char *cluster, char *sqlquery, t_node_info *node_info)
+{
+	int         ntuples;
+	PGresult   *res;
+
 	res = PQexec(conn, sqlquery);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
@@ -1706,7 +1784,6 @@ get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info

 	if (ntuples == 0)
 	{
-		log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %i\n", node_id);
 		return 0;
 	}

@@ -1727,6 +1804,9 @@ get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info
 }


+
+
+
 int
 get_node_replication_state(PGconn *conn, char *node_name, char *output)
 {
--- a/dbutils.h
+++ b/dbutils.h
@@ -1,5 +1,6 @@
 /*
 * dbutils.h
+ *
 * Copyright (c) 2ndQuadrant, 2010-2016
 *
 * This program is free software: you can redistribute it and/or modify
@@ -21,6 +22,7 @@
 #define _REPMGR_DBUTILS_H_

 #include "access/xlogdefs.h"
+#include "pqexpbuffer.h"

 #include "config.h"
 #include "strutil.h"
@@ -52,18 +54,6 @@ typedef struct s_node_info
 }	t_node_info;


-/*
- * Struct to store replication slot information
- */
-
-typedef struct s_replication_slot
-{
-	char slot_name[MAXLEN];
-    char slot_type[MAXLEN];
-	bool active;
-}   t_replication_slot;
-
-
 #define T_NODE_INFO_INITIALIZER { \
  NODE_NOT_FOUND, \
  NO_UPSTREAM_NODE, \
@@ -78,13 +68,27 @@ typedef struct s_replication_slot
  InvalidXLogRecPtr \
 }

+/*
+ * Struct to store replication slot information
+ */
+
+typedef struct s_replication_slot
+{
+	char slot_name[MAXLEN];
+    char slot_type[MAXLEN];
+	bool active;
+}   t_replication_slot;
+
+extern char		repmgr_schema[MAXLEN];
+
 PGconn *_establish_db_connection(const char *conninfo,
 								 const bool exit_on_error,
-								 const bool log_notice);
+								 const bool log_notice,
+								 const bool verbose_only);
 PGconn *establish_db_connection(const char *conninfo,
 								const bool exit_on_error);
-PGconn *test_db_connection(const char *conninfo,
-						   const bool exit_on_error);
+PGconn *establish_db_connection_quiet(const char *conninfo);
+PGconn *test_db_connection(const char *conninfo);
 PGconn *establish_db_connection_by_params(const char *keywords[],
 								  const char *values[],
 								  const bool exit_on_error);
@@ -115,7 +119,7 @@ int			wait_connection_availability(PGconn *conn, long long timeout);
 bool		cancel_query(PGconn *conn, int timeout);
 char       *get_repmgr_schema(void);
 char       *get_repmgr_schema_quoted(PGconn *conn);
-bool		create_replication_slot(PGconn *conn, char *slot_name, int server_version_num);
+bool		create_replication_slot(PGconn *conn, char *slot_name, int server_version_num, PQExpBufferData *error_msg);
 int			get_slot_record(PGconn *conn, char *slot_name, t_replication_slot *record);
 bool		drop_replication_slot(PGconn *conn, char *slot_name);
 bool		start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint);
@@ -125,6 +129,7 @@ bool		witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *c
 bool		create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name, bool active);
 bool		delete_node_record(PGconn *conn, int node, char *action);
 int			get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info);
+int			get_node_record_by_name(PGconn *conn, char *cluster, const char *node_name, t_node_info *node_info);
 bool        update_node_record_status(PGconn *conn, char *cluster_name, int this_node_id, char *type, int upstream_node_id, bool active);
 bool        update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_id, int new_upstream_node_id);
 bool        create_event_record(PGconn *conn, t_configuration_options *options, int node_id, char *event, bool successful, char *details);
--- a/debian/DEBIAN/control
+++ b/debian/DEBIAN/control
@@ -1,9 +1,9 @@
 Package: repmgr-auto
-Version: 3.0.1
+Version: 3.2dev
 Section: database
 Priority: optional
 Architecture: all
-Depends: rsync, postgresql-9.3 | postgresql-9.4
+Depends: rsync, postgresql-9.3 | postgresql-9.4 | postgresql-9.5
 Maintainer: Self built package <user@localhost>
 Description: PostgreSQL replication setup, magament and monitoring
 has two main executables
--- a/dirmod.c
+++ b/dirmod.c
@@ -0,0 +1,194 @@
+/*
+ *
+ * dirmod.c
+ *	  directory handling functions
+ *
+ * Copyright (C) 2ndQuadrant, 2010-2016
+ *
+ * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "postgres_fe.h"
+
+/* Don't modify declarations in system headers */
+
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/stat.h>
+
+/*
+ * pgfnames
+ *
+ * return a list of the names of objects in the argument directory.  Caller
+ * must call pgfnames_cleanup later to free the memory allocated by this
+ * function.
+ */
+char	  **
+pgfnames(const char *path)
+{
+	DIR		   *dir;
+	struct dirent *file;
+	char	  **filenames;
+	int			numnames = 0;
+	int			fnsize = 200;	/* enough for many small dbs */
+
+	dir = opendir(path);
+	if (dir == NULL)
+	{
+		return NULL;
+	}
+
+	filenames = (char **) palloc(fnsize * sizeof(char *));
+
+	while (errno = 0, (file = readdir(dir)) != NULL)
+	{
+		if (strcmp(file->d_name, ".") != 0 && strcmp(file->d_name, "..") != 0)
+		{
+			if (numnames + 1 >= fnsize)
+			{
+				fnsize *= 2;
+				filenames = (char **) repalloc(filenames,
+											   fnsize * sizeof(char *));
+			}
+			filenames[numnames++] = pstrdup(file->d_name);
+		}
+	}
+
+	if (errno)
+	{
+		fprintf(stderr, _("could not read directory \"%s\": %s\n"),
+				path, strerror(errno));
+	}
+
+	filenames[numnames] = NULL;
+
+	if (closedir(dir))
+	{
+		fprintf(stderr, _("could not close directory \"%s\": %s\n"),
+				path, strerror(errno));
+	}
+
+	return filenames;
+}
+
+
+/*
+ *	pgfnames_cleanup
+ *
+ *	deallocate memory used for filenames
+ */
+void
+pgfnames_cleanup(char **filenames)
+{
+	char	  **fn;
+
+	for (fn = filenames; *fn; fn++)
+		pfree(*fn);
+
+	pfree(filenames);
+}
+
+
+/*
+ *	rmtree
+ *
+ *	Delete a directory tree recursively.
+ *	Assumes path points to a valid directory.
+ *	Deletes everything under path.
+ *	If rmtopdir is true deletes the directory too.
+ *	Returns true if successful, false if there was any problem.
+ *	(The details of the problem are reported already, so caller
+ *	doesn't really have to say anything more, but most do.)
+ */
+bool
+rmtree(const char *path, bool rmtopdir)
+{
+	bool		result = true;
+	char		pathbuf[MAXPGPATH];
+	char	  **filenames;
+	char	  **filename;
+	struct stat statbuf;
+
+	/*
+	 * we copy all the names out of the directory before we start modifying
+	 * it.
+	 */
+	filenames = pgfnames(path);
+
+	if (filenames == NULL)
+		return false;
+
+	/* now we have the names we can start removing things */
+	for (filename = filenames; *filename; filename++)
+	{
+		snprintf(pathbuf, MAXPGPATH, "%s/%s", path, *filename);
+
+		/*
+		 * It's ok if the file is not there anymore; we were just about to
+		 * delete it anyway.
+		 *
+		 * This is not an academic possibility. One scenario where this
+		 * happens is when bgwriter has a pending unlink request for a file in
+		 * a database that's being dropped. In dropdb(), we call
+		 * ForgetDatabaseFsyncRequests() to flush out any such pending unlink
+		 * requests, but because that's asynchronous, it's not guaranteed that
+		 * the bgwriter receives the message in time.
+		 */
+		if (lstat(pathbuf, &statbuf) != 0)
+		{
+			if (errno != ENOENT)
+			{
+				result = false;
+			}
+			continue;
+		}
+
+		if (S_ISDIR(statbuf.st_mode))
+		{
+			/* call ourselves recursively for a directory */
+			if (!rmtree(pathbuf, true))
+			{
+				/* we already reported the error */
+				result = false;
+			}
+		}
+		else
+		{
+			if (unlink(pathbuf) != 0)
+			{
+				if (errno != ENOENT)
+				{
+					result = false;
+				}
+			}
+		}
+	}
+
+	if (rmtopdir)
+	{
+		if (rmdir(path) != 0)
+		{
+			result = false;
+		}
+	}
+
+	pgfnames_cleanup(filenames);
+
+	return result;
+}
+
--- a/dirmod.h
+++ b/dirmod.h
@@ -0,0 +1,23 @@
+/*
+ * dirmod.h
+ * Copyright (c) 2ndQuadrant, 2010-2016
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _DIRMOD_H_
+#define _DIRMOD_H_
+
+#endif
--- a/docs/repmgrd-failover-mechanism.md
+++ b/docs/repmgrd-failover-mechanism.md
@@ -0,0 +1,75 @@
+repmgrd's failover algorithm
+============================
+
+When implementing automatic failover, there are two factors which are critical in
+ensuring the desired result is achieved:
+
+  - has the master node genuinely failed?
+  - which is the best node to promote to the new master?
+
+This document outlines repmgrd's decision-making process during automatic failover
+for standbys directly connected to the master node.
+
+
+Master node failure detection
+-----------------------------
+
+If a `repmgrd` instance running on a PostgreSQL standby node is unable to connect to
+the master node, this doesn't neccesarily mean that the master is down and a
+failover is required. Factors such as network connectivity issues could mean that
+even though the standby node is isolated, the replication cluster as a whole
+is functioning correctly, and promoting the standby without further verification
+could result in a "split-brain" situation.
+
+In the event that `repmgrd` is unable to connect to the master node, it will attempt
+to reconnect to the master server several times (as defined by the `reconnect_attempts`
+parameter in `repmgr.conf`), with reconnection attempts  occuring at the interval
+specified by `reconnect_interval`. This happens to verify that the master is definitively
+not accessible (e.g. that connection was not lost due to a brief network glitch).
+
+Appropriate values for these settings will depend very much on the replication
+cluster environment. There will necessarily be a trade-off between the time it
+takes to assume the master is not reachable, and the reliability of that conclusion.
+A standby in a different physical location to the master will probably need a longer
+check interval to rule out possible network issues, whereas one located in the same
+rack with a direct connection between servers could perform the check very quickly.
+
+Note that it's possible the master comes back online after this point is reached,
+but before a new master has been selected; in this case it will be noticed
+during the selection of a new master and no actual failover will take place.
+
+Promotion candidate selection
+-----------------------------
+
+Once `repmgrd` has decided the master is definitively unreachable, following checks
+will be carried out:
+
+* attempts to connect to all other nodes in the cluster (including the witness
+  node, if defined) to establish the state of the cluster, including their
+  current LSN
+
+* If less than half of the nodes are visible (from the viewpoint
+  of this node), `repmgrd` will not take any further action. This is to ensure that
+  e.g. if a replication cluster is spread over multiple data centres, a split-brain
+  situation does not occur if there is a network failure between datacentres. Note
+  that if nodes are split evenly between data centres, a witness server can be
+  used to establish the "majority" daat centre.
+
+* `repmgrd` polls all visible servers and waits for each node to return a valid LSN;
+  it updates the LSN previously  stored for this node if it has increased since
+  the initial check
+
+* once all LSNs have been retrieved, `repmgrd` will check for the highest LSN; if
+  its own node has the highest LSN, it will attempt to promote itself (using the
+  command defined in `promote_command` in `repmgr.conf`. Note that if using
+  `repmgr standby promote` as the promotion command, and the original master becomes available
+  before the promotion takes effect, `repmgr` will return an error and no promotion
+  will take place, and `repmgrd` will resume monitoring as usual.
+
+* if the node is not the promotion candidate, `repmgrd` will execute the
+  `follow_command` defined in `repmgr.conf`. If using `repmgr standby follow` here,
+  `repmgr` will attempt to detect the new master node and attach to that.
+
+
+
+
--- a/docs/repmgrd-node-fencing.md
+++ b/docs/repmgrd-node-fencing.md
@@ -0,0 +1,150 @@
+Fencing a failed master node with repmgrd and pgbouncer
+=======================================================
+
+With automatic failover, it's essential to ensure that a failed master
+remains inaccessible to your application, even if it comes back online
+again, to avoid a split-brain situation.
+
+By using `pgbouncer` together with `repmgrd`, it's possible to combine
+automatic failover with a process to isolate the failed master from
+your application and ensure that all connections which should go to
+the master are directed there smoothly without having to reconfigure
+your application. (Note that as a connection pooler, `pgbouncer` can
+benefit your application in other ways, but those are beyond the scope
+of this document).
+
+* * *
+
+> *WARNING*: automatic failover is tricky to get right. This document
+> demonstrates one possible implementation method, however you should
+> carefully configure and test any setup to suit the needs of your own
+> replication cluster/application.
+
+* * *
+
+In a failover situation, `repmgrd` promotes a standby to master by
+executing the command defined in `promote_command`. Normally this
+would be something like:
+
+    repmgr standby promote -f /etc/repmgr.conf
+
+By wrapping this in a custom script which adjusts the `pgbouncer`
+configuration on all nodes, it's possible to fence the failed master
+and redirect write connections to the new master.
+
+The script consists of three sections:
+
+* commands to pause `pgbouncer` on all nodes
+* the promotion command itself
+* commands to reconfigure and restart `pgbouncer` on all nodes
+
+Note that it requires password-less SSH access between all nodes to be
+able to update the `pgbouncer` configuration files.
+
+For the purposes of this demonstration, we'll assume there are 3 nodes
+(master and two standbys), with `pgbouncer` listening on port 6432
+handling connections to a database called `appdb`. The `postgres`
+system user must have write access to the `pgbouncer` configuration
+file on all nodes, assumed to be at `/etc/pgbouncer.ini`.
+
+The script also requires a template file containing global `pgbouncer`
+configuration, which should looks something like this (adjust
+settings appropriately for your environment):
+
+`/var/lib/postgres/repmgr/pgbouncer.ini.template`
+
+    [pgbouncer]
+
+    logfile = /var/log/pgbouncer/pgbouncer.log
+    pidfile = /var/run/pgbouncer/pgbouncer.pid
+
+    listen_addr = *
+    listen_port = 6532
+    unix_socket_dir = /tmp
+
+    auth_type = trust
+    auth_file = /etc/pgbouncer.auth
+
+    admin_users = postgres
+    stats_users = postgres
+
+    pool_mode = transaction
+
+    max_client_conn = 100
+    default_pool_size = 20
+    min_pool_size = 5
+    reserve_pool_size = 5
+    reserve_pool_timeout = 3
+
+    log_connections = 1
+    log_disconnections = 1
+    log_pooler_errors = 1
+
+The actual script is as follows; adjust the configurable items as appropriate:
+
+`/var/lib/postgres/repmgr/promote.sh`
+
+
+    #!/usr/bin/env bash
+    set -u
+    set -e
+
+    # Configurable items
+    PGBOUNCER_HOSTS="node1 node2 node3"
+    REPMGR_DB="repmgr"
+    REPMGR_USER="repmgr"
+    REPMGR_SCHEMA="repmgr_test"
+    PGBOUNCER_CONFIG="/etc/pgbouncer.ini"
+    PGBOUNCER_INI_TEMPLATE="/var/lib/postgres/repmgr/pgbouncer.ini.template"
+    PGBOUNCER_DATABASE="appdb"
+
+    # 1. Pause running pgbouncer instances
+    for HOST in $PGBOUNCER_HOSTS
+    do
+        psql -t -c "pause" -h $HOST -p $PORT -U postgres pgbouncer
+    done
+
+
+    # 2. Promote this node from standby to master
+
+    repmgr standby promote -f /etc/repmgr.conf
+
+
+    # 3. Reconfigure pgbouncer instances
+
+    PGBOUNCER_INI_NEW="/tmp/pgbouncer.ini.new"
+
+    for HOST in $PGBOUNCER_HOSTS
+    do
+        # Recreate the pgbouncer config file
+        echo -e "[databases]\n" > $PGBOUNCER_INI_NEW
+
+        psql -d $REPMGR_DB -U $REPMGR_USER -t -A \
+          -c "SELECT '$PGBOUNCER_DATABASE= ' || conninfo || ' application_name=pgbouncer_$HOST' \
+              FROM $REPMGR_SCHEMA.repl_nodes \
+              WHERE active = TRUE AND type='master'" >> $PGBOUNCER_INI_NEW
+
+        cat $PGBOUNCER_INI_TEMPLATE >> $PGBOUNCER_INI_NEW
+
+        rsync $PGBOUNCER_INI_NEW $HOST:$PGBOUNCER_CONFIG
+
+        psql -tc "reload" -h $HOST -U postgres pgbouncer
+        psql -tc "resume" -h $HOST -U postgres pgbouncer
+
+    done
+
+    # Clean up generated file
+    rm $PGBOUNCER_INI_NEW
+
+    echo "Reconfiguration of pgbouncer complete"
+
+Script and template file should be installed on each node where
+`repmgrd` is running.
+
+Finally, set `promote_command` in `repmgr.conf` on each node to
+point to the custom promote script:
+
+    promote_command=/var/lib/postgres/repmgr/promote.sh
+
+and reload/restart any running `repmgrd` instances for the changes to take
+effect.
--- a/errcode.h
+++ b/errcode.h
@@ -29,7 +29,6 @@
 #define ERR_DB_CON 6
 #define ERR_DB_QUERY 7
 #define ERR_PROMOTED 8
-#define ERR_BAD_PASSWORD 9
 #define ERR_STR_OVERFLOW 10
 #define ERR_FAILOVER_FAIL 11
 #define ERR_BAD_SSH 12
@@ -38,5 +37,9 @@
 #define ERR_INTERNAL 15
 #define ERR_MONITORING_FAIL 16
 #define ERR_BAD_BACKUP_LABEL 17
+#define ERR_SWITCHOVER_FAIL 18
+#define ERR_BARMAN 19
+#define ERR_REGISTRATION_SYNC 20
+

 #endif   /* _ERRCODE_H_ */
--- a/log.c
+++ b/log.c
@@ -142,7 +142,7 @@ log_verbose(int level, const char *fmt, ...)


 bool
-logger_init(t_configuration_options * opts, const char *ident)
+logger_init(t_configuration_options *opts, const char *ident)
 {
 	char	   *level = opts->loglevel;
 	char	   *facility = opts->logfacility;
--- a/log.h
+++ b/log.h
@@ -130,5 +130,7 @@ __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));

 extern int	log_type;
 extern int	log_level;
+extern int	verbose_logging;
+extern int	terse_logging;

-#endif
+#endif /* _REPMGR_LOG_H_ */
--- a/repmgr.c
+++ b/repmgr.c
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -19,7 +19,7 @@

 # Node ID and name
 # (Note: we recommend to avoid naming nodes after their initial
-#  replication funcion, as this will cause confusion when e.g.
+#  replication function, as this will cause confusion when e.g.
 #  "standby2" is promoted to primary)
 #node=2           # a unique integer
 #node_name=node2  # an arbitrary (but unique) string; we recommend using
@@ -28,8 +28,16 @@

 # Database connection information as a conninfo string
 # This must be accessible to all servers in the cluster; for details see:
-#   http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING
-#conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
+#
+#   https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING
+#
+#conninfo='host=192.168.204.104 dbname=repmgr user=repmgr'
+#
+# If repmgrd is in use, consider explicitly setting `connect_timeout` in the
+# conninfo string to determine the length of time which elapses before
+# a network connection attempt is abandoned; for details see:
+#
+#   https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNECT-CONNECT-TIMEOUT

 # Optional configuration items
 # ============================
@@ -92,6 +100,34 @@
 # path to PostgreSQL binary directory (location of pg_ctl, pg_basebackup etc.)
 # (if not provided, defaults to system $PATH)
 #pg_bindir=/usr/bin/
+#
+# Debian/Ubuntu users: you will probably need to set this to the directory
+# where `pg_ctl` is located, e.g. /usr/lib/postgresql/9.5/bin/
+
+# service control commands
+#
+# repmgr provides options to override the default pg_ctl commands
+# used to stop, start, restart, reload and promote the PostgreSQL cluster
+#
+# NOTE: These commands must be runnable on remote nodes as well for switchover
+# to function correctly.
+#
+# If you use sudo, the user repmgr runs as (usually 'postgres')  must have
+# passwordless sudo access to execute the command
+#
+# For example, to use systemd, you may use the following configuration:
+#
+#    # this is required when running sudo over ssh without -t:
+#    Defaults:postgres !requiretty
+#    postgres ALL = NOPASSWD: /usr/bin/systemctl stop postgresql-9.5, \
+#       /usr/bin/systemctl start postgresql-9.5, \
+#       /usr/bin/systemctl restart postgresql-9.5
+#
+# service_start_command = systemctl start postgresql-9.5
+# service_stop_command = systemctl stop postgresql-9.5
+# service_restart_command = systemctl restart postgresql-9.5
+# service_reload_command = pg_ctlcluster 9.5 main reload
+# service_promote_command = pg_ctlcluster 9.5 main promote

 # external command options

@@ -113,6 +149,10 @@
 #
 # tablespace_mapping=/path/to/original/tablespace=/path/to/new/tablespace

+# You can specify a restore_command to be used in the recovery.conf that
+# will be placed in the cloned standby
+#
+# restore_command = cp /path/to/archived/wals/%f %p

 # Failover settings (repmgrd)
 # ---------------------------
@@ -120,6 +160,9 @@
 # These settings are only applied when repmgrd is running. Values shown
 # are defaults.

+# monitoring interval in seconds; default is 2
+#monitor_interval_secs=2
+
 # Number of seconds to wait for a response from the primary server before
 # deciding it has failed.

@@ -132,16 +175,21 @@
 #reconnect_interval=10

 # Autofailover options
-#failover=manual    # one of 'automatic', 'manual'
-                    # (default: manual)
-#priority=100       # a value of zero or less prevents the node being promoted to primary
+#failover=manual    # one of 'automatic', 'manual' (default: manual)
+                    # defines the action to take in the event of upstream failure
+                    #
+                    # 'automatic': repmgrd will automatically attempt to promote the
+                    #    node or follow the new upstream node
+                    # 'manual': repmgrd will take no action and the mode will require
+                    #    manual attention to reattach it to replication
+
+#priority=100       # indicate a preferred priorty for promoting nodes
+                    # a value of zero or less prevents the node being promoted to primary
                    # (default: 100)
+
 #promote_command='repmgr standby promote -f /path/to/repmgr.conf'
 #follow_command='repmgr standby follow -f /path/to/repmgr.conf -W'

-# monitoring interval in seconds; default is 2
-#monitor_interval_secs=2
-
 # change wait time for primary; before we bail out and exit when the primary
 # disappears, we wait 'reconnect_attempts' * 'retry_promote_interval_secs'
 # seconds; by default this would be half an hour, as 'retry_promote_interval_secs'
--- a/repmgr.h
+++ b/repmgr.h
@@ -23,18 +23,20 @@
 #include <libpq-fe.h>
 #include <postgres_fe.h>
 #include <getopt_long.h>
+#include "pqexpbuffer.h"

 #include "strutil.h"
 #include "dbutils.h"
 #include "errcode.h"
 #include "config.h"
+#include "dirmod.h"

 #define MIN_SUPPORTED_VERSION		"9.3"
 #define MIN_SUPPORTED_VERSION_NUM	90300

 #define ERRBUFF_SIZE	512

-#define DEFAULT_WAL_KEEP_SEGMENTS	"5000"
+#define DEFAULT_WAL_KEEP_SEGMENTS	"0"
 #define DEFAULT_DEST_DIR		"."
 #define DEFAULT_REPMGR_SCHEMA_PREFIX	"repmgr_"
 #define DEFAULT_PRIORITY		100
@@ -46,55 +48,121 @@
 #define NO_UPSTREAM_NODE	-1
 #define UNKNOWN_NODE_ID     -1

+/* command line options without short versions */
+#define OPT_HELP                         1
+#define OPT_CHECK_UPSTREAM_CONFIG        2
+#define OPT_RECOVERY_MIN_APPLY_DELAY     3
+#define OPT_COPY_EXTERNAL_CONFIG_FILES   4
+#define OPT_CONFIG_ARCHIVE_DIR           5
+#define OPT_PG_REWIND                    6
+#define OPT_PWPROMPT                     7
+#define OPT_CSV                          8
+#define OPT_NODE                         9
+#define OPT_WITHOUT_BARMAN               10
+#define OPT_NO_UPSTREAM_CONNECTION       11
+#define OPT_REGISTER_WAIT                12
+#define OPT_CLUSTER                      13
+
+/* deprecated command line options */
+#define OPT_INITDB_NO_PWPROMPT           998
+#define OPT_IGNORE_EXTERNAL_CONFIG_FILES 999
+
+/* values for --copy-external-config-files */
+#define CONFIG_FILE_SAMEPATH 1
+#define CONFIG_FILE_PGDATA 2


 /* Run time options type */
 typedef struct
 {
+	/* general repmgr options */
+	char		config_file[MAXPGPATH];
+	bool		verbose;
+	bool		terse;
+	bool		force;

+	/* options which override setting in repmgr.conf */
+	char		loglevel[MAXLEN];
+	char		pg_bindir[MAXLEN];
+
+	/* connection parameters */
 	char		dbname[MAXLEN];
 	char		host[MAXLEN];
 	char		username[MAXLEN];
 	char		dest_dir[MAXPGPATH];
-	char		config_file[MAXPGPATH];
 	char		remote_user[MAXLEN];
 	char		superuser[MAXLEN];
+	char		masterport[MAXLEN];
+	bool		conninfo_provided;
+	bool		connection_param_provided;
+	bool		host_param_provided;
+
+	/* standby clone parameters */
+	bool		wal_keep_segments_used;
 	char		wal_keep_segments[MAXLEN];
-	bool		verbose;
-	bool		terse;
-	bool		force;
-	bool		wait_for_master;
 	bool		ignore_rsync_warn;
-	bool		witness_pwprompt;
 	bool		rsync_only;
 	bool		fast_checkpoint;
-	bool		ignore_external_config_files;
-	char		pg_ctl_mode[MAXLEN];
-	char		masterport[MAXLEN];
-	/*
-	 * configuration file parameters which can be overridden on the
-	 * command line
-	 */
-	char		loglevel[MAXLEN];
-
-	/* parameter used by STANDBY SWITCHOVER */
-	char		remote_config_file[MAXLEN];
-	char		pg_rewind[MAXPGPATH];
-	/* parameter used by STANDBY {ARCHIVE_CONFIG | RESTORE_CONFIG} */
-	char		config_archive_dir[MAXLEN];
-	/* parameter used by CLUSTER CLEANUP */
-	int			keep_history;
-
-	char		pg_bindir[MAXLEN];
+	bool		without_barman;
+	bool		no_upstream_connection;
+	bool		copy_external_config_files;
+	int			copy_external_config_files_destination;
+	bool		wait_register_sync;
+	int			wait_register_sync_seconds;

 	char		recovery_min_apply_delay[MAXLEN];

-	/* deprecated command line options */
-	char		localport[MAXLEN];
-	bool		initdb_no_pwprompt;
+	/* witness create parameters */
+	bool		witness_pwprompt;
+
+	/* standby follow parameters */
+	bool		wait_for_master;
+
+	/* cluster {show|matrix|crosscheck} parameters */
+	bool		csv_mode;
+
+	/* cluster cleanup parameters */
+	int			keep_history;
+
+	/* standby switchover parameters */
+	char		remote_config_file[MAXLEN];
+	bool		pg_rewind_supplied;
+	char		pg_rewind[MAXPGPATH];
+	char		pg_ctl_mode[MAXLEN];
+
+	/* standby {archive_config | restore_config} parameters  */
+	char		config_archive_dir[MAXLEN];
+
+	/* {standby|witness} unregister parameters */
+	int			node;
+
 }	t_runtime_options;

-#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, false, "smart", "", "", "", "", "", 0, "", "", "", false }
+#define T_RUNTIME_OPTIONS_INITIALIZER { \
+		/* general repmgr options */	\
+		"", false, false, false,		\
+		/* options which override setting in repmgr.conf */ \
+		"", "",                         \
+		/* connection parameters */		\
+		"", "", "", "", "", "", "", 	\
+		false, false, false,		    \
+		/* standby clone parameters */  \
+		false, DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, \
+		CONFIG_FILE_SAMEPATH, false, 0, "", \
+		/* witness create parameters */ \
+		false,                          \
+		/* standby follow parameters */ \
+		false,                          \
+		/* cluster {show|matrix|crosscheck} parameters */ \
+		false,                          \
+		/* cluster cleanup parameters */ \
+		0,                              \
+		/* standby switchover parameters */ \
+		"", false, "", "fast",          \
+		/* standby {archive_config | restore_config} parameters  */ \
+		"",                             \
+		/* {standby|witness} unregister parameters */ \
+		UNKNOWN_NODE_ID }

 struct BackupLabel
 {
@@ -108,7 +176,60 @@ struct BackupLabel
 	XLogRecPtr min_failover_slot_lsn;
 };

-extern char		repmgr_schema[MAXLEN];
-extern bool		config_file_found;
+
+typedef struct
+{
+	char		slot[MAXLEN];
+	char		xlog_method[MAXLEN];
+} t_basebackup_options;
+
+#define T_BASEBACKUP_OPTIONS_INITIALIZER { "", "" }
+
+typedef struct
+{
+	int    size;
+	char **keywords;
+	char **values;
+} t_conninfo_param_list;
+
+typedef struct
+{
+	char filepath[MAXPGPATH];
+	char filename[MAXPGPATH];
+	bool in_data_directory;
+} t_configfile_info;
+
+
+typedef struct
+{
+	int    size;
+	int    entries;
+	t_configfile_info **files;
+} t_configfile_list;
+
+#define T_CONFIGFILE_LIST_INITIALIZER { 0, 0, NULL }
+
+
+typedef struct
+{
+	int node_id;
+	int node_status;
+} t_node_status_rec;
+
+typedef struct
+{
+	int node_id;
+	char node_name[MAXLEN];
+	t_node_status_rec **node_status_list;
+} t_node_matrix_rec;
+
+typedef struct
+{
+	int node_id;
+	char node_name[MAXLEN];
+	t_node_matrix_rec **matrix_list_rec;
+} t_node_status_cube;
+
+

 #endif
--- a/repmgr.sql
+++ b/repmgr.sql
@@ -64,7 +64,7 @@ CREATE INDEX idx_repl_status_sort ON repl_monitor(last_monitor_time, standby_nod
 * This view shows the list of nodes with the information of which one is the upstream
 * in each case (when appliable)
 */
-CREATE VIEW repl_show_nodes AS 
+CREATE VIEW repl_show_nodes AS
 SELECT rn.id, rn.conninfo, rn.type, rn.name, rn.cluster,
 	rn.priority, rn.active, sq.name AS upstream_node_name
 FROM repl_nodes as rn LEFT JOIN repl_nodes AS sq ON sq.id=rn.upstream_node_id;
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -1,5 +1,6 @@
 /*
 * repmgrd.c - Replication manager daemon
+ *
 * Copyright (C) 2ndQuadrant, 2010-2016
 *
 * This module connects to the nodes of a replication cluster and monitors
@@ -41,14 +42,17 @@
 #include "access/xlogdefs.h"
 #include "pqexpbuffer.h"

+/* Message strings passed in repmgrSharedState->location */

+#define PASSIVE_NODE "PASSIVE_NODE"
+#define LSN_QUERY_ERROR "LSN_QUERY_ERROR"

 /* Local info */
-t_configuration_options local_options;
+t_configuration_options local_options = T_CONFIGURATION_OPTIONS_INITIALIZER;
 PGconn	   *my_local_conn = NULL;

 /* Master info */
-t_configuration_options master_options;
+t_configuration_options master_options = T_CONFIGURATION_OPTIONS_INITIALIZER;

 PGconn	   *master_conn = NULL;

@@ -59,9 +63,14 @@ t_node_info node_info;

 bool		failover_done = false;

-char	   *pid_file = NULL;
+/*
+ * when `failover=manual`, and the upstream server has gone away,
+ * this flag is set to indicate we should connect to whatever the
+ * current master is to update monitoring information
+ */
+bool		manual_mode_upstream_disconnected = false;

-t_configuration_options config = T_CONFIGURATION_OPTIONS_INITIALIZER;
+char	   *pid_file = NULL;

 static void help(void);
 static void usage(void);
@@ -102,17 +111,15 @@ static void check_and_create_pid_file(const char *pid_file);
 static void
 close_connections()
 {
-	if (master_conn != NULL && PQisBusy(master_conn) == 1)
+	if (PQstatus(master_conn) == CONNECTION_OK && PQisBusy(master_conn) == 1)
 		cancel_query(master_conn, local_options.master_response_timeout);

-	if (my_local_conn != NULL)
+
+	if (PQstatus(my_local_conn) == CONNECTION_OK)
 		PQfinish(my_local_conn);

-	if (master_conn != NULL && master_conn != my_local_conn)
+	if (PQstatus(master_conn) == CONNECTION_OK)
 		PQfinish(master_conn);
-
-	master_conn = NULL;
-	my_local_conn = NULL;
 }


@@ -126,7 +133,7 @@ main(int argc, char **argv)
 		{"monitoring-history", no_argument, NULL, 'm'},
 		{"daemonize", no_argument, NULL, 'd'},
 		{"pid-file", required_argument, NULL, 'p'},
-		{"help", no_argument, NULL, '?'},
+		{"help", no_argument, NULL, OPT_HELP},
 		{"version", no_argument, NULL, 'V'},
 		{NULL, 0, NULL, 0}
 	};
@@ -160,6 +167,23 @@ main(int argc, char **argv)
 	{
 		switch (c)
 		{
+			case '?':
+				/* Actual help option given */
+				if (strcmp(argv[optind - 1], "-?") == 0)
+				{
+					help();
+					exit(SUCCESS);
+				}
+				/* unknown option reported by getopt */
+				else
+					goto unknown_option;
+				break;
+			case OPT_HELP:
+				help();
+				exit(SUCCESS);
+			case 'V':
+				printf("%s %s (PostgreSQL %s)\n", progname(), REPMGR_VERSION, PG_VERSION);
+				exit(SUCCESS);
 			case 'f':
 				config_file = optarg;
 				break;
@@ -175,13 +199,9 @@ main(int argc, char **argv)
 			case 'p':
 				pid_file = optarg;
 				break;
-			case '?':
-				help();
-				exit(SUCCESS);
-			case 'V':
-				printf("%s %s (PostgreSQL %s)\n", progname(), REPMGR_VERSION, PG_VERSION);
-				exit(SUCCESS);
+
 			default:
+		unknown_option:
 				usage();
 				exit(ERR_BAD_CONFIG);
 		}
@@ -291,10 +311,46 @@ main(int argc, char **argv)

 	log_debug("node id is %i, upstream is %i\n", node_info.node_id, node_info.upstream_node_id);

+    /*
+     * Check if node record is active - if not, and `failover=automatic`, the node
+     * won't be considered as a promotion candidate; this often happens when
+     * a failed primary is recloned and the node was not re-registered, giving
+     * the impression failover capability is there when it's not. In this case
+     * abort with an error and a hint about registering.
+     *
+     * If `failover=manual`, repmgrd can continue to passively monitor the node, but
+     * we should nevertheless issue a warning and the same hint.
+     */
+
+    if (node_info.active == false)
+    {
+        char *hint = "Check that 'repmgr (master|standby) register' was executed for this node";
+
+        switch (local_options.failover)
+        {
+            case AUTOMATIC_FAILOVER:
+                log_err(_("This node is marked as inactive and cannot be used for failover\n"));
+                log_hint(_("%s\n"), hint);
+                terminate(ERR_BAD_CONFIG);
+
+            case MANUAL_FAILOVER:
+                log_warning(_("This node is marked as inactive and will be passively monitored only\n"));
+                log_hint(_("%s\n"), hint);
+                break;
+
+            default:
+                /* This should never happen */
+                log_err(_("Unknown failover mode %i\n"), local_options.failover);
+                terminate(ERR_BAD_CONFIG);
+        }
+
+    }
+
 	/*
 	 * MAIN LOOP This loops cycles at startup and once per failover and
-	 * Requisites: - my_local_conn needs to be already setted with an active
-	 * connection - no master connection
+	 * Requisites:
+	 *  - my_local_conn must have an active connection to the monitored node
+	 *  - master_conn must not be open
 	 */
 	do
 	{
@@ -399,14 +455,14 @@ main(int argc, char **argv)
 			case STANDBY:

 				/* We need the node id of the master server as well as a connection to it */
-				log_info(_("connecting to master node '%s'\n"),
+				log_info(_("connecting to master node of cluster '%s'\n"),
 						 local_options.cluster_name);

 				master_conn = get_master_connection(my_local_conn,
 													local_options.cluster_name,
 													&master_options.node, NULL);

-				if (master_conn == NULL)
+				if (PQstatus(master_conn) != CONNECTION_OK)
 				{
 					PQExpBufferData errmsg;
 					initPQExpBuffer(&errmsg);
@@ -435,6 +491,7 @@ main(int argc, char **argv)
 					my_local_conn = establish_db_connection(local_options.conninfo, true);
 					update_registration();
 				}
+
 				/* Log startup event */
 				if (startup_event_logged == false)
 				{
@@ -462,16 +519,16 @@ main(int argc, char **argv)

 				do
 				{
-					log_verbose(LOG_DEBUG, "standby check loop...\n");
-
-					if (node_info.type == WITNESS)
-					{
-						witness_monitor();
-					}
-					else if (node_info.type == STANDBY)
+					if (node_info.type == STANDBY)
 					{
+						log_verbose(LOG_DEBUG, "standby check loop...\n");
 						standby_monitor();
 					}
+					else if (node_info.type == WITNESS)
+					{
+						log_verbose(LOG_DEBUG, "witness check loop...\n");
+						witness_monitor();
+					}

 					sleep(local_options.monitor_interval_secs);

@@ -590,7 +647,7 @@ witness_monitor(void)
 			}
 			else
 			{
-				log_debug(_("new master found with node ID: %i\n"), master_options.node);
+				log_info(_("new master found with node ID: %i\n"), master_options.node);
 				connection_ok = true;

 				/*
@@ -641,7 +698,7 @@ witness_monitor(void)
 								 local_options.master_response_timeout) != 1)
 		return;

-	/* Get local xlog info */
+	/* Get timestamp for monitoring update */
 	sqlquery_snprintf(sqlquery, "SELECT CURRENT_TIMESTAMP");

 	res = PQexec(my_local_conn, sqlquery);
@@ -667,7 +724,7 @@ witness_monitor(void)
 					  "            replication_lag, apply_lag )"
 					  "      VALUES(%d, %d, "
 					  "             '%s'::TIMESTAMP WITH TIME ZONE, NULL, "
-					  "             pg_current_xlog_location(), NULL, "
+					  "             pg_catalog.pg_current_xlog_location(), NULL, "
 					  "             0, 0) ",
 					  get_repmgr_schema_quoted(my_local_conn),
 					  master_options.node,
@@ -694,18 +751,22 @@ static void
 standby_monitor(void)
 {
 	PGresult   *res;
+	char		sqlquery[QUERY_STR_LEN];
+
 	char		monitor_standby_timestamp[MAXLEN];
-	char		last_wal_master_location[MAXLEN];
+	char		last_wal_primary_location[MAXLEN];
 	char		last_xlog_receive_location[MAXLEN];
 	char		last_xlog_replay_location[MAXLEN];
 	char		last_xact_replay_timestamp[MAXLEN];
-	bool		last_xlog_receive_location_gte_replayed;
-	char		sqlquery[QUERY_STR_LEN];
+	bool		receiving_streamed_wal = true;

 	XLogRecPtr	lsn_master_current_xlog_location;
 	XLogRecPtr	lsn_last_xlog_receive_location;
 	XLogRecPtr	lsn_last_xlog_replay_location;

+	long long unsigned int replication_lag;
+	long long unsigned int apply_lag;
+
 	int			connection_retries,
 				ret;
 	bool		did_retry = false;
@@ -718,7 +779,8 @@ standby_monitor(void)
 	int			active_master_id;
 	const char *upstream_node_type = NULL;

-	bool		receiving_streamed_wal = true;
+
+
 	/*
 	 * Verify that the local node is still available - if not there's
 	 * no point in doing much else anyway
@@ -740,53 +802,91 @@ standby_monitor(void)
 		goto continue_monitoring_standby;
 	}

-	upstream_conn = get_upstream_connection(my_local_conn,
-											local_options.cluster_name,
-											local_options.node,
-											&upstream_node_id,
-											upstream_conninfo);
-
-	upstream_node_type = (upstream_node_id == master_options.node)
-		? "master"
-		: "upstream";
-
-	// ZZZ "5 minutes"?
 	/*
-	 * Check if the upstream node is still available, if after 5 minutes of retries
-	 * we cannot reconnect, try to get a new upstream node.
+	 * Standby has `failover` set to manual and is disconnected from
+	 * replication following a prior upstream node failure - we'll
+	 * find the master to be able to write monitoring information, if
+	 * required
 	 */
+	if (manual_mode_upstream_disconnected == true)
+	{
+		upstream_conn = get_master_connection(my_local_conn,
+												local_options.cluster_name,
+												&upstream_node_id,
+												upstream_conninfo);
+		upstream_node_type = "master";
+	}
+	else
+	{
+		upstream_conn = get_upstream_connection(my_local_conn,
+												local_options.cluster_name,
+												local_options.node,
+												&upstream_node_id,
+												upstream_conninfo);
+
+		upstream_node_type = (upstream_node_id == master_options.node)
+			? "master"
+			: "upstream";
+	}

-	check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
 	/*
+	 * Check that the upstream node is still available
+	 * If not, initiate failover process
+	 *
 	 * This takes up to local_options.reconnect_attempts *
 	 * local_options.reconnect_interval seconds
 	 */

+	check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
+
 	if (PQstatus(upstream_conn) != CONNECTION_OK)
 	{
+		int previous_master_node_id = master_options.node;
+
 		PQfinish(upstream_conn);
 		upstream_conn = NULL;

+		/*
+		 * When `failover=manual`, no actual failover will be performed, instead
+		 * the following happens:
+		 *  - find the new master
+		 *  - create an event notification `standby_disconnect_manual`
+		 *  - set a flag to indicate we're disconnected from replication,
+		 */
 		if (local_options.failover == MANUAL_FAILOVER)
 		{
 			log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), upstream_node_type);

+			/*
+			 * Set the location string in shared memory to indicate to other
+			 * repmgrd instances that we're *not* a promotion candidate and
+			 * that other repmgrd instance should not expect location updates
+			 * from us
+			 */
+
+			update_shared_memory(PASSIVE_NODE);
+
 			for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
 			{
 				master_conn = get_master_connection(my_local_conn,
 					local_options.cluster_name, &master_options.node, NULL);
+
 				if (PQstatus(master_conn) == CONNECTION_OK)
 				{
 					/*
 					 * Connected, we can continue the process so break the
 					 * loop
 					 */
-					log_err(_("connected to node %d, continuing monitoring.\n"),
+					log_notice(_("connected to node %d, continuing monitoring.\n"),
 							master_options.node);
 					break;
 				}
 				else
 				{
+					/*
+					 * XXX this is the only place where `retry_promote_interval_secs`
+					 * is used - this parameter should be renamed or possibly be replaced
+					 */
 					log_err(
 					    _("no new master found, waiting %i seconds before retry...\n"),
 					    local_options.retry_promote_interval_secs
@@ -816,30 +916,58 @@ standby_monitor(void)

 				terminate(ERR_DB_CON);
 			}
+
+			/*
+			 * connected to a master - is it the same as the former upstream?
+			 * if not:
+			 *  - create event standby_disconnect
+			 *  - set global "disconnected_manual_standby"
+			 */
+
+			if (previous_master_node_id != master_options.node)
+			{
+				PQExpBufferData errmsg;
+				initPQExpBuffer(&errmsg);
+
+				appendPQExpBuffer(&errmsg,
+								  _("node %i is in manual failover mode and is now disconnected from replication"),
+								  local_options.node);
+
+				log_verbose(LOG_DEBUG, "old master: %i; current: %i\n", previous_master_node_id, master_options.node);
+
+				manual_mode_upstream_disconnected = true;
+
+				create_event_record(master_conn,
+									&local_options,
+									local_options.node,
+									"standby_disconnect_manual",
+									/* here "true" indicates the action has occurred as expected */
+									true,
+									errmsg.data);
+
+			}
 		}
 		else if (local_options.failover == AUTOMATIC_FAILOVER)
 		{
 			/*
-			 * When we returns from this function we will have a new master
+			 * When we return from this function we will have a new master
 			 * and a new master_conn
-			 */
-
-			/*
+			 *
 			 * Failover handling is handled differently depending on whether
 			 * the failed node is the master or a cascading standby
 			 */
 			upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);

-            if (upstream_node.type == MASTER)
-            {
-                log_debug(_("failure detected on master node (%i); attempting to promote a standby\n"),
-                          node_info.upstream_node_id);
-                do_master_failover();
-            }
-            else
-            {
-                log_debug(_("failure detected on upstream node %i; attempting to reconnect to new upstream node\n"),
-                          node_info.upstream_node_id);
+			if (upstream_node.type == MASTER)
+			{
+				log_debug(_("failure detected on master node (%i); attempting to promote a standby\n"),
+						  node_info.upstream_node_id);
+				do_master_failover();
+			}
+			else
+			{
+				log_debug(_("failure detected on upstream node %i; attempting to reconnect to new upstream node\n"),
+						  node_info.upstream_node_id);

 				if (!do_upstream_standby_failover(upstream_node))
 				{
@@ -847,20 +975,20 @@ standby_monitor(void)
 					initPQExpBuffer(&errmsg);

 					appendPQExpBuffer(&errmsg,
-									  _("unable to reconnect to new upstream node, terminating..."));
+							  _("unable to reconnect to new upstream node, terminating..."));

 					log_err("%s\n", errmsg.data);

 					create_event_record(master_conn,
-										&local_options,
-										local_options.node,
-										"repmgrd_shutdown",
-										false,
-										errmsg.data);
+							    &local_options,
+							    local_options.node,
+							    "repmgrd_shutdown",
+							    false,
+							    errmsg.data);

 					terminate(ERR_DB_CON);
 				}
-            }
+			}
 			return;
 		}
 	}
@@ -918,8 +1046,8 @@ standby_monitor(void)
 		 * the stream. If we set the local standby node as failed and it's now running
 		 * and receiving replication data, we should activate it again.
 		 */
-	        set_local_node_status();
-	        log_info(_("standby connection recovered!\n"));
+		set_local_node_status();
+		log_info(_("standby connection recovered!\n"));
 	}

 	/* Fast path for the case where no history is requested */
@@ -931,6 +1059,7 @@ standby_monitor(void)
 	 * from the upstream node to write monitoring information
 	 */

+	/* XXX not used? */
 	upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);

 	sprintf(sqlquery,
@@ -963,7 +1092,7 @@ standby_monitor(void)

 	if (active_master_id != master_options.node)
 	{
-		log_notice(_("connecting to active master (node %i)...\n"), active_master_id); \
+		log_notice(_("connecting to active master (node %i)...\n"), active_master_id);
 		if (master_conn != NULL)
 		{
 			PQfinish(master_conn);
@@ -984,11 +1113,28 @@ standby_monitor(void)
 	if (wait_connection_availability(master_conn, local_options.master_response_timeout) != 1)
 		return;

-	/* Get local xlog info */
+	/* Get local xlog info
+	 *
+	 * If receive_location is NULL, we're in archive recovery and not streaming WAL
+	 * If receive_location is less than replay location, we were streaming WAL but are
+	 *   somehow disconnected and evidently in archive recovery
+	 */
 	sqlquery_snprintf(sqlquery,
-					  "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
-					  "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp(), "
-					  "pg_last_xlog_receive_location() >= pg_last_xlog_replay_location()");
+					  " SELECT ts, "
+					  "        CASE WHEN (receive_location IS NULL OR receive_location < replay_location) "
+					  "          THEN replay_location "
+					  "          ELSE receive_location"
+					  "        END AS receive_location,"
+					  "        replay_location, "
+					  "        replay_timestamp, "
+					  "        COALESCE(receive_location, '0/0') >= replay_location AS receiving_streamed_wal "
+					  "   FROM (SELECT CURRENT_TIMESTAMP AS ts, "
+					  "         pg_catalog.pg_last_xlog_receive_location() AS receive_location, "
+					  "         pg_catalog.pg_last_xlog_replay_location()  AS replay_location, "
+					  "         pg_catalog.pg_last_xact_replay_timestamp() AS replay_timestamp "
+					  "        ) q ");
+
+

 	res = PQexec(my_local_conn, sqlquery);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -1004,41 +1150,23 @@ standby_monitor(void)
 	strncpy(last_xlog_replay_location, PQgetvalue(res, 0, 2), MAXLEN);
 	strncpy(last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);

-	last_xlog_receive_location_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
+	receiving_streamed_wal = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
 		? true
 		: false;

-	/*
-	 * If pg_last_xlog_receive_location is NULL, this means we're in archive
-	 * recovery and will need to calculate lag based on pg_last_xlog_replay_location
-	 */
-
-	/*
-	 * Replayed WAL is greater than received streamed WAL
-	 */
-	if (PQgetisnull(res, 0, 1))
+	if (receiving_streamed_wal == false)
 	{
-		receiving_streamed_wal = false;
+		log_verbose(LOG_DEBUG, _("standby %i not connected to streaming replication"), local_options.node);
 	}

 	PQclear(res);

 	/*
-	 * In the unusual event of a standby becoming disconnected from the primary,
-	 * while this repmgrd remains connected to the primary,  subtracting
-	 * "last_xlog_replay_location" from "lsn_last_xlog_receive_location" and coercing to
-	 * (long long unsigned int) will result in a meaningless, very large
-	 * value which will overflow a BIGINT column and spew error messages into the
-	 * PostgreSQL log. In the absence of a better strategy, skip attempting
-	 * to insert a monitoring record.
+	 * Get master xlog position
+	 *
+	 * TODO: investigate whether pg_current_xlog_insert_location() would be a better
+	 * choice; see: https://github.com/2ndQuadrant/repmgr/issues/189
 	 */
-	if (receiving_streamed_wal == true && last_xlog_receive_location_gte_replayed == false)
-	{
-		log_verbose(LOG_WARNING,
-					"Replayed WAL newer than received WAL - is this standby connected to its upstream?\n");
-	}
-
-	/* Get master xlog info */
 	sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_current_xlog_location()");

 	res = PQexec(master_conn, sqlquery);
@@ -1049,21 +1177,27 @@ standby_monitor(void)
 		return;
 	}

-	strncpy(last_wal_master_location, PQgetvalue(res, 0, 0), MAXLEN);
+	strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN);
 	PQclear(res);

-	/* Calculate the lag */
-	lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_master_location, NULL);
-
+	lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_primary_location, NULL);
 	lsn_last_xlog_replay_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL);
+	lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);

-	if (last_xlog_receive_location_gte_replayed == false)
+	apply_lag = (long long unsigned int)lsn_last_xlog_receive_location - lsn_last_xlog_replay_location;
+
+	/* Calculate replication lag */
+	if (lsn_master_current_xlog_location >= lsn_last_xlog_receive_location)
 	{
-		lsn_last_xlog_receive_location = lsn_last_xlog_replay_location;
+		replication_lag = (long long unsigned int)(lsn_master_current_xlog_location - lsn_last_xlog_receive_location);
 	}
 	else
 	{
-		lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);
+		/* This should never happen, but in case it does set lag to zero */
+		log_warning("Master xlog (%s) location appears less than standby receive location (%s)\n",
+					last_wal_primary_location,
+					last_xlog_receive_location);
+		replication_lag = 0;
 	}

 	/*
@@ -1092,10 +1226,10 @@ standby_monitor(void)
 					  local_options.node,
 					  monitor_standby_timestamp,
 					  last_xact_replay_timestamp,
-					  last_wal_master_location,
+					  last_wal_primary_location,
 					  last_xlog_receive_location,
-					  (long long unsigned int)(lsn_master_current_xlog_location - lsn_last_xlog_receive_location),
-					  (long long unsigned int)(lsn_last_xlog_receive_location - lsn_last_xlog_replay_location));
+					  replication_lag,
+					  apply_lag);

 	/*
 	 * Execute the query asynchronously, but don't check for a result. We will
@@ -1121,7 +1255,7 @@ do_master_failover(void)
 	PGresult   *res;
 	char		sqlquery[QUERY_STR_LEN];

-	int			total_nodes = 0;
+	int			total_active_nodes = 0;
 	int			visible_nodes = 0;
 	int			ready_nodes = 0;

@@ -1133,8 +1267,6 @@ do_master_failover(void)
 	XLogRecPtr	xlog_recptr;
 	bool		lsn_format_ok;

-	char		last_xlog_replay_location[MAXLEN];
-
 	PGconn	   *node_conn = NULL;

 	/*
@@ -1143,8 +1275,8 @@ do_master_failover(void)
 	 */
 	t_node_info nodes[FAILOVER_NODES_MAX_CHECK];

-    /* Store details of the failed node here */
-    t_node_info failed_master = T_NODE_INFO_INITIALIZER;
+	/* Store details of the failed node here */
+	t_node_info failed_master = T_NODE_INFO_INITIALIZER;

 	/* Store details of the best candidate for promotion to master here */
 	t_node_info best_candidate = T_NODE_INFO_INITIALIZER;
@@ -1154,7 +1286,7 @@ do_master_failover(void)
 			"SELECT id, conninfo, type, upstream_node_id "
 			"  FROM %s.repl_nodes "
 			" WHERE cluster = '%s' "
-            "   AND active IS TRUE "
+			"   AND active IS TRUE "
 			"   AND priority > 0 "
 			" ORDER BY priority DESC, id "
 			" LIMIT %i ",
@@ -1167,36 +1299,28 @@ do_master_failover(void)
 	{
 		log_err(_("unable to retrieve node records: %s\n"), PQerrorMessage(my_local_conn));
 		PQclear(res);
-		PQfinish(my_local_conn);
 		terminate(ERR_DB_QUERY);
 	}

-	/*
-	 * total nodes that are registered
-	 */
-	total_nodes = PQntuples(res);
-	log_debug(_("%d active nodes registered\n"), total_nodes);
+	total_active_nodes = PQntuples(res);
+	log_debug(_("%d active nodes registered\n"), total_active_nodes);

 	/*
 	 * Build an array with the nodes and indicate which ones are visible and
 	 * ready
 	 */
-	for (i = 0; i < total_nodes; i++)
+	for (i = 0; i < total_active_nodes; i++)
 	{
+		char node_type[MAXLEN];
+
+		nodes[i] = (t_node_info) T_NODE_INFO_INITIALIZER;
+
 		nodes[i].node_id = atoi(PQgetvalue(res, i, 0));

 		strncpy(nodes[i].conninfo_str, PQgetvalue(res, i, 1), MAXCONNINFO);
+		strncpy(node_type, PQgetvalue(res, i, 2), MAXLEN);

-		nodes[i].type = parse_node_type(PQgetvalue(res, i, 2));
-
-		/* Copy details of the failed node */
-		/* XXX only node_id is actually used later */
-		if (nodes[i].type == MASTER)
-		{
-			failed_master.node_id = nodes[i].node_id;
-			failed_master.xlog_location = nodes[i].xlog_location;
-			failed_master.is_ready = nodes[i].is_ready;
-		}
+		nodes[i].type = parse_node_type(node_type);

 		nodes[i].upstream_node_id = atoi(PQgetvalue(res, i, 3));

@@ -1207,11 +1331,42 @@ do_master_failover(void)
 		nodes[i].is_visible = false;
 		nodes[i].is_ready = false;

-		nodes[i].xlog_location = InvalidXLogRecPtr;
+		log_debug(_("node=%i conninfo=\"%s\" type=%s\n"),
+				  nodes[i].node_id,
+				  nodes[i].conninfo_str,
+				  node_type);

-		log_debug(_("node=%d conninfo=\"%s\" type=%s\n"),
-				  nodes[i].node_id, nodes[i].conninfo_str,
-				  PQgetvalue(res, i, 2));
+		/* Copy details of the failed master node */
+		if (nodes[i].type == MASTER)
+		{
+			/* XXX only node_id is currently used */
+			failed_master.node_id = nodes[i].node_id;
+
+			/*
+			 * XXX experimental
+			 *
+			 * Currently an attempt is made to connect to the master,
+			 * which is very likely to be a waste of time at this point, as we'll
+			 * have spent the last however many seconds trying to do just that
+			 * in check_connection() before deciding it's gone away.
+			 *
+			 * If the master did come back at this point, the voting algorithm should decide
+			 * it's the "best candidate" anyway and no standby will promote itself or
+			 * attempt to follow* another server.
+			 *
+			 * If we don't try and connect to the master here (and the code generally
+			 * assumes it's failed anyway) but it does come back any time from here
+			 * onwards, promotion will fail and the promotion candidate will
+			 * notice the reappearance.
+			 *
+			 * TLDR version: by skipping the master connection attempt (and the chances
+			 * the master would reappear between the last attempt in check_connection()
+			 * and now are minimal) we can remove useless cycles during the failover process;
+			 * if the master does reappear it will be caught before later anyway.
+			 */
+
+			continue;
+		}

 		node_conn = establish_db_connection(nodes[i].conninfo_str, false);

@@ -1232,13 +1387,13 @@ do_master_failover(void)
 	PQclear(res);

 	log_debug(_("total nodes counted: registered=%d, visible=%d\n"),
-			  total_nodes, visible_nodes);
+			  total_active_nodes, visible_nodes);

 	/*
 	 * Am I on the group that should keep alive? If I see less than half of
-	 * total_nodes then I should do nothing
+	 * total_active_nodes then I should do nothing
 	 */
-	if (visible_nodes < (total_nodes / 2.0))
+	if (visible_nodes < (total_active_nodes / 2.0))
 	{
 		log_err(_("Unable to reach most of the nodes.\n"
 				  "Let the other standby servers decide which one will be the master.\n"
@@ -1247,7 +1402,7 @@ do_master_failover(void)
 	}

 	/* Query all available nodes to determine readiness and LSN */
-	for (i = 0; i < total_nodes; i++)
+	for (i = 0; i < total_active_nodes; i++)
 	{
 		log_debug("checking node %i...\n", nodes[i].node_id);

@@ -1316,8 +1471,8 @@ do_master_failover(void)
 				  " considered as new master and exit.\n"),
 				PQerrorMessage(my_local_conn));
 		PQclear(res);
-		sprintf(last_xlog_replay_location, "'%X/%X'", 0, 0);
-		update_shared_memory(last_xlog_replay_location);
+
+		update_shared_memory(LSN_QUERY_ERROR);
 		terminate(ERR_DB_QUERY);
 	}
 	/* write last location in shared memory */
@@ -1325,7 +1480,7 @@ do_master_failover(void)
 	PQclear(res);

 	/* Wait for each node to come up and report a valid LSN */
-	for (i = 0; i < total_nodes; i++)
+	for (i = 0; i < total_active_nodes; i++)
 	{
 		/*
 		 * ensure witness server is marked as ready, and skip
@@ -1367,6 +1522,7 @@ do_master_failover(void)

 		while (!nodes[i].is_ready)
 		{
+			char location_value[MAXLEN];

 			sqlquery_snprintf(sqlquery,
 							  "SELECT %s.repmgr_get_last_standby_location()",
@@ -1382,7 +1538,11 @@ do_master_failover(void)
 				terminate(ERR_DB_QUERY);
 			}

-			xlog_recptr = lsn_to_xlogrecptr(PQgetvalue(res, 0, 0), &lsn_format_ok);
+			/* Copy the returned value as we'll need to reference it a few times */
+			strncpy(location_value, PQgetvalue(res, 0, 0), MAXLEN);
+			PQclear(res);
+
+			xlog_recptr = lsn_to_xlogrecptr(location_value, &lsn_format_ok);

 			/* If position reported as "invalid", check for format error or
 			 * empty string; otherwise position is 0/0 and we need to continue
@@ -1390,10 +1550,36 @@ do_master_failover(void)
 			 */
 			if (xlog_recptr == InvalidXLogRecPtr)
 			{
+				bool continue_loop = true;
+
 				if (lsn_format_ok == false)
 				{
+
+					/*
+					 * The node is indicating it is not a promotion candidate -
+					 * in this case we can store its invalid LSN to ensure it
+					 * can't be a promotion candidate when comparing locations
+					 */
+					if (strcmp(location_value, PASSIVE_NODE) == 0)
+					{
+						log_debug("node %i is passive mode\n", nodes[i].node_id);
+						log_info(_("node %i will not be considered for promotion\n"), nodes[i].node_id);
+						nodes[i].xlog_location = InvalidXLogRecPtr;
+						continue_loop = false;
+					}
+					/*
+					 * This should probably never happen but if it does, rule the
+					 * node out as a promotion candidate
+					 */
+					else if (strcmp(location_value, LSN_QUERY_ERROR) == 0)
+					{
+						log_warning(_("node %i is unable to update its shared memory and will not be considered for promotion\n"), nodes[i].node_id);
+						nodes[i].xlog_location = InvalidXLogRecPtr;
+						continue_loop = false;
+					}
+
 					/* Unable to parse value returned by `repmgr_get_last_standby_location()` */
-					if (*PQgetvalue(res, 0, 0) == '\0')
+					else if (*location_value == '\0')
 					{
 						log_crit(
 							_("unable to obtain LSN from node %i"), nodes[i].node_id
@@ -1402,8 +1588,8 @@ do_master_failover(void)
 							_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n")
 							);

-						PQclear(res);
 						PQfinish(node_conn);
+						/* XXX shouldn't we just ignore this node? */
 						exit(ERR_BAD_CONFIG);
 					}

@@ -1411,25 +1597,29 @@ do_master_failover(void)
 					 * Very unlikely to happen; in the absence of any better
 					 * strategy keep checking
 					 */
-					log_warning(_("unable to parse LSN \"%s\"\n"),
-								PQgetvalue(res, 0, 0));
+					else {
+						log_warning(_("unable to parse LSN \"%s\"\n"),
+									location_value);
+					}
 				}
 				else
 				{
 					log_debug(
 						_("invalid LSN returned from node %i: '%s'\n"),
 						nodes[i].node_id,
-						PQgetvalue(res, 0, 0)
-						);
+						location_value);
 				}

-				PQclear(res);
-
-				/* If position is 0/0, keep checking */
-				/* XXX we should add a timeout here to prevent infinite looping
+				/*
+				 * If the node is still reporting an InvalidXLogRecPtr, it means
+				 * its repmgrd hasn't yet had time to update it (either with a valid
+				 * XLogRecPtr or a message) so we continue looping.
+				 *
+				 * XXX we should add a timeout here to prevent infinite looping
 				 * if the other node's repmgrd is not up
 				 */
-				continue;
+				if (continue_loop == true)
+					continue;
 			}

 			if (nodes[i].xlog_location < xlog_recptr)
@@ -1437,8 +1627,7 @@ do_master_failover(void)
 				nodes[i].xlog_location = xlog_recptr;
 			}

-			log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, PQgetvalue(res, 0, 0));
-			PQclear(res);
+			log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, location_value);

 			ready_nodes++;
 			nodes[i].is_ready = true;
@@ -1451,7 +1640,7 @@ do_master_failover(void)
 	/*
 	 * determine which one is the best candidate to promote to master
 	 */
-	for (i = 0; i < total_nodes; i++)
+	for (i = 0; i < total_active_nodes; i++)
 	{
 		/* witness server can never be a candidate */
 		if (nodes[i].type == WITNESS)
@@ -1540,13 +1729,15 @@ do_master_failover(void)
 				{
 					log_notice(_("Original master reappeared before this standby was promoted - no action taken\n"));

+					/* XXX log an event here?  */
+
 					PQfinish(master_conn);
+					master_conn = NULL;
+
 					/* no failover occurred but we'll want to restart connections */
 					failover_done = true;
 					return;
 				}
-
-				PQfinish(my_local_conn);
 			}

 			log_err(_("promote command failed. You could check and try it manually.\n"));
@@ -1676,8 +1867,10 @@ do_master_failover(void)
 		termPQExpBuffer(&event_details);
 	}

-	/* to force it to re-calculate mode and master node */
-	// ^ ZZZ check that behaviour ^
+	/*
+	 * setting "failover_done" to true will cause the node's monitoring loop
+	 * to restart in the appropriate mode for the node's (possibly new) role
+	 */
 	failover_done = true;
 }

@@ -1853,7 +2046,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
 		{
 			if (conninfo == NULL)
 			{
-				log_err("INTERNAL ERROR: *conn == NULL && conninfo == NULL");
+				log_err("INTERNAL ERROR: *conn == NULL && conninfo == NULL\n");
 				terminate(ERR_INTERNAL);
 			}
 			*conn = establish_db_connection(conninfo, false);
@@ -1901,7 +2094,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
 static bool
 set_local_node_status(void)
 {
-        PGresult       *res;
+	PGresult       *res;
 	char		sqlquery[QUERY_STR_LEN];
 	int		active_master_node_id = NODE_NOT_FOUND;
 	char		master_conninfo[MAXLEN];
@@ -1994,10 +2187,12 @@ check_cluster_configuration(PGconn *conn)
 	log_info(_("checking cluster configuration with schema '%s'\n"), get_repmgr_schema());

 	sqlquery_snprintf(sqlquery,
-					  "SELECT oid FROM pg_class "
+					  "SELECT oid FROM pg_catalog.pg_class "
 					  " WHERE oid = '%s.repl_nodes'::regclass ",
-			                  get_repmgr_schema_quoted(master_conn));
+					  get_repmgr_schema_quoted(master_conn));
+
 	res = PQexec(conn, sqlquery);
+
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
 		log_err(_("PQexec failed: %s\n"), PQerrorMessage(conn));
@@ -2112,7 +2307,7 @@ lsn_to_xlogrecptr(char *lsn, bool *format_ok)
 	{
 		if (format_ok != NULL)
 			*format_ok = false;
-		log_err(_("incorrect log location format: %s\n"), lsn);
+		log_warning(_("incorrect log location format: %s\n"), lsn);
 		return 0;
 	}

@@ -2416,6 +2611,8 @@ get_node_info(PGconn *conn, char *cluster, int node_id)
 							errmsg.data);

 		PQfinish(conn);
+		conn = NULL;
+
 		terminate(ERR_DB_QUERY);
 	}

--- a/sql/repmgr2_repmgr3.sql
+++ b/sql/repmgr2_repmgr3.sql
@@ -63,6 +63,15 @@ UPDATE repl_nodes SET type = 'master' WHERE id = $master_id;

 -- UPDATE repl_nodes SET active = FALSE WHERE id IN (...);

+/* There's also an event table which we need to create */
+CREATE TABLE repl_events (
+  node_id          INTEGER NOT NULL,
+  event            TEXT NOT NULL,
+  successful       BOOLEAN NOT NULL DEFAULT TRUE,
+  event_timestamp  TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  details          TEXT NULL
+);
+
 /* When you're sure of your changes, commit them */

 -- COMMIT;
--- a/sql/repmgr3.1.1_repmgr3.1.2.sql
+++ b/sql/repmgr3.1.1_repmgr3.1.2.sql
@@ -27,5 +27,6 @@

 BEGIN;

-ALTER TABLE repl_nodes ALTER CONSTRAINT repl_nodes_upstream_node_id_fkey DEFERRABLE;
+ALTER TABLE repl_nodes DROP CONSTRAINT repl_nodes_upstream_node_id_fkey,
+      ADD CONSTRAINT repl_nodes_upstream_node_id_fkey FOREIGN KEY (upstream_node_id) REFERENCES repl_nodes(id) DEFERRABLE;
 COMMIT;
--- a/strutil.c
+++ b/strutil.c
@@ -87,3 +87,34 @@ maxlen_snprintf(char *str, const char *format,...)

 	return retval;
 }
+
+
+/*
+ * Adapted from: src/fe_utils/string_utils.c
+ *
+ * Function not publicly available before PostgreSQL 9.6.
+ */
+void
+appendShellString(PQExpBuffer buf, const char *str)
+{
+	const char *p;
+
+	appendPQExpBufferChar(buf, '\'');
+	for (p = str; *p; p++)
+	{
+		if (*p == '\n' || *p == '\r')
+		{
+			fprintf(stderr,
+					_("shell command argument contains a newline or carriage return: \"%s\"\n"),
+					str);
+			exit(ERR_BAD_CONFIG);
+		}
+
+		if (*p == '\'')
+			appendPQExpBufferStr(buf, "'\"'\"'");
+		else
+			appendPQExpBufferChar(buf, *p);
+	}
+
+	appendPQExpBufferChar(buf, '\'');
+}
--- a/strutil.h
+++ b/strutil.h
@@ -22,6 +22,7 @@
 #define _STRUTIL_H_

 #include <stdlib.h>
+#include "pqexpbuffer.h"
 #include "errcode.h"


@@ -48,4 +49,6 @@ extern int
 maxlen_snprintf(char *str, const char *format,...)
 __attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));

+extern void
+appendShellString(PQExpBuffer buf, const char *str);
 #endif   /* _STRUTIL_H_ */
--- a/version.h
+++ b/version.h
@@ -1,6 +1,6 @@
 #ifndef _VERSION_H_
 #define _VERSION_H_

-#define REPMGR_VERSION "3.1.3"
+#define REPMGR_VERSION "3.2.1"

 #endif