From 2491b8ae5255bb87a92fb1f87266c1de1c751035 Mon Sep 17 00:00:00 2001
From: Ian Barwick <barwick@gmail.com>
Date: Thu, 27 Sep 2018 16:42:10 +0900
Subject: [PATCH] Add functionality to "pause" repmgrd

In some circumstances, e.g. while performing a switchover, it is essential
that repmgrd does not take any kind of failover action, as this will put
the cluster into an incorrect state.

Previously it was necessary to stop repmgrd on all nodes (or at least
those nodes which repmgrd would consider as promotion candidates), however
this is a cumbersome and potentially risk-prone operation, particularly if the
replication cluster contains more than a couple of servers.

To prevent this issue from occurring, this patch introduces the ability
to "pause" repmgrd on all nodes wth a single command ("repmgr daemon pause")
which notifies repmgrd not to take any failover action until the node
is "unpaused" ("repmgr daemon unpause").

"repmgr daemon status" provides an overview of each node and whether repmgrd
is running, and if so whether it is paused.

"repmgr standby switchover" has been modified to automatically pause repmgrd
while carrying out the switchover.

See documentation for further details.
---
 Makefile.in                        |   4 +-
 dbutils.c                          | 130 ++++++++-
 dbutils.h                          |  20 ++
 doc/filelist.sgml                  |   4 +
 doc/repmgr-cluster-show.sgml       |   4 +-
 doc/repmgr-daemon-pause.sgml       | 109 ++++++++
 doc/repmgr-daemon-status.sgml      | 165 ++++++++++++
 doc/repmgr-daemon-unpause.sgml     | 103 +++++++
 doc/repmgr-standby-switchover.sgml |  33 ++-
 doc/repmgr.sgml                    |   4 +
 doc/repmgrd-pausing.sgml           | 169 ++++++++++++
 doc/switchover.sgml                |  49 ++--
 errcode.h                          |   1 +
 repmgr--4.1.sql                    |  15 ++
 repmgr--4.2.sql                    |  30 +++
 repmgr-action-cluster.c            |  41 +--
 repmgr-action-daemon.c             | 420 +++++++++++++++++++++++++++++
 repmgr-action-daemon.h             |  28 ++
 repmgr-action-standby.c            | 256 +++++++++++++++++-
 repmgr-client-global.h             |  14 +-
 repmgr-client.c                    |  95 ++++++-
 repmgr-client.h                    |   5 +
 repmgr.c                           | 260 +++++++++++++++++-
 repmgr.h                           |   1 +
 repmgrd-physical.c                 |  95 ++++---
 repmgrd.c                          |   8 +-
 repmgrd.h                          |   1 +
 27 files changed, 1943 insertions(+), 121 deletions(-)
 create mode 100644 doc/repmgr-daemon-pause.sgml
 create mode 100644 doc/repmgr-daemon-status.sgml
 create mode 100644 doc/repmgr-daemon-unpause.sgml
 create mode 100644 doc/repmgrd-pausing.sgml
 create mode 100644 repmgr-action-daemon.c
 create mode 100644 repmgr-action-daemon.h

diff --git a/Makefile.in b/Makefile.in
index 001605c3..36040829 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -17,7 +17,6 @@ DATA = \
   repmgr--4.1--4.2.sql \
   repmgr--4.2.sql
 
-
 REGRESS = repmgr_extension
 
 # Hacky workaround to install the binaries
@@ -43,7 +42,7 @@ $(info Building against PostgreSQL $(MAJORVERSION))
 
 REPMGR_CLIENT_OBJS = repmgr-client.o \
 	repmgr-action-primary.o repmgr-action-standby.o repmgr-action-witness.o \
-	repmgr-action-bdr.o repmgr-action-cluster.o repmgr-action-node.o \
+	repmgr-action-bdr.o repmgr-action-cluster.o repmgr-action-node.o repmgr-action-daemon.o \
 	configfile.o log.o strutil.o controldata.o dirutil.o compat.o dbutils.o
 REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o configfile.o log.o dbutils.o strutil.o controldata.o compat.o
 DATE=$(shell date "+%Y-%m-%d")
@@ -87,6 +86,7 @@ additional-clean:
 	rm -f repmgr-action-bdr.o
 	rm -f repmgr-action-node.o
 	rm -f repmgr-action-cluster.o
+	rm -f repmgr-action-daemon.o
 	rm -f repmgrd.o
 	rm -f repmgrd-physical.o
 	rm -f repmgrd-bdr.o
diff --git a/dbutils.c b/dbutils.c
index bc9ba3d2..23a0bf57 100644
--- a/dbutils.c
+++ b/dbutils.c
@@ -1627,7 +1627,6 @@ repmgrd_set_local_node_id(PGconn *conn, int local_node_id)
 }
 
 
-
 int
 repmgrd_get_local_node_id(PGconn *conn)
 {
@@ -1686,6 +1685,135 @@ server_in_exclusive_backup_mode(PGconn *conn)
 }
 
 
+void
+repmgrd_set_pid(PGconn *conn, pid_t repmgrd_pid, const char *pidfile)
+{
+	PQExpBufferData query;
+	PGresult   *res = NULL;
+
+	log_verbose(LOG_DEBUG, "repmgrd_set_pid(): pid is %i", (int) repmgrd_pid);
+
+	initPQExpBuffer(&query);
+
+	appendPQExpBuffer(&query,
+					  "SELECT repmgr.set_repmgrd_pid(%i, '%s')",
+					  (int) repmgrd_pid, pidfile);
+
+	res = PQexec(conn, query.data);
+	termPQExpBuffer(&query);
+
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_error(_("unable to execute \"SELECT repmgr.set_repmgrd_pid()\""));
+		log_detail("%s", PQerrorMessage(conn));
+	}
+
+	PQclear(res);
+
+	return;
+}
+
+
+pid_t
+repmgrd_get_pid(PGconn *conn)
+{
+	PGresult   *res = NULL;
+	pid_t		repmgrd_pid = UNKNOWN_PID;
+
+	res = PQexec(conn, "SELECT repmgr.get_repmgrd_pid()");
+
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_error(_("unable to execute \"SELECT repmgr.get_repmgrd_pid()\""));
+		log_detail("%s", PQerrorMessage(conn));
+	}
+	else if (!PQgetisnull(res, 0, 0))
+	{
+		repmgrd_pid = atoi(PQgetvalue(res, 0, 0));
+	}
+
+	PQclear(res);
+
+	return repmgrd_pid;
+}
+
+
+bool
+repmgrd_is_running(PGconn *conn)
+{
+	PGresult   *res = NULL;
+	bool		is_running = false;
+
+	res = PQexec(conn, "SELECT repmgr.repmgrd_is_running()");
+
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_error(_("unable to execute \"SELECT repmgr.repmgrd_is_running()\""));
+		log_detail("%s", PQerrorMessage(conn));
+	}
+	else if (!PQgetisnull(res, 0, 0))
+	{
+		is_running = atobool(PQgetvalue(res, 0, 0));
+	}
+
+	PQclear(res);
+
+	return is_running;
+}
+
+
+bool
+repmgrd_is_paused(PGconn *conn)
+{
+	PGresult   *res = NULL;
+	bool		is_paused = false;
+
+	res = PQexec(conn, "SELECT repmgr.repmgrd_is_paused()");
+
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_error(_("unable to execute \"SELECT repmgr.repmgrd_is_paused()\""));
+		log_detail("%s", PQerrorMessage(conn));
+	}
+	else if (!PQgetisnull(res, 0, 0))
+	{
+		is_paused = atobool(PQgetvalue(res, 0, 0));
+	}
+
+	PQclear(res);
+
+	return is_paused;
+}
+
+
+bool
+repmgrd_pause(PGconn *conn, bool pause)
+{
+	PQExpBufferData query;
+	PGresult   *res = NULL;
+	bool		success = true;
+
+	initPQExpBuffer(&query);
+
+	appendPQExpBuffer(&query,
+					  "SELECT repmgr.repmgrd_pause(%s)",
+					  pause == true ? "TRUE" : "FALSE");
+	res = PQexec(conn, query.data);
+	termPQExpBuffer(&query);
+
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_error(_("unable to execute \"SELECT repmgr.repmgrd_pause()\""));
+		log_detail("%s", PQerrorMessage(conn));
+
+		success = false;
+	}
+
+	PQclear(res);
+
+	return success;
+}
+
 /* ================ */
 /* result functions */
 /* ================ */
diff --git a/dbutils.h b/dbutils.h
index da25d677..875fa42d 100644
--- a/dbutils.h
+++ b/dbutils.h
@@ -327,6 +327,21 @@ typedef struct
     UNKNOWN_TIMELINE_ID, \
 	InvalidXLogRecPtr \
 }
+
+
+typedef struct RepmgrdInfo {
+	int node_id;
+	int pid;
+	char pid_text[MAXLEN];
+	char pid_file[MAXLEN];
+	bool pg_running;
+	char pg_running_text[MAXLEN];
+	bool running;
+	char repmgrd_running[MAXLEN];
+	bool paused;
+} RepmgrdInfo;
+
+
 /* global variables */
 
 extern int	server_version_num;
@@ -399,6 +414,11 @@ bool		identify_system(PGconn *repl_conn, t_system_identification *identification
 bool		repmgrd_set_local_node_id(PGconn *conn, int local_node_id);
 int			repmgrd_get_local_node_id(PGconn *conn);
 BackupState	server_in_exclusive_backup_mode(PGconn *conn);
+void		repmgrd_set_pid(PGconn *conn, pid_t repmgrd_pid, const char *pidfile);
+pid_t		repmgrd_get_pid(PGconn *conn);
+bool		repmgrd_is_running(PGconn *conn);
+bool		repmgrd_is_paused(PGconn *conn);
+bool		repmgrd_pause(PGconn *conn, bool pause);
 
 /* extension functions */
 ExtensionStatus get_repmgr_extension_status(PGconn *conn);
diff --git a/doc/filelist.sgml b/doc/filelist.sgml
index 7a1faa71..1bb2e7f9 100644
--- a/doc/filelist.sgml
+++ b/doc/filelist.sgml
@@ -58,6 +58,7 @@
 <!ENTITY repmgrd-cascading-replication SYSTEM "repmgrd-cascading-replication.sgml">
 <!ENTITY repmgrd-network-split SYSTEM "repmgrd-network-split.sgml">
 <!ENTITY repmgrd-witness-server SYSTEM "repmgrd-witness-server.sgml">
+<!ENTITY repmgrd-pausing SYSTEM "repmgrd-pausing.sgml">
 <!ENTITY repmgrd-bdr SYSTEM "repmgrd-bdr.sgml">
 
 <!ENTITY repmgr-primary-register SYSTEM "repmgr-primary-register.sgml">
@@ -78,6 +79,9 @@
 <!ENTITY repmgr-cluster-crosscheck SYSTEM "repmgr-cluster-crosscheck.sgml">
 <!ENTITY repmgr-cluster-event SYSTEM "repmgr-cluster-event.sgml">
 <!ENTITY repmgr-cluster-cleanup SYSTEM "repmgr-cluster-cleanup.sgml">
+<!ENTITY repmgr-daemon-status SYSTEM "repmgr-daemon-status.sgml">
+<!ENTITY repmgr-daemon-pause SYSTEM "repmgr-daemon-pause.sgml">
+<!ENTITY repmgr-daemon-unpause SYSTEM "repmgr-daemon-unpause.sgml">
 
 <!ENTITY appendix-release-notes  SYSTEM "appendix-release-notes.sgml">
 <!ENTITY appendix-faq      SYSTEM "appendix-faq.sgml">
diff --git a/doc/repmgr-cluster-show.sgml b/doc/repmgr-cluster-show.sgml
index a096ff12..944d866c 100644
--- a/doc/repmgr-cluster-show.sgml
+++ b/doc/repmgr-cluster-show.sgml
@@ -90,7 +90,7 @@
 		  <para>
 			<command>repmgr cluster show</command> accepts an optional parameter <literal>--csv</literal>, which
 			outputs the replication cluster's status in a simple CSV format, suitable for
-			parsing by scripts:
+			parsing by scripts, e.g.:
 			<programlisting>
     $ repmgr -f /etc/repmgr.conf cluster show --csv
     1,-1,-1
@@ -165,7 +165,7 @@
   <refsect1>
     <title>See also</title>
     <para>
-     <xref linkend="repmgr-node-status">, <xref linkend="repmgr-node-check">
+     <xref linkend="repmgr-node-status">, <xref linkend="repmgr-node-check">, <xref linkend="repmgr-daemon-status">
     </para>
   </refsect1>
 
diff --git a/doc/repmgr-daemon-pause.sgml b/doc/repmgr-daemon-pause.sgml
new file mode 100644
index 00000000..c2845611
--- /dev/null
+++ b/doc/repmgr-daemon-pause.sgml
@@ -0,0 +1,109 @@
+<refentry id="repmgr-daemon-pause">
+  <indexterm>
+    <primary>repmgr daemon pause</primary>
+  </indexterm>
+
+  <refmeta>
+    <refentrytitle>repmgr daemon pause</refentrytitle>
+  </refmeta>
+
+  <refnamediv>
+    <refname>repmgr daemon pause</refname>
+    <refpurpose>Instruct all <application>repmgrd</application> instances in the replication cluster to pause failover operations</refpurpose>
+  </refnamediv>
+
+  <refsect1>
+    <title>Description</title>
+    <para>
+      This command can be run on any active node in the replication cluster to instruct all
+      running <application>repmgrd</application> instances to &quot;pause&quot; themselves, i.e. take no
+      action (such as promoting themselves or following a new primary) if a failover event is detected.
+    </para>
+    <para>
+      This functionality is useful for performing maintenance operations, such as switchovers
+      or upgrades, which might otherwise trigger a failover if <application>repmgrd</application>
+      is running normally.
+    </para>
+    <note>
+      <para>
+        It's important to wait a few seconds after restarting PostgreSQL on any node before running
+        <command>repmgr daemon pause</command>, as the <application>repmgrd</application> instance
+        on the restarted node will take a second or two before it has updated its status.
+      </para>
+    </note>
+    <para>
+      <xref linkend="repmgr-daemon-unpause"> will instruct all previously paused <application>repmgrd</application>
+      instances to resume normal failover operation.
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>Execution</title>
+    <para>
+      <command>repmgr daemon pause</command> can be executed on any active node in the
+      replication cluster. A valid <filename>repmgr.conf</filename> file is required.
+      It will have no effect on previously paused nodes.
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>Example</title>
+    <para>
+    <programlisting>
+$ repmgr -f /etc/repmgr.conf daemon pause
+NOTICE: node 1 (node1) paused
+NOTICE: node 2 (node2) paused
+NOTICE: node 3 (node3) paused</programlisting>
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>Options</title>
+    <variablelist>
+      <varlistentry>
+        <term><option>--dry-run</option></term>
+        <listitem>
+          <para>
+            Check if nodes are reachable but don't pause <application>repmgrd</application>.
+          </para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>Exit codes</title>
+    <para>
+      Following exit codes can be emitted by <command>repmgr daemon unpause</command>:
+    </para>
+    <variablelist>
+
+      <varlistentry>
+        <term><option>SUCCESS (0)</option></term>
+        <listitem>
+          <para>
+            <application>repmgrd</application> could be paused on all nodes.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_REPMGRD_PAUSE (26)</option></term>
+        <listitem>
+          <para>
+           <application>repmgrd</application> could not be paused on one or mode nodes.
+          </para>
+        </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>See also</title>
+    <para>
+      <xref linkend="repmgr-daemon-unpause">, <xref linkend="repmgr-daemon-status">
+    </para>
+  </refsect1>
+</refentry>
+
diff --git a/doc/repmgr-daemon-status.sgml b/doc/repmgr-daemon-status.sgml
new file mode 100644
index 00000000..1d2dc765
--- /dev/null
+++ b/doc/repmgr-daemon-status.sgml
@@ -0,0 +1,165 @@
+<refentry id="repmgr-daemon-status">
+  <indexterm>
+    <primary>repmgr daemon status</primary>
+  </indexterm>
+
+  <refmeta>
+    <refentrytitle>repmgr daemon status</refentrytitle>
+  </refmeta>
+
+  <refnamediv>
+    <refname>repmgr daemon status</refname>
+    <refpurpose>display information about the status of <application>repmgrd</application> on each node in the cluster</refpurpose>
+  </refnamediv>
+
+  <refsect1>
+    <title>Description</title>
+    <para>
+      This command provides an overview over all active nodes in the cluster and the state
+      of each node's <application>repmgrd</application> instance. It can be used to check
+      the result of <xref linkend="repmgr-daemon-pause"> and <xref linkend="repmgr-daemon-unpause">
+      operations.
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>Execution</title>
+    <para>
+      <command>repmgr daemon status</command> can be executed on any active node in the
+      replication cluster. A valid <filename>repmgr.conf</filename> file is required.
+    </para>
+
+    <note>
+      <para>
+        After restarting PostgreSQL on any node, the <application>repmgrd</application> instance
+        will take a second or two before it is able to update its status. Until then,
+        <application>repmgrd</application> will be shown as not running.
+      </para>
+    </note>
+
+  </refsect1>
+
+  <refsect1>
+    <title>Examples</title>
+    <para>
+      <application>repmgrd</application> running normally on all nodes:
+    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
+ ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
+----+-------+---------+---------+---------+------+---------
+ 1  | node1 | primary | running | running | 7851 | no
+ 2  | node2 | standby | running | running | 7889 | no
+ 3  | node3 | standby | running | running | 7918 | no</programlisting>
+    </para>
+
+    <para>
+      <application>repmgrd</application> paused on all nodes (using <xref linkend="repmgr-daemon-pause">):
+    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
+ ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
+----+-------+---------+---------+---------+------+---------
+ 1  | node1 | primary | running | running | 7851 | yes
+ 2  | node2 | standby | running | running | 7889 | yes
+ 3  | node3 | standby | running | running | 7918 | yes</programlisting>
+    </para>
+
+    <para>
+      <application>repmgrd</application> not running on one node:
+    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
+ ID | Name  | Role    | Status  | repmgrd     | PID  | Paused?
+----+-------+---------+---------+-------------+------+---------
+ 1  | node1 | primary | running | running     | 7851 | yes
+ 2  | node2 | standby | running | not running | n/a  | n/a
+ 3  | node3 | standby | running | running     | 7918 | yes</programlisting>
+    </para>
+
+  </refsect1>
+
+  <refsect1>
+    <title>Options</title>
+
+    <variablelist>
+
+      <varlistentry>
+        <term><option>--csv</option></term>
+        <listitem>
+		  <para>
+			<command>repmgr daemon status</command> accepts an optional parameter <literal>--csv</literal>, which
+			outputs the replication cluster's status in a simple CSV format, suitable for
+			parsing by scripts, e.g.:
+			<programlisting>
+    $ repmgr -f /etc/repmgr.conf daemon status --csv
+    1,node1,primary,1,1,10204,1
+    2,node2,standby,1,0,-1,1
+    3,node3,standby,1,1,10225,1</programlisting>
+		  </para>
+		  <para>
+			The columns have following meanings:
+			<itemizedlist spacing="compact" mark="bullet">
+			  <listitem>
+				<simpara>
+				  node ID
+				</simpara>
+			  </listitem>
+
+			  <listitem>
+				<simpara>
+                  node name
+				</simpara>
+			  </listitem>
+
+			  <listitem>
+				<simpara>
+                  node type (primary or standby)
+				</simpara>
+			  </listitem>
+
+			  <listitem>
+				<simpara>
+                  PostgreSQL server running
+				</simpara>
+			  </listitem>
+
+			  <listitem>
+				<simpara>
+                  <application>repmgrd</application> running (1 = running, 0 = not running)
+				</simpara>
+			  </listitem>
+
+			  <listitem>
+				<simpara>
+                  <application>repmgrd</application> PID (-1 if not running)
+				</simpara>
+			  </listitem>
+
+			  <listitem>
+				<simpara>
+                  <application>repmgrd</application> paused (1 = paused, 0 = not paused)
+				</simpara>
+			  </listitem>
+
+			</itemizedlist>
+		  </para>
+		</listitem>
+	  </varlistentry>
+
+      <varlistentry>
+        <term><option>--verbose</option></term>
+        <listitem>
+          <para>
+			Display the full text of any database connection error messages
+          </para>
+        </listitem>
+      </varlistentry>
+
+	</variablelist>
+
+  </refsect1>
+
+
+
+  <refsect1>
+    <title>See also</title>
+    <para>
+      <xref linkend="repmgr-daemon-pause">, <xref linkend="repmgr-daemon-unpause">, <xref linkend="repmgr-cluster-show">
+    </para>
+  </refsect1>
+</refentry>
diff --git a/doc/repmgr-daemon-unpause.sgml b/doc/repmgr-daemon-unpause.sgml
new file mode 100644
index 00000000..9e640313
--- /dev/null
+++ b/doc/repmgr-daemon-unpause.sgml
@@ -0,0 +1,103 @@
+<refentry id="repmgr-daemon-unpause">
+  <indexterm>
+    <primary>repmgr daemon unpause</primary>
+  </indexterm>
+
+  <refmeta>
+    <refentrytitle>repmgr daemon unpause</refentrytitle>
+  </refmeta>
+
+  <refnamediv>
+    <refname>repmgr daemon unpause</refname>
+    <refpurpose>Instruct all <application>repmgrd</application> instances in the replication cluster to resume failover operations</refpurpose>
+  </refnamediv>
+
+  <refsect1>
+    <title>Description</title>
+    <para>
+      This command can be run on any active node in the replication cluster to instruct all
+      running <application>repmgrd</application> instances to &quot;unpause&quot;
+      (following a previous execution of <xref linkend="repmgr-daemon-pause">)
+      and resume normal failover/monitoring operation.
+    </para>
+
+    <note>
+      <para>
+        It's important to wait a few seconds after restarting PostgreSQL on any node before running
+        <command>repmgr daemon pause</command>, as the <application>repmgrd</application> instance
+        on the restarted node will take a second or two before it has updated its status.
+      </para>
+    </note>
+
+  </refsect1>
+
+  <refsect1>
+    <title>Execution</title>
+    <para>
+     <command>repmgr daemon unpause</command> can be executed on any active node in the
+      replication cluster. A valid <filename>repmgr.conf</filename> file is required.
+      It will have no effect on nodes which are not already paused.
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>Example</title>
+    <para>
+    <programlisting>
+$ repmgr -f /etc/repmgr.conf daemon unpause
+NOTICE: node 1 (node1) unpaused
+NOTICE: node 2 (node2) unpaused
+NOTICE: node 3 (node3) unpaused</programlisting>
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>Options</title>
+    <variablelist>
+      <varlistentry>
+        <term><option>--dry-run</option></term>
+        <listitem>
+          <para>
+            Check if nodes are reachable but don't unpause <application>repmgrd</application>.
+          </para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>Exit codes</title>
+    <para>
+      Following exit codes can be emitted by <command>repmgr daemon unpause</command>:
+    </para>
+    <variablelist>
+
+      <varlistentry>
+        <term><option>SUCCESS (0)</option></term>
+        <listitem>
+          <para>
+            <application>repmgrd</application> could be unpaused on all nodes.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_REPMGRD_PAUSE (26)</option></term>
+        <listitem>
+          <para>
+           <application>repmgrd</application> could not be unpaused on one or mode nodes.
+          </para>
+        </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>See also</title>
+    <para>
+      <xref linkend="repmgr-daemon-pause">, <xref linkend="repmgr-daemon-status">
+    </para>
+  </refsect1>
+</refentry>
+
diff --git a/doc/repmgr-standby-switchover.sgml b/doc/repmgr-standby-switchover.sgml
index cbd5d7a1..d8cf6d70 100644
--- a/doc/repmgr-standby-switchover.sgml
+++ b/doc/repmgr-standby-switchover.sgml
@@ -35,6 +35,10 @@
         &repmgr; will attempt to check for potential issues but cannot guarantee
         a successful switchover.
       </para>
+      <para>
+        &repmgr; will refuse to perform the switchover if an exclusive backup is running on
+        the current primary.
+      </para>
     </note>
     <para>
       For more details on performing a switchover, including preparation and configuration,
@@ -43,11 +47,14 @@
 
     <note>
       <para>
-        <application>repmgrd</application> should not be active on any nodes while a switchover is being
-        executed. This restriction may be lifted in a later version.
+        From <link linkend="release-4.2">repmgr 4.2</link>, &repmgr; will instruct any running
+        <application>repmgrd</application> instances to pause operations while the switchover
+        is being carried out, to prevent <application>repmgrd</application> from
+        unintentionally promoting a node. For more details, see <xref linkend="repmgrd-pausing">.
       </para>
       <para>
-        &repmgr; will not perform the switchover if an exclusive backup is running on the current primary.
+        Users of &repmgr; versions prior to 4.2 should ensure that <application>repmgrd</application>
+        is not running on any nodes while a switchover is being executed.
       </para>
     </note>
 
@@ -61,8 +68,9 @@
         <term><option>--always-promote</option></term>
         <listitem>
           <para>
-            Promote standby to primary, even if it is behind original primary
-            (original primary will be shut down in any case).
+            Promote standby to primary, even if it is behind or has diverged
+            from the original primary. The original primary will be shut down in any case,
+            and will need to be manually reintegrated into the replication cluster.
           </para>
         </listitem>
       </varlistentry>
@@ -122,6 +130,21 @@
         </listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><option>--repmgrd-no-pause</option></term>
+        <listitem>
+          <para>
+            Don't pause <application>repmgrd</application> while executing a switchover.
+          </para>
+          <para>
+            This option should not be used unless you take steps by other means
+            to ensure <application>repmgrd</application> is paused or not
+            running on all nodes.
+          </para>
+        </listitem>
+      </varlistentry>
+
+
      <varlistentry>
         <term><option>--siblings-follow</option></term>
         <listitem>
diff --git a/doc/repmgr.sgml b/doc/repmgr.sgml
index 90ef1bc4..68903d2c 100644
--- a/doc/repmgr.sgml
+++ b/doc/repmgr.sgml
@@ -86,6 +86,7 @@
   &repmgrd-cascading-replication;
   &repmgrd-network-split;
   &repmgrd-witness-server;
+  &repmgrd-pausing;
   &repmgrd-degraded-monitoring;
   &repmgrd-monitoring;
   &repmgrd-bdr;
@@ -112,6 +113,9 @@
   &repmgr-cluster-crosscheck;
   &repmgr-cluster-event;
   &repmgr-cluster-cleanup;
+  &repmgr-daemon-status;
+  &repmgr-daemon-pause;
+  &repmgr-daemon-unpause;
  </part>
 
  &appendix-release-notes;
diff --git a/doc/repmgrd-pausing.sgml b/doc/repmgrd-pausing.sgml
new file mode 100644
index 00000000..ccef2b61
--- /dev/null
+++ b/doc/repmgrd-pausing.sgml
@@ -0,0 +1,169 @@
+<chapter id="repmgrd-pausing" xreflabel="Pausing repmgrd">
+
+  <indexterm>
+    <primary>repmgrd</primary>
+    <secondary>pausing</secondary>
+  </indexterm>
+
+  <indexterm>
+    <primary>pausing repmgrd</primary>
+  </indexterm>
+
+  <title>Pausing repmgrd</title>
+
+  <para>
+    In normal operation, <application>repmgrd</application> monitors the state of the
+    PostgreSQL node it is running on, and will take appropriate action if problems
+    are detected, e.g. (if so configured) promote the node to primary, if the existing
+    primary has been determined as failed.
+  </para>
+
+  <para>
+    However, <application>repmgrd</application> is unable to distinguish between
+    planned outages (such as performing a <link linkend="performing-switchover">switchover</link>
+    or upgrading a server), and an actual server outage. In versions prior to &repmgr; 4.2
+    it was necessary to stop <application>repmgrd</application> on all nodes (or at least
+    on all nodes where <application>repmgrd</application> is
+    <link linkend="repmgrd-automatic-failover">configured for automatic failover</link>)
+    to prevent <application>repmgrd</application> from making changes to the
+    replication cluster.
+  </para>
+  <para>
+    From <link linkend="release-4.2">&repmgr; 4.2</link>, <application>repmgrd</application>
+    can now be &quot;paused&quot;, i.e. instructed not to take any action such as performing a failover.
+    This can be done from any node in the cluster, removing the need to stop/restart
+    each <application>repmgrd</application> individually.
+  </para>
+
+  <sect1 id="repmgrd-pausing-prerequisites">
+    <title>Prerequisites for pausing <application>repmgrd</application></title>
+    <para>
+      In order to be able to pause/unpause <application>repmgrd</application>, following
+      prerequisites must be met:
+      <itemizedlist spacing="compact" mark="bullet">
+
+        <listitem>
+          <simpara><link linkend="release-4.2">&repmgr; 4.2</link> or later must be installed on all nodes.</simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>The same major &repmgr; version (e.g. 4.2) must be installed on all nodes (and preferably the same minor version).</simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            PostgreSQL on all nodes must be accessible from the node where the
+            <literal>pause</literal>/<literal>unpause</literal> operation is executed, using the
+            <varname>conninfo</varname> string shown by <link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>.
+          </simpara>
+        </listitem>
+      </itemizedlist>
+    </para>
+    <note>
+      <para>
+        These conditions are required for normal &repmgr; operation in any case.
+      </para>
+    </note>
+
+  </sect1>
+
+  <sect1 id="repmgrd-pausing-execution">
+    <title>Pausing/unpausing <application>repmgrd</application></title>
+    <para>
+      To pause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link>, e.g.:
+   <programlisting>
+$ repmgr -f /etc/repmgr.conf daemon pause
+NOTICE: node 1 (node1) paused
+NOTICE: node 2 (node2) paused
+NOTICE: node 3 (node3) paused</programlisting>
+    </para>
+    <para>
+      The state of <application>repmgrd</application> on each node can be checked with
+      <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>, e.g.:
+    <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
+ ID | Name  | Role    | Status  | repmgrd | PID  | Paused?
+----+-------+---------+---------+---------+------+---------
+ 1  | node1 | primary | running | running | 7851 | yes
+ 2  | node2 | standby | running | running | 7889 | yes
+ 3  | node3 | standby | running | running | 7918 | yes</programlisting>
+    </para>
+
+    <note>
+      <para>
+        If executing a switchover with  <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
+		&repmgr; will automatically pause/unpause <application>repmgrd</application> as part of the switchover process.
+      </para>
+    </note>
+
+    <para>
+      If the primary (in this example, <literal>node1</literal>) is stopped, <application>repmgrd</application>
+      running on one of the standbys (here: <literal>node2</literal>) will react like this:
+      <programlisting>
+[2018-09-20 12:22:21] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
+[2018-09-20 12:22:21] [INFO] checking state of node 1, 1 of 5 attempts
+[2018-09-20 12:22:21] [INFO] sleeping 1 seconds until next reconnection attempt
+...
+[2018-09-20 12:22:24] [INFO] sleeping 1 seconds until next reconnection attempt
+[2018-09-20 12:22:25] [INFO] checking state of node 1, 5 of 5 attempts
+[2018-09-20 12:22:25] [WARNING] unable to reconnect to node 1 after 5 attempts
+[2018-09-20 12:22:25] [NOTICE] node is paused
+[2018-09-20 12:22:33] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state
+[2018-09-20 12:22:33] [DETAIL] repmgrd paused by administrator
+[2018-09-20 12:22:33] [HINT] execute "repmgr daemon unpause" to resume normal failover mode</programlisting>
+    </para>
+    <para>
+      If the primary becomes available again (e.g. following a software upgrade), <application>repmgrd</application>
+      will automatically reconnect, e.g.:
+      <programlisting>
+[2018-09-20 13:12:41] [NOTICE] reconnected to upstream node 1 after 8 seconds, resuming monitoring</programlisting>
+    </para>
+
+    <para>
+      To unpause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>, e.g.:
+   <programlisting>
+$ repmgr -f /etc/repmgr.conf daemon pause
+NOTICE: node 1 (node1) unpaused
+NOTICE: node 2 (node2) unpaused
+NOTICE: node 3 (node3) unpaused</programlisting>
+    </para>
+
+    <note>
+      <para>
+        If the previous primary is no longer accessible when <application>repmgrd</application>
+        is unpaused, no failover action will be taken. Instead, a new primary must be manually promoted using
+        <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>,
+		and any standbys attached to the new primary with
+		<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>.
+      </para>
+      <para>
+        This is to prevent <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
+        resulting in the automatic promotion of a new primary, which may be a problem particularly
+        in larger clusters, where <application>repmgrd</application> could select a different promotion
+        candidate to the one intended by the administrator.
+      </para>
+    </note>
+
+  <sect2 id="repmgrd-pausing-details">
+    <title>Details on the <application>repmgrd</application> pausing mechanism</title>
+
+    <para>
+      The pause state of each node will be stored over a PostgreSQL restart.
+    </para>
+
+	<para>
+	  <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
+	  <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link> can be
+	  executed even if <application>repmgrd</application> is not running; in this case,
+	  <application>repmgrd</application> will start up in whichever pause state has been set.
+	</para>
+    <note>
+      <para>
+		<link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
+		<link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
+		<emphasis>do not</emphasis> stop/start <application>repmgrd</application>.
+      </para>
+    </note>
+  </sect2>
+  </sect1>
+</chapter>
+
diff --git a/doc/switchover.sgml b/doc/switchover.sgml
index e3999112..56683c93 100644
--- a/doc/switchover.sgml
+++ b/doc/switchover.sgml
@@ -19,9 +19,10 @@
  </para>
  <para>
   <command>repmgr standby switchover</command> differs from other &repmgr;
-  actions in that it also performs actions on another server (the demotion
-  candidate), which means passwordless SSH access is required to that server
-  from the one where <command>repmgr standby switchover</command> is executed.
+  actions in that it also performs actions on other servers (the demotion
+  candidate, and optionally any other servers which are to follow the new primary),
+  which means passwordless SSH access is required to those servers from the one where
+  <command>repmgr standby switchover</command> is executed.
  </para>
  <note>
   <simpara>
@@ -153,12 +154,18 @@
     manually with <command>repmgr node check --archive-ready</command>.
    </para>
 
-   <note>
-     <para>
-       Ensure that <application>repmgrd</application> is *not* running anywhere to prevent it unintentionally
-       promoting a node. This restriction will be removed in a future &repmgr; version.
-     </para>
-   </note>
+    <note>
+      <para>
+        From <link linkend="release-4.2">repmgr 4.2</link>, &repmgr; will instruct any running
+        <application>repmgrd</application> instances to pause operations while the switchover
+        is being carried out, to prevent <application>repmgrd</application> from
+        unintentionally promoting a node. For more details, see <xref linkend="repmgrd-pausing">.
+      </para>
+      <para>
+        Users of &repmgr; versions prior to 4.2 should ensure that <application>repmgrd</application>
+        is not running on any nodes while a switchover is being executed.
+      </para>
+    </note>
 
 
    <para>
@@ -303,7 +310,21 @@
      2  | node2 | primary | * running |          | default  | host=node2 dbname=repmgr user=repmgr
    </programlisting>
   </para>
+  <para>
+    If <application>repmgrd</application> is in use, it's worth double-checking that
+    all nodes are unpaused by executing <command><link linkend="repmgr-daemon-status">repmgr-daemon-status</link></command>.
+  </para>
+
+   <note>
+     <para>
+       Users of &repmgr; versions prior to 4.2 will need to manually restart <application>repmgrd</application>
+       on all nodes after the switchover is completed.
+     </para>
+    </note>
+
  </sect1>
+
+
  <sect1 id="switchover-caveats" xreflabel="Caveats">
   <indexterm>
    <primary>switchover</primary>
@@ -329,18 +350,8 @@
       for details.
      </simpara>
     </listitem>
-    <listitem>
-     <simpara>
-      <application>repmgrd</application> should not be running with setting <varname>failover=automatic</varname>
-      in <filename>repmgr.conf</filename> when a switchover is carried out, otherwise the
-      <application>repmgrd</application> daemon may try and promote a standby by itself.
-     </simpara>
-    </listitem>
    </itemizedlist>
   </para>
-  <para>
-   We hope to remove some of these restrictions in future versions of &repmgr;.
-  </para>
  </sect1>
 
  <sect1 id="switchover-troubleshooting" xreflabel="Troubleshooting">
diff --git a/errcode.h b/errcode.h
index b7d4c688..a7a4f770 100644
--- a/errcode.h
+++ b/errcode.h
@@ -47,5 +47,6 @@
 #define ERR_FOLLOW_FAIL 23
 #define ERR_REJOIN_FAIL 24
 #define ERR_NODE_STATUS 25
+#define ERR_REPMGRD_PAUSE 26
 
 #endif							/* _ERRCODE_H_ */
diff --git a/repmgr--4.1.sql b/repmgr--4.1.sql
index f012853f..d73d988b 100644
--- a/repmgr--4.1.sql
+++ b/repmgr--4.1.sql
@@ -145,6 +145,21 @@ CREATE FUNCTION unset_bdr_failover_handler()
   AS 'MODULE_PATHNAME', 'unset_bdr_failover_handler'
   LANGUAGE C STRICT;
 
+CREATE FUNCTION get_repmgrd_pid()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_repmgrd_pid'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_repmgrd_pidfile()
+  RETURNS TEXT
+  AS 'MODULE_PATHNAME', 'get_repmgrd_pidfile'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION set_repmgrd_pid(INT, TEXT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_repmgrd_pid'
+  LANGUAGE C STRICT;
+
 
 CREATE VIEW repmgr.replication_status AS
   SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
diff --git a/repmgr--4.2.sql b/repmgr--4.2.sql
index f012853f..c0567ca3 100644
--- a/repmgr--4.2.sql
+++ b/repmgr--4.2.sql
@@ -145,6 +145,36 @@ CREATE FUNCTION unset_bdr_failover_handler()
   AS 'MODULE_PATHNAME', 'unset_bdr_failover_handler'
   LANGUAGE C STRICT;
 
+CREATE FUNCTION get_repmgrd_pid()
+  RETURNS INT
+  AS 'MODULE_PATHNAME', 'get_repmgrd_pid'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION get_repmgrd_pidfile()
+  RETURNS TEXT
+  AS 'MODULE_PATHNAME', 'get_repmgrd_pidfile'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION set_repmgrd_pid(INT, TEXT)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'set_repmgrd_pid'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION repmgrd_is_running()
+  RETURNS BOOL
+  AS 'MODULE_PATHNAME', 'repmgrd_is_running'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION repmgrd_pause(BOOL)
+  RETURNS VOID
+  AS 'MODULE_PATHNAME', 'repmgrd_pause'
+  LANGUAGE C STRICT;
+
+CREATE FUNCTION repmgrd_is_paused()
+  RETURNS BOOL
+  AS 'MODULE_PATHNAME', 'repmgrd_is_paused'
+  LANGUAGE C STRICT;
+
 
 CREATE VIEW repmgr.replication_status AS
   SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
diff --git a/repmgr-action-cluster.c b/repmgr-action-cluster.c
index 54a771d1..b41229f0 100644
--- a/repmgr-action-cluster.c
+++ b/repmgr-action-cluster.c
@@ -26,7 +26,6 @@
 
 #define SHOW_HEADER_COUNT 7
 
-
 typedef enum
 {
 	SHOW_ID = 0,
@@ -51,14 +50,6 @@ typedef enum
 }			EventHeader;
 
 
-
-struct ColHeader
-{
-	char		title[MAXLEN];
-	int			max_length;
-	int			cur_length;
-};
-
 struct ColHeader headers_show[SHOW_HEADER_COUNT];
 struct ColHeader headers_event[EVENT_HEADER_COUNT];
 
@@ -159,7 +150,7 @@ do_cluster_show(void)
 			else
 			{
 				item_list_append_format(&warnings,
-										"unable to  connect to node \"%s\" (ID: %i)",
+										"unable to connect to node \"%s\" (ID: %i)",
 										cell->node_info->node_name, cell->node_info->node_id);
 			}
 		}
@@ -364,36 +355,10 @@ do_cluster_show(void)
 
 	}
 
+	/* Print column header row (text mode only) */
 	if (runtime_options.output_mode == OM_TEXT)
 	{
-		for (i = 0; i < SHOW_HEADER_COUNT; i++)
-		{
-			if (i == 0)
-				printf(" ");
-			else
-				printf(" | ");
-
-			printf("%-*s",
-				   headers_show[i].max_length,
-				   headers_show[i].title);
-		}
-		printf("\n");
-		printf("-");
-
-		for (i = 0; i < SHOW_HEADER_COUNT; i++)
-		{
-			int			j;
-
-			for (j = 0; j < headers_show[i].max_length; j++)
-				printf("-");
-
-			if (i < (SHOW_HEADER_COUNT - 1))
-				printf("-+-");
-			else
-				printf("-");
-		}
-
-		printf("\n");
+		print_status_header(SHOW_HEADER_COUNT, headers_show);
 	}
 
 	for (cell = nodes.head; cell; cell = cell->next)
diff --git a/repmgr-action-daemon.c b/repmgr-action-daemon.c
new file mode 100644
index 00000000..a6351df0
--- /dev/null
+++ b/repmgr-action-daemon.c
@@ -0,0 +1,420 @@
+/*
+ * repmgr-action-daemon.c
+ *
+ * Implements repmgrd actions for the repmgr command line utility
+ * Copyright (c) 2ndQuadrant, 2010-2018
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "repmgr.h"
+
+#include "repmgr-client-global.h"
+#include "repmgr-action-daemon.h"
+
+
+
+/*
+ * Possibly also show:
+ *  - repmgrd start time?
+ *  - repmgrd mode
+ *  - priority
+ *  - whether promotion candidate (due to zero priority/different location)
+ */
+
+typedef enum
+{
+	STATUS_ID = 0,
+	STATUS_NAME,
+	STATUS_ROLE,
+	STATUS_PG,
+	STATUS_RUNNING,
+	STATUS_PID,
+	STATUS_PAUSED
+} StatusHeader;
+
+#define STATUS_HEADER_COUNT 7
+
+struct ColHeader headers_status[STATUS_HEADER_COUNT];
+
+static void fetch_node_records(PGconn *conn, NodeInfoList *node_list);
+static void _do_repmgr_pause(bool pause);
+
+
+void
+do_daemon_status(void)
+{
+	PGconn	   *conn = NULL;
+	NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
+	NodeInfoListCell *cell = NULL;
+	int i;
+	RepmgrdInfo **repmgrd_info;
+	ItemList	warnings = {NULL, NULL};
+
+	/* Connect to local database to obtain cluster connection data */
+	log_verbose(LOG_INFO, _("connecting to database"));
+
+	if (strlen(config_file_options.conninfo))
+		conn = establish_db_connection(config_file_options.conninfo, true);
+	else
+		conn = establish_db_connection_by_params(&source_conninfo, true);
+
+	fetch_node_records(conn, &nodes);
+
+	repmgrd_info = (RepmgrdInfo **) pg_malloc0(sizeof(RepmgrdInfo *) * nodes.node_count);
+
+	if (repmgrd_info == NULL)
+	{
+		log_error(_("unable to allocate memory"));
+		exit(ERR_OUT_OF_MEMORY);
+	}
+
+	strncpy(headers_status[STATUS_ID].title, _("ID"), MAXLEN);
+	strncpy(headers_status[STATUS_NAME].title, _("Name"), MAXLEN);
+	strncpy(headers_status[STATUS_ROLE].title, _("Role"), MAXLEN);
+	strncpy(headers_status[STATUS_PG].title, _("Status"), MAXLEN);
+	strncpy(headers_status[STATUS_RUNNING].title, _("repmgrd"), MAXLEN);
+	strncpy(headers_status[STATUS_PID].title, _("PID"), MAXLEN);
+	strncpy(headers_status[STATUS_PAUSED].title, _("Paused?"), MAXLEN);
+
+	for (i = 0; i < STATUS_HEADER_COUNT; i++)
+	{
+		headers_status[i].max_length = strlen(headers_status[i].title);
+	}
+
+	i = 0;
+
+	for (cell = nodes.head; cell; cell = cell->next)
+	{
+		int j;
+
+		repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
+		repmgrd_info[i]->node_id = cell->node_info->node_id;
+		repmgrd_info[i]->pid = UNKNOWN_PID;
+		repmgrd_info[i]->paused = false;
+		repmgrd_info[i]->running = false;
+		repmgrd_info[i]->pg_running = true;
+
+		cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
+
+		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
+		{
+			if (runtime_options.verbose)
+			{
+				char		error[MAXLEN];
+
+				strncpy(error, PQerrorMessage(cell->node_info->conn), MAXLEN);
+
+				item_list_append_format(&warnings,
+										"when attempting to connect to node \"%s\" (ID: %i), following error encountered :\n\"%s\"",
+										cell->node_info->node_name, cell->node_info->node_id, trim(error));
+			}
+			else
+			{
+				item_list_append_format(&warnings,
+										"unable to  connect to node \"%s\" (ID: %i)",
+										cell->node_info->node_name, cell->node_info->node_id);
+			}
+
+			repmgrd_info[i]->pg_running = false;
+			maxlen_snprintf(repmgrd_info[i]->pg_running_text, "%s", _("not running"));
+			maxlen_snprintf(repmgrd_info[i]->repmgrd_running, "%s", _("n/a"));
+			maxlen_snprintf(repmgrd_info[i]->pid_text, "%s", _("n/a"));
+		}
+		else
+		{
+			maxlen_snprintf(repmgrd_info[i]->pg_running_text, "%s", _("running"));
+
+			repmgrd_info[i]->pid = repmgrd_get_pid(cell->node_info->conn);
+
+			repmgrd_info[i]->running = repmgrd_is_running(cell->node_info->conn);
+
+			if (repmgrd_info[i]->running == true)
+			{
+				maxlen_snprintf(repmgrd_info[i]->repmgrd_running, "%s", _("running"));
+			}
+			else
+			{
+				maxlen_snprintf(repmgrd_info[i]->repmgrd_running, "%s", _("not running"));
+			}
+
+			if (repmgrd_info[i]->pid == UNKNOWN_PID)
+			{
+				maxlen_snprintf(repmgrd_info[i]->pid_text, "%s", _("n/a"));
+			}
+			else
+			{
+				maxlen_snprintf(repmgrd_info[i]->pid_text, "%i", repmgrd_info[i]->pid);
+			}
+
+			repmgrd_info[i]->paused = repmgrd_is_paused(cell->node_info->conn);
+
+			PQfinish(cell->node_info->conn);
+		}
+
+
+		headers_status[STATUS_NAME].cur_length = strlen(cell->node_info->node_name);
+		headers_status[STATUS_ROLE].cur_length = strlen(get_node_type_string(cell->node_info->type));
+		headers_status[STATUS_PID].cur_length = strlen(repmgrd_info[i]->pid_text);
+		headers_status[STATUS_RUNNING].cur_length = strlen(repmgrd_info[i]->repmgrd_running);
+		headers_status[STATUS_PG].cur_length = strlen(repmgrd_info[i]->pg_running_text);
+
+		for (j = 0; j < STATUS_HEADER_COUNT; j++)
+		{
+			if (headers_status[j].cur_length > headers_status[j].max_length)
+			{
+				headers_status[j].max_length = headers_status[j].cur_length;
+			}
+		}
+
+		i++;
+	}
+
+	/* Print column header row (text mode only) */
+	if (runtime_options.output_mode == OM_TEXT)
+	{
+		print_status_header(STATUS_HEADER_COUNT, headers_status);
+	}
+
+	i = 0;
+
+	for (cell = nodes.head; cell; cell = cell->next)
+	{
+		if (runtime_options.output_mode == OM_CSV)
+		{
+			printf("%i,%s,%s,%i,%i,%i,%i\n",
+				   cell->node_info->node_id,
+				   cell->node_info->node_name,
+				   get_node_type_string(cell->node_info->type),
+				   repmgrd_info[i]->pg_running ? 1 : 0,
+				   repmgrd_info[i]->running ? 1 : 0,
+				   repmgrd_info[i]->pid,
+				   repmgrd_info[i]->paused ? 1 : 0);
+		}
+		else
+		{
+			printf(" %-*i ",  headers_status[STATUS_ID].max_length, cell->node_info->node_id);
+			printf("| %-*s ", headers_status[STATUS_NAME].max_length, cell->node_info->node_name);
+			printf("| %-*s ", headers_status[STATUS_ROLE].max_length, get_node_type_string(cell->node_info->type));
+
+			printf("| %-*s ", headers_status[STATUS_PG].max_length, repmgrd_info[i]->pg_running_text);
+			printf("| %-*s ", headers_status[STATUS_RUNNING].max_length, repmgrd_info[i]->repmgrd_running);
+			printf("| %-*s ", headers_status[STATUS_PID].max_length, repmgrd_info[i]->pid_text);
+
+			if (repmgrd_info[i]->pid == UNKNOWN_PID)
+				printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, "n/a");
+			else
+				printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, repmgrd_info[i]->paused ? "yes" : "no");
+
+			printf("\n");
+		}
+
+		free(repmgrd_info[i]);
+		i++;
+	}
+
+	free(repmgrd_info);
+
+	/* emit any warnings */
+
+	if (warnings.head != NULL && runtime_options.terse == false && runtime_options.output_mode != OM_CSV)
+	{
+		ItemListCell *cell = NULL;
+
+		printf(_("\nWARNING: following issues were detected\n"));
+		for (cell = warnings.head; cell; cell = cell->next)
+		{
+			printf(_("  - %s\n"), cell->string);
+		}
+
+		if (runtime_options.verbose == false)
+		{
+			log_hint(_("execute with --verbose option to see connection error messages"));
+		}
+	}
+}
+
+void
+do_daemon_pause(void)
+{
+	_do_repmgr_pause(true);
+}
+
+void
+do_daemon_unpause(void)
+{
+	_do_repmgr_pause(false);
+}
+
+
+static void
+_do_repmgr_pause(bool pause)
+{
+	PGconn	   *conn = NULL;
+	NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
+	NodeInfoListCell *cell = NULL;
+	RepmgrdInfo **repmgrd_info;
+	int i;
+	int error_nodes = 0;
+
+	repmgrd_info = (RepmgrdInfo **) pg_malloc0(sizeof(RepmgrdInfo *) * nodes.node_count);
+
+	if (repmgrd_info == NULL)
+	{
+		log_error(_("unable to allocate memory"));
+		exit(ERR_OUT_OF_MEMORY);
+	}
+
+	/* Connect to local database to obtain cluster connection data */
+	log_verbose(LOG_INFO, _("connecting to database"));
+
+	if (strlen(config_file_options.conninfo))
+		conn = establish_db_connection(config_file_options.conninfo, true);
+	else
+		conn = establish_db_connection_by_params(&source_conninfo, true);
+
+	fetch_node_records(conn, &nodes);
+
+	i = 0;
+
+	for (cell = nodes.head; cell; cell = cell->next)
+	{
+		repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
+		repmgrd_info[i]->node_id = cell->node_info->node_id;
+
+		log_verbose(LOG_DEBUG, "pausing node %i (%s)",
+					cell->node_info->node_id,
+					cell->node_info->node_name);
+		cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
+
+		if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
+		{
+			log_warning(_("unable to connect to node %i"),
+						cell->node_info->node_id);
+			error_nodes++;
+		}
+		else
+		{
+			if (runtime_options.dry_run == true)
+			{
+				if (pause == true)
+				{
+					log_info(_("would pause node %i (%s) "),
+							 cell->node_info->node_id,
+							 cell->node_info->node_name);
+				}
+				else
+				{
+					log_info(_("would unpause node %i (%s) "),
+							 cell->node_info->node_id,
+							 cell->node_info->node_name);
+				}
+			}
+			else
+			{
+				bool success = repmgrd_pause(cell->node_info->conn, pause);
+
+				if (success == false)
+					error_nodes++;
+
+				log_notice(_("node %i (%s) %s"),
+						   cell->node_info->node_id,
+						   cell->node_info->node_name,
+						   success == true
+								? pause == true ? "paused" : "unpaused"
+		   						: pause == true ? "not paused" : "not unpaused");
+			}
+			PQfinish(cell->node_info->conn);
+		}
+		i++;
+	}
+
+	if (error_nodes > 0)
+	{
+		if (pause == true)
+		{
+			log_error(_("unable to pause %i node(s)"), error_nodes);
+		}
+		else
+		{
+			log_error(_("unable to unpause %i node(s)"), error_nodes);
+		}
+
+		log_hint(_("execute \"repmgr daemon status\" to view current status"));
+
+		exit(ERR_REPMGRD_PAUSE);
+	}
+
+	exit(SUCCESS);
+}
+
+
+
+void
+fetch_node_records(PGconn *conn, NodeInfoList *node_list)
+{
+	bool success = get_all_node_records(conn, node_list);
+
+	if (success == false)
+	{
+		/* get_all_node_records() will display any error message */
+		PQfinish(conn);
+		exit(ERR_BAD_CONFIG);
+	}
+
+	if (node_list->node_count == 0)
+	{
+		log_error(_("no node records were found"));
+		log_hint(_("ensure at least one node is registered"));
+		PQfinish(conn);
+		exit(ERR_BAD_CONFIG);
+	}
+}
+
+
+void do_daemon_help(void)
+{
+	print_help_header();
+
+	printf(_("Usage:\n"));
+	printf(_("    %s [OPTIONS] daemon status\n"),  progname());
+	printf(_("    %s [OPTIONS] daemon pause\n"),   progname());
+	printf(_("    %s [OPTIONS] daemon unpause\n"), progname());
+	puts("");
+
+	printf(_("DAEMON STATUS\n"));
+	puts("");
+	printf(_("  \"daemon status\" shows the status of repmgrd on each node in the cluster\n"));
+	puts("");
+	printf(_("    --csv                     emit output as CSV\n"));
+	printf(_("    --verbose                 show text of database connection error messages\n"));
+	puts("");
+
+	printf(_("DAEMON PAUSE\n"));
+	puts("");
+	printf(_("  \"daemon pause\" instructs repmgrd on each node to pause failover detection\n"));
+	puts("");
+	printf(_("    --dry-run               check if nodes are reachable but don't pause repmgrd\n"));
+	puts("");
+
+	printf(_("DAEMON PAUSE\n"));
+	puts("");
+	printf(_("  \"daemon unpause\"  instructs repmgrd on each node to resume failover detection\n"));
+	puts("");
+	printf(_("    --dry-run               check if nodes are reachable but don't unpause repmgrd\n"));
+	puts("");
+
+
+	puts("");
+}
diff --git a/repmgr-action-daemon.h b/repmgr-action-daemon.h
new file mode 100644
index 00000000..026feac0
--- /dev/null
+++ b/repmgr-action-daemon.h
@@ -0,0 +1,28 @@
+/*
+ * repmgr-action-daemon.h
+ * Copyright (c) 2ndQuadrant, 2010-2018
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _REPMGR_ACTION_DAEMON_H_
+#define _REPMGR_ACTION_DAEMON_H_
+
+
+extern void do_daemon_status(void);
+extern void do_daemon_pause(void);
+extern void do_daemon_unpause(void);
+
+extern void do_daemon_help(void);
+#endif
diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c
index 167be896..47c69f43 100644
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -2788,15 +2788,13 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor
 
 /*
  * Perform a switchover by:
+ *
  *  - stopping current primary node
  *  - promoting this standby node to primary
- *  - forcing previous primary node to follow this node
+ *  - forcing the previous primary node to follow this node
  *
- * Caveat:
- *  - repmgrd must not be running, otherwise it may
- *    attempt a failover
- *    (TODO: find some way of notifying repmgrd of planned
- *     activity like this)
+ * Where running and not already paused, repmgrd will be paused (and
+ * subsequently unpaused), unless --repmgrd-no-pause provided.
  *
  * TODO:
  *  - make connection test timeouts/intervals configurable (see below)
@@ -2854,6 +2852,11 @@ do_standby_switchover(void)
 
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
 
+	/* used for handling repmgrd pause/unpause */
+	NodeInfoList all_nodes = T_NODE_INFO_LIST_INITIALIZER;
+	RepmgrdInfo **repmgrd_info = NULL;
+	int			repmgrd_running_count = 0;
+
 	/*
 	 * SANITY CHECKS
 	 *
@@ -2924,7 +2927,7 @@ do_standby_switchover(void)
 
 	if (record_status != RECORD_FOUND)
 	{
-		log_error(_("unable to retrieve node record for node %i"),
+		log_error(_("unable to retrieve node record for currentr primary (node %i)"),
 				  remote_node_id);
 
 		PQfinish(local_conn);
@@ -2980,6 +2983,7 @@ do_standby_switchover(void)
 	{
 		min_required_free_slots++;
 	}
+
 	/*
 	 * If --force-rewind specified, check pg_rewind can be used, and
 	 * pre-emptively fetch the list of configuration files which should be
@@ -3544,8 +3548,8 @@ do_standby_switchover(void)
 
 		log_debug("minimum of %i free slots (%i for siblings) required; %i available",
 				  min_required_free_slots,
-				  reachable_sibling_nodes_with_slot_count
-				  , available_slots);
+				  reachable_sibling_nodes_with_slot_count,
+				  available_slots);
 
 		if (available_slots < min_required_free_slots)
 		{
@@ -3575,6 +3579,147 @@ do_standby_switchover(void)
 		}
 	}
 
+	/*
+	 * Attempt to pause all repmgrd instances, unless user explicitly
+	 * specifies not to.
+	 */
+	if (runtime_options.repmgrd_no_pause == false)
+	{
+		NodeInfoListCell *cell = NULL;
+		ItemList repmgrd_connection_errors = {NULL, NULL};
+		int i = 0;
+		int unreachable_node_count = 0;
+
+		get_all_node_records(local_conn, &all_nodes);
+
+		repmgrd_info = (RepmgrdInfo **) pg_malloc0(sizeof(RepmgrdInfo *) * all_nodes.node_count);
+
+		for (cell = all_nodes.head; cell; cell = cell->next)
+		{
+			cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
+
+			repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
+			repmgrd_info[i]->node_id = cell->node_info->node_id;
+			repmgrd_info[i]->pid = UNKNOWN_PID;
+			repmgrd_info[i]->paused = false;
+			repmgrd_info[i]->running = false;
+
+			if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
+			{
+				/*
+				 * unable to connect; treat this as an error
+				 */
+
+				repmgrd_info[i]->pg_running = false;
+
+				item_list_append_format(&repmgrd_connection_errors,
+										_("unable to connect to node \"%s\" (ID %i)"),
+										cell->node_info->node_name,
+										cell->node_info->node_id);
+
+				unreachable_node_count++;
+				continue;
+			}
+
+			repmgrd_info[i]->running = repmgrd_is_running(cell->node_info->conn);
+			repmgrd_info[i]->pid = repmgrd_get_pid(cell->node_info->conn);
+			repmgrd_info[i]->paused = repmgrd_is_paused(cell->node_info->conn);
+
+			if (repmgrd_info[i]->running == true)
+				repmgrd_running_count++;
+
+			i++;
+		}
+
+		if (unreachable_node_count > 0)
+		{
+			PQExpBufferData msg;
+			PQExpBufferData detail;
+			ItemListCell *cell;
+
+			initPQExpBuffer(&msg);
+			appendPQExpBuffer(&msg,
+							  _("unable to connect to %i node(s), unable to pause all repmgrd instances"),
+							  unreachable_node_count);
+
+			initPQExpBuffer(&detail);
+
+			for (cell = repmgrd_connection_errors.head; cell; cell = cell->next)
+			{
+				appendPQExpBuffer(&detail,
+								  "  %s\n",
+								  cell->string);
+			}
+
+
+			if (runtime_options.force == false)
+			{
+				log_error("%s", msg.data);
+			}
+			else
+			{
+				log_warning("%s", msg.data);
+			}
+
+			log_detail(_("following node(s) unreachable:\n%s"), detail.data);
+
+			termPQExpBuffer(&msg);
+			termPQExpBuffer(&detail);
+
+			/* tell user about footgun */
+			if (runtime_options.force == false)
+			{
+				log_hint(_("use -F/--force to continue anyway"));
+
+				clear_node_info_list(&sibling_nodes);
+				clear_node_info_list(&all_nodes);
+
+				exit(ERR_SWITCHOVER_FAIL);
+			}
+
+		}
+
+		if (repmgrd_running_count > 0)
+		{
+			i = 0;
+			for (cell = all_nodes.head; cell; cell = cell->next)
+			{
+				/*
+				 * Skip if node is already paused. Note we won't unpause these, to
+				 * leave the repmgrd instances in the cluster in the same state they
+				 * were before the switchover.
+				 */
+				if (repmgrd_info[i]->paused == true)
+				{
+					PQfinish(cell->node_info->conn);
+					cell->node_info->conn = NULL;
+					i++;
+					continue;
+				}
+
+				if (runtime_options.dry_run == true)
+				{
+					log_info(_("would pause repmgrd on node %s (ID %i)"),
+							 cell->node_info->node_name,
+							 cell->node_info->node_id);
+				}
+				else
+				{
+					/* XXX check result  */
+					log_debug("pausing repmgrd on node %s (ID %i)",
+							 cell->node_info->node_name,
+							 cell->node_info->node_id);
+
+					(void) repmgrd_pause(cell->node_info->conn, true);
+				}
+
+				PQfinish(cell->node_info->conn);
+				cell->node_info->conn = NULL;
+				i++;
+			}
+		}
+
+	}
 
 	/*
 	 * Sanity checks completed - prepare for the switchover
@@ -3656,6 +3801,7 @@ do_standby_switchover(void)
 				 shutdown_command);
 
 		clear_node_info_list(&sibling_nodes);
+		clear_node_info_list(&all_nodes);
 		key_value_list_free(&remote_config_files);
 
 		return;
@@ -3793,7 +3939,7 @@ do_standby_switchover(void)
 
 
 	/*
-	 * if pg_rewind is requested, issue a checkpoint immediately after promoting
+	 * If pg_rewind is requested, issue a checkpoint immediately after promoting
 	 * the local node, as pg_rewind compares timelines on the basis of the value
 	 * in pg_control, which is written at the first checkpoint, which might not
 	 * occur immediately.
@@ -3805,7 +3951,7 @@ do_standby_switchover(void)
 	}
 
 	/*
-	 * Execute `repmgr node rejoin` to create recovery.conf and start the
+	 * Execute "repmgr node rejoin" to create recovery.conf and start the
 	 * remote server. Additionally execute "pg_rewind", if required and
 	 * requested.
 	 */
@@ -3819,6 +3965,7 @@ do_standby_switchover(void)
 		{
 			log_error(_("new primary diverges from former primary and --force-rewind not provided"));
 			log_hint(_("the former primary will need to be restored manually, or use \"repmgr node rejoin\""));
+
 			termPQExpBuffer(&node_rejoin_options);
 			PQfinish(local_conn);
 			exit(ERR_SWITCHOVER_FAIL);
@@ -3875,7 +4022,7 @@ do_standby_switchover(void)
 
 	if (command_success == false)
 	{
-		log_error(_("rejoin failed %i"), r);
+		log_error(_("rejoin failed with error code %i"), r);
 
 		create_event_notification_extended(local_conn,
 										   &config_file_options,
@@ -3997,11 +4144,13 @@ do_standby_switchover(void)
 
 	clear_node_info_list(&sibling_nodes);
 
+
+
 	PQfinish(local_conn);
 
 	/*
-	 * Clean up remote node. It's possible that the standby is still starting up,
-	 * so poll for a while until we get a connection.
+	 * Clean up remote node (primary demoted to standby). It's possible that the node is
+	 * still starting up, so poll for a while until we get a connection.
 	 */
 
 	for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
@@ -4053,6 +4202,84 @@ do_standby_switchover(void)
 
 	PQfinish(remote_conn);
 
+	/*
+	 * Attempt to unpause all paused repmgrd instances, unless user explicitly
+	 * specifies not to.
+	 */
+	if (runtime_options.repmgrd_no_pause == false)
+	{
+		if (repmgrd_running_count > 0)
+		{
+			ItemList repmgrd_unpause_errors = {NULL, NULL};
+			NodeInfoListCell *cell = NULL;
+			int i = 0;
+			int error_node_count = 0;
+
+			for (cell = all_nodes.head; cell; cell = cell->next)
+			{
+
+				if (repmgrd_info[i]->paused == true)
+				{
+					log_debug("repmgrd on node %s (ID %i) paused before switchover, not unpausing",
+							  cell->node_info->node_name,
+							  cell->node_info->node_id);
+
+					i++;
+					continue;
+				}
+
+				log_debug("unpausing repmgrd on node %s (ID %i)",
+						  cell->node_info->node_name,
+						  cell->node_info->node_id);
+
+				cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
+
+				if (PQstatus(cell->node_info->conn) == CONNECTION_OK)
+				{
+					if (repmgrd_pause(cell->node_info->conn, false) == false)
+					{
+						item_list_append_format(&repmgrd_unpause_errors,
+												_("unable to unpause node \"%s\" (ID %i)"),
+												cell->node_info->node_name,
+												cell->node_info->node_id);
+						error_node_count++;
+					}
+				}
+				else
+				{
+					item_list_append_format(&repmgrd_unpause_errors,
+											_("unable to connect to node \"%s\" (ID %i)"),
+											cell->node_info->node_name,
+											cell->node_info->node_id);
+					error_node_count++;
+				}
+
+				i++;
+			}
+
+			if (error_node_count > 0)
+			{
+				PQExpBufferData detail;
+				ItemListCell *cell;
+
+				for (cell = repmgrd_unpause_errors.head; cell; cell = cell->next)
+				{
+					appendPQExpBuffer(&detail,
+									  "  %s\n",
+									  cell->string);
+				}
+
+				log_warning(_("unable to unpause repmgrd on %i node(s)"),
+							error_node_count);
+				log_detail(_("errors encountered for following node(s):\n%s"), detail.data);
+				log_hint(_("check node connection and status; unpause manually with \"repmgr daemon unpause\""));
+
+				termPQExpBuffer(&detail);
+			}
+		}
+
+		clear_node_info_list(&all_nodes);
+	}
 
 	if (switchover_success == true)
 	{
@@ -6602,6 +6829,7 @@ do_standby_help(void)
 	printf(_("                                        (9.3 and 9.4 - provide \"pg_rewind\" path)\n"));
 
 	printf(_("  -R, --remote-user=USERNAME          database server username for SSH operations (default: \"%s\")\n"), runtime_options.username);
+	printf(_("  --repmgrd-no-pause                  don't pause repmgrd\n"));
 	printf(_("  --siblings-follow                   have other standbys follow new primary\n"));
 
 	puts("");
diff --git a/repmgr-client-global.h b/repmgr-client-global.h
index 55256f56..d2a4aa65 100644
--- a/repmgr-client-global.h
+++ b/repmgr-client-global.h
@@ -97,6 +97,7 @@ typedef struct
 	bool		force_rewind_used;
 	char		force_rewind_path[MAXPGPATH];
 	bool		siblings_follow;
+	bool		repmgrd_no_pause;
 
 	/* "node status" options */
 	bool		is_shutdown_cleanly;
@@ -156,7 +157,7 @@ typedef struct
 		/* "standby register" options */ \
 		false, -1, DEFAULT_WAIT_START,   \
 		/* "standby switchover" options */ \
-		false, false, "", false,		   \
+		false, false, "", false, false,	\
 		/* "node status" options */ \
 		false, \
 		/* "node check" options */ \
@@ -193,6 +194,14 @@ typedef enum
 } t_server_action;
 
 
+typedef struct ColHeader
+{
+	char		title[MAXLEN];
+	int			max_length;
+	int			cur_length;
+} ColHeader;
+
+
 
 /* global configuration structures */
 extern t_runtime_options runtime_options;
@@ -228,7 +237,10 @@ extern void get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGc
 extern bool remote_command(const char *host, const char *user, const char *command, PQExpBufferData *outputbuf);
 
 extern void make_remote_repmgr_path(PQExpBufferData *outputbuf, t_node_info *remote_node_record);
+
+/* display functions */
 extern void print_help_header(void);
+extern void print_status_header(int cols, ColHeader *headers);
 
 /* server control functions */
 extern void get_server_action(t_server_action action, char *script, char *data_dir);
diff --git a/repmgr-client.c b/repmgr-client.c
index daed411a..0395bc9d 100644
--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -30,10 +30,15 @@
  * NODE STATUS
  * NODE CHECK
  *
+ * DAEMON STATUS
+ * DAEMON PAUSE
+ * DAEMON UNPAUSE
+ *
  * For internal use:
  * NODE REJOIN
  * NODE SERVICE
  *
+ *
  * This program is free software: you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation, either version 3 of the License, or
@@ -62,6 +67,7 @@
 #include "repmgr-action-bdr.h"
 #include "repmgr-action-node.h"
 #include "repmgr-action-cluster.h"
+#include "repmgr-action-daemon.h"
 
 #include <storage/fd.h>			/* for PG_TEMP_FILE_PREFIX */
 
@@ -438,6 +444,10 @@ main(int argc, char **argv)
 				runtime_options.siblings_follow = true;
 				break;
 
+			case OPT_REPMGRD_NO_PAUSE:
+				runtime_options.repmgrd_no_pause = true;
+				break;
+
 				/*----------------------
 				 * "node status" options
 				 *----------------------
@@ -900,6 +910,21 @@ main(int argc, char **argv)
 			else if (strcasecmp(repmgr_action, "CLEANUP") == 0)
 				action = CLUSTER_CLEANUP;
 		}
+		else if (strcasecmp(repmgr_command, "DAEMON") == 0)
+		{
+			if (help_option == true)
+			{
+				do_daemon_help();
+				exit(SUCCESS);
+			}
+
+			if (strcasecmp(repmgr_action, "STATUS") == 0)
+				action = DAEMON_STATUS;
+			else if (strcasecmp(repmgr_action, "PAUSE") == 0)
+				action = DAEMON_PAUSE;
+			else if (strcasecmp(repmgr_action, "UNPAUSE") == 0)
+				action = DAEMON_UNPAUSE;
+		}
 		else
 		{
 			valid_repmgr_command_found = false;
@@ -1298,6 +1323,17 @@ main(int argc, char **argv)
 			do_cluster_cleanup();
 			break;
 
+			/* DAEMON */
+		case DAEMON_STATUS:
+			do_daemon_status();
+			break;
+		case DAEMON_PAUSE:
+			do_daemon_pause();
+			break;
+		case DAEMON_UNPAUSE:
+			do_daemon_unpause();
+			break;
+
 		default:
 			/* An action will have been determined by this point  */
 			break;
@@ -1744,6 +1780,18 @@ check_cli_parameters(const int action)
 		}
 	}
 
+	if (runtime_options.repmgrd_no_pause == true)
+	{
+		switch (action)
+		{
+			case STANDBY_SWITCHOVER:
+				break;
+			default:
+				item_list_append_format(&cli_warnings,
+										_("--repmgrd-no-pause will be ignored when executing %s"),
+										action_name(action));
+		}
+	}
 
 	if (runtime_options.config_files[0] != '\0')
 	{
@@ -1772,6 +1820,8 @@ check_cli_parameters(const int action)
 			case WITNESS_UNREGISTER:
 			case NODE_REJOIN:
 			case NODE_SERVICE:
+			case DAEMON_PAUSE:
+			case DAEMON_UNPAUSE:
 				break;
 			default:
 				item_list_append_format(&cli_warnings,
@@ -1851,6 +1901,14 @@ action_name(const int action)
 			return "CLUSTER MATRIX";
 		case CLUSTER_CROSSCHECK:
 			return "CLUSTER CROSSCHECK";
+
+		case DAEMON_STATUS:
+			return "DAEMON STATUS";
+		case DAEMON_PAUSE:
+			return "DAEMON PAUSE";
+		case DAEMON_UNPAUSE:
+			return "DAEMON UNPAUSE";
+
 	}
 
 	return "UNKNOWN ACTION";
@@ -1878,6 +1936,42 @@ print_error_list(ItemList *error_list, int log_level)
 }
 
 
+void
+print_status_header(int cols, ColHeader *headers)
+{
+	int i;
+
+	for (i = 0; i < cols; i++)
+	{
+		if (i == 0)
+			printf(" ");
+		else
+			printf(" | ");
+
+		printf("%-*s",
+			   headers[i].max_length,
+			   headers[i].title);
+	}
+	printf("\n");
+	printf("-");
+
+	for (i = 0; i < cols; i++)
+	{
+		int			j;
+
+		for (j = 0; j < headers[i].max_length; j++)
+			printf("-");
+
+		if (i < (cols - 1))
+			printf("-+-");
+		else
+			printf("-");
+	}
+
+	printf("\n");
+}
+
+
 void
 print_help_header(void)
 {
@@ -3021,4 +3115,3 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
 		}
 	}
 }
-
diff --git a/repmgr-client.h b/repmgr-client.h
index 714a560c..c80fb673 100644
--- a/repmgr-client.h
+++ b/repmgr-client.h
@@ -45,6 +45,9 @@
 #define CLUSTER_MATRIX		   19
 #define CLUSTER_CROSSCHECK	   20
 #define CLUSTER_EVENT		   21
+#define DAEMON_STATUS		   22
+#define DAEMON_PAUSE		   23
+#define DAEMON_UNPAUSE		   24
 
 /* command line options without short versions */
 #define OPT_HELP						   1001
@@ -88,6 +91,7 @@
 #define OPT_RECOVERY_CONF_ONLY             1039
 #define OPT_NO_WAIT                        1040
 #define OPT_MISSING_SLOTS                  1041
+#define OPT_REPMGRD_NO_PAUSE               1042
 
 /* deprecated since 3.3 */
 #define OPT_DATA_DIR						999
@@ -156,6 +160,7 @@ static struct option long_options[] =
  */
 	{"always-promote", no_argument, NULL, OPT_ALWAYS_PROMOTE},
 	{"siblings-follow", no_argument, NULL, OPT_SIBLINGS_FOLLOW},
+	{"repmgrd-no-pause", no_argument, NULL, OPT_REPMGRD_NO_PAUSE},
 
 /* "node status" options */
 	{"is-shutdown-cleanly", no_argument, NULL, OPT_IS_SHUTDOWN_CLEANLY},
diff --git a/repmgr.c b/repmgr.c
index 80fb0fe7..4461a354 100644
--- a/repmgr.c
+++ b/repmgr.c
@@ -26,6 +26,7 @@
 #include "access/xlog.h"
 #include "miscadmin.h"
 #include "replication/walreceiver.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/procarray.h"
@@ -43,14 +44,21 @@
 #include "lib/stringinfo.h"
 #include "access/xact.h"
 #include "utils/snapmgr.h"
-#include "pgstat.h"
 
+#if (PG_VERSION_NUM >= 90400)
+#include "pgstat.h"
+#else
+#define PGSTAT_STAT_PERMANENT_DIRECTORY             "pg_stat"
+#endif
 
 #include "voting.h"
 
 #define UNKNOWN_NODE_ID		-1
+#define UNKNOWN_PID			-1
 
 #define TRANCHE_NAME "repmgrd"
+#define REPMGRD_STATE_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/repmgrd_state.txt"
+
 
 PG_MODULE_MAGIC;
 
@@ -66,6 +74,9 @@ typedef struct repmgrdSharedState
 	LWLockId	lock;			/* protects search/modification */
 	TimestampTz last_updated;
 	int			local_node_id;
+	int			repmgrd_pid;
+	char		repmgrd_pidfile[MAXPGPATH];
+	bool		repmgrd_paused;
 	/* streaming failover */
 	NodeVotingStatus voting_status;
 	int			current_electoral_term;
@@ -112,6 +123,25 @@ PG_FUNCTION_INFO_V1(am_bdr_failover_handler);
 Datum		unset_bdr_failover_handler(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(unset_bdr_failover_handler);
 
+Datum		set_repmgrd_pid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(set_repmgrd_pid);
+
+Datum		get_repmgrd_pid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(get_repmgrd_pid);
+
+Datum		get_repmgrd_pidfile(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(get_repmgrd_pidfile);
+
+Datum		repmgrd_is_running(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(repmgrd_is_running);
+
+Datum		repmgrd_pause(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(repmgrd_pause);
+
+Datum		repmgrd_is_paused(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(repmgrd_is_paused);
+
+
 
 /*
  * Module load callback
@@ -185,6 +215,9 @@ repmgr_shmem_startup(void)
 #endif
 
 		shared_state->local_node_id = UNKNOWN_NODE_ID;
+		shared_state->repmgrd_pid = UNKNOWN_PID;
+		memset(shared_state->repmgrd_pidfile, 0, MAXPGPATH);
+		shared_state->repmgrd_paused = false;
 		shared_state->current_electoral_term = 0;
 		shared_state->voting_status = VS_NO_VOTE;
 		shared_state->candidate_node_id = UNKNOWN_NODE_ID;
@@ -204,6 +237,8 @@ Datum
 set_local_node_id(PG_FUNCTION_ARGS)
 {
 	int			local_node_id = UNKNOWN_NODE_ID;
+	int			stored_node_id = UNKNOWN_NODE_ID;
+	int			paused = -1;
 
 	if (!shared_state)
 		PG_RETURN_NULL();
@@ -213,6 +248,34 @@ set_local_node_id(PG_FUNCTION_ARGS)
 
 	local_node_id = PG_GETARG_INT32(0);
 
+	/* read state file and if exists/valid, update "repmgrd_paused" */
+	{
+		FILE	   *file = NULL;
+
+		file = AllocateFile(REPMGRD_STATE_FILE, PG_BINARY_R);
+
+		if (file != NULL)
+		{
+			int			buffer_size = 128;
+			char		buffer[buffer_size];
+
+			if (fgets(buffer, buffer_size, file) != NULL)
+			{
+				if (sscanf(buffer, "%i:%i", &stored_node_id, &paused) != 2)
+				{
+					elog(WARNING, "unable to parse repmgrd state file");
+				}
+				else
+				{
+					elog(DEBUG1, "node_id: %i; paused: %i", stored_node_id, paused);
+				}
+			}
+
+			FreeFile(file);
+		}
+
+	}
+
 	LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
 
 	/* only set local_node_id once, as it should never change */
@@ -221,6 +284,19 @@ set_local_node_id(PG_FUNCTION_ARGS)
 		shared_state->local_node_id = local_node_id;
 	}
 
+	/* only update if state file valid */
+	if (stored_node_id == shared_state->local_node_id)
+	{
+		if (paused == 0)
+		{
+			shared_state->repmgrd_paused = false;
+		}
+		else if (paused == 1)
+		{
+			shared_state->repmgrd_paused = true;
+		}
+	}
+
 	LWLockRelease(shared_state->lock);
 
 	PG_RETURN_VOID();
@@ -422,3 +498,185 @@ unset_bdr_failover_handler(PG_FUNCTION_ARGS)
 
 	PG_RETURN_VOID();
 }
+
+
+/*
+ * Returns the repmgrd pid; or NULL if none set; or -1 if set but repmgrd
+ * process not running (TODO!)
+ */
+Datum
+get_repmgrd_pid(PG_FUNCTION_ARGS)
+{
+	int repmgrd_pid = UNKNOWN_PID;
+
+	if (!shared_state)
+		PG_RETURN_NULL();
+
+	LWLockAcquire(shared_state->lock, LW_SHARED);
+	repmgrd_pid = shared_state->repmgrd_pid;
+	LWLockRelease(shared_state->lock);
+
+	PG_RETURN_INT32(repmgrd_pid);
+}
+
+
+/*
+ * Returns the repmgrd pidfile
+ */
+Datum
+get_repmgrd_pidfile(PG_FUNCTION_ARGS)
+{
+	char repmgrd_pidfile[MAXPGPATH];
+
+	if (!shared_state)
+		PG_RETURN_NULL();
+
+	memset(repmgrd_pidfile, 0, MAXPGPATH);
+
+	LWLockAcquire(shared_state->lock, LW_SHARED);
+	strncpy(repmgrd_pidfile, shared_state->repmgrd_pidfile, MAXPGPATH);
+	LWLockRelease(shared_state->lock);
+
+	if (repmgrd_pidfile[0] == '\0')
+		PG_RETURN_NULL();
+
+	PG_RETURN_TEXT_P(cstring_to_text(repmgrd_pidfile));
+}
+
+Datum
+set_repmgrd_pid(PG_FUNCTION_ARGS)
+{
+	int repmgrd_pid = UNKNOWN_PID;
+	char *repmgrd_pidfile = NULL;
+
+	if (!shared_state)
+		PG_RETURN_VOID();
+
+	if (PG_ARGISNULL(0))
+	{
+		repmgrd_pid = UNKNOWN_PID;
+	}
+	else
+	{
+		repmgrd_pid = PG_GETARG_INT32(0);
+	}
+
+	elog(DEBUG3, "set_repmgrd_pid(): provided pid is %i", repmgrd_pid);
+
+	if (repmgrd_pid != UNKNOWN_PID && !PG_ARGISNULL(1))
+	{
+		repmgrd_pidfile = text_to_cstring(PG_GETARG_TEXT_PP(1));
+		elog(INFO, "set_repmgrd_pid(): provided pidfile is %s", repmgrd_pidfile);
+	}
+
+	LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
+
+	shared_state->repmgrd_pid = repmgrd_pid;
+	memset(shared_state->repmgrd_pidfile, 0, MAXPGPATH);
+
+	if(repmgrd_pidfile != NULL)
+	{
+		strncpy(shared_state->repmgrd_pidfile, repmgrd_pidfile, MAXPGPATH);
+	}
+
+	LWLockRelease(shared_state->lock);
+	PG_RETURN_VOID();
+}
+
+
+Datum
+repmgrd_is_running(PG_FUNCTION_ARGS)
+{
+	int repmgrd_pid = UNKNOWN_PID;
+	int kill_ret;
+
+	if (!shared_state)
+		PG_RETURN_NULL();
+
+	LWLockAcquire(shared_state->lock, LW_SHARED);
+	repmgrd_pid = shared_state->repmgrd_pid;
+	LWLockRelease(shared_state->lock);
+
+	/* No PID registered - assume not running */
+	if (repmgrd_pid == UNKNOWN_PID)
+	{
+		PG_RETURN_BOOL(false);
+	}
+
+	kill_ret = kill(repmgrd_pid, 0);
+
+	if (kill_ret == 0)
+	{
+		PG_RETURN_BOOL(true);
+	}
+
+	PG_RETURN_BOOL(false);
+}
+
+
+Datum
+repmgrd_pause(PG_FUNCTION_ARGS)
+{
+	bool		pause;
+	FILE	   *file = NULL;
+	StringInfoData buf;
+
+	if (!shared_state)
+		PG_RETURN_NULL();
+
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	pause = PG_GETARG_BOOL(0);
+
+	LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
+	shared_state->repmgrd_paused = pause;
+	LWLockRelease(shared_state->lock);
+
+	/* write state to file */
+	file = AllocateFile(REPMGRD_STATE_FILE, PG_BINARY_W);
+
+	if (file == NULL)
+	{
+		elog(DEBUG1, "unable to allocate %s", REPMGRD_STATE_FILE);
+
+		// XXX anything else we can do? log?
+		PG_RETURN_VOID();
+	}
+
+	elog(DEBUG1, "allocated");
+
+	initStringInfo(&buf);
+
+	LWLockAcquire(shared_state->lock, LW_SHARED);
+
+	appendStringInfo(&buf, "%i:%i",
+					 shared_state->local_node_id,
+					 pause ? 1 : 0);
+	LWLockRelease(shared_state->lock);
+
+	// XXX check success
+	fwrite(buf.data, strlen(buf.data) + 1, 1, file);
+
+
+	resetStringInfo(&buf);
+	FreeFile(file);
+
+	PG_RETURN_VOID();
+}
+
+
+Datum
+repmgrd_is_paused(PG_FUNCTION_ARGS)
+{
+	bool is_paused;
+
+	if (!shared_state)
+		PG_RETURN_NULL();
+
+	LWLockAcquire(shared_state->lock, LW_SHARED);
+	is_paused = shared_state->repmgrd_paused;
+	LWLockRelease(shared_state->lock);
+
+	PG_RETURN_BOOL(is_paused);
+}
diff --git a/repmgr.h b/repmgr.h
index 8bf4ec4f..21a1e067 100644
--- a/repmgr.h
+++ b/repmgr.h
@@ -53,6 +53,7 @@
 
 #define UNKNOWN_TIMELINE_ID -1
 #define UNKNOWN_SYSTEM_IDENTIFIER 0
+#define UNKNOWN_PID			-1
 
 #define NODE_NOT_FOUND		-1
 #define NO_UPSTREAM_NODE	-1
diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index ec1e0682..a05cc614 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -106,12 +106,13 @@ handle_sigint_physical(SIGNAL_ARGS)
 	else
 		writeable_conn = primary_conn;
 
-	create_event_notification(writeable_conn,
-							  &config_file_options,
-							  config_file_options.node_id,
-							  "repmgrd_shutdown",
-							  true,
-							  event_details.data);
+	if (PQstatus(writeable_conn) == CONNECTION_OK)
+		create_event_notification(writeable_conn,
+								  &config_file_options,
+								  config_file_options.node_id,
+								  "repmgrd_shutdown",
+								  true,
+								  event_details.data);
 
 	termPQExpBuffer(&event_details);
 
@@ -145,7 +146,6 @@ do_physical_node_check(void)
 			case FAILOVER_AUTOMATIC:
 				log_error(_("this node is marked as inactive and cannot be used as a failover target"));
 				log_hint(_("%s"), hint);
-				close_connection(&local_conn);
 
 				create_event_notification(NULL,
 										  &config_file_options,
@@ -206,8 +206,7 @@ do_physical_node_check(void)
 		if (required_param_missing == true)
 		{
 			log_hint(_("add the missing configuration parameter(s) and start repmgrd again"));
-			close_connection(&local_conn);
-			exit(ERR_BAD_CONFIG);
+			terminate(ERR_BAD_CONFIG);
 		}
 	}
 }
@@ -339,6 +338,7 @@ monitor_streaming_primary(void)
 					if (stored_local_node_id == UNKNOWN_NODE_ID)
 					{
 						repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
+						repmgrd_set_pid(local_conn, getpid(), pid_file);
 					}
 
 					goto loop;
@@ -606,8 +606,7 @@ monitor_streaming_standby(void)
 		if (local_node_info.upstream_node_id == NODE_NOT_FOUND)
 		{
 			log_error(_("unable to determine an active primary for this cluster, terminating"));
-			close_connection(&local_conn);
-			exit(ERR_BAD_CONFIG);
+			terminate(ERR_BAD_CONFIG);
 		}
 	}
 
@@ -623,15 +622,15 @@ monitor_streaming_standby(void)
 		log_error(_("no record found for upstream node (ID: %i), terminating"),
 				  local_node_info.upstream_node_id);
 		log_hint(_("ensure the upstream node is registered correctly"));
-		close_connection(&local_conn);
-		exit(ERR_DB_CONN);
+
+		terminate(ERR_DB_CONN);
 	}
 	else if (record_status == RECORD_ERROR)
 	{
 		log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
 				  local_node_info.upstream_node_id);
-		close_connection(&local_conn);
-		exit(ERR_DB_CONN);
+
+		terminate(ERR_DB_CONN);
 	}
 
 	log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);
@@ -650,8 +649,7 @@ monitor_streaming_standby(void)
 				  local_node_info.upstream_node_id);
 		log_hint(_("upstream node must be running before repmgrd can start"));
 
-		close_connection(&local_conn);
-		exit(ERR_DB_CONN);
+		terminate(ERR_DB_CONN);
 	}
 
 	/*
@@ -673,7 +671,8 @@ monitor_streaming_standby(void)
 		{
 			log_error(_("unable to connect to primary node"));
 			log_hint(_("ensure the primary node is reachable from this node"));
-			exit(ERR_DB_CONN);
+
+			terminate(ERR_DB_CONN);
 		}
 
 		log_verbose(LOG_DEBUG, "connected to primary");
@@ -799,28 +798,40 @@ monitor_streaming_standby(void)
 					goto loop;
 				}
 
-				/* still down after reconnect attempt(s) */
+
+				/* upstream is still down after reconnect attempt(s) */
 				if (upstream_node_info.node_status == NODE_STATUS_DOWN)
 				{
 					bool		failover_done = false;
 
-					if (upstream_node_info.type == PRIMARY)
+					if (PQstatus(local_conn) == CONNECTION_OK && repmgrd_is_paused(local_conn))
 					{
-						failover_done = do_primary_failover();
+						log_notice(_("repmgrd on this node is paused"));
+						log_detail(_("no failover will be carried out"));
+						log_hint(_("execute \"repmgr daemon unpause\" to resume normal failover mode"));
+						monitoring_state = MS_DEGRADED;
+						INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
 					}
-					else if (upstream_node_info.type == STANDBY)
+					else
 					{
-						failover_done = do_upstream_standby_failover();
-					}
+						if (upstream_node_info.type == PRIMARY)
+						{
+							failover_done = do_primary_failover();
+						}
+						else if (upstream_node_info.type == STANDBY)
+						{
+							failover_done = do_upstream_standby_failover();
+						}
 
-					/*
-					 * XXX it's possible it will make sense to return in all
-					 * cases to restart monitoring
-					 */
-					if (failover_done == true)
-					{
-						primary_node_id = get_primary_node_id(local_conn);
-						return;
+						/*
+						 * XXX it's possible it will make sense to return in all
+						 * cases to restart monitoring
+						 */
+						if (failover_done == true)
+						{
+							primary_node_id = get_primary_node_id(local_conn);
+							return;
+						}
 					}
 				}
 			}
@@ -990,7 +1001,7 @@ monitor_streaming_standby(void)
 				}
 
 
-				if (config_file_options.failover == FAILOVER_AUTOMATIC)
+				if (config_file_options.failover == FAILOVER_AUTOMATIC && repmgrd_is_paused(local_conn) == false)
 				{
 					get_active_sibling_node_records(local_conn,
 													local_node_info.node_id,
@@ -1066,7 +1077,15 @@ loop:
 				termPQExpBuffer(&monitoring_summary);
 				if (monitoring_state == MS_DEGRADED && config_file_options.failover == FAILOVER_AUTOMATIC)
 				{
-					log_detail(_("waiting for upstream or another primary to reappear"));
+					if (PQstatus(local_conn) == CONNECTION_OK && repmgrd_is_paused(local_conn))
+					{
+						log_detail(_("repmgrd paused by administrator"));
+						log_hint(_("execute \"repmgr daemon unpause\" to resume normal failover mode"));
+					}
+					else
+					{
+						log_detail(_("waiting for upstream or another primary to reappear"));
+					}
 				}
 				else if (config_file_options.monitoring_history == true)
 				{
@@ -1195,6 +1214,7 @@ loop:
 				if (stored_local_node_id == UNKNOWN_NODE_ID)
 				{
 					repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
+					repmgrd_set_pid(local_conn, getpid(), pid_file);
 				}
 			}
 		}
@@ -1247,8 +1267,7 @@ monitor_streaming_witness(void)
 				  upstream_node_info.node_id);
 		log_hint(_("primary node must be running before repmgrd can start"));
 
-		close_connection(&local_conn);
-		exit(ERR_DB_CONN);
+		terminate(ERR_DB_CONN);
 	}
 
 	/* synchronise local copy of "repmgr.nodes", in case it was stale */
@@ -1561,6 +1580,7 @@ loop:
 				if (stored_local_node_id == UNKNOWN_NODE_ID)
 				{
 					repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
+					repmgrd_set_pid(local_conn, getpid(), pid_file);
 				}
 			}
 		}
@@ -2094,6 +2114,7 @@ do_upstream_standby_failover(void)
 
 	/* refresh shared memory settings which will have been zapped by the restart */
 	repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
+	repmgrd_set_pid(local_conn, getpid(), pid_file);
 
 	/*
 	 *
@@ -2564,6 +2585,7 @@ follow_new_primary(int new_primary_id)
 
 	/* refresh shared memory settings which will have been zapped by the restart */
 	repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
+	repmgrd_set_pid(local_conn, getpid(), pid_file);
 
 	initPQExpBuffer(&event_details);
 	appendPQExpBuffer(&event_details,
@@ -3088,6 +3110,7 @@ check_connection(t_node_info *node_info, PGconn **conn)
 			if (stored_local_node_id == UNKNOWN_NODE_ID)
 			{
 				repmgrd_set_local_node_id(*conn, config_file_options.node_id);
+				repmgrd_set_pid(local_conn, getpid(), pid_file);
 			}
 
 		}
diff --git a/repmgrd.c b/repmgrd.c
index 80ed5a79..06eba3ec 100644
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -35,7 +35,7 @@
 
 static char *config_file = NULL;
 static bool verbose = false;
-static char pid_file[MAXPGPATH];
+char pid_file[MAXPGPATH];
 static bool daemonize = true;
 static bool show_pid_file = false;
 static bool no_pid_file = false;
@@ -488,6 +488,9 @@ main(int argc, char **argv)
 		check_and_create_pid_file(pid_file);
 	}
 
+	repmgrd_set_pid(local_conn, getpid(), pid_file);
+
+
 #ifndef WIN32
 	setup_event_handlers();
 #endif
@@ -901,6 +904,9 @@ print_monitoring_state(MonitoringState monitoring_state)
 void
 terminate(int retval)
 {
+	if (PQstatus(local_conn)  == CONNECTION_OK)
+		repmgrd_set_pid(local_conn, UNKNOWN_PID, NULL);
+
 	logger_shutdown();
 
 	if (pid_file[0] != '\0')
diff --git a/repmgrd.h b/repmgrd.h
index 0f8f3706..144ec9e8 100644
--- a/repmgrd.h
+++ b/repmgrd.h
@@ -20,6 +20,7 @@ extern t_configuration_options config_file_options;
 extern t_node_info local_node_info;
 extern PGconn *local_conn;
 extern bool startup_event_logged;
+extern char pid_file[MAXPGPATH];
 
 void		try_reconnect(PGconn **conn, t_node_info *node_info);