doc: add a link to the current documentation from the contents page

doc: fix typo
Per user report on mailing list.
2026-03-23 15:16:29 +00:00 · 2019-04-03 10:48:36 +09:00 · 2018-10-23 09:01:00 +09:00 · 2018-07-23 13:23:28 +09:00 · 2018-07-18 16:11:18 +09:00 · 2018-07-16 14:55:07 +09:00
39 changed files with 1492 additions and 746 deletions
--- a/19
+++ b/19
@@ -1,3 +1,22 @@
+4.0.6   2018-06-14
+        repmgr: (witness register) prevent registration of a witness server with the
+ 		  same name as an existing node (Ian)
+		repmgr: (standby follow) check node has actually connected to new primary
+		  before reporting success; GitHub #444 (Ian)
+		repmgr: (standby clone) improve handling of external configuration file copying,
+		  including consideration in --dry-run check; GitHub #443 (Ian)
+		repmgr: (standby clone) don't require presence of "user" parameter in
+		  conninfo string; GitHub #437 (Ian)
+		repmgr: (standby clone) improve documentation of --recovery-conf-only
+		  mode; GitHub #438 (Ian)
+		repmgr: (node rejoin) fix bug when parsing --config-files parameter;
+		  GitHub #442 (Ian)
+		repmgr: when using --dry-run, force log level to INFO to ensure output
+		  will always be displayed; GitHub #441 (Ian)
+        repmgr: (cluster matrix/crosscheck) return non-zero exit code if node
+           connection issues detected; GitHub #447 (Ian)
+		repmgrd: ensure local node is counted as quorum member; GitHub #439 (Ian)
+
 4.0.5   2018-05-02
        repmgr: poll demoted primary after restart as a standby during a
          switchover operation; GitHub #408 (Ian)
--- a/configfile.c
+++ b/configfile.c
@@ -319,13 +319,20 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 	options->use_primary_conninfo_password = false;
 	memset(options->passfile, 0, sizeof(options->passfile));

-	/*-----------------------
+	/*-------------------------
 	 * standby promote settings
-	 *------------------------
+	 *-------------------------
 	 */
 	options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT;
 	options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL;

+	/*------------------------
+	 * standby follow settings
+	 *------------------------
+	 */
+	options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
+	options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
+
 	/*-----------------
 	 * repmgrd settings
 	 *-----------------
@@ -345,7 +352,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 	options->degraded_monitoring_timeout = -1;
 	options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
 	options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
-	options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
 	options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;

 	/*-------------
@@ -527,6 +533,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 		else if (strcmp(name, "promote_check_interval") == 0)
 			options->promote_check_interval = repmgr_atoi(value, name, error_list, 1);

+		/* standby follow settings */
+		else if (strcmp(name, "primary_follow_timeout") == 0)
+			options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
+		else if (strcmp(name, "standby_follow_timeout") == 0)
+			options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
+
 		/* node check settings */
 		else if (strcmp(name, "archive_ready_warning") == 0)
 			options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
@@ -576,8 +588,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 			options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
 		else if (strcmp(name, "primary_notification_timeout") == 0)
 			options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
-		else if (strcmp(name, "primary_follow_timeout") == 0)
-			options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
 		else if (strcmp(name, "standby_reconnect_timeout") == 0)
 			options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);

--- a/configfile.h
+++ b/configfile.h
@@ -98,6 +98,10 @@ typedef struct
 	int			promote_check_timeout;
 	int			promote_check_interval;

+	/* standby follow settings */
+	int			primary_follow_timeout;
+	int			standby_follow_timeout;
+
 	/* node check settings */
 	int			archive_ready_warning;
 	int			archive_ready_critical;
@@ -120,7 +124,6 @@ typedef struct
 	int			degraded_monitoring_timeout;
 	int			async_query_timeout;
 	int			primary_notification_timeout;
-	int			primary_follow_timeout;
 	int			standby_reconnect_timeout;

 	/* BDR settings */
@@ -167,6 +170,9 @@ typedef struct
 		false, "", "", { NULL, NULL }, "", false, "", false, "", \
 		/* standby promote settings */ \
 		DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \
+		/* standby follow settings */ \
+		DEFAULT_PRIMARY_FOLLOW_TIMEOUT,	\
+		DEFAULT_STANDBY_FOLLOW_TIMEOUT,	\
 		/* node check settings */ \
 		DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
 		DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
@@ -180,7 +186,6 @@ typedef struct
        false, -1, \
 		DEFAULT_ASYNC_QUERY_TIMEOUT, \
 		DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT,	\
-		DEFAULT_PRIMARY_FOLLOW_TIMEOUT,	\
 		DEFAULT_STANDBY_RECONNECT_TIMEOUT,	\
 		/* BDR settings */ \
 		false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
--- a/configure.in
+++ b/configure.in
@@ -1,4 +1,4 @@
-AC_INIT([repmgr], [4.0.5], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])
+AC_INIT([repmgr], [4.0.6], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])

 AC_COPYRIGHT([Copyright (c) 2010-2018, 2ndQuadrant Ltd.])

--- a/dbutils.c
+++ b/dbutils.c
@@ -23,6 +23,7 @@
 #include <sys/time.h>
 #include <sys/stat.h>
 #include <dirent.h>
+#include <arpa/inet.h>

 #include "repmgr.h"
 #include "dbutils.h"
@@ -370,6 +371,37 @@ get_conninfo_value(const char *conninfo, const char *keyword, char *output)
 }


+/*
+ * Get a default conninfo value for the provided parameter, and copy
+ * it to the 'output' buffer.
+ *
+ * Returns true on success, or false on failure (provided keyword not found).
+ *
+ */
+bool
+get_conninfo_default_value(const char *param, char *output, int maxlen)
+{
+	PQconninfoOption *defs = NULL;
+	PQconninfoOption *def = NULL;
+	bool found = false;
+
+	defs = PQconndefaults();
+
+	for (def = defs; def->keyword; def++)
+	{
+		if (strncmp(def->keyword, param, maxlen) == 0)
+		{
+			strncpy(output, def->val, maxlen);
+			found = true;
+		}
+	}
+
+	PQconninfoFree(defs);
+
+	return found;
+}
+
+
 void
 initialize_conninfo_params(t_conninfo_param_list *param_list, bool set_defaults)
 {
@@ -1733,7 +1765,7 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row)
 	strncpy(node_info->location, PQgetvalue(res, row, 7), MAXLEN);
 	node_info->priority = atoi(PQgetvalue(res, row, 8));
 	node_info->active = atobool(PQgetvalue(res, row, 9));
-	strncpy(node_info->config_file, PQgetvalue(res, row, 10), MAXLEN);
+	strncpy(node_info->config_file, PQgetvalue(res, row, 10), MAXPGPATH);

 	/* This won't normally be set */
 	strncpy(node_info->upstream_node_name, PQgetvalue(res, row, 11), MAXLEN);
@@ -2146,8 +2178,9 @@ get_downstream_nodes_with_missing_slot(PGconn *conn, int this_node_id, NodeInfoL
 					  "LEFT JOIN pg_catalog.pg_replication_slots rs "
 					  "       ON rs.slot_name = n.slot_name "
 					  "    WHERE n.slot_name IS NOT NULL"
-                      "      AND rs.slot_name IS NULL "
-                      "      AND n.upstream_node_id = %i ",
+					  "      AND rs.slot_name IS NULL "
+					  "      AND n.upstream_node_id = %i "
+					  "      AND n.type = 'standby'",
 					  this_node_id);

 	log_verbose(LOG_DEBUG, "get_all_node_records_with_missing_slot():\n%s", query.data);
@@ -2884,8 +2917,7 @@ get_datadir_configuration_files(PGconn *conn, KeyValueList *list)

 	for (i = 0; i < PQntuples(res); i++)
 	{
-		key_value_list_set(
-						   list,
+		key_value_list_set(list,
 						   PQgetvalue(res, i, 1),
 						   PQgetvalue(res, i, 0));
 	}
@@ -3110,6 +3142,8 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
 	char		event_timestamp[MAXLEN] = "";
 	bool		success = true;

+	log_verbose(LOG_DEBUG, "_create_event(): event is \"%s\" for node %i", event, node_id);
+
 	/*
 	 * Only attempt to write a record if a connection handle was provided.
 	 * Also check that the repmgr schema has been properly initialised - if
@@ -3620,7 +3654,7 @@ get_slot_record(PGconn *conn, char *slot_name, t_replication_slot *record)


 int
-get_free_replication_slots(PGconn *conn)
+get_free_replication_slot_count(PGconn *conn)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
@@ -3657,6 +3691,47 @@ get_free_replication_slots(PGconn *conn)
 }


+int
+get_inactive_replication_slots(PGconn *conn, KeyValueList *list)
+{
+	PQExpBufferData query;
+	PGresult   *res = NULL;
+	int			i, inactive_slots = 0;
+
+	initPQExpBuffer(&query);
+
+	appendPQExpBuffer(&query,
+					  "   SELECT slot_name, slot_type "
+					  "     FROM pg_catalog.pg_replication_slots "
+					  "    WHERE active IS FALSE "
+					  " ORDER BY slot_name ");
+
+	res = PQexec(conn, query.data);
+	termPQExpBuffer(&query);
+
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_error(_("unable to execute replication slot query"));
+		log_detail("%s", PQerrorMessage(conn));
+		PQclear(res);
+		return -1;
+	}
+
+	inactive_slots = PQntuples(res);
+
+	for (i = 0; i < inactive_slots; i++)
+	{
+		key_value_list_set(list,
+						   PQgetvalue(res, i, 0),
+						   PQgetvalue(res, i, 1));
+	}
+
+	PQclear(res);
+	return inactive_slots;
+}
+
+
+
 /* ==================== */
 /* tablespace functions */
 /* ==================== */
--- a/dbutils.h
+++ b/dbutils.h
@@ -357,7 +357,7 @@ void		close_connection(PGconn **conn);

 /* conninfo manipulation functions */
 bool		get_conninfo_value(const char *conninfo, const char *keyword, char *output);
-
+bool		get_conninfo_default_value(const char *param, char *output, int maxlen);
 void		initialize_conninfo_params(t_conninfo_param_list *param_list, bool set_defaults);
 void		free_conninfo_params(t_conninfo_param_list *param_list);
 void		copy_conninfo_params(t_conninfo_param_list *dest_list, t_conninfo_param_list *source_list);
@@ -369,6 +369,7 @@ bool		parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *par
 char	   *param_list_to_string(t_conninfo_param_list *param_list);
 bool		has_passfile(void);

+
 /* transaction functions */
 bool		begin_transaction(PGconn *conn);
 bool		commit_transaction(PGconn *conn);
@@ -454,7 +455,8 @@ void		create_slot_name(char *slot_name, int node_id);
 bool		create_replication_slot(PGconn *conn, char *slot_name, int server_version_num, PQExpBufferData *error_msg);
 bool		drop_replication_slot(PGconn *conn, char *slot_name);
 RecordStatus get_slot_record(PGconn *conn, char *slot_name, t_replication_slot *record);
-int			get_free_replication_slots(PGconn *conn);
+int			get_free_replication_slot_count(PGconn *conn);
+int			get_inactive_replication_slots(PGconn *conn, KeyValueList *list);

 /* tablespace functions */
 bool		get_tablespace_name_by_location(PGconn *conn, const char *location, char *name);
--- a/doc/appendix-packages.sgml
+++ b/doc/appendix-packages.sgml
@@ -41,18 +41,19 @@
      <title>CentOS repositories</title>

      <para>
-        &repmgr; packages are available from the 2ndQuadrant repository, and also the PostgreSQL
-        community repository. The 2ndQuadrant repository is updated immediately after each
+        &repmgr; packages are available from the public 2ndQuadrant repository, and also the
+        PostgreSQL community repository. The 2ndQuadrant repository is updated immediately
+        after each
        &repmgr; release.
      </para>

      <table id="centos-2ndquadrant-repository">
-        <title>2ndQuadrant repository</title>
+        <title>2ndQuadrant public repository</title>
        <tgroup cols="2">
          <tbody>
            <row>
              <entry>Repository URL:</entry>
-              <entry><ulink url="http://packages.2ndquadrant.com/repmgr/">http://packages.2ndquadrant.com/repmgr/</ulink></entry>
+              <entry><ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink></entry>
            </row>
            <row>
              <entry>Repository documentation:</entry>
--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -15,6 +15,113 @@
    See also: <xref linkend="upgrading-repmgr">
  </para>

+  <sect1 id="release-4.0.6">
+    <title>Release 4.0.6</title>
+    <para><emphasis>June 14, 2018</emphasis></para>
+    <para>
+	  &repmgr; 4.0.6 contains a number of bug fixes and usability enhancements.
+    </para>
+	<para>
+	  We recommend upgrading to this version as soon as possible.
+	  This release can be installed as a simple package upgrade from repmgr 4.0 ~ 4.0.5;
+      <application>repmgrd</application> (if running) should be restarted. See <xref linkend="upgrading-repmgr">
+      for more details.
+	</para>
+
+    <sect2>
+      <title>Usability enhancements</title>
+
+      <para>
+        <itemizedlist>
+          <listitem>
+            <para>
+              <command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command> and
+              <command><link linkend="repmgr-cluster-matrix">repmgr cluster matrix</link></command>:
+              return non-zero exit code if node connection issues detected (GitHub #447)
+            </para>
+          </listitem>
+
+          <listitem>
+            <para>
+               <command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>:
+			   Improve handling of external configuration file copying,  including consideration in
+			   <option>--dry-run</option> check
+			   (GitHub #443)
+           </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              When using <option>--dry-run</option>, force log level to <literal>INFO</literal>
+			  to ensure output will always be displayed
+			  (GitHub #441)
+           </para>
+          </listitem>
+
+          <listitem>
+            <para>
+               <command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>:
+			   Improve documentation of <option>--recovery-conf-only</option> mode
+			   (GitHub #438)
+           </para>
+          </listitem>
+
+          <listitem>
+            <para>
+               <command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>:
+			   Don't require presence of <varname>user</varname> parameter in conninfo string
+			   (GitHub #437)
+           </para>
+          </listitem>
+
+
+        </itemizedlist>
+      </para>
+    </sect2>
+
+    <sect2>
+      <title>Bug fixes</title>
+      <para>
+
+        <itemizedlist>
+
+          <listitem>
+            <para>
+              <command><link linkend="repmgr-witness-register">repmgr witness register</link></command>:
+              prevent registration of a witness server with the same name as an existing node
+           </para>
+          </listitem>
+
+
+          <listitem>
+            <para>
+               <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>:
+			   check node has actually connected to new primary before reporting success
+			   (GitHub #444)
+           </para>
+          </listitem>
+
+          <listitem>
+            <para>
+               <command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>:
+			   Fix bug when parsing <option>--config-files</option> parameter
+			   (GitHub #442)
+           </para>
+          </listitem>
+
+          <listitem>
+            <para>
+              <application>repmgrd</application>: ensure local node is counted as quorum member
+			  (GitHub #439)
+           </para>
+          </listitem>
+
+        </itemizedlist>
+      </para>
+    </sect2>
+
+  </sect1>
+
  <sect1 id="release-4.0.5">
    <title>Release 4.0.5</title>
    <para><emphasis>Wed May 2, 2018</emphasis></para>
@@ -24,6 +131,7 @@
      generation and (in <application>repmgrd</application>) handling of various
      corner-case situations, as well as a number of bug fixes.
    </para>
+
    <sect2>
      <title>Usability enhancements</title>

--- a/doc/appendix-signatures.sgml
+++ b/doc/appendix-signatures.sgml
@@ -33,34 +33,5 @@

 </sect1>

- <sect1 id="repmgr-rpm-key" xreflabel="repmgr rpm key">
-   <title>repmgr RPM signing key</title>
-   <para>
-     The signing key ID used for <application>repmgr</application> source code bundles is:
-     <ulink url="http://packages.2ndquadrant.com/repmgr/RPM-GPG-KEY-repmgr">
-       <literal>0x702D883A</literal></ulink>.
-   </para>
-
-   <para>
-     To download the <application>repmgr</application> source key to your computer:
-     <programlisting>
-       curl -s http://packages.2ndquadrant.com/repmgr/RPM-GPG-KEY-repmgr | gpg --import
-       gpg --fingerprint 0x702D883A
-     </programlisting>
-     then verify that the fingerprint is the expected value:
-     <programlisting>
-       AE4E 390E A58E 0037 6148  3F29 888D 018B 702D 883A</programlisting>
-   </para>
-
-   <para>
-     To check a repository RPM, use <application>rpmkeys</application> to load the
-      packaging signing key into the RPM database then use <literal>rpm -K</literal>, e.g.:
-     <programlisting>
-       sudo rpmkeys --import http://packages.2ndquadrant.com/repmgr/RPM-GPG-KEY-repmgr
-       rpm -K postgresql-bdr94-2ndquadrant-redhat-1.0-2.noarch.rpm
-     </programlisting>
-   </para>
-
- </sect1>

 </appendix>
--- a/doc/configuration-service-commands.sgml
+++ b/doc/configuration-service-commands.sgml
@@ -25,7 +25,7 @@

  <note>
    <para>
-      If using <application>systemd</application>, ensure you have <varname>RemoteIPC</varname> set to <literal>off</literal>.
+      If using <application>systemd</application>, ensure you have <varname>RemoveIPC</varname> set to <literal>off</literal>.
      See the <ulink url="https://wiki.postgresql.org/wiki/Systemd">systemd</ulink>
      entry in the <ulink url="https://wiki.postgresql.org/wiki/Main_Page">PostgreSQL wiki</ulink> for details.
    </para>
@@ -47,16 +47,24 @@
    service_restart_command
    service_reload_command</programlisting>
  </para>
-
  <note>
    <para>
-      It's also possible to specify a <varname>service_promote_command</varname>;
-      this overrides any value contained in the setting <varname>promote_command</varname>.
+      It's also possible to specify a <varname>service_promote_command</varname>.
      This is intended for systems which provide a package-level promote command,
-      such as Debian's <application>pg_ctlcluster</application>.
+      such as Debian's <application>pg_ctlcluster</application>, to promote the
+      PostgreSQL from standby to primary.
+    </para>
+    <para>
+      If your packaging system does not provide such a command, it can be left empty,
+      and &repmgr; will generate the appropriate <command>pg_ctl ... promote</command> command.
+    </para>
+    <para>
+      Do not confuse this with <varname>promote_command</varname>, which is used
+      by <application>repmgrd</application> to execute <xref linkend="repmgr-standby-promote">.
    </para>
  </note>

+
  <para>
    To confirm which command &repmgr; will execute for each action, use
    <command>repmgr node service --list --action=...</command>, e.g.:
--- a/doc/event-notifications.sgml
+++ b/doc/event-notifications.sgml
@@ -217,9 +217,6 @@
   <listitem>
    <simpara><literal>repmgrd_promote_error</literal></simpara>
   </listitem>
-   <listitem>
-    <simpara><literal>repmgrd_failover_promote</literal></simpara>
-   </listitem>
   <listitem>
    <simpara><literal>bdr_failover</literal></simpara>
   </listitem>
--- a/doc/install-packages.sgml
+++ b/doc/install-packages.sgml
@@ -5,26 +5,27 @@
  system.
 </para>

- <sect2 id="installation-packages-redhat" xreflabel="Installing from packages on RHEL, Fedora and CentOS">
+ <sect2 id="installation-packages-redhat" xreflabel="Installing from packages on RHEL, CentOS and Fedora">

  <indexterm>
   <primary>installation</primary>
   <secondary>on Red Hat/CentOS/Fedora etc.</secondary>
  </indexterm>

-  <title>RedHat/Fedora/CentOS</title>
+  <title>RedHat/CentOS/Fedora</title>
  <para>
-   RPM packages for &repmgr; are available via Yum through
+	&repmgr; RPM packages for RedHat/CentOS variants and Fedora are available from the
+	<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
+	<ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>; see following
+	section for details.
+  </para>
+  <para>
+   RPM packages for &repmgr; are also available via Yum through
   the PostgreSQL Global Development Group RPM repository
   (<ulink url="https://yum.postgresql.org/">http://yum.postgresql.org/</ulink>).
   Follow the instructions for your distribution (RedHat, CentOS,
-   Fedora, etc.) and architecture as detailed there.
-  </para>
-  <para>
-   <ulink url="https://2ndquadrant.com">2ndQuadrant</ulink> also provides its
-   own RPM packages which are made available
-   at the same time as each &repmgr; release, as it can take some days for
-   them to become available via the main PGDG repository. See following section for details:
+   Fedora, etc.) and architecture as detailed there. Note that it can take some days
+   for new &repmgr; packages to become available via the this repository.
  </para>
  <note>
    <para>
@@ -43,59 +44,68 @@


  <sect3 id="installation-packages-redhat-2ndq">
-    <title>2ndQuadrant repmgr yum repository</title>
+    <title>2ndQuadrant public RPM yum repository</title>
+
+	<note>
+	  <para>
+		<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink> previously provided a dedicated
+        &repmgr; repository at
+        <ulink url="http://packages.2ndquadrant.com/repmgr/">http://packages.2ndquadrant.com/repmgr/</ulink>.
+		This repository will be deprecated in a future release as it is now replaced by
+		the <ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>
+		documented below.
+	  </para>
+	</note>
+
    <para>
-      Beginning with <ulink url="http://repmgr.org/release-notes-3.1.3.html">repmgr 3.1.3</ulink>,
+      Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
      <ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a dedicated <literal>yum</literal>
-      repository for &repmgr; releases. This repository complements the main
-      <ulink url="https://yum.postgresql.org/repopackages.php">PGDG community repository</ulink>,
-      but enables repmgr users to access the latest &repmgr; packages before they are
-      available via the PGDG repository, which can take several days to be updated following
-      a fresh  &repmgr; release.
-    </para>
+	  <ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink> for 2ndQuadrant software,
+	  including &repmgr;. We recommend using this for all future &repmgr; releases.
+	</para>
+	<para>
+	  General instructions for using this repository can be found on its
+	  <ulink url="https://rpm.2ndquadrant.com/">homepage</ulink>. Specific instructions
+	  for installing &repmgr; follow below.
+	</para>
    <para>
      <emphasis>Installation</emphasis>

      <itemizedlist>
-        <listitem>
-          <para>
-            Import the repository public key (optional but recommended):
-            <programlisting>
-              rpm --import http://packages.2ndquadrant.com/repmgr/RPM-GPG-KEY-repmgr</programlisting>
-          </para>
-        </listitem>
+		<listitem>
+		  <para>
+			Locate the repository RPM for your PostgreSQL version from the list at:
+			<ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink>
+		  </para>
+		</listitem>

        <listitem>
          <para>
-            Install the repository RPM for your distribution (this enables the 2ndQuadrant
-            repository as a source of repmgr packages):
-            <itemizedlist>
-              <listitem>
-                <simpara>
-                  <emphasis>Fedora:</emphasis>
-                  <ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm</ulink>
-                </simpara>
-              </listitem>
-              <listitem>
-                <simpara>
-                  <emphasis>RHEL, CentOS etc:</emphasis>
-                  <ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</ulink>
-                </simpara>
-              </listitem>
-            </itemizedlist>
-          </para>
-          <para>
-            e.g.:
-            <programlisting>
-              $ yum install http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</programlisting>
-          </para>
-        </listitem>
+            Install the repository RPM for your distribution and PostgreSQL version
+			(this enables the 2ndQuadrant repository as a source of &repmgr; packages).
+		  </para>
+		  <para>
+			For example, for PostgreSQL 10 on CentOS, execute:
+			<programlisting>
+sudo yum install https://rpm.2ndquadrant.com/site/content/2ndquadrant-repo-10-1-1.el7.noarch.rpm
+			</programlisting>
+		  </para>
+		  <para>
+			Verify that the repository is installed with:
+			<programlisting>
+sudo yum repolist</programlisting>
+			The output should contain two entries like this:
+			<programlisting>
+2ndquadrant-repo-10/7/x86_64         2ndQuadrant packages for PG10 for rhel 7 - x86_64           1
+2ndquadrant-repo-10-debug/7/x86_64   2ndQuadrant packages for PG10 for rhel 7 - x86_64 - Debug   1</programlisting>
+		  </para>
+		</listitem>

        <listitem>
          <para>
-            Install the repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr96</literal>), e.g.:
+            Install the &repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr10</literal>):
            <programlisting>
-              $ yum install repmgr96</programlisting>
+$ yum install repmgr10</programlisting>
          </para>
        </listitem>
      </itemizedlist>
@@ -105,13 +115,13 @@
      <emphasis>Compatibility with PGDG Repositories</emphasis>
    </para>
    <para>
-        The 2ndQuadrant &repmgr; yum repository uses exactly the same package definitions as the
-        main PGDG repository and is effectively a selective mirror for &repmgr; packages only.
+      The 2ndQuadrant &repmgr; yum repository packages use the same definitions and file system layout as the
+      main PGDG repository.
    </para>
    <para>
-        Normally yum should prioritize the repository with the most recent &repmgr; version.
-        Once the PGDG repository has been updated, it doesn't matter which repository
-        the packages are installed from.
+      Normally <application>yum</application> will prioritize the repository with the most recent &repmgr; version.
+      Once the PGDG repository has been updated, it doesn't matter which repository
+      the packages are installed from.
    </para>
    <para>
      To ensure the 2ndQuadrant repository is always prioritised, install <literal>yum-plugin-priorities</literal>
@@ -125,30 +135,23 @@
      To install a specific package version, execute <command>yum --showduplicates list</command>
      for the package in question:
      <programlisting>
-        [root@localhost ~]# yum --showduplicates list repmgr96
+        [root@localhost ~]# yum --showduplicates list repmgr10
        Loaded plugins: fastestmirror
        Loading mirror speeds from cached hostfile
         * base: ftp.iij.ad.jp
         * extras: ftp.iij.ad.jp
         * updates: ftp.iij.ad.jp
        Available Packages
-        repmgr96.x86_64               3.2-1.el6                    2ndquadrant-repmgr
-        repmgr96.x86_64               3.2.1-1.el6                  2ndquadrant-repmgr
-        repmgr96.x86_64               3.3-1.el6                    2ndquadrant-repmgr
-        repmgr96.x86_64               3.3.1-1.el6                  2ndquadrant-repmgr
-        repmgr96.x86_64               3.3.2-1.el6                  2ndquadrant-repmgr
-        repmgr96.x86_64               3.3.2-1.rhel6                pgdg96
-        repmgr96.x86_64               4.0.0-1.el6                  2ndquadrant-repmgr
-        repmgr96.x86_64               4.0.0-1.rhel6                pgdg96</programlisting>
+		repmgr10.x86_64                       4.0.3-1.rhel7                        pgdg10
+		repmgr10.x86_64                       4.0.4-1.rhel7                        pgdg10
+		repmgr10.x86_64                       4.0.5-1.el7                          2ndquadrant-repo-10</programlisting>
      then append the appropriate version number to the package name with a hyphen, e.g.:
      <programlisting>
-        [root@localhost ~]# yum install repmgr96-3.3.2-1.el6</programlisting>
+        [root@localhost ~]# yum install repmgr10-4.0.3-1.rhel7</programlisting>
    </para>
  </sect3>
 </sect2>

-
-
 <sect2 id="installation-packages-debian" xreflabel="Installing from packages on Debian or Ubuntu">

  <indexterm>
@@ -168,6 +171,79 @@
    see the appendix section <xref linkend="packages-debian-ubuntu">.
  </para>

+  <sect3 id="installation-packages-debian-ubuntu-2ndq">
+    <title>2ndQuadrant public apt repository for Debian/Ubuntu</title>
+
+    <para>
+      Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
+      <ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a
+	  <ulink url="https://apt.2ndquadrant.com/">public apt repository</ulink> for 2ndQuadrant software,
+	  including &repmgr;.
+	</para>
+	<para>
+	  General instructions for using this repository can be found on its
+	  <ulink url="https://apt.2ndquadrant.com/">homepage</ulink>. Specific instructions
+	  for installing &repmgr; follow below.
+	</para>
+
+    <para>
+      <emphasis>Installation</emphasis>
+
+      <itemizedlist>
+
+		<listitem>
+		  <para>
+			If not already present, install the  <application>apt-transport-https</application> package:
+			<programlisting>
+sudo apt-get install apt-transport-https</programlisting>
+		  </para>
+		</listitem>
+
+		<listitem>
+		  <para>
+			Create <filename>/etc/apt/sources.list.d/2ndquadrant.list</filename> as follows:
+			<programlisting>
+sudo sh -c 'echo "deb https://apt.2ndquadrant.com/ $(lsb_release -cs)-2ndquadrant main" > /etc/apt/sources.list.d/2ndquadrant.list'</programlisting>
+		  </para>
+		</listitem>
+
+		<listitem>
+		  <para>
+			Install the 2ndQuadrant <ulink url="https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc">repository key</ulink>:
+			<programlisting>
+sudo apt-get install curl ca-certificates
+curl https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc | sudo apt-key add -</programlisting>
+		  </para>
+		</listitem>
+
+		<listitem>
+		  <para>
+			Update the package list
+			<programlisting>
+ sudo apt-get update</programlisting>
+		  </para>
+		</listitem>
+
+		<listitem>
+		  <para>
+            Install the &repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr10</literal>):
+            <programlisting>
+$ apt-get install postgresql-10-repmgr</programlisting>
+		  </para>
+          <note>
+            <para>
+            For packages for PostgreSQL 9.6 and earlier, the package name includes
+            a period between major and minor version numbers, e.g.
+            <literal>postgresql-9.6-repmgr</literal>.
+            </para>
+          </note>
+		</listitem>
+
+	  </itemizedlist>
+
+	</para>
+
+  </sect3>
 </sect2>

 </sect1>
--- a/doc/install-source.sgml
+++ b/doc/install-source.sgml
@@ -80,7 +80,7 @@
   </para>

   <para>
-    There are also tags for each &repmgr; release, e.g. <filename>REL4_0_STABLE</filename>.
+    There are also tags for each &repmgr; release, e.g. <filename>4.0.5</filename>.
   </para>

   <para>
--- a/doc/repmgr-cluster-crosscheck.sgml
+++ b/doc/repmgr-cluster-crosscheck.sgml
@@ -38,5 +38,34 @@
      and therefore determine the state of outbound connections from that node.
    </para>
  </refsect1>
+
+  <refsect1>
+    <title>Exit codes</title>
+    <para>
+      Following exit codes can be emitted by <command>repmgr cluster crosscheck</command>:
+    </para>
+    <variablelist>
+
+      <varlistentry>
+        <term><option>SUCCESS (0)</option></term>
+        <listitem>
+          <para>
+            The check completed successfully and all nodes are reachable.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_CLUSTER_CHECK (25)</option></term>
+        <listitem>
+          <para>
+            One or more nodes could not be reached.
+          </para>
+        </listitem>
+      </varlistentry>
+
+   </variablelist>
+  </refsect1>
+
 </refentry>

--- a/doc/repmgr-cluster-matrix.sgml
+++ b/doc/repmgr-cluster-matrix.sgml
@@ -97,5 +97,35 @@
    useful result.
  </para>
  </refsect1>
+
+
+  <refsect1>
+    <title>Exit codes</title>
+    <para>
+      Following exit codes can be emitted by <command>repmgr cluster matrix</command>:
+    </para>
+    <variablelist>
+
+      <varlistentry>
+        <term><option>SUCCESS (0)</option></term>
+        <listitem>
+          <para>
+            The check completed successfully and all nodes are reachable.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term><option>ERR_CLUSTER_CHECK (25)</option></term>
+        <listitem>
+          <para>
+            One or more nodes could not be reached.
+          </para>
+        </listitem>
+      </varlistentry>
+
+   </variablelist>
+  </refsect1>
+
 </refentry>

--- a/doc/repmgr-node-check.sgml
+++ b/doc/repmgr-node-check.sgml
@@ -79,9 +79,26 @@

    </itemizedlist>
  </para>
-  <para>
-   Individual checks can also be output in a Nagios-compatible format by additionally
-   providing the option <literal>--nagios</literal>.
-  </para>
+  </refsect1>
+
+  <refsect1>
+    <title>Output format</title>
+    <para>
+      <itemizedlist spacing="compact" mark="bullet">
+
+        <listitem>
+          <simpara>
+            <literal>--csv</literal>: generate output in CSV format (not available
+            for individual checks)
+          </simpara>
+        </listitem>
+
+        <listitem>
+          <simpara>
+            <literal>--nagios</literal>: generate output in a Nagios-compatible format
+          </simpara>
+        </listitem>
+      </itemizedlist>
+    </para>
  </refsect1>
 </refentry>
--- a/doc/repmgr-node-status.sgml
+++ b/doc/repmgr-node-status.sgml
@@ -24,7 +24,7 @@
    <title>Example</title>
    <para>
    <programlisting>
-        $ repmgr -f /etc/repmgr.comf node status
+        $ repmgr -f /etc/repmgr.conf node status
        Node "node1":
            PostgreSQL version: 10beta1
            Total data size: 30 MB
@@ -38,6 +38,20 @@
    </para>
  </refsect1>

+  <refsect1>
+    <title>Output format</title>
+    <para>
+      <itemizedlist spacing="compact" mark="bullet">
+
+        <listitem>
+          <simpara>
+            <literal>--csv</literal>: generate output in CSV format
+          </simpara>
+        </listitem>
+      </itemizedlist>
+    </para>
+  </refsect1>
+
  <refsect1>
    <title>See also</title>
    <para>
--- a/doc/repmgr-standby-clone.sgml
+++ b/doc/repmgr-standby-clone.sgml
@@ -124,7 +124,7 @@
     <para>
       We recommend using <ulink url="https://www.pgbarman.org/">Barman</ulink> to manage
       WAL file archiving. For more details on combining &repmgr; and <application>Barman</application>,
-       in particular using <varname>restore_command</varname> to configure Barman as a backu source of
+       in particular using <varname>restore_command</varname> to configure Barman as a backup source of
       WAL files, see <xref linkend="cloning-from-barman">.
     </para>
   </note>
@@ -177,12 +177,13 @@
   <title>Using a standby cloned by another method</title>
   <para>
     &repmgr; supports standbys cloned by another method (e.g. using <application>barman</application>'s
-     <command>barman recover</command> command).
+     <command><ulink url="http://docs.pgbarman.org/release/2.4/#recover">barman recover</ulink></command> command).
   </para>
   <para>
     To integrate the standby as a &repmgr; node, ensure the <filename>repmgr.conf</filename>
-     file is created for the node, then execute the command
-     <command>repmgr standby clone --recovery-conf-only</command>.
+     file is created for the node, and that it has been registered using
+     <command><link linkend="repmgr-standby-register">repmgr standby register</link></command>.
+     Then execute the command <command>repmgr standby clone --recovery-conf-only</command>.
     This will create the <filename>recovery.conf</filename> file needed to attach
     the node to its upstream, and will also create a replication slot on the
     upstream node if required.
--- a/doc/repmgr-standby-follow.sgml
+++ b/doc/repmgr-standby-follow.sgml
@@ -26,10 +26,18 @@
      running. It can only be used to attach an active standby to the current primary node
   (and not to another standby).
    </para>
-    <para>
-      To re-add an inactive node to the replication cluster, see
-      <xref linkend="repmgr-node-rejoin">
-    </para>
+	<tip>
+      <para>
+		To re-add an inactive node to the replication cluster, use
+		<xref linkend="repmgr-node-rejoin">.
+      </para>
+	</tip>
+
+	<para>
+	  <command>repmgr standby follow</command> will wait up to
+	  <varname>standby_follow_timeout</varname> seconds (default: <literal>30</literal>)
+	  to verify the standby has actually connected to the new primary.
+	</para>

  </refsect1>

@@ -92,7 +100,7 @@
      A <literal>standby_follow</literal> <link linkend="event-notifications">event notification</link> will be generated.
    </para>
    <para>
-      If provided, &repmgr; will subsitute the placeholders <literal>%p</literal> with the node ID of the primary
+      If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the primary
      being followed, <literal>%c</literal> with its <literal>conninfo</literal> string, and
      <literal>%a</literal> with its node name.
    </para>
--- a/doc/repmgr-standby-promote.sgml
+++ b/doc/repmgr-standby-promote.sgml
@@ -32,6 +32,7 @@
      check the promotion every <varname>promote_check_interval</varname> seconds (default: 1 second).
      Both values can be defined in <filename>repmgr.conf</filename>.
    </para>
+
  </refsect1>

  <refsect1>
--- a/doc/repmgr-standby-register.sgml
+++ b/doc/repmgr-standby-register.sgml
@@ -173,7 +173,7 @@
    </para>

    <para>
-      If provided, &repmgr; will subsitute the placeholders <literal>%p</literal> with the node ID of the
+      If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the
      primary node, <literal>%c</literal> with its <literal>conninfo</literal> string, and
      <literal>%a</literal> with its node name.
    </para>
--- a/doc/repmgr-standby-switchover.sgml
+++ b/doc/repmgr-standby-switchover.sgml
@@ -12,6 +12,7 @@
    <refpurpose>promote a standby to primary and demote the existing primary to a standby</refpurpose>
  </refnamediv>

+
  <refsect1>
    <title>Description</title>

@@ -39,6 +40,14 @@
      For more details on performing a switchover, including preparation and configuration,
      see section <xref linkend="performing-switchover">.
    </para>
+
+    <note>
+      <para>
+        <application>repmgrd</application> should not be active on any nodes while a switchover is being
+        executed. This restriction may be lifted in a later version.
+      </para>
+    </note>
+
  </refsect1>

  <refsect1>
@@ -171,10 +180,12 @@
      Execute with the <literal>--dry-run</literal> option to test the switchover as far as
      possible without actually changing the status of either node.
    </para>
-    <para>
-      <application>repmgrd</application> should not be active on any nodes while a switchover is being
-      executed. This restriction may be lifted in a later version.
-    </para>
+    <important>
+      <para>
+        <application>repmgrd</application> must be shut down on all nodes while a switchover is being
+        executed. This restriction will be removed in a future &repmgr; version.
+      </para>
+    </important>
    <para>
      External database connections, e.g. from an application, should not be permitted while
      the switchover is taking place. In particular, active transactions on the primary
@@ -199,7 +210,7 @@
  <refsect1>
    <title>Exit codes</title>
    <para>
-      Following exit codes can be emitted by <literal>repmgr standby switchover</literal>:
+      Following exit codes can be emitted by <command>repmgr standby switchover</command>:
    </para>
    <variablelist>

@@ -227,7 +238,7 @@
          <para>
            The switchover was executed but a problem was encountered.
            Typically this means the former primary could not be reattached
-            as a standby.
+            as a standby. Check preceding log messages for more information.
          </para>
        </listitem>
      </varlistentry>
--- a/doc/repmgr.sgml
+++ b/doc/repmgr.sgml
@@ -25,7 +25,13 @@
   <para>
   This is the official documentation of &repmgr; &repmgrversion; for
   use with PostgreSQL 9.3 - PostgreSQL 10.
-   It describes the functionality supported by the current version of &repmgr;.
+   </para>
+   <para>
+     &repmgr; is being continually developed and we strongly recommend using the
+     latest version. Please check the
+     <ulink url="https://repmgr.org/">repmgr website</ulink> for details
+     about the current &repmgr; version as well as the
+     <ulink url="https://repmgr.org/docs/current/index.html">current documentation</ulink>.
   </para>

   <para>
--- a/doc/repmgrd-bdr.sgml
+++ b/doc/repmgrd-bdr.sgml
@@ -99,15 +99,16 @@
      replication cluster. The database must be the BDR-enabled database.
    </para>
    <para>
-      If defined, the evenr <application>event_notifications</application> parameter
-      will restrict execution of <varname>event_notification_command</varname>
+      If defined, the <varname>event_notifications</varname> parameter will restrict
+      execution of the script defined in  <varname>event_notification_command</varname>
      to the specified event(s).
    </para>
    <note>
      <simpara>
        <varname>event_notification_command</varname> is the script which does the actual "heavy lifting"
        of reconfiguring the proxy server/ connection pooler. It is fully
-        user-definable; a reference implementation is documented below.
+        user-definable; see section <xref linkend="bdr-event-notification-command"> for a reference
+        implementation.
      </simpara>
    </note>

@@ -169,8 +170,8 @@
    </para>
  </sect1>

-  <sect1 id="bdr-event-notification-command" xreflabel="BDR failover event notification command">
-    <title>Defining the "event_notification_command"</title>
+  <sect1 id="bdr-event-notification-command" xreflabel="Defining the BDR failover &quot;event_notification command&quot;">
+    <title>Defining the BDR failover "event_notification_command"</title>
    <para>
      Key to "failover" execution is the <literal>event_notification_command</literal>,
      which is a user-definable script specified in <filename>repmpgr.conf</filename>
--- a/doc/repmgrd-configuration.sgml
+++ b/doc/repmgrd-configuration.sgml
@@ -34,6 +34,24 @@
      the <ulink url="https://www.postgresql.org/docs/current/static/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
    </para>

+    <para>
+      To apply configuration file changes to a running <application>repmgrd</application>
+      daemon, execute the operating system's r<application>repmgrd</application> service reload command
+      (see <xref linkend="appendix-packages"> for examples),
+      or for instances  which were manually started, execute <command>kill -HUP</command>, e.g.
+      <command>kill -HUP `cat /tmp/repmgrd.pid`</command>.
+    </para>
+    <note>
+      <para>
+        Check the <application>repmgrd</application> log to see what changes were
+        applied, or if any issues were encountered when reloading the configuration.
+      </para>
+    </note>
+    <para>
+      Note that only a subset of configuration file parameters can be changed on a
+      running <application>repmgrd</application> daemon.
+    </para>
+
    <sect2 id="repmgrd-automatic-failover-configuration">
      <title>automatic failover configuration</title>
      <para>
@@ -162,13 +180,6 @@
        repmgrd -f /etc/repmgr.conf --pid-file /tmp/repmgrd.pid --daemonize</programlisting>
      and stopped with <command>kill `cat /tmp/repmgrd.pid`</command>. Adjust paths as appropriate.
    </para>
-    <para>
-      To apply configuration file changes to a running <application>repmgrd</application>
-      daemon, execute the operating system's service reload command (for manually started
-      instances, execute <command>kill -HUP `cat /tmp/repmgrd.pid`</command>).
-      Note that only a subset of configuration file parameters can be changed on a
-      running <application>repmgrd</application> daemon.
-    </para>

    <sect2 id="repmgrd-configuration-debian-ubuntu">
      <indexterm>
--- a/doc/switchover.sgml
+++ b/doc/switchover.sgml
@@ -140,10 +140,12 @@
    manually with <command>repmgr node check --archive-ready</command>.
   </para>

-   <para>
-    Ensure that <application>repmgrd</application> is *not* running anywhere to prevent it unintentionally
-    promoting a node.
-   </para>
+   <note>
+     <para>
+       Ensure that <application>repmgrd</application> is *not* running anywhere to prevent it unintentionally
+       promoting a node. This restriction will be removed in a future &repmgr; version.
+     </para>
+   </note>

   <para>
    Finally, consider executing <command>repmgr standby switchover</command> with the
--- a/doc/version.sgml
+++ b/doc/version.sgml
@@ -1 +1 @@
-<!ENTITY repmgrversion "4.0.5">
+<!ENTITY repmgrversion "4.0.6">
--- a/errcode.h
+++ b/errcode.h
@@ -46,5 +46,6 @@
 #define ERR_SWITCHOVER_INCOMPLETE 22
 #define ERR_FOLLOW_FAIL 23
 #define ERR_REJOIN_FAIL 24
+#define ERR_CLUSTER_CHECK 25

 #endif							/* _ERRCODE_H_ */
--- a/log.c
+++ b/log.c
@@ -329,6 +329,13 @@ logger_set_terse(void)
 }


+void
+logger_set_min_level(int min_log_level)
+{
+	if (min_log_level > log_level)
+		log_level = min_log_level;
+}
+
 int
 detect_log_level(const char *level)
 {
--- a/log.h
+++ b/log.h
@@ -128,6 +128,7 @@ bool		logger_shutdown(void);

 void		logger_set_verbose(void);
 void		logger_set_terse(void);
+void		logger_set_min_level(int min_log_level);

 void
 log_detail(const char *fmt,...)
--- a/repmgr-action-cluster.c
+++ b/repmgr-action-cluster.c
@@ -569,6 +569,8 @@ do_cluster_crosscheck(void)

 	t_node_status_cube **cube;

+	bool		error_found = false;
+
 	n = build_cluster_crosscheck(&cube, &name_length);
 	if (runtime_options.output_mode == OM_CSV)
 	{
@@ -648,9 +650,11 @@ do_cluster_crosscheck(void)
 				{
 					case -2:
 						c = '?';
+						error_found = true;
 						break;
 					case -1:
 						c = 'x';
+						error_found = true;
 						break;
 					case 0:
 						c = '*';
@@ -689,6 +693,11 @@ do_cluster_crosscheck(void)

 		free(cube);
 	}
+
+	if (error_found == true)
+	{
+		exit(ERR_CLUSTER_CHECK);
+	}
 }


@@ -704,6 +713,8 @@ do_cluster_matrix()

 	t_node_matrix_rec **matrix_rec_list;

+	bool		error_found = false;
+
 	n = build_cluster_matrix(&matrix_rec_list, &name_length);

 	if (runtime_options.output_mode == OM_CSV)
@@ -742,9 +753,11 @@ do_cluster_matrix()
 				{
 					case -2:
 						c = '?';
+						error_found = true;
 						break;
 					case -1:
 						c = 'x';
+						error_found = true;
 						break;
 					case 0:
 						c = '*';
@@ -770,6 +783,11 @@ do_cluster_matrix()
 	}

 	free(matrix_rec_list);
+
+	if (error_found == true)
+	{
+		exit(ERR_CLUSTER_CHECK);
+	}
 }


--- a/repmgr-action-node.c
+++ b/repmgr-action-node.c
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -87,7 +87,7 @@ static void initialise_direct_clone(t_node_info *node_record);
 static int	run_basebackup(t_node_info *node_record);
 static int	run_file_backup(t_node_info *node_record);

-static void copy_configuration_files(void);
+static void copy_configuration_files(bool delete_after_copy);

 static void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);

@@ -498,7 +498,33 @@ do_standby_clone(void)

 			termPQExpBuffer(&msg);

-			/* TODO: check all files are readable */
+			/*
+			 * Here we'll attempt an initial test copy of the detected external
+			 * files, to detect any issues before we run the base backup.
+			 *
+			 * Note this will exit with an error, unless -F/--force supplied.
+			 *
+			 * TODO: put the files in a temporary directory and move to their final
+			 * destination once the database has been cloned.
+			 */
+
+			if (runtime_options.copy_external_config_files_destination == CONFIG_FILE_SAMEPATH)
+			{
+				/*
+				 * Files will be placed in the same path as on the source server;
+				 * don't delete after copying.
+				 */
+				copy_configuration_files(false);
+
+			}
+			else
+			{
+				/*
+				 * Files will be placed in the data directory - delete after copying.
+				 * They'll be copied again later; see TODO above.
+				 */
+				copy_configuration_files(true);
+			}
 		}


@@ -597,7 +623,12 @@ do_standby_clone(void)
 	 */
 	if (runtime_options.copy_external_config_files == true && config_files.entries > 0)
 	{
-		copy_configuration_files();
+		/*
+		 * If "--copy-external-config-files=samepath" was used, the files will already
+		 * have been copied.
+		 */
+		if (runtime_options.copy_external_config_files_destination == CONFIG_FILE_PGDATA)
+			copy_configuration_files(false);
 	}

 	/* Write the recovery.conf file */
@@ -938,7 +969,6 @@ _do_create_recovery_conf(void)
 			log_detail("%s", PQerrorMessage(source_conn));
 		}

-
 		exit(ERR_BAD_CONFIG);
 	}

@@ -955,7 +985,10 @@ _do_create_recovery_conf(void)
 		{
 			log_detail("%s", PQerrorMessage(source_conn));
 		}
-
+		else
+		{
+			log_hint(_("standby must be registered before a new recovery.conf file can be created"));
+		}

 		exit(ERR_BAD_CONFIG);
 	}
@@ -2126,7 +2159,13 @@ do_standby_follow(void)

 	log_verbose(LOG_DEBUG, "do_standby_follow()");

-	local_conn = establish_db_connection(config_file_options.conninfo, true);
+	local_conn = establish_db_connection(config_file_options.conninfo, false);
+
+	if (PQstatus(local_conn) != CONNECTION_OK)
+	{
+		log_hint(_("use \"repmgr node rejoin\" to re-add an inactive node to the replication cluster"));
+		exit(ERR_DB_CONN);
+	}

 	log_verbose(LOG_INFO, _("connected to local node"));

@@ -2218,7 +2257,7 @@ do_standby_follow(void)

 	if (config_file_options.use_replication_slots)
 	{
-		int free_slots = get_free_replication_slots(primary_conn);
+		int free_slots = get_free_replication_slot_count(primary_conn);
 		if (free_slots < 0)
 		{
 			log_error(_("unable to determine number of free replication slots on the primary"));
@@ -2313,6 +2352,74 @@ do_standby_follow(void)
 										 &follow_output,
 										 &follow_error_code);

+	/* unable to restart the standby */
+	if (success == false)
+	{
+		create_event_notification_extended(
+			primary_conn,
+			&config_file_options,
+			config_file_options.node_id,
+			"standby_follow",
+			success,
+			follow_output.data,
+			&event_info);
+
+		PQfinish(primary_conn);
+
+		log_notice(_("STANDBY FOLLOW failed"));
+		if (strlen( follow_output.data ))
+			log_detail("%s", follow_output.data);
+
+		termPQExpBuffer(&follow_output);
+		exit(follow_error_code);
+	}
+
+	termPQExpBuffer(&follow_output);
+
+	initPQExpBuffer(&follow_output);
+
+	/*
+	 * Wait up to "standby_follow_timeout" seconds for standby to connect to
+	 * upstream.
+	 * For 9.6 and later, we could check pg_stat_wal_receiver on the local node.
+	 */
+
+	/* assume success, necessary if standby_follow_timeout is zero */
+	success = true;
+
+	for (timer = 0; timer < config_file_options.standby_follow_timeout; timer++)
+	{
+		success = is_downstream_node_attached(primary_conn, config_file_options.node_name);
+		if (success == true)
+			break;
+
+		log_verbose(LOG_DEBUG, "sleeping %i of max %i seconds waiting for standby to attach to primary",
+					timer + 1,
+					config_file_options.standby_follow_timeout);
+		sleep(1);
+	}
+
+	if (success == true)
+	{
+		log_notice(_("STANDBY FOLLOW successful"));
+		appendPQExpBuffer(&follow_output,
+						  "standby attached to upstream node \"%s\" (node ID: %i)",
+						  primary_node_record.node_name,
+						  primary_node_id);
+	}
+	else
+	{
+		log_error(_("STANDBY FOLLOW failed"));
+		appendPQExpBuffer(&follow_output,
+						  "standby did not attach to upstream node \"%s\" (node ID: %i) after %i seconds",
+						  primary_node_record.node_name,
+						  primary_node_id,
+						  config_file_options.standby_follow_timeout);
+
+	}
+
+	log_detail("%s", follow_output.data);
+
 	create_event_notification_extended(
 		primary_conn,
 		&config_file_options,
@@ -2324,20 +2431,11 @@ do_standby_follow(void)

 	PQfinish(primary_conn);

-	if (success == false)
-	{
-		log_notice(_("STANDBY FOLLOW failed"));
-		log_detail("%s", follow_output.data);
-
-		termPQExpBuffer(&follow_output);
-		exit(follow_error_code);
-	}
-
-	log_notice(_("STANDBY FOLLOW successful"));
-	log_detail("%s", follow_output.data);
-
 	termPQExpBuffer(&follow_output);

+	if (success == false)
+		exit(ERR_FOLLOW_FAIL);
+
 	return;
 }

@@ -3335,8 +3433,6 @@ do_standby_switchover(void)
 				}
 			}

-
-
 			/*
 			 * check there are sufficient free walsenders - obviously there's potential
 			 * for a later race condition if some walsenders come into use before the
@@ -3760,7 +3856,6 @@ do_standby_switchover(void)
 	 * If --siblings-follow specified, attempt to make them follow the new
 	 * primary
 	 */
-
 	if (runtime_options.siblings_follow == true && sibling_nodes.node_count > 0)
 	{
 		int			failed_follow_count = 0;
@@ -3787,8 +3882,17 @@ do_standby_switchover(void)
 			initPQExpBuffer(&remote_command_str);
 			make_remote_repmgr_path(&remote_command_str, &sibling_node_record);

-			appendPQExpBuffer(&remote_command_str,
-							  "standby follow 2>/dev/null && echo \"1\" || echo \"0\"");
+			if (sibling_node_record.type == WITNESS)
+			{
+				appendPQExpBuffer(&remote_command_str,
+								  "witness register -d \\'%s\\' --force 2>/dev/null && echo \"1\" || echo \"0\"",
+								  local_node_record.conninfo);
+			}
+			else
+			{
+				appendPQExpBuffer(&remote_command_str,
+								  "standby follow 2>/dev/null && echo \"1\" || echo \"0\"");
+			}
 			get_conninfo_value(cell->node_info->conninfo, "host", host);
 			log_debug("executing:\n  %s", remote_command_str.data);

@@ -3803,8 +3907,16 @@ do_standby_switchover(void)

 			if (success == false || command_output.data[0] == '0')
 			{
-				log_warning(_("STANDBY FOLLOW failed on node \"%s\""),
-							cell->node_info->node_name);
+				if (sibling_node_record.type == WITNESS)
+				{
+					log_warning(_("WITNESS REGISTER failed on node \"%s\""),
+								cell->node_info->node_name);
+				}
+				else
+				{
+					log_warning(_("STANDBY FOLLOW failed on node \"%s\""),
+								cell->node_info->node_name);
+				}
 				failed_follow_count++;
 			}

@@ -3909,6 +4021,8 @@ check_source_server()
 	PGconn	   *privileged_conn = NULL;

 	char		cluster_size[MAXLEN];
+	char	   *connstr = NULL;
+
 	t_node_info node_record = T_NODE_INFO_INITIALIZER;
 	RecordStatus record_status = RECORD_NOT_FOUND;
 	ExtensionStatus extension_status = REPMGR_UNKNOWN;
@@ -3917,8 +4031,11 @@ check_source_server()
 	log_verbose(LOG_DEBUG, "check_source_server()");
 	log_info(_("connecting to source node"));

-	source_conn = establish_db_connection_by_params(&source_conninfo, false);
+	connstr = param_list_to_string(&source_conninfo);
+	log_detail(_("connection string is: %s"), connstr);
+	pfree(connstr);

+	source_conn = establish_db_connection_by_params(&source_conninfo, false);
 	/*
 	 * Unless in barman mode, exit with an error;
 	 * establish_db_connection_by_params() will have already logged an error
@@ -4073,13 +4190,25 @@ check_source_server()
 		if (record_status == RECORD_FOUND)
 		{
 			t_conninfo_param_list upstream_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
+			char	   *upstream_conninfo_user;

 			initialize_conninfo_params(&upstream_conninfo, false);
 			parse_conninfo_string(node_record.conninfo, &upstream_conninfo, NULL, false);

 			strncpy(recovery_conninfo_str, node_record.conninfo, MAXLEN);
 			strncpy(upstream_repluser, node_record.repluser, NAMEDATALEN);
-			strncpy(upstream_user, param_get(&upstream_conninfo, "user"), NAMEDATALEN);
+
+			upstream_conninfo_user = param_get(&upstream_conninfo, "user");
+			if (upstream_conninfo_user != NULL)
+			{
+				strncpy(upstream_user, upstream_conninfo_user, NAMEDATALEN);
+			}
+			else
+			{
+				get_conninfo_default_value("user", upstream_user, NAMEDATALEN);
+			}
+
+			log_verbose(LOG_DEBUG, "upstream_user is \"%s\"", upstream_user);

 			upstream_conninfo_found = true;
 		}
@@ -4632,7 +4761,7 @@ initialise_direct_clone(t_node_info *node_record)
 		}
 		else
 		{
-			TablespaceListCell *cell = false;
+			TablespaceListCell *cell;
 			KeyValueList not_found = {NULL, NULL};
 			int			total = 0,
 						matched = 0;
@@ -5690,7 +5819,7 @@ get_barman_property(char *dst, char *name, char *local_repmgr_directory)


 static void
-copy_configuration_files(void)
+copy_configuration_files(bool delete_after_copy)
 {
 	int			i,
 				r;
@@ -5735,13 +5864,35 @@ copy_configuration_files(void)
 		r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
 							  file->filepath, dest_path.data, false, source_server_version_num);

-		termPQExpBuffer(&dest_path);
+		/*
+		 * TODO: collate errors into list
+		 */

 		if (WEXITSTATUS(r))
 		{
 			log_error(_("standby clone: unable to copy config file \"%s\""),
 					  file->filename);
+			log_hint(_("see preceding messages for details"));
+
+			if (runtime_options.force == false)
+				exit(ERR_BAD_RSYNC);
 		}
+
+		/*
+		 * This is to check we can actually copy the files before running the
+		 * main clone operation
+		 */
+		if (delete_after_copy == true)
+		{
+			/* this is very unlikely to happen, but log in case it does */
+			if (unlink(dest_path.data) < 0 && errno != ENOENT)
+			{
+				log_warning(_("unable to delete %s"), dest_path.data);
+				log_detail("%s", strerror(errno));
+			}
+		}
+
+		termPQExpBuffer(&dest_path);
 	}

 	return;
--- a/repmgr-action-witness.c
+++ b/repmgr-action-witness.c
@@ -137,7 +137,7 @@ do_witness_register(void)
 	}

 	/*
-	 * TODO:sanity check witness node is not part of main cluster; we could
+	 * TODO: sanity check witness node is not part of main cluster; we could
 	 * add a random application_name to the respective connections,
 	 * and do a simple check of pg_stat_activity
 	 */
@@ -193,8 +193,26 @@ do_witness_register(void)
 		}
 	}

+	/*
+	 * Check that an active node with the same node_name doesn't exist already
+	 */

-	// XXX check other node with same name does not exist
+	record_status = get_node_record_by_name(primary_conn,
+											config_file_options.node_name,
+											&node_record);
+
+
+	if (record_status == RECORD_FOUND)
+	{
+		if (node_record.active == true && node_record.node_id != config_file_options.node_id)
+		{
+			log_error(_("node %i exists already with node_name \"%s\""),
+					  node_record.node_id,
+					  config_file_options.node_name);
+			PQfinish(primary_conn);
+			exit(ERR_BAD_CONFIG);
+		}
+	}

 	/*
 	 * if repmgr.nodes contains entries, delete if -F/--force provided,
@@ -225,6 +243,7 @@ do_witness_register(void)
 		PQfinish(witness_conn);
 		exit(SUCCESS);
 	}
+
 	/* create record on primary */

 	/*
--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -634,7 +634,7 @@ main(int argc, char **argv)
 	 * If -d/--dbname appears to be a conninfo string, validate by attempting
 	 * to parse it (and if successful, store the parsed parameters)
 	 */
-	if (runtime_options.dbname)
+	if (runtime_options.dbname[0])
 	{
 		if (strncmp(runtime_options.dbname, "postgresql://", 13) == 0 ||
 			strncmp(runtime_options.dbname, "postgres://", 11) == 0 ||
@@ -1010,7 +1010,6 @@ main(int argc, char **argv)
 		runtime_options.output_mode = OM_OPTFORMAT;
 	}

-
 	/*
 	 * Check for configuration file items which can be overriden by runtime
 	 * options
@@ -1068,6 +1067,17 @@ main(int argc, char **argv)
 	if (runtime_options.terse)
 		logger_set_terse();

+	/*
+	 * If --dry-run specified, ensure log_level is at least LOG_INFO, regardless
+	 * of what's in the configuration file or -L/--log-level paremeter, otherwise
+	 * some or output might not be displayed.
+	 */
+	if (runtime_options.dry_run == true)
+	{
+		logger_set_min_level(LOG_INFO);
+	}
+
+
 	/*
 	 * Node configuration information is not needed for all actions, with
 	 * STANDBY CLONE being the main exception.
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -98,7 +98,7 @@
 #log_facility=STDERR		 # Logging facility: possible values are STDERR, or for
 				 # syslog integration, one of LOCAL0, LOCAL1, ..., LOCAL7, USER

-#log_file=''			 # stderr can be redirected to an arbitrary file:
+#log_file=''			 # stderr can be redirected to an arbitrary file
 #log_status_interval=300	 # interval (in seconds) for repmgrd to log a status message


@@ -213,8 +213,10 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 # These settings apply when instructing a standby to follow the new primary
 # ("repmgr standby follow").

-#primary_follow_timeout=60		# The length of time (in seconds) to wait
+#primary_follow_timeout=60		# The max length of time (in seconds) to wait
 					# for the new primary to become available
+#standby_follow_timeout=15		# The max length of time (in seconds) to wait
+					# for the standby to connect to the primary


 #------------------------------------------------------------------------------
@@ -251,11 +253,11 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					# primary (or other upstream node)
 #reconnect_interval=10			# Interval between attempts  to reconnect to an unreachable
 					# primary (or other upstream node)
-#promote_command=			# command to execute when promoting a new primary; use something like:
+#promote_command=			# command repmgrd executes when promoting a new primary; use something like:
 					#
 					#     repmgr standby promote -f /etc/repmgr.conf
 					#
-#follow_command=			# command to execute when instructing a standby to follow a new primary;
+#follow_command=			# command repmgrd executes when instructing a standby to follow a new primary;
 					# use something like:
 					#
 					#     repmgr standby follow -f /etc/repmgr.conf -W --upstream-node-id=%n
@@ -308,11 +310,11 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 #service_stop_command = ''
 #service_restart_command = ''
 #service_reload_command = ''
-#service_promote_command = ''		# Note: this overrides any value contained in the setting
-					# "promote_command". This is intended for systems which
-					# provide a package-level promote command, such as Debian's
-					# "pg_ctlcluster"
-
+#service_promote_command = ''		# This parameter is intended for systems which provide a
+					# package-level promote command, such as Debian's
+					# "pg_ctlcluster". *IMPORTANT*: it is *not* a substitute
+					# for "promote_command"; do not use "repmgr standby promote"
+					# (or a script which executes "repmgr standby promote") here.

 #------------------------------------------------------------------------------
 # Status check thresholds
--- a/repmgr.h
+++ b/repmgr.h
@@ -70,6 +70,7 @@
 #define DEFAULT_ASYNC_QUERY_TIMEOUT          60  /* seconds */
 #define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60  /* seconds */
 #define DEFAULT_PRIMARY_FOLLOW_TIMEOUT       60  /* seconds */
+#define DEFAULT_STANDBY_FOLLOW_TIMEOUT       30  /* seconds */
 #define DEFAULT_BDR_RECOVERY_TIMEOUT         30  /* seconds */
 #define DEFAULT_ARCHIVE_READY_WARNING        16  /* WAL files */
 #define DEFAULT_ARCHIVE_READY_CRITICAL       128 /* WAL files */
--- a/repmgr_version.h.in
+++ b/repmgr_version.h.in
@@ -1,3 +1,3 @@
 #define REPMGR_VERSION_DATE ""
-#define REPMGR_VERSION "4.0.5"
+#define REPMGR_VERSION "4.0.6"

--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -58,7 +58,7 @@ static FailoverState failover_state = FAILOVER_STATE_UNKNOWN;

 static int	primary_node_id = UNKNOWN_NODE_ID;
 static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
-static NodeInfoList standby_nodes = T_NODE_INFO_LIST_INITIALIZER;
+static NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;


 static ElectionResult do_election(void);
@@ -816,6 +816,29 @@ monitor_streaming_standby(void)
 		{
 			int			degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);

+			if (config_file_options.degraded_monitoring_timeout > 0
+				&& degraded_monitoring_elapsed > config_file_options.degraded_monitoring_timeout)
+			{
+				initPQExpBuffer(&event_details);
+
+				appendPQExpBuffer(&event_details,
+								  _("degraded monitoring timeout (%i seconds) exceeded, terminating"),
+								  degraded_monitoring_elapsed);
+
+				log_notice("%s", event_details.data);
+
+				create_event_notification(NULL,
+										  &config_file_options,
+										  config_file_options.node_id,
+										  "repmgrd_shutdown",
+										  true,
+										  event_details.data);
+
+				termPQExpBuffer(&event_details);
+				terminate(ERR_MONITORING_TIMEOUT);
+			}
+
+
 			log_debug("monitoring node %i in degraded state for %i seconds",
 					  upstream_node_info.node_id,
 					  degraded_monitoring_elapsed);
@@ -918,8 +941,8 @@ monitor_streaming_standby(void)
 						get_active_sibling_node_records(local_conn,
 														local_node_info.node_id,
 														former_upstream_node_id,
-														&standby_nodes);
-						notify_followers(&standby_nodes, local_node_info.node_id);
+														&sibling_nodes);
+						notify_followers(&sibling_nodes, local_node_info.node_id);

 						/* this will restart monitoring in primary mode */
 						monitoring_state = MS_NORMAL;
@@ -958,12 +981,12 @@ monitor_streaming_standby(void)
 					get_active_sibling_node_records(local_conn,
 													local_node_info.node_id,
 													local_node_info.upstream_node_id,
-													&standby_nodes);
+													&sibling_nodes);

-					if (standby_nodes.node_count > 0)
+					if (sibling_nodes.node_count > 0)
 					{
-						log_debug("scanning %i node records to detect new primary...", standby_nodes.node_count);
-						for (cell = standby_nodes.head; cell; cell = cell->next)
+						log_debug("scanning %i node records to detect new primary...", sibling_nodes.node_count);
+						for (cell = sibling_nodes.head; cell; cell = cell->next)
 						{
 							/* skip local node check, we did that above */
 							if (cell->node_info->node_id == local_node_info.node_id)
@@ -993,7 +1016,7 @@ monitor_streaming_standby(void)
 							follow_new_primary(follow_node_id);
 						}
 					}
-					clear_node_info_list(&standby_nodes);
+					clear_node_info_list(&sibling_nodes);
 				}
 			}
 		}
@@ -1395,12 +1418,12 @@ monitor_streaming_witness(void)
 				get_active_sibling_node_records(local_conn,
 												local_node_info.node_id,
 												local_node_info.upstream_node_id,
-												&standby_nodes);
+												&sibling_nodes);

-				if (standby_nodes.node_count > 0)
+				if (sibling_nodes.node_count > 0)
 				{
-					log_debug("scanning %i node records to detect new primary...", standby_nodes.node_count);
-					for (cell = standby_nodes.head; cell; cell = cell->next)
+					log_debug("scanning %i node records to detect new primary...", sibling_nodes.node_count);
+					for (cell = sibling_nodes.head; cell; cell = cell->next)
 					{
 						/* skip local node check, we did that above */
 						if (cell->node_info->node_id == local_node_info.node_id)
@@ -1430,7 +1453,7 @@ monitor_streaming_witness(void)
 						witness_follow_new_primary(follow_node_id);
 					}
 				}
-				clear_node_info_list(&standby_nodes);
+				clear_node_info_list(&sibling_nodes);
 			}
 		}
 loop:
@@ -1531,7 +1554,7 @@ do_primary_failover(void)
 	}
 	else if (election_result == ELECTION_WON)
 	{
-		if (standby_nodes.node_count > 0)
+		if (sibling_nodes.node_count > 0)
 		{
 			log_notice("this node is the winner, will now promote itself and inform other nodes");
 		}
@@ -1576,7 +1599,7 @@ do_primary_failover(void)
 				get_active_sibling_node_records(local_conn,
 												local_node_info.node_id,
 												upstream_node_info.node_id,
-												&standby_nodes);
+												&sibling_nodes);

 			}
 			else if (config_file_options.failover == FAILOVER_MANUAL)
@@ -1638,10 +1661,10 @@ do_primary_failover(void)
 	{
 		case FAILOVER_STATE_PROMOTED:
 			/* notify former siblings that they should now follow this node */
-			notify_followers(&standby_nodes, local_node_info.node_id);
+			notify_followers(&sibling_nodes, local_node_info.node_id);

 			/* we no longer care about our former siblings */
-			clear_node_info_list(&standby_nodes);
+			clear_node_info_list(&sibling_nodes);

 			/* pass control back down to start_monitoring() */
 			log_info(_("switching to primary monitoring mode"));
@@ -1655,10 +1678,10 @@ do_primary_failover(void)
 			 * notify siblings that they should resume following the original
 			 * primary
 			 */
-			notify_followers(&standby_nodes, upstream_node_info.node_id);
+			notify_followers(&sibling_nodes, upstream_node_info.node_id);

 			/* we no longer care about our former siblings */
-			clear_node_info_list(&standby_nodes);
+			clear_node_info_list(&sibling_nodes);

 			/* pass control back down to start_monitoring() */
 			log_info(_("resuming standby monitoring mode"));
@@ -2543,6 +2566,7 @@ do_election(void)

 	/* we're visible */
 	int			visible_nodes = 1;
+	int			total_nodes = 0;

 	NodeInfoListCell *cell = NULL;

@@ -2593,14 +2617,16 @@ do_election(void)
 	get_active_sibling_node_records(local_conn,
 									local_node_info.node_id,
 									upstream_node_info.node_id,
-									&standby_nodes);
+									&sibling_nodes);
+
+	total_nodes = sibling_nodes.node_count + 1;

 	log_debug("do_election(): primary location is %s", upstream_node_info.location);

 	local_node_info.last_wal_receive_lsn = InvalidXLogRecPtr;

 	/* fast path if no other standbys (or witness) exists - normally win by default */
-	if (standby_nodes.node_count == 0)
+	if (sibling_nodes.node_count == 0)
 	{
 		if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
 		{
@@ -2628,7 +2654,7 @@ do_election(void)
 	}
 	else
 	{
-		/* standby nodes found - check if we're in the primary location befor checking theirs */
+		/* standby nodes found - check if we're in the primary location before checking theirs */
 		if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
 		{
 			primary_location_seen = true;
@@ -2643,7 +2669,7 @@ do_election(void)
 	/* pointer to "winning" node, initially self */
 	candidate_node = &local_node_info;

-	for (cell = standby_nodes.head; cell; cell = cell->next)
+	for (cell = sibling_nodes.head; cell; cell = cell->next)
 	{
 		/* assume the worst case */
 		cell->node_info->node_status = NODE_STATUS_UNKNOWN;
@@ -2698,7 +2724,7 @@ do_election(void)
 			candidate_node = cell->node_info;
 		}
 		/* LSN is same - tiebreak on priority, then node_id */
-		else if(cell->node_info->last_wal_receive_lsn == candidate_node->last_wal_receive_lsn)
+		else if (cell->node_info->last_wal_receive_lsn == candidate_node->last_wal_receive_lsn)
 		{
 			log_verbose(LOG_DEBUG, "node %i has same LSN as current candidate %i",
 						cell->node_info->node_id,
@@ -2750,9 +2776,9 @@ do_election(void)

 	log_debug("visible nodes: %i; total nodes: %i",
 			  visible_nodes,
-			  standby_nodes.node_count);
+			  total_nodes);

-	if (visible_nodes <= (standby_nodes.node_count / 2.0))
+	if (visible_nodes <= (total_nodes / 2.0))
 	{
 		log_notice(_("unable to reach a qualified majority of nodes"));
 		log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
Author	SHA1	Message	Date
Ian Barwick	ee1a6f9d0f	doc: add a link to the current documentation from the contents page	2019-04-03 10:48:36 +09:00
Ian Barwick	49eb408873	doc: fix typo Per user report on mailing list.	2018-10-23 09:01:00 +09:00
Ian Barwick	fba3d29514	doc: clarify BDR repmgrd configuration Link directly to section about configuring the "event_notification_command".	2018-07-23 13:23:28 +09:00
Ian Barwick	77200e5030	doc: remove duplicate item in list of event notifications	2018-07-18 16:11:18 +09:00
Ian Barwick	4589b8d439	doc: update documentation of "promote_command" and "service_promote_command" See commit `63242e2277`	2018-07-16 14:55:07 +09:00
Ian Barwick	048f7c3310	doc: add extra emphasis about not running repmgrd during switchover One day this will no longer be an issue, until then let's hope the fine documentation is read.	2018-07-11 09:55:37 +09:00
Ian Barwick	1e5f63792f	node check: implement CSV output This is advertised in the --help output and placeholder code was in place, but it wasn't actually implemented.	2018-06-22 15:46:50 +09:00
Ian Barwick	d26989bd12	node status: improve output and documentation In the default text output mode, list inactive slots. In CSV output mode, list inactive slots as additional information; add output line with number of missing slots and a list thereof. Also document --csv output mode.	2018-06-22 15:46:44 +09:00
Ian Barwick	f999c810a7	node check: clarify status information for witness server Previously the output gave the impression the server was a primary, which is technically the case, but it's not the actual cluster primary. Also output an error if the node is in recovery, which is unlikely but you never know.	2018-06-22 15:46:40 +09:00
Ian Barwick	81077d4bc2	standby switchover: fix behaviour if witness node is a sibling The witness node is not a streaming replication standby, so executing "repmgr standby follow" will fail. Instead, execute "repmgr witness register --force" to update the witness node record on the primary and its local copy of all node records. Addresses GitHub #453.	2018-06-21 17:16:18 +09:00
Ian Barwick	a549941d4f	repmgr: don't count witness node as a standby when running "node status" Addresses GitHub #451.	2018-06-21 14:27:47 +09:00
Ian Barwick	2f6c159f9a	"repmgr node ...": update comments and formatting	2018-06-21 14:27:42 +09:00
Ian Barwick	2eca1a0311	repmgr: don't count witness node as a standby when running "node check" Addresses GitHub #451.	2018-06-21 11:31:09 +09:00
Ian Barwick	f6377084ec	doc: remove info about old RPM package repository	2018-06-15 11:14:10 +09:00
Ian Barwick	d85c02b92b	doc: finalize release notes	2018-06-15 10:52:51 +09:00
Ian Barwick	d9ba41fc35	doc: emphasize that repmgrd should not be running during a switchover	2018-06-11 15:31:22 +09:00
Ian Barwick	afdaf9be66	_create_event(): log event and node ID for debugging	2018-06-11 15:20:01 +09:00
Ian Barwick	8067924c3e	repmgr: consolidate code in "standby switchover" Commit `41274f5525` left us with two if statements in sequence with exactly the same condition, so consolidate both into a single statement. Clarify code comments while we're at it.	2018-06-11 15:14:40 +09:00
Ian Barwick	e94a6eefde	repmgr: cluster check commands - non-zero exit code if node(s) unavailable Return ERR_CLUSTER_CHECK if one or nodes was not reachable. Implements GitHub #447.	2018-06-11 12:41:19 +09:00
Ian Barwick	69d7b6f7eb	doc: 4.0.6 release notes	2018-06-07 17:14:50 +09:00
Ian Barwick	8ec3b2a536	Bump version 4.0.6	2018-06-07 15:08:48 +09:00
Ian Barwick	68a9745e7e	standby follow: check node has connect to new primary After restarting the standby, poll pg_stat_replication on the upstream until the standby connects, and exit with an error if it doesn't by the timeout defined in "standby_follow_timeout". Implments GitHub #444.	2018-06-07 14:41:05 +09:00
Ian Barwick	20ce53e2d2	doc: update release notes	2018-06-07 12:48:54 +09:00
Ian Barwick	638a119c85	standby follow: add hint about using "node rejoin" If "repmgr standby follow" is executed on a node which isn't running, point out "repmgr node rejoin" should probably be used instead.	2018-06-07 11:02:32 +09:00
Ian Barwick	053863cdd0	doc: fix typos	2018-06-07 10:40:30 +09:00
Ian Barwick	009cc0480c	witness_register: check for existing node with same name	2018-06-07 10:04:26 +09:00
Ian Barwick	63bdc19132	repmgrd: ensure local node is counted as quorum member Rename "standby_nodes" to "sibling_nodes" to make it clearer in the code what total is actually provided by the struct. Addresses GitHub #439.	2018-06-01 17:19:40 +09:00
Ian Barwick	fbd389d0b3	doc: fix typo	2018-06-01 13:07:19 +09:00
Ian Barwick	4aef4ea11e	standby clone: improve external configuration file copying If --copy-external-config-files was provided, check that we can copy the files before cloning the standby, and abort if an error is encountered. This will give the user the opportunity to fix any issues before running the entire (and potentially lengthy) clone. Previously errors were logged but no action taken, and the final message indicated the clone operation was successful. Addresses GitHub #443.	2018-06-01 13:00:07 +09:00
Ian Barwick	0ffaff75df	repmgrd: ensue degraded monitoring timeout works on standby Parameter "degraded_monitoring_timeout" was not being acted on when monitoring a streaming replication standby. Addresses GitHub #439.	2018-05-31 17:53:31 +09:00
Ian Barwick	c54bb73fb2	If --dry-run specified, ensure minimum log level is INFO When executed with --dry-run, repmgr outputs detail about what would happen using log level INFO. If the log_level is configured to NOTICE or higher, it's possible some or all of the --dry-run output might not be displayed. Addresses GitHub #441.	2018-05-31 15:30:26 +09:00
Ian Barwick	28ea2e48de	node rejoin: avoid outputting empty DETAIL message	2018-05-31 15:10:51 +09:00
Ian Barwick	41274f5525	node rejoin: improve handling of --config-file parameter Fixes bug when parsing --config-file values (GitHub #442). Also improves handling in --dry-run mode, as some checks for the provided files were being skipped if --dry-run supplied, even though they are intended to work with --dry-run.	2018-05-31 11:44:31 +09:00
Ian Barwick	edceb32ccb	standby clone: --recovery-conf-only expects the standby to be registered Note this in the documentation, and add a HINT about registering it if the standby record is not available. Related to GitHub #438.	2018-05-29 11:54:38 +09:00
Ian Barwick	3dba8336e9	standby clone: don't assume existence of "user" in upstream conninfo Usually a seperate user (typically "repmgr") is set up specifically to manage the repmgr metadata, however there's no compelling requirement to do this, and it's possible the database owner (usually: "postgres") will be used, in which case it's possible the username will be left out of the conninfo string. Addresses GitHub #437.	2018-05-24 15:51:41 +09:00
Ian Barwick	97d0cee259	"config_file" is MAXPGPATH, not MAXLEN The two values are the same anyway, so change is more for consistency.	2018-05-22 17:19:55 +09:00
Martín Marqués	2dfe1d18e9	Fix typo in a code comment	2018-05-19 12:29:04 -03:00
Ian Barwick	55bb93bd3f	"standby clone": log actual connection string used to connect to upstream Useful for diagnostic purposes.	2018-05-10 11:58:48 +09:00
Ian Barwick	4c49954cd4	Fix check for -d/--dbname parameter Not a bug per-se, just meant some unnecessary processing was done on an empty string. Per note from petere.	2018-05-10 11:57:02 +09:00
Ian Barwick	a880b6ce16	Include "arpa/inet.h" in dbutils.c Needed for htonl() on FreeBSD.	2018-05-10 11:25:52 +09:00
Ian Barwick	c51a2283dd	Minor documentation fixes	2018-05-10 10:27:25 +09:00
Ian Barwick	717828e73e	doc: update 2ndQuadrant repository information Canonical link for each repository should not include any directories.	2018-05-03 17:21:29 +09:00
Ian Barwick	c7477d7a9c	doc: update repository information	2018-05-03 15:22:33 +09:00
Ian Barwick	1db8d3904f	doc: update package installation information Document the new public 2ndQuadrant apt repository	2018-05-03 15:07:26 +09:00
Ian Barwick	362f478d55	doc: update package installation information Document the new, public 2ndQuadrant RPM repository.	2018-05-03 14:12:29 +09:00