From 9439467958f15136d7fb0673dcd9f0c70061fc83 Mon Sep 17 00:00:00 2001
From: Ian Barwick <ian@2ndquadrant.com>
Date: Tue, 25 Sep 2018 11:53:36 +0900
Subject: [PATCH] doc: add troubleshooting section to switchover documentation

---
 doc/appendix-release-notes.sgml    |  7 +++
 doc/repmgr-standby-switchover.sgml |  9 +++-
 doc/switchover.sgml                | 69 ++++++++++++++++++++++++++++++
 repmgr-action-standby.c            |  4 +-
 4 files changed, 85 insertions(+), 4 deletions(-)
diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml
index 6c055e78..35c252c3 100644
--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -15,6 +15,13 @@
     See also: <xref linkend="upgrading-repmgr">
   </para>
 
+  <sect1 id="release-4.2">
+    <title>Release 4.2</title>
+    <para><emphasis>???, 2018</emphasis></para>
+    <para>
+    </para>
+  </sect1>
+
   <sect1 id="release-4.1.1">
     <title>Release 4.1.1</title>
     <para><emphasis>Wed September 5, 2018</emphasis></para>
diff --git a/doc/repmgr-standby-switchover.sgml b/doc/repmgr-standby-switchover.sgml
index 89140bb4..cbd5d7a1 100644
--- a/doc/repmgr-standby-switchover.sgml
+++ b/doc/repmgr-standby-switchover.sgml
@@ -156,9 +156,14 @@
            <literal>shutdown_check_timeout</literal>: maximum number of seconds to wait for the
            demotion candidate (current primary) to shut down, before aborting the switchover.
          </simpara>
+         <simpara>
+           Note that this parameter is set on the node where <command>repmgr standby switchover</command>
+           is executed (promotion candidate); setting it on the demotion candidate (former primary) will
+           have no effect.
+         </simpara>
          <note>
            <para>
-             In versions prior to &repmgr; 4.2, <command>repmgr standby switchover</command> would
+             In versions prior to <link linkend="release-4.2">&repmgr; 4.2</link>, <command>repmgr standby switchover</command> would
              use the values defined in <literal>reconnect_attempts</literal> and <literal>reconnect_interval</literal>
              to determine the timeout for demotion candidate shutdown.
            </para>
@@ -168,7 +173,7 @@
        <listitem>
          <simpara>
            <literal>standby_reconnect_timeout</literal>:
-           maximum number of seconds to attempt to wait for the demoted primary
+           maximum number of seconds to attempt to wait for the demotion candidate (former primary)
            to reconnect to the promoted primary (default: 60 seconds)
          </simpara>
        </listitem>
diff --git a/doc/switchover.sgml b/doc/switchover.sgml
index 84444e69..e3999112 100644
--- a/doc/switchover.sgml
+++ b/doc/switchover.sgml
@@ -342,4 +342,73 @@
    We hope to remove some of these restrictions in future versions of &repmgr;.
   </para>
  </sect1>
+
+ <sect1 id="switchover-troubleshooting" xreflabel="Troubleshooting">
+   <indexterm>
+     <primary>switchover</primary>
+     <secondary>troubleshooting</secondary>
+   </indexterm>
+   <title>Troubleshooting switchover issues</title>
+
+   <para>
+     As <link linkend="performing-switchover">emphasised previously</link>, performing a switchover
+     is a non-trivial operation and there are a number of potential issues which can occur.
+     While &repmgr; attempts to perform sanity checks, there's no guaranteed way of determining the success of
+     a switchover without actually carrying it out.
+   </para>
+
+   <sect2 id="switchover-troubleshooting-primary-shutdown">
+     <title>Demotion candidate (old primary) does not shut down</title>
+     <para>
+       &repmgr; may abort a switchover with a message like:
+       <programlisting>
+ERROR: shutdown of the primary server could not be confirmed
+HINT: check the primary server status before performing any further actions</programlisting>
+     </para>
+     <para>
+       This means the shutdown of the old primary has taken longer than &repmgr; expected,
+       and it has given up waiting.
+     </para>
+     <para>
+       In this case, check the PostgreSQL log on the primary server to see what is going
+       on. It's entirely possible the shutdown process is just taking longer than the
+       timeout set by the configuration parameter <varname>shutdown_check_timeout</varname>
+       (default: 60 seconds), in which case you may need to adjust this parameter.
+     </para>
+     <note>
+       <para>
+         Note that <varname>shutdown_check_timeout</varname>is set on the node where
+         <command>repmgr standby switchover</command> is executed (promotion candidate); setting it on the
+         demotion candidate (former primary) will have no effect.
+       </para>
+     </note>
+     <para>
+       If the primary server has shut down cleanly, and no other node has been promoted,
+       it is safe to restart it, in which case the replication cluster will be restored
+       to its original configuration.
+     </para>
+   </sect2>
+
+   <sect2 id="switchover-troubleshooting-exclusive-backup">
+     <title>Switchover aborts with an &quot;exclusive backup&quot; error</title>
+     <para>
+       &repmgr; may abort a switchover with a message like:
+       <programlisting>
+ERROR: unable to perform a switchover while primary server is in exclusive backup mode
+HINT: stop backup before attempting the switchover</programlisting>
+     </para>
+     <para>
+       This means an exclusive backup is running on the current primary; interrupting this
+       will not only abort the backup, but potentially leave the primary with an ambiguous
+       backup state.
+     </para>
+     <para>
+       To proceed, either wait until the backup has finished, or cancel it with the command
+       <command>SELECT pg_stop_backup()</command>. For more details see the PostgreSQL
+       documentation section
+       <ulink url="https://www.postgresql.org/docs/current/static/continuous-archiving.html#BACKUP-LOWLEVEL-BASE-BACKUP-EXCLUSIVE">Making an exclusive low level backup</ulink>.
+     </para>
+   </sect2>
+ </sect1>
+
 </chapter>
diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c
index e6f91e75..167be896 100644
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -2934,11 +2934,11 @@ do_standby_switchover(void)
 	}
 
 	/*
-	 * Check that there's no exclusive backups running on the primary.
+	 * Check that there are no exclusive backups running on the primary.
 	 * We don't want to end up damaging the backup and also leaving the server in an
 	 * state where there's control data saying it's in backup mode but there's no
 	 * backup_label in PGDATA.
-	 * If the DBA wants to do the switchover anyway, he should first stop the
+	 * If the user wants to do the switchover anyway, they should first stop the
 	 * backup that's running.
 	 */
 	if (server_in_exclusive_backup_mode(remote_conn) != BACKUP_STATE_NO_BACKUP)