From 58efb0f15857e121b01fe099e4d110b54237af0f Mon Sep 17 00:00:00 2001
From: Ian Barwick <ian@2ndquadrant.com>
Date: Mon, 21 Jan 2019 14:14:14 +0900
Subject: [PATCH] repmgrd: on a cascaded standby, don't fail over if
 "failover=manual"

Addresses GitHub #531.
---
 HISTORY                                |  2 ++
 doc/appendix-release-notes.sgml        |  7 +++++
 doc/repmgrd-cascading-replication.sgml |  4 ++-
 repmgrd-physical.c                     | 43 +++++++++++++++++++++-----
 4 files changed, 47 insertions(+), 9 deletions(-)
diff --git a/HISTORY b/HISTORY
index 4a7acce9..5a9ac271 100644
--- a/HISTORY
+++ b/HISTORY
@@ -8,6 +8,8 @@
         repmgr: "standby switchover": improve handling of connection URIs when
           executing "node rejoin" on the demotion candidate; GitHub #525 (Ian)
         repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
+        repmgrd: on a cascaded standby, don't fail over if "failover=manual";
+          GitHub #531 (Ian)
 
 4.2.1   2018-??-??
         repmgr: add sanity check for correct extension version (Ian)
diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml
index aec8b864..b5cd7e17 100644
--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -83,6 +83,13 @@
             </para>
           </listitem>
 
+          <listitem>
+            <para>
+              <application>repmgrd</application>:  on a cascaded standby, don't fail over if
+              <literal>failover=manual</literal>. GitHub #531.
+            </para>
+          </listitem>
+
         </itemizedlist>
       </para>
     </sect2>
diff --git a/doc/repmgrd-cascading-replication.sgml b/doc/repmgrd-cascading-replication.sgml
index dfa8809b..7d4dfbc3 100644
--- a/doc/repmgrd-cascading-replication.sgml
+++ b/doc/repmgrd-cascading-replication.sgml
@@ -17,6 +17,8 @@
   is promoted, a standby connected to another standby will not be affected
   and continue working as normal (even if the upstream standby it's connected
   to becomes the primary node). If however the node's direct upstream fails,
-  the "cascaded standby" will attempt to reconnect to that node's parent.
+  the &quot;cascaded standby&quot; will attempt to reconnect to that node's parent
+  (unless <varname>failover</varname> is set to <literal>manual</literal> in
+  <filename>repmgr.conf</filename>).
  </para>
 </chapter>
diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index 46182f6c..648787bc 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -950,7 +950,14 @@ monitor_streaming_standby(void)
 						}
 						else if (upstream_node_info.type == STANDBY)
 						{
+
 							failover_done = do_upstream_standby_failover();
+
+							if (failover_done == false)
+							{
+								monitoring_state = MS_DEGRADED;
+								INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+							}
 						}
 
 						/*
@@ -1225,7 +1232,15 @@ loop:
 						log_detail(_("waiting for upstream or another primary to reappear"));
 					}
 				}
-				else if (config_file_options.monitoring_history == true)
+
+				/*
+				 * Add update about monitoring updates.
+				 *
+				 * Note: with cascaded replication, it's possible we're still able to write
+				 * monitoring history to the primary even if the upstream is still reachable.
+				 */
+
+				if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
 				{
 					if (INSTR_TIME_IS_ZERO(last_monitoring_update))
 					{
@@ -2194,15 +2209,18 @@ update_monitoring_history(void)
 /*
  * do_upstream_standby_failover()
  *
- * Attach cascaded standby to primary
+ * Attach cascaded standby to another node, currently the primary.
  *
- * Currently we will try to attach to the cluster primary, as "repmgr
- * standby follow" doesn't support attaching to another node.
+ * Note that in contrast to a primary failover, where one of the downstrean
+ * standby nodes will become a primary, a cascaded standby failover (where the
+ * upstream standby has gone away) is "just" a case of attaching the standby to
+ * another node.
  *
- * If this becomes supported, it might be worth providing a selection
- * of reconnection strategies as different behaviour might be desirable
- * in different situations;
- * or maybe the option not to reconnect might be required?
+ * Currently we will try to attach the node to the cluster primary.
+ *
+ * TODO: As of repmgr 4.3, "repmgr standby follow" supports attaching a standby to another
+ * standby node. We need to provide a selection of reconnection strategies as different
+ * behaviour might be desirable in different situations.
  */
 
 static bool
@@ -2217,6 +2235,15 @@ do_upstream_standby_failover(void)
 
 	close_connection(&upstream_conn);
 
+	/*
+	 *
+	 */
+	if (config_file_options.failover == FAILOVER_MANUAL)
+	{
+		log_notice(_("this node is not configured for automatic failover"));
+		return false;
+	}
+
 	if (get_primary_node_record(local_conn, &primary_node_info) == false)
 	{
 		log_error(_("unable to retrieve primary node record"));