From ce229beff8e1485c0c93efe296f16b3c8e978a26 Mon Sep 17 00:00:00 2001
From: Ian Barwick <ian@2ndquadrant.com>
Date: Tue, 29 Sep 2020 14:13:03 +0900
Subject: [PATCH] repmgrd: add configuration option "always_promote"

In certain corner cases, it's possible repmgrd may end up monitoring
a standby which was a former primary, but the node record has not
yet been updated.

Previously repmgrd would abort the promotion with a cryptic message
about being unable to find a node record for node_id -1 (the
default value for an unknown node id).

This commit addes a new configuration option "always_promote", which
determines whether repmgrd should promote the node in this case.
The default is "false", to effectively maintain the existing behaviour.

Logging output has also been improved to make it clearer what has
happened when this situation occurs.
---
 configdata.c                   | 12 +++++-
 configfile.c                   | 10 +++++
 configfile.h                   |  1 +
 doc/appendix-release-notes.xml |  8 ++++
 doc/repmgrd-configuration.xml  | 33 +++++++++++++++
 repmgr.conf.sample             |  1 +
 repmgr.h                       |  3 +-
 repmgrd-physical.c             | 73 +++++++++++++++++++++++++++++-----
 8 files changed, 130 insertions(+), 11 deletions(-)
diff --git a/configdata.c b/configdata.c
index 4b482945..45e25a19 100644
--- a/configdata.c
+++ b/configdata.c
@@ -606,7 +606,17 @@ struct ConfigFileSetting config_file_settings[] =
 		"primary_visibility_consensus",
 		CONFIG_BOOL,
 		{ .boolptr = &config_file_options.primary_visibility_consensus },
-		{ .booldefault =  DEFAULT_PRIMARY_VISIBILITY_CONSENSUS },
+		{ .booldefault = DEFAULT_PRIMARY_VISIBILITY_CONSENSUS },
+		{},
+		{},
+		{}
+	},
+	/* always_promote */
+	{
+		"always_promote",
+		CONFIG_BOOL,
+		{ .boolptr = &config_file_options.always_promote },
+		{ .booldefault = DEFAULT_ALWAYS_PROMOTE },
 		{},
 		{},
 		{}
diff --git a/configfile.c b/configfile.c
index e6e06625..62439a57 100644
--- a/configfile.c
+++ b/configfile.c
@@ -867,6 +867,7 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
  * - monitoring_history
  * - primary_notification_timeout
  * - primary_visibility_consensus
+ * - always_promote
  * - promote_command
  * - reconnect_attempts
  * - reconnect_interval
@@ -1239,6 +1240,15 @@ reload_config(t_server_type server_type)
 								format_bool(config_file_options.primary_visibility_consensus));
 	}
 
+	/* always_promote */
+	if (config_file_options.always_promote != orig_config_file_options.always_promote)
+	{
+		item_list_append_format(&config_changes,
+								_("\"always_promote\" changed from \"%s\" to \"%s\""),
+								format_bool(orig_config_file_options.always_promote),
+								format_bool(config_file_options.always_promote));
+	}
+
 	/* failover_validation_command */
 	if (strncmp(config_file_options.failover_validation_command, orig_config_file_options.failover_validation_command, sizeof(config_file_options.failover_validation_command)) != 0)
 	{
diff --git a/configfile.h b/configfile.h
index b0791bb7..12440897 100644
--- a/configfile.h
+++ b/configfile.h
@@ -201,6 +201,7 @@ typedef struct
 	int			sibling_nodes_disconnect_timeout;
 	ConnectionCheckType connection_check_type;
 	bool		primary_visibility_consensus;
+	bool		always_promote;
 	char		failover_validation_command[MAXPGPATH];
 	int			election_rerun_interval;
 	int			child_nodes_check_interval;
diff --git a/doc/appendix-release-notes.xml b/doc/appendix-release-notes.xml
index 940bb7cd..91440806 100644
--- a/doc/appendix-release-notes.xml
+++ b/doc/appendix-release-notes.xml
@@ -127,6 +127,14 @@
               </itemizedlist>
             </para>
           </listitem>
+
+          <listitem>
+            <para>
+              Configuration option <varname>always_promote</varname> (default: <literal>false</literal>)
+              to control whether a node should be promoted if the &repmgr; metadata is not up-to-date
+              on that node.
+            </para>
+          </listitem>
         </itemizedlist>
       </para>
     </sect2>
diff --git a/doc/repmgrd-configuration.xml b/doc/repmgrd-configuration.xml
index 647a1f94..fae0847a 100644
--- a/doc/repmgrd-configuration.xml
+++ b/doc/repmgrd-configuration.xml
@@ -419,6 +419,33 @@
           </listitem>
         </varlistentry>
 
+        <varlistentry>
+          <term><option>always_promote</option></term>
+
+          <listitem>
+            <indexterm>
+              <primary>always_promote</primary>
+            </indexterm>
+
+            <para>
+              Default: <literal>false</literal>.
+            </para>
+            <para>
+              If <literal>true</literal>, promote the local node even if its
+              &repmgr; metadata is not up-to-date.
+            </para>
+            <para>
+              Normally &repmgr; expects its metadata (stored in the <varname>repmgr.nodes</varname>
+              table) to be up-to-date so &repmgrd; can take the correct action during a failover.
+              However it's possible that updates made on the primary may not
+              have propagated to the standby (promotion candidate). In this case &repmgrd; will
+              default to not promoting the standby. This behaviour can be overridden by setting
+              <option>always_promote</option> to <literal>true</literal>.
+            </para>
+          </listitem>
+        </varlistentry>
+
+
         <varlistentry>
 
           <term><option>standby_disconnect_on_failover</option></term>
@@ -765,6 +792,12 @@ repmgrd_service_stop_command='sudo systemctl repmgr12 stop'
           </simpara>
         </listitem>
 
+        <listitem>
+          <simpara>
+            <varname>always_promote</varname>
+          </simpara>
+        </listitem>
+
         <listitem>
           <simpara>
             <varname>promote_command</varname>
diff --git a/repmgr.conf.sample b/repmgr.conf.sample
index cea903d1..b6c69302 100644
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -342,6 +342,7 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					# WAL receivers
 #primary_visibility_consensus=false	# If "true", only continue with failover if no standbys have seen
 					# the primary node recently. *Must* be the same on all nodes.
+#always_promote=false			# Always promote a node, even if repmgr metadata is outdated
 #failover_validation_command=''		# Script to execute for an external mechanism to validate the failover
 					# decision made by repmgrd. One or both of the following parameter placeholders
 					# should be provided, which will be replaced by repmgrd with the appropriate
diff --git a/repmgr.h b/repmgr.h
index e54fc941..6e875cbe 100644
--- a/repmgr.h
+++ b/repmgr.h
@@ -126,7 +126,7 @@
 #define DEFAULT_WITNESS_SYNC_INTERVAL        15  /* seconds */
 #define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT    30  /* seconds */
 #define DEFAULT_LOCATION                     "default"
-#define DEFAULT_PRIORITY		             100
+#define DEFAULT_PRIORITY                     100
 #define DEFAULT_MONITORING_INTERVAL          2	 /* seconds */
 #define DEFAULT_RECONNECTION_ATTEMPTS        6	 /* seconds */
 #define DEFAULT_RECONNECTION_INTERVAL        10  /* seconds */
@@ -139,6 +139,7 @@
 #define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
 #define DEFAULT_CONNECTION_CHECK_TYPE        CHECK_PING
 #define DEFAULT_PRIMARY_VISIBILITY_CONSENSUS false
+#define DEFAULT_ALWAYS_PROMOTE               false
 #define DEFAULT_ELECTION_RERUN_INTERVAL      15  /* seconds */
 #define DEFAULT_CHILD_NODES_CHECK_INTERVAL   5   /* seconds */
 #define DEFAULT_CHILD_NODES_DISCONNECT_MIN_COUNT -1
diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index 321ad2c5..72dff5db 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -764,9 +764,25 @@ check_primary_status(int degraded_monitoring_elapsed)
 		}
 		else
 		{
-			appendPQExpBuffer(&event_details,
-							  _("node has become a standby, monitoring connection to upstream node %i"),
-							  local_node_info.upstream_node_id);
+			if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
+			{
+				/*
+				 * If upstream_node_id is not set, it's possible that following a switchover
+				 * of some kind (possibly forced in some way), the updated node record has
+				 * not yet propagated to the local node. In this case however we can safely
+				 * assume we're monitoring the primary.
+				 */
+
+				appendPQExpBuffer(&event_details,
+								  _("node has become a standby, monitoring connection to primary node %i"),
+								  primary_node_id);
+			}
+			else
+			{
+				appendPQExpBuffer(&event_details,
+								  _("node has become a standby, monitoring connection to upstream node %i"),
+								  local_node_info.upstream_node_id);
+			}
 		}
 
 		create_event_notification(new_primary_conn,
@@ -3199,6 +3215,7 @@ update_monitoring_history(void)
 	if (primary_last_wal_location >= replication_info.last_wal_receive_lsn)
 	{
 		replication_lag_bytes = (long long unsigned int) (primary_last_wal_location - replication_info.last_wal_receive_lsn);
+		log_debug("replication lag in bytes is: %llu", replication_lag_bytes);
 	}
 	else
 	{
@@ -3482,6 +3499,14 @@ do_upstream_standby_failover(void)
 }
 
 
+/*
+ * This promotes the local node using the "promote_command" configuration
+ * parameter, which must be either "repmgr standby promote" or a script which
+ * at some point executes "repmgr standby promote".
+ *
+ * TODO: make "promote_command" and execute the same code used by
+ * "repmgr standby promote".
+ */
 static FailoverState
 promote_self(void)
 {
@@ -3504,13 +3529,43 @@ promote_self(void)
 		sleep(config_file_options.promote_delay);
 	}
 
-	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
-
-	if (record_status != RECORD_FOUND)
+	if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
 	{
-		log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
-				  local_node_info.upstream_node_id);
-		return FAILOVER_STATE_PROMOTION_FAILED;
+		/*
+		 * This is a corner-case situation where the repmgr metadata on the
+		 * promotion candidate is outdated and the local node's upstream_node_id
+		 * is not set. This is often an indication of potentially serious issues,
+		 * such as the local node being very far behind the primary, or not being
+		 * attached at all.
+		 *
+		 * In this case it may be desirable to restore the original primary.
+		 * This behaviour can be controlled by the "always_promote" configuration option.
+		 */
+		if (config_file_options.always_promote == false)
+		{
+			log_error(_("this node (ID: %i) does not have its upstream_node_id set, not promoting"),
+					  local_node_info.node_id);
+			log_detail(_("the local node's metadata has not been updated since it became a standby"));
+			log_hint(_("set \"always_promote\" to \"true\" to force promotion in this situation"));
+			return FAILOVER_STATE_PROMOTION_FAILED;
+		}
+		else
+		{
+			log_warning(_("this node (ID: %i) does not have its upstream_node_id set, promoting anyway"),
+						local_node_info.node_id);
+			log_detail(_("\"always_promote\" is set to \"true\" "));
+		}
+	}
+	else
+	{
+		record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
+
+		if (record_status != RECORD_FOUND)
+		{
+			log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
+					  local_node_info.upstream_node_id);
+			return FAILOVER_STATE_PROMOTION_FAILED;
+		}
 	}
 
 	/* the presence of this command has been established already */