repmgrd: add configuration option "always_promote"

In certain corner cases, it's possible repmgrd may end up monitoring a standby which was a former primary, but the node record has not yet been updated. Previously repmgrd would abort the promotion with a cryptic message about being unable to find a node record for node_id -1 (the default value for an unknown node id). This commit addes a new configuration option "always_promote", which determines whether repmgrd should promote the node in this case. The default is "false", to effectively maintain the existing behaviour. Logging output has also been improved to make it clearer what has happened when this situation occurs.
2026-05-31 19:39:04 +00:00 · 2020-09-29 14:13:03 +09:00
parent 16eeae700c
commit ce229beff8
8 changed files with 130 additions and 11 deletions
@@ -606,7 +606,17 @@ struct ConfigFileSetting config_file_settings[] =
 		"primary_visibility_consensus",
 		CONFIG_BOOL,
 		{ .boolptr = &config_file_options.primary_visibility_consensus },
-		{ .booldefault =  DEFAULT_PRIMARY_VISIBILITY_CONSENSUS },
+		{ .booldefault = DEFAULT_PRIMARY_VISIBILITY_CONSENSUS },
+		{},
+		{},
+		{}
+	},
+	/* always_promote */
+	{
+		"always_promote",
+		CONFIG_BOOL,
+		{ .boolptr = &config_file_options.always_promote },
+		{ .booldefault = DEFAULT_ALWAYS_PROMOTE },
 		{},
 		{},
 		{}
@@ -867,6 +867,7 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
 * - monitoring_history
 * - primary_notification_timeout
 * - primary_visibility_consensus
+ * - always_promote
 * - promote_command
 * - reconnect_attempts
 * - reconnect_interval
@@ -1239,6 +1240,15 @@ reload_config(t_server_type server_type)
 								format_bool(config_file_options.primary_visibility_consensus));
 	}

+	/* always_promote */
+	if (config_file_options.always_promote != orig_config_file_options.always_promote)
+	{
+		item_list_append_format(&config_changes,
+								_("\"always_promote\" changed from \"%s\" to \"%s\""),
+								format_bool(orig_config_file_options.always_promote),
+								format_bool(config_file_options.always_promote));
+	}
+
 	/* failover_validation_command */
 	if (strncmp(config_file_options.failover_validation_command, orig_config_file_options.failover_validation_command, sizeof(config_file_options.failover_validation_command)) != 0)
 	{
@@ -201,6 +201,7 @@ typedef struct
 	int			sibling_nodes_disconnect_timeout;
 	ConnectionCheckType connection_check_type;
 	bool		primary_visibility_consensus;
+	bool		always_promote;
 	char		failover_validation_command[MAXPGPATH];
 	int			election_rerun_interval;
 	int			child_nodes_check_interval;
@@ -127,6 +127,14 @@
              </itemizedlist>
            </para>
          </listitem>
+
+          <listitem>
+            <para>
+              Configuration option <varname>always_promote</varname> (default: <literal>false</literal>)
+              to control whether a node should be promoted if the &repmgr; metadata is not up-to-date
+              on that node.
+            </para>
+          </listitem>
        </itemizedlist>
      </para>
    </sect2>
@@ -419,6 +419,33 @@
          </listitem>
        </varlistentry>

+        <varlistentry>
+          <term><option>always_promote</option></term>
+
+          <listitem>
+            <indexterm>
+              <primary>always_promote</primary>
+            </indexterm>
+
+            <para>
+              Default: <literal>false</literal>.
+            </para>
+            <para>
+              If <literal>true</literal>, promote the local node even if its
+              &repmgr; metadata is not up-to-date.
+            </para>
+            <para>
+              Normally &repmgr; expects its metadata (stored in the <varname>repmgr.nodes</varname>
+              table) to be up-to-date so &repmgrd; can take the correct action during a failover.
+              However it's possible that updates made on the primary may not
+              have propagated to the standby (promotion candidate). In this case &repmgrd; will
+              default to not promoting the standby. This behaviour can be overridden by setting
+              <option>always_promote</option> to <literal>true</literal>.
+            </para>
+          </listitem>
+        </varlistentry>
+
+
        <varlistentry>

          <term><option>standby_disconnect_on_failover</option></term>
@@ -765,6 +792,12 @@ repmgrd_service_stop_command='sudo systemctl repmgr12 stop'
          </simpara>
        </listitem>

+        <listitem>
+          <simpara>
+            <varname>always_promote</varname>
+          </simpara>
+        </listitem>
+
        <listitem>
          <simpara>
            <varname>promote_command</varname>
@@ -342,6 +342,7 @@ ssh_options='-q -o ConnectTimeout=10'	# Options to append to "ssh"
 					# WAL receivers
 #primary_visibility_consensus=false	# If "true", only continue with failover if no standbys have seen
 					# the primary node recently. *Must* be the same on all nodes.
+#always_promote=false			# Always promote a node, even if repmgr metadata is outdated
 #failover_validation_command=''		# Script to execute for an external mechanism to validate the failover
 					# decision made by repmgrd. One or both of the following parameter placeholders
 					# should be provided, which will be replaced by repmgrd with the appropriate
@@ -126,7 +126,7 @@
 #define DEFAULT_WITNESS_SYNC_INTERVAL        15  /* seconds */
 #define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT    30  /* seconds */
 #define DEFAULT_LOCATION                     "default"
-#define DEFAULT_PRIORITY		             100
+#define DEFAULT_PRIORITY                     100
 #define DEFAULT_MONITORING_INTERVAL          2	 /* seconds */
 #define DEFAULT_RECONNECTION_ATTEMPTS        6	 /* seconds */
 #define DEFAULT_RECONNECTION_INTERVAL        10  /* seconds */
@@ -139,6 +139,7 @@
 #define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
 #define DEFAULT_CONNECTION_CHECK_TYPE        CHECK_PING
 #define DEFAULT_PRIMARY_VISIBILITY_CONSENSUS false
+#define DEFAULT_ALWAYS_PROMOTE               false
 #define DEFAULT_ELECTION_RERUN_INTERVAL      15  /* seconds */
 #define DEFAULT_CHILD_NODES_CHECK_INTERVAL   5   /* seconds */
 #define DEFAULT_CHILD_NODES_DISCONNECT_MIN_COUNT -1
@@ -764,9 +764,25 @@ check_primary_status(int degraded_monitoring_elapsed)
 		}
 		else
 		{
-			appendPQExpBuffer(&event_details,
-							  _("node has become a standby, monitoring connection to upstream node %i"),
-							  local_node_info.upstream_node_id);
+			if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
+			{
+				/*
+				 * If upstream_node_id is not set, it's possible that following a switchover
+				 * of some kind (possibly forced in some way), the updated node record has
+				 * not yet propagated to the local node. In this case however we can safely
+				 * assume we're monitoring the primary.
+				 */
+
+				appendPQExpBuffer(&event_details,
+								  _("node has become a standby, monitoring connection to primary node %i"),
+								  primary_node_id);
+			}
+			else
+			{
+				appendPQExpBuffer(&event_details,
+								  _("node has become a standby, monitoring connection to upstream node %i"),
+								  local_node_info.upstream_node_id);
+			}
 		}

 		create_event_notification(new_primary_conn,
@@ -3199,6 +3215,7 @@ update_monitoring_history(void)
 	if (primary_last_wal_location >= replication_info.last_wal_receive_lsn)
 	{
 		replication_lag_bytes = (long long unsigned int) (primary_last_wal_location - replication_info.last_wal_receive_lsn);
+		log_debug("replication lag in bytes is: %llu", replication_lag_bytes);
 	}
 	else
 	{
@@ -3482,6 +3499,14 @@ do_upstream_standby_failover(void)
 }


+/*
+ * This promotes the local node using the "promote_command" configuration
+ * parameter, which must be either "repmgr standby promote" or a script which
+ * at some point executes "repmgr standby promote".
+ *
+ * TODO: make "promote_command" and execute the same code used by
+ * "repmgr standby promote".
+ */
 static FailoverState
 promote_self(void)
 {
@@ -3504,13 +3529,43 @@ promote_self(void)
 		sleep(config_file_options.promote_delay);
 	}

-	record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
-
-	if (record_status != RECORD_FOUND)
+	if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
 	{
-		log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
-				  local_node_info.upstream_node_id);
-		return FAILOVER_STATE_PROMOTION_FAILED;
+		/*
+		 * This is a corner-case situation where the repmgr metadata on the
+		 * promotion candidate is outdated and the local node's upstream_node_id
+		 * is not set. This is often an indication of potentially serious issues,
+		 * such as the local node being very far behind the primary, or not being
+		 * attached at all.
+		 *
+		 * In this case it may be desirable to restore the original primary.
+		 * This behaviour can be controlled by the "always_promote" configuration option.
+		 */
+		if (config_file_options.always_promote == false)
+		{
+			log_error(_("this node (ID: %i) does not have its upstream_node_id set, not promoting"),
+					  local_node_info.node_id);
+			log_detail(_("the local node's metadata has not been updated since it became a standby"));
+			log_hint(_("set \"always_promote\" to \"true\" to force promotion in this situation"));
+			return FAILOVER_STATE_PROMOTION_FAILED;
+		}
+		else
+		{
+			log_warning(_("this node (ID: %i) does not have its upstream_node_id set, promoting anyway"),
+						local_node_info.node_id);
+			log_detail(_("\"always_promote\" is set to \"true\" "));
+		}
+	}
+	else
+	{
+		record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
+
+		if (record_status != RECORD_FOUND)
+		{
+			log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
+					  local_node_info.upstream_node_id);
+			return FAILOVER_STATE_PROMOTION_FAILED;
+		}
 	}

 	/* the presence of this command has been established already */