Have repmgrd parse "standby follow --upstream-node-id=%n"

2026-05-31 19:39:04 +00:00 · 2017-09-04 13:42:50 +09:00
parent 9a0f45d7d3
commit 78e6bdeebe
7 changed files with 105 additions and 15 deletions
@@ -1250,12 +1250,18 @@ Additionally the following `repmgrd` options *must* be set in `repmgr.conf`

    failover=automatic
    promote_command='repmgr standby promote -f /etc/repmgr.conf --log-to-file'
-    follow_command='repmgr standby follow -f /etc/repmgr.conf --log-to-file'
+    follow_command='repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'

 Note that the `--log-to-file` option will cause `repmgr`'s output to be logged to
 the destination configured to receive log output for `repmgrd`.
 See `repmgr.conf.sample` for further `repmgrd`-specific settings

+The `follow_command` should provide the `--upstream-node-id=%n` option to
+`repmgr standby follow`; the `%n` will be replaced by `repmgrd` with the ID
+of the new primary. If this is not provided, if the original primary comes back
+online after the new primary is promoted, there is a risk that
+`repmgr standby follow` will follow the original primary.
+
 When `failover` is set to `automatic`, upon detecting failure of the current
 primary, `repmgrd` will execute one of `promote_command` or `follow_command`,
 depending on whether the current server is to become the new primary, or
@@ -2977,6 +2977,14 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
 			{
 				switch (src_ptr[1])
 				{
+					case '%':
+						/* %%: replace with % */
+						if (dst_ptr < end_ptr)
+						{
+							src_ptr++;
+							*dst_ptr++ = *src_ptr;
+						}
+						break;
 					case 'n':
 						/* %n: node id */
 						src_ptr++;
@@ -1431,7 +1431,13 @@ do_standby_follow(void)

 	if (runtime_options.upstream_node_id != NO_UPSTREAM_NODE)
 	{
-		// XXX check not self!
+		/* check not self! */
+		if (runtime_options.upstream_node_id == config_file_options.node_id)
+		{
+			log_error(_("provided \"--upstream-node-id\" %i is the current node!"),
+					  runtime_options.upstream_node_id);
+			exit(ERR_BAD_CONFIG);
+		}

 		record_status = get_node_record(local_conn, runtime_options.upstream_node_id, &primary_node_record);

@@ -1445,10 +1451,11 @@ do_standby_follow(void)

 		for (timer = 0; timer < config_file_options.primary_follow_timeout; timer++)
 		{
-			primary_conn = establish_db_connection(config_file_options.conninfo, true);
+			primary_conn = establish_db_connection(primary_node_record.conninfo, true);

 			if (PQstatus(primary_conn) == CONNECTION_OK || runtime_options.wait == false)
 			{
+				log_debug("setting primary id to %i", runtime_options.upstream_node_id);
 				primary_id = runtime_options.upstream_node_id;
 				break;
 			}
@@ -1690,7 +1697,6 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor
 	/* start/restart the service */

 	// XXX here check if service is running!! if not, start
-	//     ensure that problem with pg_ctl output is caught here

 	{
 		char		server_command[MAXLEN] = "";
@@ -210,15 +210,22 @@ ssh_options='-q'                        # Options to append to "ssh"
                                        # 'manual': repmgrd will take no action and the node will require
                                        #    manual attention to reattach it to replication
                                        # (does not apply to BDR mode)
-#monitoring_history=no
-
+#promote_command=                       # command to execute when promoting a new primary; use something like:
+                                        #
+                                        #     repmgr standby promote -f /etc/repmgr.conf
+                                        #
+#follow_command=                        # command to execute when instructing a standby to follow a new primary;
+                                        # use something like:
+                                        #
+                                        #     repmgr standby follow -f /etc/repmgr.conf -W --upstream-node-id=%n
+                                        #
 #primary_notification_timeout=60        # Interval (in seconds) which repmgrd on a standby
                                        # will wait for a notification from the new primary,
-                                        # before falling back to degraded monitoriong
-#degraded_monitoring_timeout=-1         # Interval (in seconds) after which repmgrd
-                                        # will terminate if the server being monitored
-                                        # is no longer available. -1 (default) disables the
-                                        # timeout completely.
+                                        # before falling back to degraded monitoring
+#monitoring_history=no
+#degraded_monitoring_timeout=-1         # Interval (in seconds) after which repmgrd will terminate if the
+                                        # server being monitored is no longer available. -1 (default)
+                                        # disables the timeout completely.
 #async_query_timeout=60                 # Interval (in seconds) which repmgrd will wait before
                                        # cancelling an asynchronous query.

@@ -1661,6 +1661,8 @@ wait_primary_notification(int *new_primary_id)
 static FailoverState
 follow_new_primary(int new_primary_id)
 {
+	char		parsed_follow_command[MAXPGPATH] = "";
+
 	PQExpBufferData event_details;
 	int r;

@@ -1695,9 +1697,6 @@ follow_new_primary(int new_primary_id)
 		fflush(stderr);
 	}

-	log_debug(_("standby follow command is:\n  \"%s\""),
-			  config_file_options.follow_command);
-
 	upstream_conn = establish_db_connection(new_primary.conninfo, false);

 	if (PQstatus(upstream_conn) == CONNECTION_OK)
@@ -1709,6 +1708,7 @@ follow_new_primary(int new_primary_id)
 		}
 		else
 		{
+			new_primary_ok = false;
 			log_warning(_("new primary is not in recovery"));
 			PQfinish(upstream_conn);
 		}
@@ -1727,8 +1727,18 @@ follow_new_primary(int new_primary_id)
 	PQfinish(local_conn);
 	local_conn = NULL;

+	/*
+	 * replace %n in "config_file_options.follow_command" with ID of primary
+	 * to follow.
+	 */
+	parse_follow_command(parsed_follow_command, config_file_options.follow_command, new_primary_id);
+
+	log_debug(_("standby follow command is:\n  \"%s\""),
+			  parsed_follow_command);
+
+
 	/* execute the follow command */
-	r = system(config_file_options.follow_command);
+	r = system(parsed_follow_command);

 	if (r != 0)
 	{
@@ -410,3 +410,54 @@ trim(char *s)

 	return s;
 }
+
+
+void
+parse_follow_command(char *parsed_command, char *template, int node_id)
+{
+	const char *src_ptr = NULL;
+	char	   *dst_ptr = NULL;
+	char	   *end_ptr = NULL;
+
+	dst_ptr = parsed_command;
+	end_ptr = parsed_command + MAXPGPATH - 1;
+	*end_ptr = '\0';
+
+	for(src_ptr = template; *src_ptr; src_ptr++)
+	{
+		if (*src_ptr == '%')
+		{
+			switch (src_ptr[1])
+			{
+				case '%':
+					/* %%: replace with % */
+					if (dst_ptr < end_ptr)
+					{
+						src_ptr++;
+						*dst_ptr++ = *src_ptr;
+					}
+					break;
+				case 'n':
+					/* %n: node id */
+					src_ptr++;
+					snprintf(dst_ptr, end_ptr - dst_ptr, "%i", node_id);
+					dst_ptr += strlen(dst_ptr);
+					break;
+				default:
+					/* otherwise treat the % as not special */
+					if (dst_ptr < end_ptr)
+						*dst_ptr++ = *src_ptr;
+					break;
+			}
+		}
+		else
+		{
+			if (dst_ptr < end_ptr)
+				*dst_ptr++ = *src_ptr;
+		}
+	}
+
+	*dst_ptr = '\0';
+
+	return;
+}
@@ -142,5 +142,7 @@ extern char

 extern char	*trim(char *s);

+extern void
+parse_follow_command(char *parsed_command, char *template, int node_id);

 #endif	 /* _STRUTIL_H_ */