"standby clone": add --recovery-conf-only option

This will generate "recovery.conf" for an existing standby. Typical use-case is a standby cloned manually from an external data source (e.g. Barman), where "recovery.conf" needs to be created (and if required a replication slot). The --dry-run option will check the pre-requisites but not actually create "recovery.conf" or a replication slot. This requires that the upstream node is running, a replication connection can be made and if required a replication slot can be created. Implements GitHub #382.
2026-03-26 16:46:28 +00:00 · 2018-02-21 14:37:01 +09:00
parent 829cf5cca4
commit 3a764f678a
5 changed files with 424 additions and 25 deletions
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -73,6 +73,7 @@ static char datadir_list_filename[MAXLEN];
 static char barman_command_buf[MAXLEN] = "";

 static void _do_standby_promote_internal(PGconn *conn, const char *data_dir);
+static void _do_create_recovery_conf(void);

 static void check_barman_config(void);
 static void check_source_server(void);
@@ -119,6 +120,7 @@ static ConnectionStatus parse_remote_node_replication_connection(const char *nod
 *  --recovery-min-apply-delay
 *  --replication-user (only required if no upstream record)
 *  --without-barman
+ *  --recovery-conf-only
 */

 void
@@ -130,6 +132,14 @@ do_standby_clone(void)
 	/* dummy node record */
 	t_node_info node_record = T_NODE_INFO_INITIALIZER;

+	/*
+	 * --recovery-conf-only provided - we'll handle that separately
+	 */
+	if (runtime_options.recovery_conf_only == true)
+	{
+		return _do_create_recovery_conf();
+	}
+
 	/*
 	 * conninfo params for the actual upstream node (which might be different
 	 * to the node we're cloning from) to write to recovery.conf
@@ -789,6 +799,372 @@ check_barman_config(void)
 }


+/*
+ * _do_create_recovery_conf()
+ *
+ * Create recovery.conf for a previously cloned instance.
+ *
+ * Prerequisites:
+ *
+ * - data directory must be provided
+ * - the instance should not be running
+ * - an existing "recovery.conf" file can only be overwritten with
+ *   -F/--force
+ * - connection parameters for an existing, running node must be provided
+ * - --upstream-node-id, if provided, will be "primary_conninfo",
+ *   otherwise primary node id; node must exist; unless -F/--force
+ *   provided, must be active and connection possible
+ * - if replication slots in use, create (respect --dry-run)
+ *
+ * not compatible with --no-upstream-connection
+ *
+ */
+
+static void
+_do_create_recovery_conf(void)
+{
+	t_node_info local_node_record = T_NODE_INFO_INITIALIZER;
+	t_node_info upstream_node_record = T_NODE_INFO_INITIALIZER;
+
+	RecordStatus record_status = RECORD_NOT_FOUND;
+	char		recovery_file_path[MAXPGPATH] = "";
+	struct stat st;
+	bool		node_is_running = false;
+	bool		slot_creation_required = false;
+	PGconn	   *upstream_conn = NULL;
+	PGconn	   *upstream_repl_conn = NULL;
+
+	get_node_data_directory(local_data_directory);
+
+	if (local_data_directory[0] == '\0')
+	{
+		log_error(_("no data directory provided"));
+		log_hint(_("provide the node's \"repmgr.conf\" file with -f/--config-file or the data directory with -D/--pgdata"));
+		exit(ERR_BAD_CONFIG);
+	}
+
+	/*
+	 * Do some sanity checks on the data directory to make sure
+	 * it contains a valid but dormant instance
+	 */
+	switch (check_dir(local_data_directory))
+	{
+		case DIR_ERROR:
+			log_error(_("unable to access specified data directory \"%s\""), local_data_directory);
+			log_detail("%s", strerror(errno));
+			exit(ERR_BAD_CONFIG);
+			break;
+		case DIR_NOENT:
+			log_error(_("specified data directory \"%s\" does not exist"), local_data_directory);
+			exit(ERR_BAD_CONFIG);
+			break;
+		case DIR_EMPTY:
+			log_error(_("specified data directory \"%s\" is empty"), local_data_directory);
+			exit(ERR_BAD_CONFIG);
+			break;
+		case DIR_NOT_EMPTY:
+			/* Present but not empty */
+			if (!is_pg_dir(local_data_directory))
+			{
+				log_error(_("specified data directory \"%s\" does not contain a PostgreSQL instance"), local_data_directory);
+				exit(ERR_BAD_CONFIG);
+			}
+
+			if (is_pg_running(local_data_directory))
+			{
+				if (runtime_options.force == false)
+				{
+					log_error(_("specified data directory \"%s\" appears to contain a running PostgreSQL instance"),
+							  local_data_directory);
+					log_hint(_("use -F/--force to create \"recovery.conf\" anyway"));
+					exit(ERR_BAD_CONFIG);
+				}
+
+				node_is_running = true;
+
+				if (runtime_options.dry_run == true)
+				{
+					log_warning(_("\"recovery.conf\" would be created in an active data directory"));
+				}
+				else
+				{
+					log_warning(_("creating \"recovery.conf\" in an active data directory"));
+				}
+			}
+			break;
+		default:
+			break;
+	}
+
+	/* check connection */
+	source_conn = establish_db_connection_by_params(&source_conninfo, true);
+
+	/* determine node for primary_conninfo */
+
+	if (runtime_options.upstream_node_id != UNKNOWN_NODE_ID)
+	{
+		upstream_node_id = runtime_options.upstream_node_id;
+	}
+	else
+	{
+		/* if --upstream-node-id not specifically supplied, get primary node id */
+		upstream_node_id = get_primary_node_id(source_conn);
+
+		if (upstream_node_id == NODE_NOT_FOUND)
+		{
+			log_error(_("unable to determine primary node for this replication cluster"));
+			PQfinish(source_conn);
+			exit(ERR_BAD_CONFIG);
+		}
+
+		log_debug("primary node determined as: %i", upstream_node_id);
+	}
+
+	/* attempt to retrieve upstream node record */
+	record_status = get_node_record(source_conn,
+									upstream_node_id,
+									&upstream_node_record);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve node record for upstream node %i"), upstream_node_id);
+
+		if (record_status == RECORD_ERROR)
+		{
+			log_detail("%s", PQerrorMessage(source_conn));
+		}
+
+
+		exit(ERR_BAD_CONFIG);
+	}
+
+	/* attempt to retrieve local node record */
+	record_status = get_node_record(source_conn,
+									config_file_options.node_id,
+									&local_node_record);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve node record for local node %i"), config_file_options.node_id);
+
+		if (record_status == RECORD_ERROR)
+		{
+			log_detail("%s", PQerrorMessage(source_conn));
+		}
+
+
+		exit(ERR_BAD_CONFIG);
+	}
+
+	PQfinish(source_conn);
+
+
+	/* connect to upstream (which could be different to source) */
+
+	upstream_conn = establish_db_connection(upstream_node_record.conninfo, false);
+	if (PQstatus(upstream_conn) != CONNECTION_OK)
+	{
+		log_error(_("unable to connect to upstream node \"%s\" (ID: %i)"),
+				  upstream_node_record.node_name,
+ 				  upstream_node_id);
+		exit(ERR_BAD_CONFIG);
+	}
+
+	/* Set the application name to this node's name */
+	if (config_file_options.node_name[0] != '\0')
+		param_set(&recovery_conninfo, "application_name", config_file_options.node_name);
+
+	/* Set the replication user from the primary node record */
+	param_set(&recovery_conninfo, "user", upstream_node_record.repluser);
+
+	initialize_conninfo_params(&recovery_conninfo, false);
+
+	/* We ignore any application_name set in the primary's conninfo */
+	parse_conninfo_string(upstream_node_record.conninfo, &recovery_conninfo, NULL, true);
+
+	/* check that a replication connection can be made (--force = override) */
+	upstream_repl_conn = establish_db_connection_by_params(&recovery_conninfo, false);
+
+	if (PQstatus(upstream_repl_conn) != CONNECTION_OK)
+	{
+		if (runtime_options.force == false)
+		{
+			log_error(_("unable to initiate replication connection to upstream node \"%s\" (ID: %i)"),
+					  upstream_node_record.node_name,
+					  upstream_node_id);
+			PQfinish(upstream_conn);
+			exit(ERR_BAD_CONFIG);
+		}
+	}
+
+	/* if replication slots are in use, perform some checks */
+	if (config_file_options.use_replication_slots == true)
+	{
+		PQExpBufferData msg;
+		t_replication_slot slot_info = T_REPLICATION_SLOT_INITIALIZER;
+
+		record_status = get_slot_record(upstream_conn, local_node_record.slot_name, &slot_info);
+
+		/* check if replication slot exists*/
+		if (record_status == RECORD_FOUND)
+		{
+			if (slot_info.active == true)
+			{
+				initPQExpBuffer(&msg);
+
+				appendPQExpBuffer(&msg,
+								  _("an active replication slot named \"%s\" already exists on upstream node \"%s\" (ID: %i)"),
+								  local_node_record.slot_name,
+								  upstream_node_record.node_name,
+								  upstream_node_id);
+				if (runtime_options.force == false && runtime_options.dry_run == false)
+				{
+					log_error("%s", msg.data);
+					log_hint(_("use -F/--force to continue anyway"));
+					termPQExpBuffer(&msg);
+					PQfinish(upstream_conn);
+					exit(ERR_BAD_CONFIG);
+				}
+
+				log_warning("%s", msg.data);
+				termPQExpBuffer(&msg);
+			}
+			else
+			{
+				log_info(_("an inactive replication slot for this node exists on the upstream node"));
+			}
+		}
+		/* if not, if check one can and should be created */
+		else
+		{
+			get_node_replication_stats(upstream_conn, UNKNOWN_SERVER_VERSION_NUM, &upstream_node_record);
+
+		    if (upstream_node_record.max_replication_slots > upstream_node_record.total_replication_slots)
+			{
+				slot_creation_required = true;
+			}
+			else
+			{
+				initPQExpBuffer(&msg);
+
+				appendPQExpBuffer(&msg,
+								  _("insufficient free replicaiton slots on upstream node \"%s\" (ID: %i)"),
+								  upstream_node_record.node_name,
+								  upstream_node_id);
+
+				if (runtime_options.force == false && runtime_options.dry_run == false)
+				{
+					log_error("%s", msg.data);
+					log_hint(_("use -F/--force to continue anyway"));
+					termPQExpBuffer(&msg);
+					PQfinish(upstream_conn);
+					exit(ERR_BAD_CONFIG);
+				}
+
+				log_warning("%s", msg.data);
+				termPQExpBuffer(&msg);
+			}
+		}
+	}
+
+	/* check if recovery.conf exists */
+
+	maxpath_snprintf(recovery_file_path, "%s/%s", local_data_directory, RECOVERY_COMMAND_FILE);
+
+	if (stat(recovery_file_path, &st) == -1)
+	{
+		if (errno != ENOENT)
+		{
+			log_error(_("unable to check for existing \"recovery.conf\" file in \"%s\""),
+					  local_data_directory);
+			log_detail("%s", strerror(errno));
+			exit(ERR_BAD_CONFIG);
+		}
+	}
+	else
+	{
+		if (runtime_options.force == false)
+		{
+			log_error(_("\"recovery.conf\" already exists in \"%s\""),
+					  local_data_directory);
+			log_hint(_("use -F/--force to overwrite an existing \"recovery.conf\" file"));
+			exit(ERR_BAD_CONFIG);
+		}
+
+		if (runtime_options.dry_run == true)
+		{
+			log_warning(_("the existing \"recovery.conf\" file would be overwritten"));
+		}
+		else
+		{
+			log_warning(_("the existing \"recovery.conf\" file will be overwritten"));
+		}
+	}
+
+	if (runtime_options.dry_run == true)
+	{
+		log_info(_("would create \"recovery.conf\" file"));
+		log_detail(_("data directory is: \"%s\""), local_data_directory);
+	}
+	else
+	{
+		if (!create_recovery_file(&upstream_node_record, &recovery_conninfo, local_data_directory))
+		{
+			log_error(_("unable to create \"recovery.conf\""));
+		}
+		else
+		{
+			log_notice(_("\"recovery.conf\" created as \"%s\""), recovery_file_path);
+
+			if (node_is_running == true)
+			{
+				log_hint(_("node must be restarted for the new file to take effect"));
+			}
+		}
+	}
+
+	/* add replication slot, if required */
+	if (slot_creation_required == true)
+	{
+		if (runtime_options.dry_run == true)
+		{
+			log_info(_("would create replication slot \"%s\" on upstream node \"%s\" (ID: %i)"),
+					 local_node_record.slot_name,
+					 upstream_node_record.node_name,
+					 upstream_node_id);
+		}
+		else
+		{
+			PQExpBufferData msg;
+			initPQExpBuffer(&msg);
+
+			if (create_replication_slot(upstream_conn,
+										local_node_record.slot_name,
+										UNKNOWN_SERVER_VERSION_NUM,
+										&msg) == false)
+			{
+				log_error("%s", msg.data);
+				PQfinish(upstream_conn);
+				termPQExpBuffer(&msg);
+				exit(ERR_BAD_CONFIG);
+			}
+
+			termPQExpBuffer(&msg);
+
+			log_notice(_("replication slot \"%s\" created on upstream node \"%s\" (ID: %i)"),
+					   local_node_record.slot_name,
+					   upstream_node_record.node_name,
+					   upstream_node_id);
+		}
+	}
+
+
+	PQfinish(upstream_conn);
+
+	return;
+}
+
+
 /*
 * do_standby_register()
 *
@@ -4895,7 +5271,7 @@ run_file_backup(t_node_info *node_record)
 				if (unlink(tblspc_symlink.data) < 0 && errno != ENOENT)
 				{
 					log_error(_("unable to remove tablespace symlink %s"), tblspc_symlink.data);
-
+					log_detail("%s", strerror(errno));
 					r = ERR_BAD_BASEBACKUP;
 					goto stop_backup;
 				}
@@ -4935,9 +5311,9 @@ run_file_backup(t_node_info *node_record)
 		 */
 		if (unlink(tablespace_map_filename.data) < 0 && errno != ENOENT)
 		{
-			log_error(_("unable to remove tablespace_map file %s: %s"),
-					  tablespace_map_filename.data,
-					  strerror(errno));
+			log_error(_("unable to remove tablespace_map file \"%s\""),
+					  tablespace_map_filename.data);
+			log_detail("%s", strerror(errno));

 			r = ERR_BAD_BASEBACKUP;
 			goto stop_backup;
@@ -5771,6 +6147,8 @@ do_standby_help(void)
 			 "                                        when the intended upstream server does not yet exist\n"));
 	printf(_("  --upstream-node-id                  ID of the upstream node to replicate from (optional, defaults to primary node)\n"));
 	printf(_("  --without-barman                    do not use Barman even if configured\n"));
+	printf(_("  --recovery-conf-only                create \"recovery.conf\" file for a previously cloned instance\n"));
+
 	puts("");

 	printf(_("STANDBY REGISTER\n"));