From ee98a3a58e575224564ded03fc74fb463d6e73eb Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 21 Feb 2018 14:37:01 +0900 Subject: [PATCH] "standby clone": add --recovery-conf-only option This will generate "recovery.conf" for an existing standby. Typical use-case is a standby cloned manually from an external data source (e.g. Barman), where "recovery.conf" needs to be created (and if required a replication slot). The --dry-run option will check the pre-requisites but not actually create "recovery.conf" or a replication slot. This requires that the upstream node is running, a replication connection can be made and if required a replication slot can be created. Implements GitHub #382. --- dbutils.c | 3 + repmgr-action-standby.c | 386 +++++++++++++++++++++++++++++++++++++++- repmgr-client-global.h | 9 +- repmgr-client.c | 49 +++-- repmgr-client.h | 2 + 5 files changed, 424 insertions(+), 25 deletions(-) diff --git a/dbutils.c b/dbutils.c index e6317432..ab5c3432 100644 --- a/dbutils.c +++ b/dbutils.c @@ -3486,6 +3486,9 @@ create_replication_slot(PGconn *conn, char *slot_name, int server_version_num, P PGresult *res = NULL; t_replication_slot slot_info = T_REPLICATION_SLOT_INITIALIZER; + if (server_version_num == UNKNOWN_SERVER_VERSION_NUM) + server_version_num = get_server_version(conn, NULL); + /* * Check whether slot exists already; if it exists and is active, that * means another active standby is using it, which creates an error diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index bd1c857d..94c0e0bc 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -73,6 +73,7 @@ static char datadir_list_filename[MAXLEN]; static char barman_command_buf[MAXLEN] = ""; static void _do_standby_promote_internal(PGconn *conn, const char *data_dir); +static void _do_create_recovery_conf(void); static void check_barman_config(void); static void check_source_server(void); @@ -119,6 +120,7 @@ static ConnectionStatus parse_remote_node_replication_connection(const char *nod * --recovery-min-apply-delay * --replication-user (only required if no upstream record) * --without-barman + * --recovery-conf-only */ void @@ -130,6 +132,14 @@ do_standby_clone(void) /* dummy node record */ t_node_info node_record = T_NODE_INFO_INITIALIZER; + /* + * --recovery-conf-only provided - we'll handle that separately + */ + if (runtime_options.recovery_conf_only == true) + { + return _do_create_recovery_conf(); + } + /* * conninfo params for the actual upstream node (which might be different * to the node we're cloning from) to write to recovery.conf @@ -789,6 +799,372 @@ check_barman_config(void) } +/* + * _do_create_recovery_conf() + * + * Create recovery.conf for a previously cloned instance. + * + * Prerequisites: + * + * - data directory must be provided + * - the instance should not be running + * - an existing "recovery.conf" file can only be overwritten with + * -F/--force + * - connection parameters for an existing, running node must be provided + * - --upstream-node-id, if provided, will be "primary_conninfo", + * otherwise primary node id; node must exist; unless -F/--force + * provided, must be active and connection possible + * - if replication slots in use, create (respect --dry-run) + * + * not compatible with --no-upstream-connection + * + */ + +static void +_do_create_recovery_conf(void) +{ + t_node_info local_node_record = T_NODE_INFO_INITIALIZER; + t_node_info upstream_node_record = T_NODE_INFO_INITIALIZER; + + RecordStatus record_status = RECORD_NOT_FOUND; + char recovery_file_path[MAXPGPATH] = ""; + struct stat st; + bool node_is_running = false; + bool slot_creation_required = false; + PGconn *upstream_conn = NULL; + PGconn *upstream_repl_conn = NULL; + + get_node_data_directory(local_data_directory); + + if (local_data_directory[0] == '\0') + { + log_error(_("no data directory provided")); + log_hint(_("provide the node's \"repmgr.conf\" file with -f/--config-file or the data directory with -D/--pgdata")); + exit(ERR_BAD_CONFIG); + } + + /* + * Do some sanity checks on the data directory to make sure + * it contains a valid but dormant instance + */ + switch (check_dir(local_data_directory)) + { + case DIR_ERROR: + log_error(_("unable to access specified data directory \"%s\""), local_data_directory); + log_detail("%s", strerror(errno)); + exit(ERR_BAD_CONFIG); + break; + case DIR_NOENT: + log_error(_("specified data directory \"%s\" does not exist"), local_data_directory); + exit(ERR_BAD_CONFIG); + break; + case DIR_EMPTY: + log_error(_("specified data directory \"%s\" is empty"), local_data_directory); + exit(ERR_BAD_CONFIG); + break; + case DIR_NOT_EMPTY: + /* Present but not empty */ + if (!is_pg_dir(local_data_directory)) + { + log_error(_("specified data directory \"%s\" does not contain a PostgreSQL instance"), local_data_directory); + exit(ERR_BAD_CONFIG); + } + + if (is_pg_running(local_data_directory)) + { + if (runtime_options.force == false) + { + log_error(_("specified data directory \"%s\" appears to contain a running PostgreSQL instance"), + local_data_directory); + log_hint(_("use -F/--force to create \"recovery.conf\" anyway")); + exit(ERR_BAD_CONFIG); + } + + node_is_running = true; + + if (runtime_options.dry_run == true) + { + log_warning(_("\"recovery.conf\" would be created in an active data directory")); + } + else + { + log_warning(_("creating \"recovery.conf\" in an active data directory")); + } + } + break; + default: + break; + } + + /* check connection */ + source_conn = establish_db_connection_by_params(&source_conninfo, true); + + /* determine node for primary_conninfo */ + + if (runtime_options.upstream_node_id != UNKNOWN_NODE_ID) + { + upstream_node_id = runtime_options.upstream_node_id; + } + else + { + /* if --upstream-node-id not specifically supplied, get primary node id */ + upstream_node_id = get_primary_node_id(source_conn); + + if (upstream_node_id == NODE_NOT_FOUND) + { + log_error(_("unable to determine primary node for this replication cluster")); + PQfinish(source_conn); + exit(ERR_BAD_CONFIG); + } + + log_debug("primary node determined as: %i", upstream_node_id); + } + + /* attempt to retrieve upstream node record */ + record_status = get_node_record(source_conn, + upstream_node_id, + &upstream_node_record); + + if (record_status != RECORD_FOUND) + { + log_error(_("unable to retrieve node record for upstream node %i"), upstream_node_id); + + if (record_status == RECORD_ERROR) + { + log_detail("%s", PQerrorMessage(source_conn)); + } + + + exit(ERR_BAD_CONFIG); + } + + /* attempt to retrieve local node record */ + record_status = get_node_record(source_conn, + config_file_options.node_id, + &local_node_record); + + if (record_status != RECORD_FOUND) + { + log_error(_("unable to retrieve node record for local node %i"), config_file_options.node_id); + + if (record_status == RECORD_ERROR) + { + log_detail("%s", PQerrorMessage(source_conn)); + } + + + exit(ERR_BAD_CONFIG); + } + + PQfinish(source_conn); + + + /* connect to upstream (which could be different to source) */ + + upstream_conn = establish_db_connection(upstream_node_record.conninfo, false); + if (PQstatus(upstream_conn) != CONNECTION_OK) + { + log_error(_("unable to connect to upstream node \"%s\" (ID: %i)"), + upstream_node_record.node_name, + upstream_node_id); + exit(ERR_BAD_CONFIG); + } + + /* Set the application name to this node's name */ + if (config_file_options.node_name[0] != '\0') + param_set(&recovery_conninfo, "application_name", config_file_options.node_name); + + /* Set the replication user from the primary node record */ + param_set(&recovery_conninfo, "user", upstream_node_record.repluser); + + initialize_conninfo_params(&recovery_conninfo, false); + + /* We ignore any application_name set in the primary's conninfo */ + parse_conninfo_string(upstream_node_record.conninfo, &recovery_conninfo, NULL, true); + + /* check that a replication connection can be made (--force = override) */ + upstream_repl_conn = establish_db_connection_by_params(&recovery_conninfo, false); + + if (PQstatus(upstream_repl_conn) != CONNECTION_OK) + { + if (runtime_options.force == false) + { + log_error(_("unable to initiate replication connection to upstream node \"%s\" (ID: %i)"), + upstream_node_record.node_name, + upstream_node_id); + PQfinish(upstream_conn); + exit(ERR_BAD_CONFIG); + } + } + + /* if replication slots are in use, perform some checks */ + if (config_file_options.use_replication_slots == true) + { + PQExpBufferData msg; + t_replication_slot slot_info = T_REPLICATION_SLOT_INITIALIZER; + + record_status = get_slot_record(upstream_conn, local_node_record.slot_name, &slot_info); + + /* check if replication slot exists*/ + if (record_status == RECORD_FOUND) + { + if (slot_info.active == true) + { + initPQExpBuffer(&msg); + + appendPQExpBuffer(&msg, + _("an active replication slot named \"%s\" already exists on upstream node \"%s\" (ID: %i)"), + local_node_record.slot_name, + upstream_node_record.node_name, + upstream_node_id); + if (runtime_options.force == false && runtime_options.dry_run == false) + { + log_error("%s", msg.data); + log_hint(_("use -F/--force to continue anyway")); + termPQExpBuffer(&msg); + PQfinish(upstream_conn); + exit(ERR_BAD_CONFIG); + } + + log_warning("%s", msg.data); + termPQExpBuffer(&msg); + } + else + { + log_info(_("an inactive replication slot for this node exists on the upstream node")); + } + } + /* if not, if check one can and should be created */ + else + { + get_node_replication_stats(upstream_conn, UNKNOWN_SERVER_VERSION_NUM, &upstream_node_record); + + if (upstream_node_record.max_replication_slots > upstream_node_record.total_replication_slots) + { + slot_creation_required = true; + } + else + { + initPQExpBuffer(&msg); + + appendPQExpBuffer(&msg, + _("insufficient free replicaiton slots on upstream node \"%s\" (ID: %i)"), + upstream_node_record.node_name, + upstream_node_id); + + if (runtime_options.force == false && runtime_options.dry_run == false) + { + log_error("%s", msg.data); + log_hint(_("use -F/--force to continue anyway")); + termPQExpBuffer(&msg); + PQfinish(upstream_conn); + exit(ERR_BAD_CONFIG); + } + + log_warning("%s", msg.data); + termPQExpBuffer(&msg); + } + } + } + + /* check if recovery.conf exists */ + + maxpath_snprintf(recovery_file_path, "%s/%s", local_data_directory, RECOVERY_COMMAND_FILE); + + if (stat(recovery_file_path, &st) == -1) + { + if (errno != ENOENT) + { + log_error(_("unable to check for existing \"recovery.conf\" file in \"%s\""), + local_data_directory); + log_detail("%s", strerror(errno)); + exit(ERR_BAD_CONFIG); + } + } + else + { + if (runtime_options.force == false) + { + log_error(_("\"recovery.conf\" already exists in \"%s\""), + local_data_directory); + log_hint(_("use -F/--force to overwrite an existing \"recovery.conf\" file")); + exit(ERR_BAD_CONFIG); + } + + if (runtime_options.dry_run == true) + { + log_warning(_("the existing \"recovery.conf\" file would be overwritten")); + } + else + { + log_warning(_("the existing \"recovery.conf\" file will be overwritten")); + } + } + + if (runtime_options.dry_run == true) + { + log_info(_("would create \"recovery.conf\" file")); + log_detail(_("data directory is: \"%s\""), local_data_directory); + } + else + { + if (!create_recovery_file(&upstream_node_record, &recovery_conninfo, local_data_directory)) + { + log_error(_("unable to create \"recovery.conf\"")); + } + else + { + log_notice(_("\"recovery.conf\" created as \"%s\""), recovery_file_path); + + if (node_is_running == true) + { + log_hint(_("node must be restarted for the new file to take effect")); + } + } + } + + /* add replication slot, if required */ + if (slot_creation_required == true) + { + if (runtime_options.dry_run == true) + { + log_info(_("would create replication slot \"%s\" on upstream node \"%s\" (ID: %i)"), + local_node_record.slot_name, + upstream_node_record.node_name, + upstream_node_id); + } + else + { + PQExpBufferData msg; + initPQExpBuffer(&msg); + + if (create_replication_slot(upstream_conn, + local_node_record.slot_name, + UNKNOWN_SERVER_VERSION_NUM, + &msg) == false) + { + log_error("%s", msg.data); + PQfinish(upstream_conn); + termPQExpBuffer(&msg); + exit(ERR_BAD_CONFIG); + } + + termPQExpBuffer(&msg); + + log_notice(_("replication slot \"%s\" created on upstream node \"%s\" (ID: %i)"), + local_node_record.slot_name, + upstream_node_record.node_name, + upstream_node_id); + } + } + + + PQfinish(upstream_conn); + + return; +} + + /* * do_standby_register() * @@ -4796,7 +5172,7 @@ run_file_backup(t_node_info *node_record) if (unlink(tblspc_symlink.data) < 0 && errno != ENOENT) { log_error(_("unable to remove tablespace symlink %s"), tblspc_symlink.data); - + log_detail("%s", strerror(errno)); r = ERR_BAD_BASEBACKUP; goto stop_backup; } @@ -4836,9 +5212,9 @@ run_file_backup(t_node_info *node_record) */ if (unlink(tablespace_map_filename.data) < 0 && errno != ENOENT) { - log_error(_("unable to remove tablespace_map file %s: %s"), - tablespace_map_filename.data, - strerror(errno)); + log_error(_("unable to remove tablespace_map file \"%s\""), + tablespace_map_filename.data); + log_detail("%s", strerror(errno)); r = ERR_BAD_BASEBACKUP; goto stop_backup; @@ -5672,6 +6048,8 @@ do_standby_help(void) " when the intended upstream server does not yet exist\n")); printf(_(" --upstream-node-id ID of the upstream node to replicate from (optional, defaults to primary node)\n")); printf(_(" --without-barman do not use Barman even if configured\n")); + printf(_(" --recovery-conf-only create \"recovery.conf\" file for a previously cloned instance\n")); + puts(""); printf(_("STANDBY REGISTER\n")); diff --git a/repmgr-client-global.h b/repmgr-client-global.h index c5810f64..e6b24303 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -80,6 +80,7 @@ typedef struct char replication_user[MAXLEN]; char upstream_conninfo[MAXLEN]; bool without_barman; + bool recovery_conf_only; /* "standby clone"/"standby follow" options */ int upstream_node_id; @@ -138,14 +139,14 @@ typedef struct /* output options */ \ false, false, false, \ /* database connection options */ \ - "", "", "", "", \ + "", "", "", "", \ /* other connection options */ \ - "", "", \ + "", "", \ /* general node options */ \ UNKNOWN_NODE_ID, "", "", UNKNOWN_NODE_ID, \ /* "standby clone" options */ \ false, CONFIG_FILE_SAMEPATH, false, false, false, "", "", "", \ - false, \ + false, false, \ /* "standby clone"/"standby follow" options */ \ NO_UPSTREAM_NODE, \ /* "standby register" options */ \ @@ -164,7 +165,7 @@ typedef struct false, "", CLUSTER_EVENT_LIMIT, \ /* "cluster cleanup" options */ \ 0, \ - /* Following options for internal use */ \ + /* following options for internal use */ \ "/tmp", OM_TEXT \ } diff --git a/repmgr-client.c b/repmgr-client.c index e6b5304c..139d37aa 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -388,6 +388,11 @@ main(int argc, char **argv) runtime_options.without_barman = true; break; + case OPT_RECOVERY_CONF_ONLY: + runtime_options.recovery_conf_only = true; + break; + + /*--------------------------- * "standby register" options *--------------------------- @@ -1014,10 +1019,7 @@ main(int argc, char **argv) /* * Check for configuration file items which can be overriden by runtime * options - */ - - /* - * ============================================================================ + * ===================================================================== */ /* @@ -1495,19 +1497,6 @@ check_cli_parameters(const int action) } } - if (runtime_options.event[0]) - { - switch (action) - { - case CLUSTER_EVENT: - break; - default: - item_list_append_format(&cli_warnings, - _("--event not required when executing %s"), - action_name(action)); - } - } - if (runtime_options.replication_user[0]) { switch (action) @@ -1527,6 +1516,32 @@ check_cli_parameters(const int action) } } + if (runtime_options.recovery_conf_only == true) + { + switch (action) + { + case STANDBY_CLONE: + break; + default: + item_list_append_format(&cli_warnings, + _("--create-recovery-conf will be ignored when executing %s"), + action_name(action)); + } + } + + if (runtime_options.event[0]) + { + switch (action) + { + case CLUSTER_EVENT: + break; + default: + item_list_append_format(&cli_warnings, + _("--event not required when executing %s"), + action_name(action)); + } + } + if (runtime_options.limit_provided) { switch (action) diff --git a/repmgr-client.h b/repmgr-client.h index bad1ec5a..8ec68c69 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -85,6 +85,7 @@ #define OPT_WAIT_START 1036 #define OPT_REPL_CONN 1037 #define OPT_REMOTE_NODE_ID 1038 +#define OPT_RECOVERY_CONF_ONLY 1039 /* deprecated since 3.3 */ #define OPT_DATA_DIR 999 @@ -139,6 +140,7 @@ static struct option long_options[] = {"upstream-conninfo", required_argument, NULL, OPT_UPSTREAM_CONNINFO}, {"upstream-node-id", required_argument, NULL, OPT_UPSTREAM_NODE_ID}, {"without-barman", no_argument, NULL, OPT_WITHOUT_BARMAN}, + {"recovery-conf-only", no_argument, NULL, OPT_RECOVERY_CONF_ONLY}, /* "standby register" options */ {"wait-start", required_argument, NULL, OPT_WAIT_START},