From d9db4f6c4543dbdaaa4543ed3c003a46c87b8357 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 25 Oct 2017 11:01:58 +0900 Subject: [PATCH] repmgr node rejoin: add --dry-run option --- doc/repmgr-node-rejoin.sgml | 77 +++++++++++++++++++++---- repmgr-action-node.c | 112 ++++++++++++++++++++++++------------ 2 files changed, 140 insertions(+), 49 deletions(-) diff --git a/doc/repmgr-node-rejoin.sgml b/doc/repmgr-node-rejoin.sgml index 6c4b8450..72797755 100644 --- a/doc/repmgr-node-rejoin.sgml +++ b/doc/repmgr-node-rejoin.sgml @@ -43,17 +43,72 @@ repmgr.conf for the stopped node *must* be supplied explicitly if not otherwise available. - - repmgr node rejoin can optionally use pg_rewind to re-integrate a - node which has diverged from the rest of the cluster, typically a failed primary. - pg_rewind is available in PostgreSQL 9.5 and later. - - - - pg_rewind *requires* that either wal_log_hints is enabled, or that - data checksums were enabled when the cluster was initialized. See the - pg_rewind documentation for details. + + Using <command>pg_rewind</command> + + repmgr node rejoin can optionally use pg_rewind to re-integrate a + node which has diverged from the rest of the cluster, typically a failed primary. + pg_rewind is available in PostgreSQL 9.5 and later. - + + + pg_rewind *requires* that either wal_log_hints is enabled, or that + data checksums were enabled when the cluster was initialized. See the + pg_rewind documentation for details. + + + + + To have repmgr node rejoin use pg_rewind if required, + pass the command line option --force-rewind, which will tell &repmgr; + to execute pg_rewind to ensure the node can be rejoined successfully. + + + + Be aware that if pg_rewind is executed and actually performs a + rewind operation, any configuration files in the PostgreSQL data directory will be + overwritten with those from the source server. + + + To prevent this happening, provide a comma-separated list of files to retain + using the --config-file command line option; the specified files + will be archived in a temporary directory (whose parent directory can be specified with + --config-archive-dir) and restored once the rewind operation is + complete. + + + + Example, first using --dry-run, then actually executing the + node rejoin command. + + $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node1 dbname=repmgr user=repmgr' \ + --force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose --dry-run + NOTICE: using provided configuration file "/etc/repmgr.conf" + INFO: prerequisites for using pg_rewind are met + INFO: file "postgresql.local.conf" would be copied to "/tmp/repmgr-config-archive-node1/postgresql.local.conf" + INFO: file "postgresql.conf" would be copied to "/tmp/repmgr-config-archive-node1/postgresql.local.conf" + INFO: 2 files would have been copied to "/tmp/repmgr-config-archive-node1" + INFO: directory "/tmp/repmgr-config-archive-node1" deleted + INFO: pg_rewind would now be executed + DETAIL: pg_rewind command is: + pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node1 dbname=repmgr user=repmgr' + + $ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node1 dbname=repmgr user=repmgr' \ + --force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose + NOTICE: using provided configuration file "/etc/repmgr.conf" + INFO: prerequisites for using pg_rewind are met + INFO: 2 files copied to "/tmp/repmgr-config-archive-node1" + NOTICE: executing pg_rewind + NOTICE: 2 files copied to /space/sda1/ibarwick/repmgr-test/node_1/data + INFO: directory "/tmp/repmgr-config-archive-node1" deleted + INFO: deleting "recovery.done" + INFO: setting node 1's primary to node 2 + NOTICE: starting server using "pg_ctl-l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' start" + waiting for server to start.... done + server started + NOTICE: NODE REJOIN successful + DETAIL: node 1 is now attached to node 2 + + diff --git a/repmgr-action-node.c b/repmgr-action-node.c index 83596212..8313a65d 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -1821,7 +1821,7 @@ do_node_rejoin(void) /* - * Intended mainly for "internal" use by `node rejoin` on the local node when + * For "internal" use by `node rejoin` on the local node when * called by "standby switchover" from the remote node. * * This archives any configuration files in the data directory, which may be @@ -1865,8 +1865,6 @@ _do_node_archive_config(void) termPQExpBuffer(&archive_dir); exit(ERR_BAD_CONFIG); } - - } else if (!S_ISDIR(statbuf.st_mode)) { @@ -1876,7 +1874,6 @@ _do_node_archive_config(void) exit(ERR_BAD_CONFIG); } - arcdir = opendir(archive_dir.data); if (arcdir == NULL) @@ -1888,42 +1885,46 @@ _do_node_archive_config(void) exit(ERR_BAD_CONFIG); } - /* - * attempt to remove any existing files in the directory TODO: collate - * problem files into list - */ - while ((arcdir_ent = readdir(arcdir)) != NULL) + if (runtime_options.dry_run == false) { - PQExpBufferData arcdir_ent_path; - initPQExpBuffer(&arcdir_ent_path); - - appendPQExpBuffer(&arcdir_ent_path, - "%s/%s", - archive_dir.data, - arcdir_ent->d_name); - - if (stat(arcdir_ent_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) + /* + * attempt to remove any existing files in the directory TODO: collate + * problem files into list + */ + while ((arcdir_ent = readdir(arcdir)) != NULL) { + PQExpBufferData arcdir_ent_path; + + initPQExpBuffer(&arcdir_ent_path); + + appendPQExpBuffer(&arcdir_ent_path, + "%s/%s", + archive_dir.data, + arcdir_ent->d_name); + + if (stat(arcdir_ent_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) + { + termPQExpBuffer(&arcdir_ent_path); + continue; + } + + if (unlink(arcdir_ent_path.data) == -1) + { + log_error(_("unable to delete file in temporary archive directory")); + log_detail(_("file is: \"%s\""), arcdir_ent_path.data); + log_detail("%s", strerror(errno)); + closedir(arcdir); + termPQExpBuffer(&arcdir_ent_path); + exit(ERR_BAD_CONFIG); + } + termPQExpBuffer(&arcdir_ent_path); - continue; } - if (unlink(arcdir_ent_path.data) == -1) - { - log_error(_("unable to delete file in temporary archive directory")); - log_detail(_("file is: \"%s\""), arcdir_ent_path.data); - log_detail("%s", strerror(errno)); - closedir(arcdir); - termPQExpBuffer(&arcdir_ent_path); - exit(ERR_BAD_CONFIG); - } - - termPQExpBuffer(&arcdir_ent_path); + closedir(arcdir); } - closedir(arcdir); - /* * extract list of config files from --config-files */ @@ -1999,18 +2000,53 @@ _do_node_archive_config(void) } else { - log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"", - cell->key, dest_file.data); - copy_file(cell->value, dest_file.data); - copied_count++; + if (runtime_options.dry_run == true) + { + log_info("file \"%s\" would be copied to \"%s\"", + cell->key, dest_file.data); + copied_count++; + } + else + { + log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"", + cell->key, dest_file.data); + copy_file(cell->value, dest_file.data); + copied_count++; + } } termPQExpBuffer(&dest_file); } + if (runtime_options.dry_run == true) + { + log_verbose(LOG_INFO, _("%i files would have been copied to \"%s\""), + copied_count, archive_dir.data); + } + else + { + log_verbose(LOG_INFO, _("%i files copied to \"%s\""), + copied_count, archive_dir.data); + } - log_verbose(LOG_INFO, _("%i files copied to \"%s\""), - copied_count, archive_dir.data); + if (runtime_options.dry_run == true) + { + /* + * Delete directory in --dry-run mode - it should be empty unless it's been + * interfered with for some reason, in which case manual intervention is + * required + */ + if (rmdir(archive_dir.data) != 0 && errno != EEXIST) + { + log_warning(_("unable to delete directory \"%s\""), archive_dir.data); + log_detail("%s", strerror(errno)); + log_hint(_("directory may need to be manually removed")); + } + else + { + log_verbose(LOG_INFO, "directory \"%s\" deleted", archive_dir.data); + } + } termPQExpBuffer(&archive_dir); }