From fa10fd84933c2c885f0526729076cb0aa6e37095 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 28 Sep 2016 14:04:59 +0900 Subject: [PATCH] repmgr: add option `--wait-sync` for standby register Causes repmgr to wait for the updated node record to propagate to the standby before exiting. This can be used to ensure that actions which depend on the standby's node record being synchronised (such as starting repmgrd) are not carried out prematurely. Addresses GitHub #103 --- HISTORY | 3 ++ README.md | 36 +++++++++--------- errcode.h | 2 + repmgr.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- repmgr.h | 5 ++- 5 files changed, 136 insertions(+), 21 deletions(-) diff --git a/HISTORY b/HISTORY index c70ae831..4d43e290 100644 --- a/HISTORY +++ b/HISTORY @@ -9,6 +9,9 @@ repmgr: before cloning with pg_basebackup, check that sufficient free walsenders are available (Ian) improve "repmgr-auto" Debian package (Gianni) + repmgr: add option `--wait-sync` for `standby register` which causes + repmgr to wait for the registered node record to synchronise to + the standby 3.1.5 2016-08-15 repmgrd: in a failover situation, prevent endless looping when diff --git a/README.md b/README.md index 366e4002..cbe71ce4 100644 --- a/README.md +++ b/README.md @@ -1587,23 +1587,25 @@ which contains connection details for the local database. `repmgr` or `repmgrd` will return one of the following error codes on program exit: -* SUCCESS (0) Program ran successfully. -* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid -* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error (repmgr only) -* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed -* ERR_DB_CON (6) Error when trying to connect to a database -* ERR_DB_QUERY (7) Error while executing a database query -* ERR_PROMOTED (8) Exiting program because the node has been promoted to master -* ERR_STR_OVERFLOW (10) String overflow error -* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only) -* ERR_BAD_SSH (12) Error when connecting to remote host via SSH (repmgr only) -* ERR_SYS_FAILURE (13) Error when forking (repmgrd only) -* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup (repmgr only) -* ERR_MONITORING_FAIL (16) Unrecoverable error encountered during monitoring (repmgrd only) -* ERR_BAD_BACKUP_LABEL (17) Corrupt or unreadable backup label encountered (repmgr only) -* ERR_SWITCHOVER_FAIL (18) Error encountered during switchover (repmgr only) -* ERR_BARMAN (19) Unrecoverable error while accessing the barman server (repmgr only) - +* SUCCESS (0) Program ran successfully. +* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid +* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error + (repmgr only) +* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed +* ERR_DB_CON (6) Error when trying to connect to a database +* ERR_DB_QUERY (7) Error while executing a database query +* ERR_PROMOTED (8) Exiting program because the node has been promoted to master +* ERR_STR_OVERFLOW (10) String overflow error +* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only) +* ERR_BAD_SSH (12) Error when connecting to remote host via SSH (repmgr only) +* ERR_SYS_FAILURE (13) Error when forking (repmgrd only) +* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup (repmgr only) +* ERR_MONITORING_FAIL (16) Unrecoverable error encountered during monitoring (repmgrd only) +* ERR_BAD_BACKUP_LABEL (17) Corrupt or unreadable backup label encountered (repmgr only) +* ERR_SWITCHOVER_FAIL (18) Error encountered during switchover (repmgr only) +* ERR_BARMAN (19) Unrecoverable error while accessing the barman server (repmgr only) +* ERR_REGISTRATION_SYNC (20) After registering a standby, local node record was no + syncrhonised (repmgr only, with --wait option) Support and Assistance ---------------------- diff --git a/errcode.h b/errcode.h index 45c43c77..783501f8 100644 --- a/errcode.h +++ b/errcode.h @@ -39,5 +39,7 @@ #define ERR_BAD_BACKUP_LABEL 17 #define ERR_SWITCHOVER_FAIL 18 #define ERR_BARMAN 19 +#define ERR_REGISTRATION_SYNC 20 + #endif /* _ERRCODE_H_ */ diff --git a/repmgr.c b/repmgr.c index 62cc0faa..4efcfdae 100644 --- a/repmgr.c +++ b/repmgr.c @@ -227,6 +227,7 @@ main(int argc, char **argv) {"without-barman", no_argument, NULL, OPT_WITHOUT_BARMAN}, {"no-upstream-connection", no_argument, NULL, OPT_NO_UPSTREAM_CONNECTION}, {"copy-external-config-files", optional_argument, NULL, OPT_COPY_EXTERNAL_CONFIG_FILES}, + {"wait-sync", optional_argument, NULL, OPT_REGISTER_WAIT}, {"version", no_argument, NULL, 'V'}, /* Following options deprecated */ {"local-port", required_argument, NULL, 'l'}, @@ -524,7 +525,13 @@ main(int argc, char **argv) case OPT_NO_UPSTREAM_CONNECTION: runtime_options.no_upstream_connection = true; break; - + case OPT_REGISTER_WAIT: + runtime_options.wait_register_sync = true; + if (optarg != NULL) + { + runtime_options.wait_register_sync_seconds = repmgr_atoi(optarg, "--wait-sync", &cli_errors, false); + } + break; /* deprecated options - output a warning */ case 'l': /* -l/--local-port is deprecated */ @@ -1441,6 +1448,94 @@ do_standby_register(void) true, NULL); + /* if --wait-sync option set, wait for the records to synchronise */ + + if (runtime_options.wait_register_sync) + { + bool sync_ok = false; + int timer = 0; + int node_record_result; + t_node_info node_record_on_master = T_NODE_INFO_INITIALIZER; + t_node_info node_record_on_standby = T_NODE_INFO_INITIALIZER; + + node_record_result = get_node_record(master_conn, + options.cluster_name, + options.node, + &node_record_on_master); + + if (node_record_result != 1) + { + log_err(_("unable to retrieve node record from master\n")); + PQfinish(master_conn); + PQfinish(conn); + exit(ERR_REGISTRATION_SYNC); + } + + for (;;) + { + bool records_match = true; + + if (runtime_options.wait_register_sync_seconds && runtime_options.wait_register_sync_seconds == timer) + break; + + // XXX check result + node_record_result = get_node_record(conn, + options.cluster_name, + options.node, + &node_record_on_standby); + + if (node_record_result == 0) + { + /* no record available yet on standby*/ + records_match = false; + } + else if (node_record_result == 1) + { + /* compare relevant fields */ + if (node_record_on_standby.upstream_node_id != node_record_on_master.upstream_node_id) + records_match = false; + + if (node_record_on_standby.type != node_record_on_master.type) + records_match = false; + + if (node_record_on_standby.priority != node_record_on_master.priority) + records_match = false; + + if (node_record_on_standby.active != node_record_on_master.active) + records_match = false; + + if (strcmp(node_record_on_standby.name, node_record_on_master.name) != 0) + records_match = false; + + if (strcmp(node_record_on_standby.conninfo_str, node_record_on_master.conninfo_str) != 0) + records_match = false; + + if (strcmp(node_record_on_standby.slot_name, node_record_on_master.slot_name) != 0) + records_match = false; + + if (records_match == true) + { + sync_ok = true; + break; + } + } + + sleep(1); + timer ++; + } + + if (sync_ok == false) + { + log_err(_("node record was not synchronised after %i seconds\n"), + runtime_options.wait_register_sync_seconds); + PQfinish(master_conn); + PQfinish(conn); + exit(ERR_REGISTRATION_SYNC); + } + + log_info(_("node record on standby synchronised from master\n")); + } + PQfinish(master_conn); PQfinish(conn); @@ -6125,7 +6220,16 @@ check_parameters_for_action(const int action) } } - /* Warn about parameters which apply to STANDBY SWITCHOVER only */ + /* Warn about parameters which apply to STANDBY REGISTER only */ + if (action != STANDBY_REGISTER) + { + if (runtime_options.wait_register_sync) + { + item_list_append(&cli_warnings, _("--wait-sync can only be used when executing STANDBY REGISTER")); + } + } + + /* Warn about parameters which apply to STANDBY SWITCHOVER only */ if (action != STANDBY_SWITCHOVER) { if (pg_rewind_supplied == true) @@ -6134,6 +6238,7 @@ check_parameters_for_action(const int action) } } + /* Warn about parameters which apply to WITNESS UNREGISTER only */ if (action != WITNESS_UNREGISTER) { if (runtime_options.node) @@ -6142,7 +6247,7 @@ check_parameters_for_action(const int action) } } - /* Warn about parameters which apply to CLUSTER SHOW only */ + /* Warn about parameters which apply to CLUSTER SHOW only */ if (action != CLUSTER_SHOW) { if (runtime_options.csv_mode) diff --git a/repmgr.h b/repmgr.h index 68807ece..48c7f85e 100644 --- a/repmgr.h +++ b/repmgr.h @@ -60,6 +60,7 @@ #define OPT_NODE 9 #define OPT_WITHOUT_BARMAN 10 #define OPT_NO_UPSTREAM_CONNECTION 11 +#define OPT_REGISTER_WAIT 12 /* deprecated command line options */ #define OPT_INITDB_NO_PWPROMPT 999 @@ -93,6 +94,8 @@ typedef struct bool no_upstream_connection; bool copy_external_config_files; int copy_external_config_files_destination; + bool wait_register_sync; + int wait_register_sync_seconds; char masterport[MAXLEN]; /* * configuration file parameters which can be overridden on the @@ -116,7 +119,7 @@ typedef struct char recovery_min_apply_delay[MAXLEN]; } t_runtime_options; -#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, false, false, false, false, CONFIG_FILE_SAMEPATH, "", "", "", "", "fast", "", 0, 0, "", ""} +#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, false, false, false, false, CONFIG_FILE_SAMEPATH, false, 0, "", "", "", "", "fast", "", 0, 0, "", ""} struct BackupLabel {