From 927bf038a0bbfe577990e1075ab9a2137cfb4f2c Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 8 Feb 2018 16:59:30 +0900 Subject: [PATCH] "standby switchover": check demotion candidate can make replication connection Check it's actually possible for the demotion candidate to attach to the promotion candidate before executing the switchover. As with other checks of this nature, there's a faint possibility the situation could change between the time the check is carried out and the demotion candidate is restarted to connect to the promotion candidate, but there's not a lot we can do about that. The main purpose is to be able to catch existing misconfigurations before anything gets changed. Implements GitHub #370. --- dbutils.h | 8 +++ repmgr-action-node.c | 70 ++++++++++++++++++++++- repmgr-action-standby.c | 119 +++++++++++++++++++++++++++++++++++++--- repmgr-client-global.h | 8 ++- repmgr-client.c | 10 +++- repmgr-client.h | 4 ++ 6 files changed, 205 insertions(+), 14 deletions(-) diff --git a/dbutils.h b/dbutils.h index 9e4cd4d0..95e8abd0 100644 --- a/dbutils.h +++ b/dbutils.h @@ -79,6 +79,14 @@ typedef enum NODE_STATUS_UNCLEAN_SHUTDOWN } NodeStatus; +typedef enum +{ + CONN_UNKNOWN = -1, + CONN_OK, + CONN_BAD, + CONN_ERROR +} ConnectionStatus; + typedef enum { SLOT_UNKNOWN = -1, diff --git a/repmgr-action-node.c b/repmgr-action-node.c index c3002672..329872fa 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -41,6 +41,7 @@ static void _do_node_status_is_shutdown_cleanly(void); static void _do_node_archive_config(void); static void _do_node_restore_config(void); +static void do_node_check_replication_connection(void); static CheckStatus do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output); static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_output); static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); @@ -463,8 +464,7 @@ _do_node_status_is_shutdown_cleanly(void) initPQExpBuffer(&output); - appendPQExpBuffer( - &output, + appendPQExpBuffer(&output, "--state="); /* sanity-check we're dealing with a PostgreSQL directory */ @@ -580,6 +580,11 @@ do_node_check(void) exit(return_code); } + if (runtime_options.replication_connection == true) + { + do_node_check_replication_connection(); + exit(SUCCESS); + } if (strlen(config_file_options.conninfo)) conn = establish_db_connection(config_file_options.conninfo, true); @@ -883,6 +888,67 @@ do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, Check } +static void +do_node_check_replication_connection(void) +{ + PGconn *local_conn = NULL; + PGconn *repl_conn = NULL; + t_node_info node_record = T_NODE_INFO_INITIALIZER; + RecordStatus record_status = RECORD_NOT_FOUND; + t_conninfo_param_list remote_conninfo; + PQExpBufferData output; + + + initPQExpBuffer(&output); + appendPQExpBuffer(&output, + "--connection="); + + if (runtime_options.remote_node_id == UNKNOWN_NODE_ID) + { + appendPQExpBuffer(&output, "UNKNOWN"); + printf("%s\n", output.data); + termPQExpBuffer(&output); + return; + } + + local_conn = establish_db_connection(config_file_options.conninfo, true); + + record_status = get_node_record(local_conn, runtime_options.remote_node_id, &node_record); + PQfinish(local_conn); + + if (record_status != RECORD_FOUND) + { + appendPQExpBuffer(&output, "UNKNOWN"); + printf("%s\n", output.data); + termPQExpBuffer(&output); + return; + } + + initialize_conninfo_params(&remote_conninfo, false); + parse_conninfo_string(node_record.conninfo, &remote_conninfo, NULL, false); + + param_set(&remote_conninfo, "replication", "1"); + param_set(&remote_conninfo, "user", node_record.repluser); + + repl_conn = establish_db_connection_by_params(&remote_conninfo, false); + + if (PQstatus(repl_conn) != CONNECTION_OK) + { + appendPQExpBuffer(&output, "BAD"); + printf("%s\n", output.data); + termPQExpBuffer(&output); + return; + } + + PQfinish(repl_conn); + + appendPQExpBuffer(&output, "OK"); + printf("%s\n", output.data); + termPQExpBuffer(&output); + + return; +} + static CheckStatus do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output) { diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 25a77477..01dcf9f6 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -101,6 +101,7 @@ static bool write_recovery_file_line(FILE *recovery_file, char *recovery_file_pa static NodeStatus parse_node_status_is_shutdown_cleanly(const char *node_status_output, XLogRecPtr *checkPoint); static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold); +static ConnectionStatus parse_remote_node_replication_connection(const char *node_check_output); /* * STANDBY CLONE @@ -2373,8 +2374,7 @@ do_standby_switchover(void) appendPQExpBuffer(&remote_command_str, "--version 2>/dev/null && echo \"1\" || echo \"0\""); initPQExpBuffer(&command_output); - command_success = remote_command( - remote_host, + command_success = remote_command(remote_host, runtime_options.remote_user, remote_command_str.data, &command_output); @@ -2394,20 +2394,17 @@ do_standby_switchover(void) termPQExpBuffer(&command_output); initPQExpBuffer(&hint); - appendPQExpBuffer( - &hint, + appendPQExpBuffer(&hint, _("check \"pg_bindir\" is set to the correct path in \"repmgr.conf\"; current value: ")); if (strlen(config_file_options.pg_bindir)) { - appendPQExpBuffer( - &hint, + appendPQExpBuffer(&hint, "\"%s\"", config_file_options.pg_bindir); } else { - appendPQExpBuffer( - &hint, + appendPQExpBuffer(&hint, "(not set)"); } @@ -2428,6 +2425,49 @@ do_standby_switchover(void) } termPQExpBuffer(&command_output); + /* check demotion candidate can make replication connection to promotion candidate */ + { + initPQExpBuffer(&remote_command_str); + make_remote_repmgr_path(&remote_command_str, &remote_node_record); + appendPQExpBuffer(&remote_command_str, + "node check --remote-node-id=%i --replication-connection", + local_node_record.node_id); + + initPQExpBuffer(&command_output); + + command_success = remote_command(remote_host, + runtime_options.remote_user, + remote_command_str.data, + &command_output); + + termPQExpBuffer(&remote_command_str); + + if (command_success == true) + { + ConnectionStatus conn_status = parse_remote_node_replication_connection(command_output.data); + + switch(conn_status) + { + case CONN_OK: + if (runtime_options.dry_run == true) + { + log_info(_("demotion candidate is able to make replication connection to promotion candidate")); + } + break; + case CONN_BAD: + log_error(_("demotion candidate is unable to make replication connection to promotion candidate")); + exit(ERR_BAD_CONFIG); + break; + default: + log_error(_("unable to deterimine whether candidate is able to make replication connection to promotion candidate")); + exit(ERR_BAD_CONFIG); + break; + } + + termPQExpBuffer(&command_output); + } + } + /* check archive/replication status */ { int lag_seconds = 0; @@ -5447,6 +5487,69 @@ parse_node_status_is_shutdown_cleanly(const char *node_status_output, XLogRecPtr } +static ConnectionStatus +parse_remote_node_replication_connection(const char *node_check_output) +{ + ConnectionStatus conn_status = CONN_UNKNOWN; + + int c = 0, + argc_item = 0; + char **argv_array = NULL; + int optindex = 0; + + /* We're only interested in these options */ + struct option node_check_options[] = + { + {"connection", required_argument, NULL, 'c'}, + {NULL, 0, NULL, 0} + }; + + /* Don't attempt to tokenise an empty string */ + if (!strlen(node_check_output)) + { + return CONN_UNKNOWN; + } + + argc_item = parse_output_to_argv(node_check_output, &argv_array); + + /* Reset getopt's optind variable */ + optind = 0; + + /* Prevent getopt from emitting errors */ + opterr = 0; + + while ((c = getopt_long(argc_item, argv_array, "L:S:", node_check_options, + &optindex)) != -1) + { + switch (c) + { + + /* --connection */ + case 'c': + { + if (strncmp(optarg, "OK", MAXLEN) == 0) + { + conn_status = CONN_OK; + } + else if (strncmp(optarg, "BAD", MAXLEN) == 0) + { + conn_status = CONN_BAD; + } + else if (strncmp(optarg, "UNKNOWN", MAXLEN) == 0) + { + conn_status = CONN_UNKNOWN; + } + } + break; + } + } + + free_parsed_argv(&argv_array); + + return conn_status; +} + + static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold) { diff --git a/repmgr-client-global.h b/repmgr-client-global.h index c254a6ec..c5810f64 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -68,6 +68,7 @@ typedef struct int node_id; char node_name[MAXLEN]; char data_dir[MAXPGPATH]; + int remote_node_id; /* "standby clone" options */ bool copy_external_config_files; @@ -103,6 +104,7 @@ typedef struct bool role; bool slots; bool has_passfile; + bool replication_connection; /* "node join" options */ char config_files[MAXLEN]; @@ -139,8 +141,8 @@ typedef struct "", "", "", "", \ /* other connection options */ \ "", "", \ - /* node options */ \ - UNKNOWN_NODE_ID, "", "", \ + /* general node options */ \ + UNKNOWN_NODE_ID, "", "", UNKNOWN_NODE_ID, \ /* "standby clone" options */ \ false, CONFIG_FILE_SAMEPATH, false, false, false, "", "", "", \ false, \ @@ -153,7 +155,7 @@ typedef struct /* "node status" options */ \ false, \ /* "node check" options */ \ - false, false, false, false, false, false, \ + false, false, false, false, false, false, false, \ /* "node join" options */ \ "", \ /* "node service" options */ \ diff --git a/repmgr-client.c b/repmgr-client.c index 50a28b08..e6b5304c 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -60,7 +60,6 @@ #include "repmgr-action-witness.h" #include "repmgr-action-bdr.h" #include "repmgr-action-node.h" - #include "repmgr-action-cluster.h" #include /* for PG_TEMP_FILE_PREFIX */ @@ -329,6 +328,11 @@ main(int argc, char **argv) strncpy(runtime_options.node_name, optarg, MAXLEN); break; + /* --remote-node-id */ + case OPT_REMOTE_NODE_ID: + runtime_options.remote_node_id = repmgr_atoi(optarg, "--remote-node-id", &cli_errors, false); + break; + /* * standby options * --------------- */ @@ -455,6 +459,10 @@ main(int argc, char **argv) runtime_options.has_passfile = true; break; + case OPT_REPL_CONN: + runtime_options.replication_connection = true; + break; + /*-------------------- * "node rejoin" options *-------------------- diff --git a/repmgr-client.h b/repmgr-client.h index cc901ee3..bad1ec5a 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -83,6 +83,8 @@ #define OPT_CONFIG_ARCHIVE_DIR 1034 #define OPT_HAS_PASSFILE 1035 #define OPT_WAIT_START 1036 +#define OPT_REPL_CONN 1037 +#define OPT_REMOTE_NODE_ID 1038 /* deprecated since 3.3 */ #define OPT_DATA_DIR 999 @@ -115,6 +117,7 @@ static struct option long_options[] = {"pgdata", required_argument, NULL, 'D'}, {"node-id", required_argument, NULL, OPT_NODE_ID}, {"node-name", required_argument, NULL, OPT_NODE_NAME}, + {"remote-node-id", required_argument, NULL, OPT_REMOTE_NODE_ID}, /* logging options */ {"log-level", required_argument, NULL, 'L'}, @@ -158,6 +161,7 @@ static struct option long_options[] = {"role", no_argument, NULL, OPT_ROLE}, {"slots", no_argument, NULL, OPT_SLOTS}, {"has-passfile", no_argument, NULL, OPT_HAS_PASSFILE}, + {"replication-connection", no_argument, NULL, OPT_REPL_CONN}, /* "node rejoin" options */ {"config-files", required_argument, NULL, OPT_CONFIG_FILES},