diff --git a/README.md b/README.md index 7afdbdb2..393d44e5 100644 --- a/README.md +++ b/README.md @@ -37,11 +37,13 @@ The following commands are available: repmgr standby unregister repmgr standby promote repmgr standby follow + repmgr standby switchover repmgr bdr register repmgr bdr unregister repmgr node status + repmgr node check repmgr cluster show repmgr cluster event [--all] [--node-id] [--node-name] [--event] [--event-matching] @@ -55,6 +57,16 @@ The following commands are available: `master register` can be used as an alias for `primary register`. +* `standby switchover` + + ... + + If other standbys (siblings of the promotion candidate) are connected + to the demotion candidate, if `--siblings-follow` is specified `repmgr` + can instruct these to follow the new primary. Note this can only work + if the configuration file on each sibling is the same path as specifed + in -f/--config-file or -C/--remote-config-file. + * `cluster show` Displays information about each active node in the replication cluster. This diff --git a/dbutils.h b/dbutils.h index 58b99d68..f84ca51c 100644 --- a/dbutils.h +++ b/dbutils.h @@ -72,6 +72,7 @@ typedef enum { */ typedef struct s_node_info { + /* contents of "repmgr.nodes" */ int node_id; int upstream_node_id; t_server_type type; @@ -91,6 +92,7 @@ typedef struct s_node_info PGconn *conn; /* for ad-hoc use e.g. when working with a list of nodes */ char details[MAXLEN]; + bool reachable; /* various statistics */ int max_wal_senders; int attached_wal_receivers; @@ -101,6 +103,7 @@ typedef struct s_node_info #define T_NODE_INFO_INITIALIZER { \ + /* contents of "repmgr.nodes" */ \ NODE_NOT_FOUND, \ NO_UPSTREAM_NODE, \ UNKNOWN, \ @@ -112,12 +115,15 @@ typedef struct s_node_info DEFAULT_PRIORITY, \ true, \ "", \ + /* used during failover to track node status */ \ InvalidXLogRecPtr, \ NODE_STATUS_UNKNOWN, \ RECTYPE_UNKNOWN, \ MS_NORMAL, \ NULL, \ - "", \ + /* for ad-hoc use e.g. when working with a list of nodes */ \ + "", true \ + /* various statistics */ \ -1, -1, -1, -1, -1 \ } diff --git a/repmgr-action-node.c b/repmgr-action-node.c index 49fa6a3e..419133f0 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -52,7 +52,6 @@ do_node_status(void) return _do_node_status_is_shutdown(); } - if (strlen(config_file_options.conninfo)) conn = establish_db_connection(config_file_options.conninfo, true); else @@ -979,7 +978,9 @@ do_node_rejoin(void) /* check provided upstream connection */ - upstream_conn = establish_db_connection(runtime_options.upstream_conninfo, true); + upstream_conn = establish_db_connection_by_params(&source_conninfo, true); + +/* establish_db_connection(runtime_options.upstream_conninfo, true); */ if (get_primary_node_record(upstream_conn, &primary_node_record) == false) { @@ -1030,7 +1031,7 @@ do_node_rejoin(void) appendPQExpBuffer( &command, " --source-server='%s'", - runtime_options.upstream_conninfo); + primary_node_record.conninfo); log_notice(_("executing pg_rewind")); log_debug("pg_rewind command is:\n %s", diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index de3e27a9..6a358ec3 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -1360,8 +1360,6 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor } } - - /* Initialise connection parameters to write as `primary_conninfo` */ initialize_conninfo_params(&recovery_conninfo, false); @@ -1540,17 +1538,15 @@ do_standby_switchover(void) PGconn *local_conn; PGconn *remote_conn; - t_node_info local_node_record = T_NODE_INFO_INITIALIZER; - /* the remote server is the primary to be demoted */ char remote_conninfo[MAXCONNINFO] = ""; char remote_host[MAXLEN] = ""; int remote_node_id; t_node_info remote_node_record = T_NODE_INFO_INITIALIZER; - RecordStatus record_status; + RecordStatus record_status; RecoveryType recovery_type; PQExpBufferData remote_command_str; PQExpBufferData command_output; @@ -1565,6 +1561,10 @@ do_standby_switchover(void) /* store list of configuration files on the demotion candidate */ KeyValueList remote_config_files = { NULL, NULL }; + /* store list of sibling nodes if --siblings-follow specified */ + NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER; + int unreachable_sibling_node_count = 0; + /* * SANITY CHECKS * @@ -1847,9 +1847,6 @@ do_standby_switchover(void) } PQfinish(remote_conn); - PQfinish(local_conn); - - /* Determine the remote's configuration file location */ /* -------------------------------------------------- */ @@ -1884,6 +1881,7 @@ do_standby_switchover(void) log_error(_("unable to find the specified repmgr configuration file on remote server")); log_detail(_("remote configuration file is \"%s\""), runtime_options.remote_config_file); + PQfinish(local_conn); exit(ERR_BAD_CONFIG); } @@ -1952,10 +1950,82 @@ do_standby_switchover(void) { log_error(_("no remote configuration file supplied or found in a default location - terminating")); log_hint(_("specify the remote configuration file with -C/--remote-config-file")); + PQfinish(local_conn); exit(ERR_BAD_CONFIG); } } + /* + * If --siblings-follow specified, get list and check they're reachable + */ + + if (runtime_options.siblings_follow == true) + { + char host[MAXLEN] = ""; + NodeInfoListCell *cell; + + get_active_sibling_node_records(local_conn, + local_node_record.node_id, + local_node_record.upstream_node_id, + &sibling_nodes); + + log_verbose(LOG_INFO, _("%i active sibling nodes found"), + sibling_nodes.node_count); + + for (cell = sibling_nodes.head; cell; cell = cell->next) + { + /* get host from node record */ + get_conninfo_value(cell->node_info->conninfo, "host", host); + r = test_ssh_connection(host, runtime_options.remote_user); + + if (r != 0) + { + cell->node_info->reachable = false; + unreachable_sibling_node_count++; + } + else + { + cell->node_info->reachable = true; + } + } + + if (unreachable_sibling_node_count > 0) + { + if (runtime_options.force == false) + { + log_error(_("%i of %i sibling nodes unreachable via SSH:"), + unreachable_sibling_node_count, + sibling_nodes.node_count); + } + else + { + log_warning(_("%i of %i sibling nodes unreachable via SSH:"), + unreachable_sibling_node_count, + sibling_nodes.node_count); + } + + for (cell = sibling_nodes.head; cell; cell = cell->next) + { + if (cell->node_info->reachable == true) + continue; + log_detail(" %s (ID: %i)", + cell->node_info->node_name, + cell->node_info->node_id); + } + + if (runtime_options.force == false) + { + log_hint(_("use -F/--force to proceed in any case")); + PQfinish(local_conn); + exit(ERR_BAD_CONFIG); + } + + + log_detail(_("F/--force specified, proceeding anyway")); + } + } + PQfinish(local_conn); + /* * Sanity checks completed - prepare for the switchover @@ -2165,7 +2235,7 @@ do_standby_switchover(void) make_remote_repmgr_path(&remote_command_str); appendPQExpBuffer(&remote_command_str, - "%s--upstream-conninfo=\\'%s\\' node rejoin", + "%s-d \\'%s\\' node rejoin", node_rejoin_options.data, local_node_record.conninfo); @@ -2218,6 +2288,57 @@ do_standby_switchover(void) log_detail(_("node \"%s\" is now primary"), local_node_record.node_name); + /* + * If --siblings-follow specified, attempt to make them follow the + * new standby + */ + + if (runtime_options.siblings_follow == true) + { + int failed_follow_count = 0; + char host[MAXLEN] = ""; + NodeInfoListCell *cell; + log_notice(_("executing STANDBY FOLLOW on %i of %i siblings"), + sibling_nodes.node_count - unreachable_sibling_node_count, + sibling_nodes.node_count); + + for (cell = sibling_nodes.head; cell; cell = cell->next) + { + int r = 0; + log_debug("XXX %s", cell->node_info->node_name); + /* skip nodes previously determined as unreachable */ + if (cell->node_info->reachable == false) + { + log_debug(" XXX unreachable!"); + continue; + } + + initPQExpBuffer(&remote_command_str); + make_remote_repmgr_path(&remote_command_str); + + appendPQExpBuffer(&remote_command_str, + "standby follow"); + get_conninfo_value(cell->node_info->conninfo, "host", host); + log_debug("executing:\n \"%s\"", remote_command_str.data); + r = remote_command( + host, + runtime_options.remote_user, + remote_command_str.data, + NULL); + if (r != 0) + { + log_warning(_("STANDBY FOLLOW failed on node \"%s\""), + cell->node_info->node_name); + failed_follow_count++; + } + termPQExpBuffer(&remote_command_str); + } + + if (failed_follow_count == 0) + { + log_info(_("STANDBY FOLLOW")); + } + } return; } @@ -3381,7 +3502,7 @@ copy_configuration_files(void) { int i, r; t_configfile_info *file; - char *host; + char *host = NULL; /* get host from upstream record */ host = param_get(&recovery_conninfo, "host"); diff --git a/repmgr-client-global.h b/repmgr-client-global.h index bca484c7..71a9977e 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -86,6 +86,7 @@ typedef struct char remote_config_file[MAXPGPATH]; bool always_promote; bool force_rewind; + bool siblings_follow; /* "node status" options */ bool is_shutdown; @@ -134,7 +135,7 @@ typedef struct /* "standby register" options */ \ false, 0, \ /* "standby switchover" options */ \ - "", false, false, \ + "", false, false, false, \ /* "node status" options */ \ false, \ /* "node check" options */ \ diff --git a/repmgr-client.c b/repmgr-client.c index 94c886be..d18e9228 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -401,6 +401,7 @@ main(int argc, char **argv) /* "standby switchover" options * * ---------------------------- */ + /* -C/--remote-config-file */ case 'C': strncpy(runtime_options.remote_config_file, optarg, MAXPGPATH); break; @@ -413,6 +414,10 @@ main(int argc, char **argv) runtime_options.force_rewind = true; break; + case OPT_SIBLINGS_FOLLOW: + runtime_options.siblings_follow = true; + break; + /* "node status" options * * --------------------- */ @@ -1178,11 +1183,11 @@ check_cli_parameters(const int action) break; case NODE_REJOIN: - if (runtime_options.upstream_conninfo[0] == '\0') + if (runtime_options.connection_param_provided == false) { item_list_append( &cli_errors, - "--upstream-conninfo must be provided with NODE REJOIN"); + "database connection parameters for an available node must be provided when executing NODE REJOIN"); } break; case CLUSTER_SHOW: diff --git a/repmgr-client.h b/repmgr-client.h index 63e87a39..7a74eb45 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -69,6 +69,7 @@ #define OPT_OPTFORMAT 1033 #define OPT_REPLICATION_LAG 1034 #define OPT_CONFIG_FILES 1035 +#define OPT_SIBLINGS_FOLLOW 1036 /* deprecated since 3.3 */ #define OPT_DATA_DIR 999 #define OPT_NO_CONNINFO_PASSWORD 998 @@ -130,6 +131,7 @@ static struct option long_options[] = {"remote-config-file", required_argument, NULL, 'C'}, {"always-promote", no_argument, NULL, OPT_ALWAYS_PROMOTE }, {"force-rewind", no_argument, NULL, OPT_FORCE_REWIND }, + {"siblings-follow", no_argument, NULL, OPT_SIBLINGS_FOLLOW }, /* "node status" options */ {"is-shutdown", no_argument, NULL, OPT_IS_SHUTDOWN },