After switchover, enable sibling standbys to follow new primary

This commit is contained in:
Ian Barwick
2017-08-10 00:06:16 +09:00
parent 4930c95ef7
commit a57fb5b50c
7 changed files with 165 additions and 17 deletions

View File

@@ -37,11 +37,13 @@ The following commands are available:
repmgr standby unregister repmgr standby unregister
repmgr standby promote repmgr standby promote
repmgr standby follow repmgr standby follow
repmgr standby switchover
repmgr bdr register repmgr bdr register
repmgr bdr unregister repmgr bdr unregister
repmgr node status repmgr node status
repmgr node check
repmgr cluster show repmgr cluster show
repmgr cluster event [--all] [--node-id] [--node-name] [--event] [--event-matching] repmgr cluster event [--all] [--node-id] [--node-name] [--event] [--event-matching]
@@ -55,6 +57,16 @@ The following commands are available:
`master register` can be used as an alias for `primary register`. `master register` can be used as an alias for `primary register`.
* `standby switchover`
...
If other standbys (siblings of the promotion candidate) are connected
to the demotion candidate, if `--siblings-follow` is specified `repmgr`
can instruct these to follow the new primary. Note this can only work
if the configuration file on each sibling is the same path as specifed
in -f/--config-file or -C/--remote-config-file.
* `cluster show` * `cluster show`
Displays information about each active node in the replication cluster. This Displays information about each active node in the replication cluster. This

View File

@@ -72,6 +72,7 @@ typedef enum {
*/ */
typedef struct s_node_info typedef struct s_node_info
{ {
/* contents of "repmgr.nodes" */
int node_id; int node_id;
int upstream_node_id; int upstream_node_id;
t_server_type type; t_server_type type;
@@ -91,6 +92,7 @@ typedef struct s_node_info
PGconn *conn; PGconn *conn;
/* for ad-hoc use e.g. when working with a list of nodes */ /* for ad-hoc use e.g. when working with a list of nodes */
char details[MAXLEN]; char details[MAXLEN];
bool reachable;
/* various statistics */ /* various statistics */
int max_wal_senders; int max_wal_senders;
int attached_wal_receivers; int attached_wal_receivers;
@@ -101,6 +103,7 @@ typedef struct s_node_info
#define T_NODE_INFO_INITIALIZER { \ #define T_NODE_INFO_INITIALIZER { \
/* contents of "repmgr.nodes" */ \
NODE_NOT_FOUND, \ NODE_NOT_FOUND, \
NO_UPSTREAM_NODE, \ NO_UPSTREAM_NODE, \
UNKNOWN, \ UNKNOWN, \
@@ -112,12 +115,15 @@ typedef struct s_node_info
DEFAULT_PRIORITY, \ DEFAULT_PRIORITY, \
true, \ true, \
"", \ "", \
/* used during failover to track node status */ \
InvalidXLogRecPtr, \ InvalidXLogRecPtr, \
NODE_STATUS_UNKNOWN, \ NODE_STATUS_UNKNOWN, \
RECTYPE_UNKNOWN, \ RECTYPE_UNKNOWN, \
MS_NORMAL, \ MS_NORMAL, \
NULL, \ NULL, \
"", \ /* for ad-hoc use e.g. when working with a list of nodes */ \
"", true \
/* various statistics */ \
-1, -1, -1, -1, -1 \ -1, -1, -1, -1, -1 \
} }

View File

@@ -52,7 +52,6 @@ do_node_status(void)
return _do_node_status_is_shutdown(); return _do_node_status_is_shutdown();
} }
if (strlen(config_file_options.conninfo)) if (strlen(config_file_options.conninfo))
conn = establish_db_connection(config_file_options.conninfo, true); conn = establish_db_connection(config_file_options.conninfo, true);
else else
@@ -979,7 +978,9 @@ do_node_rejoin(void)
/* check provided upstream connection */ /* check provided upstream connection */
upstream_conn = establish_db_connection(runtime_options.upstream_conninfo, true); upstream_conn = establish_db_connection_by_params(&source_conninfo, true);
/* establish_db_connection(runtime_options.upstream_conninfo, true); */
if (get_primary_node_record(upstream_conn, &primary_node_record) == false) if (get_primary_node_record(upstream_conn, &primary_node_record) == false)
{ {
@@ -1030,7 +1031,7 @@ do_node_rejoin(void)
appendPQExpBuffer( appendPQExpBuffer(
&command, &command,
" --source-server='%s'", " --source-server='%s'",
runtime_options.upstream_conninfo); primary_node_record.conninfo);
log_notice(_("executing pg_rewind")); log_notice(_("executing pg_rewind"));
log_debug("pg_rewind command is:\n %s", log_debug("pg_rewind command is:\n %s",

View File

@@ -1360,8 +1360,6 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor
} }
} }
/* Initialise connection parameters to write as `primary_conninfo` */ /* Initialise connection parameters to write as `primary_conninfo` */
initialize_conninfo_params(&recovery_conninfo, false); initialize_conninfo_params(&recovery_conninfo, false);
@@ -1540,10 +1538,8 @@ do_standby_switchover(void)
PGconn *local_conn; PGconn *local_conn;
PGconn *remote_conn; PGconn *remote_conn;
t_node_info local_node_record = T_NODE_INFO_INITIALIZER; t_node_info local_node_record = T_NODE_INFO_INITIALIZER;
/* the remote server is the primary to be demoted */ /* the remote server is the primary to be demoted */
char remote_conninfo[MAXCONNINFO] = ""; char remote_conninfo[MAXCONNINFO] = "";
char remote_host[MAXLEN] = ""; char remote_host[MAXLEN] = "";
@@ -1565,6 +1561,10 @@ do_standby_switchover(void)
/* store list of configuration files on the demotion candidate */ /* store list of configuration files on the demotion candidate */
KeyValueList remote_config_files = { NULL, NULL }; KeyValueList remote_config_files = { NULL, NULL };
/* store list of sibling nodes if --siblings-follow specified */
NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
int unreachable_sibling_node_count = 0;
/* /*
* SANITY CHECKS * SANITY CHECKS
* *
@@ -1847,9 +1847,6 @@ do_standby_switchover(void)
} }
PQfinish(remote_conn); PQfinish(remote_conn);
PQfinish(local_conn);
/* Determine the remote's configuration file location */ /* Determine the remote's configuration file location */
/* -------------------------------------------------- */ /* -------------------------------------------------- */
@@ -1884,6 +1881,7 @@ do_standby_switchover(void)
log_error(_("unable to find the specified repmgr configuration file on remote server")); log_error(_("unable to find the specified repmgr configuration file on remote server"));
log_detail(_("remote configuration file is \"%s\""), log_detail(_("remote configuration file is \"%s\""),
runtime_options.remote_config_file); runtime_options.remote_config_file);
PQfinish(local_conn);
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
@@ -1952,10 +1950,82 @@ do_standby_switchover(void)
{ {
log_error(_("no remote configuration file supplied or found in a default location - terminating")); log_error(_("no remote configuration file supplied or found in a default location - terminating"));
log_hint(_("specify the remote configuration file with -C/--remote-config-file")); log_hint(_("specify the remote configuration file with -C/--remote-config-file"));
PQfinish(local_conn);
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
} }
/*
* If --siblings-follow specified, get list and check they're reachable
*/
if (runtime_options.siblings_follow == true)
{
char host[MAXLEN] = "";
NodeInfoListCell *cell;
get_active_sibling_node_records(local_conn,
local_node_record.node_id,
local_node_record.upstream_node_id,
&sibling_nodes);
log_verbose(LOG_INFO, _("%i active sibling nodes found"),
sibling_nodes.node_count);
for (cell = sibling_nodes.head; cell; cell = cell->next)
{
/* get host from node record */
get_conninfo_value(cell->node_info->conninfo, "host", host);
r = test_ssh_connection(host, runtime_options.remote_user);
if (r != 0)
{
cell->node_info->reachable = false;
unreachable_sibling_node_count++;
}
else
{
cell->node_info->reachable = true;
}
}
if (unreachable_sibling_node_count > 0)
{
if (runtime_options.force == false)
{
log_error(_("%i of %i sibling nodes unreachable via SSH:"),
unreachable_sibling_node_count,
sibling_nodes.node_count);
}
else
{
log_warning(_("%i of %i sibling nodes unreachable via SSH:"),
unreachable_sibling_node_count,
sibling_nodes.node_count);
}
for (cell = sibling_nodes.head; cell; cell = cell->next)
{
if (cell->node_info->reachable == true)
continue;
log_detail(" %s (ID: %i)",
cell->node_info->node_name,
cell->node_info->node_id);
}
if (runtime_options.force == false)
{
log_hint(_("use -F/--force to proceed in any case"));
PQfinish(local_conn);
exit(ERR_BAD_CONFIG);
}
log_detail(_("F/--force specified, proceeding anyway"));
}
}
PQfinish(local_conn);
/* /*
* Sanity checks completed - prepare for the switchover * Sanity checks completed - prepare for the switchover
@@ -2165,7 +2235,7 @@ do_standby_switchover(void)
make_remote_repmgr_path(&remote_command_str); make_remote_repmgr_path(&remote_command_str);
appendPQExpBuffer(&remote_command_str, appendPQExpBuffer(&remote_command_str,
"%s--upstream-conninfo=\\'%s\\' node rejoin", "%s-d \\'%s\\' node rejoin",
node_rejoin_options.data, node_rejoin_options.data,
local_node_record.conninfo); local_node_record.conninfo);
@@ -2218,6 +2288,57 @@ do_standby_switchover(void)
log_detail(_("node \"%s\" is now primary"), log_detail(_("node \"%s\" is now primary"),
local_node_record.node_name); local_node_record.node_name);
/*
* If --siblings-follow specified, attempt to make them follow the
* new standby
*/
if (runtime_options.siblings_follow == true)
{
int failed_follow_count = 0;
char host[MAXLEN] = "";
NodeInfoListCell *cell;
log_notice(_("executing STANDBY FOLLOW on %i of %i siblings"),
sibling_nodes.node_count - unreachable_sibling_node_count,
sibling_nodes.node_count);
for (cell = sibling_nodes.head; cell; cell = cell->next)
{
int r = 0;
log_debug("XXX %s", cell->node_info->node_name);
/* skip nodes previously determined as unreachable */
if (cell->node_info->reachable == false)
{
log_debug(" XXX unreachable!");
continue;
}
initPQExpBuffer(&remote_command_str);
make_remote_repmgr_path(&remote_command_str);
appendPQExpBuffer(&remote_command_str,
"standby follow");
get_conninfo_value(cell->node_info->conninfo, "host", host);
log_debug("executing:\n \"%s\"", remote_command_str.data);
r = remote_command(
host,
runtime_options.remote_user,
remote_command_str.data,
NULL);
if (r != 0)
{
log_warning(_("STANDBY FOLLOW failed on node \"%s\""),
cell->node_info->node_name);
failed_follow_count++;
}
termPQExpBuffer(&remote_command_str);
}
if (failed_follow_count == 0)
{
log_info(_("STANDBY FOLLOW"));
}
}
return; return;
} }
@@ -3381,7 +3502,7 @@ copy_configuration_files(void)
{ {
int i, r; int i, r;
t_configfile_info *file; t_configfile_info *file;
char *host; char *host = NULL;
/* get host from upstream record */ /* get host from upstream record */
host = param_get(&recovery_conninfo, "host"); host = param_get(&recovery_conninfo, "host");

View File

@@ -86,6 +86,7 @@ typedef struct
char remote_config_file[MAXPGPATH]; char remote_config_file[MAXPGPATH];
bool always_promote; bool always_promote;
bool force_rewind; bool force_rewind;
bool siblings_follow;
/* "node status" options */ /* "node status" options */
bool is_shutdown; bool is_shutdown;
@@ -134,7 +135,7 @@ typedef struct
/* "standby register" options */ \ /* "standby register" options */ \
false, 0, \ false, 0, \
/* "standby switchover" options */ \ /* "standby switchover" options */ \
"", false, false, \ "", false, false, false, \
/* "node status" options */ \ /* "node status" options */ \
false, \ false, \
/* "node check" options */ \ /* "node check" options */ \

View File

@@ -401,6 +401,7 @@ main(int argc, char **argv)
/* "standby switchover" options * /* "standby switchover" options *
* ---------------------------- */ * ---------------------------- */
/* -C/--remote-config-file */
case 'C': case 'C':
strncpy(runtime_options.remote_config_file, optarg, MAXPGPATH); strncpy(runtime_options.remote_config_file, optarg, MAXPGPATH);
break; break;
@@ -413,6 +414,10 @@ main(int argc, char **argv)
runtime_options.force_rewind = true; runtime_options.force_rewind = true;
break; break;
case OPT_SIBLINGS_FOLLOW:
runtime_options.siblings_follow = true;
break;
/* "node status" options * /* "node status" options *
* --------------------- */ * --------------------- */
@@ -1178,11 +1183,11 @@ check_cli_parameters(const int action)
break; break;
case NODE_REJOIN: case NODE_REJOIN:
if (runtime_options.upstream_conninfo[0] == '\0') if (runtime_options.connection_param_provided == false)
{ {
item_list_append( item_list_append(
&cli_errors, &cli_errors,
"--upstream-conninfo must be provided with NODE REJOIN"); "database connection parameters for an available node must be provided when executing NODE REJOIN");
} }
break; break;
case CLUSTER_SHOW: case CLUSTER_SHOW:

View File

@@ -69,6 +69,7 @@
#define OPT_OPTFORMAT 1033 #define OPT_OPTFORMAT 1033
#define OPT_REPLICATION_LAG 1034 #define OPT_REPLICATION_LAG 1034
#define OPT_CONFIG_FILES 1035 #define OPT_CONFIG_FILES 1035
#define OPT_SIBLINGS_FOLLOW 1036
/* deprecated since 3.3 */ /* deprecated since 3.3 */
#define OPT_DATA_DIR 999 #define OPT_DATA_DIR 999
#define OPT_NO_CONNINFO_PASSWORD 998 #define OPT_NO_CONNINFO_PASSWORD 998
@@ -130,6 +131,7 @@ static struct option long_options[] =
{"remote-config-file", required_argument, NULL, 'C'}, {"remote-config-file", required_argument, NULL, 'C'},
{"always-promote", no_argument, NULL, OPT_ALWAYS_PROMOTE }, {"always-promote", no_argument, NULL, OPT_ALWAYS_PROMOTE },
{"force-rewind", no_argument, NULL, OPT_FORCE_REWIND }, {"force-rewind", no_argument, NULL, OPT_FORCE_REWIND },
{"siblings-follow", no_argument, NULL, OPT_SIBLINGS_FOLLOW },
/* "node status" options */ /* "node status" options */
{"is-shutdown", no_argument, NULL, OPT_IS_SHUTDOWN }, {"is-shutdown", no_argument, NULL, OPT_IS_SHUTDOWN },