Check replication lag before attempting switchover

This commit is contained in:
Ian Barwick
2017-08-08 10:16:47 +09:00
parent fd5dfa2ebc
commit f2cf46bba3
12 changed files with 605 additions and 48 deletions

View File

@@ -82,6 +82,7 @@ static char *make_barman_ssh_command(char *buf);
static NodeStatus parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint);
static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold);
static CheckStatus parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold);
/*
* do_standby_clone()
@@ -1711,8 +1712,6 @@ do_standby_switchover(void)
termPQExpBuffer(&reason);
}
PQfinish(remote_conn);
PQfinish(local_conn);
/*
* Check that we can connect by SSH to the remote (current primary) server
@@ -1725,65 +1724,149 @@ do_standby_switchover(void)
{
log_error(_("unable to connect via SSH to host \"%s\", user \"%s\""),
remote_host, runtime_options.remote_user);
PQfinish(remote_conn);
PQfinish(local_conn);
exit(ERR_BAD_CONFIG);
}
/* check replication status */
/* check archive/replication status */
{
bool command_success;
int files = 0;
int threshold = 0;
CheckStatus status;
int lag_seconds = 0;
CheckStatus status = CHECK_STATUS_UNKNOWN;
initPQExpBuffer(&remote_command_str);
make_remote_repmgr_path(&remote_command_str);
appendPQExpBuffer(&remote_command_str,
"node check --terse -LERROR --archiver --optformat");
/* archive status - check when "archive_mode" is activated */
initPQExpBuffer(&command_output);
command_success = remote_command(
remote_host,
runtime_options.remote_user,
remote_command_str.data,
&command_output);
termPQExpBuffer(&remote_command_str);
status = parse_node_check_archiver(command_output.data, &files, &threshold);
log_debug("%i %i; '%s'", files, threshold, command_output.data);
if (status == CHECK_STATUS_CRITICAL)
if (guc_set(remote_conn, "archive_mode", "!=", "off"))
{
if (runtime_options.force == false)
int files = 0;
int threshold = 0;
bool command_success;
initPQExpBuffer(&remote_command_str);
make_remote_repmgr_path(&remote_command_str);
appendPQExpBuffer(&remote_command_str,
"node check --terse -LERROR --archiver --optformat");
initPQExpBuffer(&command_output);
command_success = remote_command(
remote_host,
runtime_options.remote_user,
remote_command_str.data,
&command_output);
termPQExpBuffer(&remote_command_str);
if (command_success == true)
{
log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"),
remote_node_record.node_name);
log_detail(_("%i pending archive files (critical threshold: %i)"),
files, threshold);
log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway"));
exit(ERR_SWITCHOVER_FAIL);
status = parse_node_check_archiver(command_output.data, &files, &threshold);
log_debug("%i %i; '%s'", files, threshold, command_output.data);
}
else
termPQExpBuffer(&command_output);
if (status == CHECK_STATUS_UNKNOWN)
{
log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"),
if (runtime_options.force == false)
{
log_error(_("unable to check number of pending archive files on demotion candidate \"%s\""),
remote_node_record.node_name);
log_hint(_("use -F/--force to continue anyway"));
PQfinish(remote_conn);
PQfinish(local_conn);
exit(ERR_SWITCHOVER_FAIL);
}
log_warning(_("unable to check number of pending archive files on demotion candidate \"%s\""),
remote_node_record.node_name);
log_notice(_("-F/--force set, continuing with switchover"));
}
else if (status == CHECK_STATUS_CRITICAL)
{
if (runtime_options.force == false)
{
log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"),
remote_node_record.node_name);
log_detail(_("%i pending archive files (critical threshold: %i)"),
files, threshold);
log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway"));
PQfinish(remote_conn);
PQfinish(local_conn);
exit(ERR_SWITCHOVER_FAIL);
}
log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"),
remote_node_record.node_name);
log_detail(_("%i pending archive files (critical threshold: %i)"),
files, threshold);
log_notice(_("-F/--force set, continuing with switchover"));
}
else if (status == CHECK_STATUS_WARNING)
{
log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"),
remote_node_record.node_name);
log_detail(_("%i pending archive files (warning threshold: %i)"),
files, threshold);
log_hint(_("PostgreSQL will not shut down until all files are archived"));
}
}
else if (status == CHECK_STATUS_WARNING)
/* check replication lag */
lag_seconds = get_replication_lag_seconds(local_conn);
log_debug("lag is %i ", lag_seconds);
termPQExpBuffer(&command_output);
if (lag_seconds >= config_file_options.replication_lag_critical)
{
log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"),
remote_node_record.node_name);
log_detail(_("%i pending archive files (warning threshold: %i)"),
files, threshold);
log_hint(_("PostgreSQL will not shut down until all files are archived"));
if (runtime_options.force == false)
{
log_error(_("replication lag on this node is critical"));
log_detail(_("lag is %i seconds (critical threshold: %i)"),
lag_seconds, config_file_options.replication_lag_critical);
log_hint(_("PostgreSQL on the demotion candidate will not shut down until pending WAL is flushed to the standby; use -F/--force to continue anyway"));
PQfinish(remote_conn);
PQfinish(local_conn);
exit(ERR_SWITCHOVER_FAIL);
}
log_warning(_("replication lag on this node is critical"));
log_detail(_("lag is %i seconds (critical threshold: %i)"),
lag_seconds, config_file_options.replication_lag_critical);
log_notice(_("-F/--force set, continuing with switchover"));
}
else if (lag_seconds >= config_file_options.replication_lag_warning)
{
log_warning(_("replication lag on this node is warning"));
log_detail(_("lag is %i seconds (warning threshold: %i)"),
lag_seconds, config_file_options.replication_lag_warning);
}
else if (lag_seconds < 0)
{
if (runtime_options.force == false)
{
log_error(_("unable to check replication lag on local node"));
log_hint(_("use -F/--force to continue anyway"));
PQfinish(remote_conn);
PQfinish(local_conn);
exit(ERR_SWITCHOVER_FAIL);
}
log_warning(_("unable to check replication lag on local node"));
log_notice(_("-F/--force set, continuing with switchover"));
}
}
PQfinish(remote_conn);
PQfinish(local_conn);
@@ -3476,7 +3559,7 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
}
/* TODO: consolidate code in below functions */
static NodeStatus
parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint)
{
@@ -3599,7 +3682,6 @@ parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkP
}
static CheckStatus
parse_node_check_archiver(const char *node_check_output, int *files, int *threshold)
{
@@ -3738,3 +3820,144 @@ parse_node_check_archiver(const char *node_check_output, int *files, int *thresh
return status;
}
static CheckStatus
parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold)
{
int options_len = 0;
char *options_string = NULL;
char *options_string_ptr = NULL;
CheckStatus status = CHECK_STATUS_UNKNOWN;
/*
* Add parsed options to this list, then copy to an array
* to pass to getopt
*/
static ItemList option_argv = { NULL, NULL };
char *argv_item;
int c, argc_item = 1;
char **argv_array;
ItemListCell *cell;
int optindex = 0;
/* We're only interested in these options */
static struct option long_options[] =
{
{"status", required_argument, NULL, 'S'},
{"lag", required_argument, NULL, 'l'},
{"threshold", required_argument, NULL, 't'},
{NULL, 0, NULL, 0}
};
*seconds = 0;
*threshold = 0;
/* Don't attempt to tokenise an empty string */
if (!strlen(node_check_output))
{
return status;
}
options_len = strlen(node_check_output) + 1;
options_string = pg_malloc(options_len);
options_string_ptr = options_string;
/* Copy the string before operating on it with strtok() */
strncpy(options_string, node_check_output, options_len);
/* Extract arguments into a list and keep a count of the total */
while ((argv_item = strtok(options_string_ptr, " ")) != NULL)
{
item_list_append(&option_argv, argv_item);
argc_item++;
if (options_string_ptr != NULL)
options_string_ptr = NULL;
}
/*
* Array of argument values to pass to getopt_long - this will need to
* include an empty string as the first value (normally this would be
* the program name)
*/
argv_array = pg_malloc0(sizeof(char *) * (argc_item + 2));
/* Insert a blank dummy program name at the start of the array */
argv_array[0] = pg_malloc0(1);
c = 1;
/*
* Copy the previously extracted arguments from our list to the array
*/
for (cell = option_argv.head; cell; cell = cell->next)
{
int argv_len = strlen(cell->string) + 1;
argv_array[c] = pg_malloc0(argv_len);
strncpy(argv_array[c], cell->string, argv_len);
c++;
}
argv_array[c] = NULL;
/* Reset getopt's optind variable */
optind = 0;
/* Prevent getopt from emitting errors */
opterr = 0;
while ((c = getopt_long(argc_item, argv_array, "l:S:t:", long_options,
&optindex)) != -1)
{
switch (c)
{
/* --files */
case 'l':
*seconds = atoi(optarg);
break;
case 't':
*threshold = atoi(optarg);
break;
/* --status */
case 'S':
{
if (strncmp(optarg, "OK", MAXLEN) == 0)
{
status = CHECK_STATUS_OK;
}
else if (strncmp(optarg, "WARNING", MAXLEN) == 0)
{
status = CHECK_STATUS_WARNING;
}
else if (strncmp(optarg, "CRITICAL", MAXLEN) == 0)
{
status = CHECK_STATUS_CRITICAL;
}
else if (strncmp(optarg, "UNKNOWN", MAXLEN) == 0)
{
status = CHECK_STATUS_UNKNOWN;
}
else
{
status = CHECK_STATUS_UNKNOWN;
}
}
break;
}
}
return status;
}