mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 08:36:30 +00:00
Check replication lag before attempting switchover
This commit is contained in:
@@ -82,6 +82,7 @@ static char *make_barman_ssh_command(char *buf);
|
||||
|
||||
static NodeStatus parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint);
|
||||
static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold);
|
||||
static CheckStatus parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold);
|
||||
|
||||
/*
|
||||
* do_standby_clone()
|
||||
@@ -1711,8 +1712,6 @@ do_standby_switchover(void)
|
||||
termPQExpBuffer(&reason);
|
||||
}
|
||||
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
/*
|
||||
* Check that we can connect by SSH to the remote (current primary) server
|
||||
@@ -1725,65 +1724,149 @@ do_standby_switchover(void)
|
||||
{
|
||||
log_error(_("unable to connect via SSH to host \"%s\", user \"%s\""),
|
||||
remote_host, runtime_options.remote_user);
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* check replication status */
|
||||
/* check archive/replication status */
|
||||
{
|
||||
bool command_success;
|
||||
int files = 0;
|
||||
int threshold = 0;
|
||||
CheckStatus status;
|
||||
int lag_seconds = 0;
|
||||
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||
|
||||
initPQExpBuffer(&remote_command_str);
|
||||
make_remote_repmgr_path(&remote_command_str);
|
||||
appendPQExpBuffer(&remote_command_str,
|
||||
"node check --terse -LERROR --archiver --optformat");
|
||||
/* archive status - check when "archive_mode" is activated */
|
||||
|
||||
initPQExpBuffer(&command_output);
|
||||
|
||||
command_success = remote_command(
|
||||
remote_host,
|
||||
runtime_options.remote_user,
|
||||
remote_command_str.data,
|
||||
&command_output);
|
||||
|
||||
termPQExpBuffer(&remote_command_str);
|
||||
|
||||
status = parse_node_check_archiver(command_output.data, &files, &threshold);
|
||||
|
||||
log_debug("%i %i; '%s'", files, threshold, command_output.data);
|
||||
if (status == CHECK_STATUS_CRITICAL)
|
||||
if (guc_set(remote_conn, "archive_mode", "!=", "off"))
|
||||
{
|
||||
if (runtime_options.force == false)
|
||||
int files = 0;
|
||||
int threshold = 0;
|
||||
bool command_success;
|
||||
|
||||
initPQExpBuffer(&remote_command_str);
|
||||
make_remote_repmgr_path(&remote_command_str);
|
||||
appendPQExpBuffer(&remote_command_str,
|
||||
"node check --terse -LERROR --archiver --optformat");
|
||||
|
||||
initPQExpBuffer(&command_output);
|
||||
|
||||
command_success = remote_command(
|
||||
remote_host,
|
||||
runtime_options.remote_user,
|
||||
remote_command_str.data,
|
||||
&command_output);
|
||||
|
||||
termPQExpBuffer(&remote_command_str);
|
||||
|
||||
if (command_success == true)
|
||||
{
|
||||
log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
||||
files, threshold);
|
||||
log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway"));
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
status = parse_node_check_archiver(command_output.data, &files, &threshold);
|
||||
|
||||
log_debug("%i %i; '%s'", files, threshold, command_output.data);
|
||||
}
|
||||
else
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
if (status == CHECK_STATUS_UNKNOWN)
|
||||
{
|
||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("unable to check number of pending archive files on demotion candidate \"%s\""),
|
||||
remote_node_record.node_name);
|
||||
log_hint(_("use -F/--force to continue anyway"));
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
}
|
||||
|
||||
log_warning(_("unable to check number of pending archive files on demotion candidate \"%s\""),
|
||||
remote_node_record.node_name);
|
||||
log_notice(_("-F/--force set, continuing with switchover"));
|
||||
|
||||
}
|
||||
else if (status == CHECK_STATUS_CRITICAL)
|
||||
{
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
||||
files, threshold);
|
||||
log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway"));
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
}
|
||||
|
||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
||||
files, threshold);
|
||||
log_notice(_("-F/--force set, continuing with switchover"));
|
||||
}
|
||||
else if (status == CHECK_STATUS_WARNING)
|
||||
{
|
||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (warning threshold: %i)"),
|
||||
files, threshold);
|
||||
log_hint(_("PostgreSQL will not shut down until all files are archived"));
|
||||
}
|
||||
|
||||
}
|
||||
else if (status == CHECK_STATUS_WARNING)
|
||||
|
||||
/* check replication lag */
|
||||
lag_seconds = get_replication_lag_seconds(local_conn);
|
||||
|
||||
log_debug("lag is %i ", lag_seconds);
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
|
||||
if (lag_seconds >= config_file_options.replication_lag_critical)
|
||||
{
|
||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (warning threshold: %i)"),
|
||||
files, threshold);
|
||||
log_hint(_("PostgreSQL will not shut down until all files are archived"));
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("replication lag on this node is critical"));
|
||||
log_detail(_("lag is %i seconds (critical threshold: %i)"),
|
||||
lag_seconds, config_file_options.replication_lag_critical);
|
||||
log_hint(_("PostgreSQL on the demotion candidate will not shut down until pending WAL is flushed to the standby; use -F/--force to continue anyway"));
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
}
|
||||
|
||||
log_warning(_("replication lag on this node is critical"));
|
||||
log_detail(_("lag is %i seconds (critical threshold: %i)"),
|
||||
lag_seconds, config_file_options.replication_lag_critical);
|
||||
log_notice(_("-F/--force set, continuing with switchover"));
|
||||
}
|
||||
else if (lag_seconds >= config_file_options.replication_lag_warning)
|
||||
{
|
||||
log_warning(_("replication lag on this node is warning"));
|
||||
log_detail(_("lag is %i seconds (warning threshold: %i)"),
|
||||
lag_seconds, config_file_options.replication_lag_warning);
|
||||
}
|
||||
else if (lag_seconds < 0)
|
||||
{
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("unable to check replication lag on local node"));
|
||||
log_hint(_("use -F/--force to continue anyway"));
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
}
|
||||
|
||||
log_warning(_("unable to check replication lag on local node"));
|
||||
log_notice(_("-F/--force set, continuing with switchover"));
|
||||
}
|
||||
}
|
||||
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
|
||||
|
||||
@@ -3476,7 +3559,7 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* TODO: consolidate code in below functions */
|
||||
static NodeStatus
|
||||
parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint)
|
||||
{
|
||||
@@ -3599,7 +3682,6 @@ parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkP
|
||||
}
|
||||
|
||||
|
||||
|
||||
static CheckStatus
|
||||
parse_node_check_archiver(const char *node_check_output, int *files, int *threshold)
|
||||
{
|
||||
@@ -3738,3 +3820,144 @@ parse_node_check_archiver(const char *node_check_output, int *files, int *thresh
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static CheckStatus
|
||||
parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold)
|
||||
{
|
||||
int options_len = 0;
|
||||
char *options_string = NULL;
|
||||
char *options_string_ptr = NULL;
|
||||
|
||||
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||
|
||||
|
||||
/*
|
||||
* Add parsed options to this list, then copy to an array
|
||||
* to pass to getopt
|
||||
*/
|
||||
static ItemList option_argv = { NULL, NULL };
|
||||
|
||||
char *argv_item;
|
||||
int c, argc_item = 1;
|
||||
|
||||
char **argv_array;
|
||||
ItemListCell *cell;
|
||||
|
||||
int optindex = 0;
|
||||
|
||||
/* We're only interested in these options */
|
||||
static struct option long_options[] =
|
||||
{
|
||||
{"status", required_argument, NULL, 'S'},
|
||||
{"lag", required_argument, NULL, 'l'},
|
||||
{"threshold", required_argument, NULL, 't'},
|
||||
{NULL, 0, NULL, 0}
|
||||
};
|
||||
|
||||
*seconds = 0;
|
||||
*threshold = 0;
|
||||
|
||||
/* Don't attempt to tokenise an empty string */
|
||||
if (!strlen(node_check_output))
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
options_len = strlen(node_check_output) + 1;
|
||||
options_string = pg_malloc(options_len);
|
||||
options_string_ptr = options_string;
|
||||
|
||||
/* Copy the string before operating on it with strtok() */
|
||||
strncpy(options_string, node_check_output, options_len);
|
||||
|
||||
/* Extract arguments into a list and keep a count of the total */
|
||||
while ((argv_item = strtok(options_string_ptr, " ")) != NULL)
|
||||
{
|
||||
item_list_append(&option_argv, argv_item);
|
||||
|
||||
argc_item++;
|
||||
|
||||
if (options_string_ptr != NULL)
|
||||
options_string_ptr = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Array of argument values to pass to getopt_long - this will need to
|
||||
* include an empty string as the first value (normally this would be
|
||||
* the program name)
|
||||
*/
|
||||
argv_array = pg_malloc0(sizeof(char *) * (argc_item + 2));
|
||||
|
||||
/* Insert a blank dummy program name at the start of the array */
|
||||
argv_array[0] = pg_malloc0(1);
|
||||
|
||||
c = 1;
|
||||
|
||||
/*
|
||||
* Copy the previously extracted arguments from our list to the array
|
||||
*/
|
||||
for (cell = option_argv.head; cell; cell = cell->next)
|
||||
{
|
||||
int argv_len = strlen(cell->string) + 1;
|
||||
|
||||
argv_array[c] = pg_malloc0(argv_len);
|
||||
|
||||
strncpy(argv_array[c], cell->string, argv_len);
|
||||
|
||||
c++;
|
||||
}
|
||||
|
||||
argv_array[c] = NULL;
|
||||
|
||||
/* Reset getopt's optind variable */
|
||||
optind = 0;
|
||||
|
||||
/* Prevent getopt from emitting errors */
|
||||
opterr = 0;
|
||||
|
||||
while ((c = getopt_long(argc_item, argv_array, "l:S:t:", long_options,
|
||||
&optindex)) != -1)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
/* --files */
|
||||
case 'l':
|
||||
*seconds = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 't':
|
||||
*threshold = atoi(optarg);
|
||||
break;
|
||||
|
||||
/* --status */
|
||||
case 'S':
|
||||
{
|
||||
if (strncmp(optarg, "OK", MAXLEN) == 0)
|
||||
{
|
||||
status = CHECK_STATUS_OK;
|
||||
}
|
||||
else if (strncmp(optarg, "WARNING", MAXLEN) == 0)
|
||||
{
|
||||
status = CHECK_STATUS_WARNING;
|
||||
}
|
||||
else if (strncmp(optarg, "CRITICAL", MAXLEN) == 0)
|
||||
{
|
||||
status = CHECK_STATUS_CRITICAL;
|
||||
}
|
||||
else if (strncmp(optarg, "UNKNOWN", MAXLEN) == 0)
|
||||
{
|
||||
status = CHECK_STATUS_UNKNOWN;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = CHECK_STATUS_UNKNOWN;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user