mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Check replication lag before attempting switchover
This commit is contained in:
@@ -395,6 +395,10 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->archiver_lag_warning = repmgr_atoi(value, name, error_list, 1);
|
||||
else if (strcmp(name, "archiver_lag_critcial") == 0)
|
||||
options->archiver_lag_critical = repmgr_atoi(value, name, error_list, 1);
|
||||
else if (strcmp(name, "replication_lag_warning") == 0)
|
||||
options->replication_lag_warning = repmgr_atoi(value, name, error_list, 1);
|
||||
else if (strcmp(name, "replication_lag_critical") == 0)
|
||||
options->replication_lag_critical = repmgr_atoi(value, name, error_list, 1);
|
||||
|
||||
/* repmgrd settings */
|
||||
else if (strcmp(name, "failover_mode") == 0)
|
||||
@@ -619,6 +623,11 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
_("\archiver_lag_critical\" must be greater than \"archiver_lag_warning\""));
|
||||
}
|
||||
|
||||
if( options->replication_lag_warning >= options->replication_lag_critical)
|
||||
{
|
||||
item_list_append(error_list,
|
||||
_("\replication_lag_critical\" must be greater than \"replication_lag_warning\""));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -75,6 +75,8 @@ typedef struct
|
||||
/* node check settings */
|
||||
int archiver_lag_warning;
|
||||
int archiver_lag_critical;
|
||||
int replication_lag_warning;
|
||||
int replication_lag_critical;
|
||||
|
||||
/* repmgrd settings */
|
||||
failover_mode_opt failover_mode;
|
||||
@@ -130,6 +132,7 @@ typedef struct
|
||||
false, "", "", "", "", { NULL, NULL }, \
|
||||
/* node check settings */ \
|
||||
DEFAULT_ARCHIVER_LAG_WARNING, DEFAULT_ARCHIVER_LAG_CRITICAL, \
|
||||
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
|
||||
/* repmgrd settings */ \
|
||||
FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", \
|
||||
DEFAULT_MONITORING_INTERVAL, \
|
||||
|
||||
56
dbutils.c
56
dbutils.c
@@ -1309,7 +1309,7 @@ get_ready_archive_files(PGconn *conn, const char *data_directory)
|
||||
if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
|
||||
server_version_num = get_server_version(conn, NULL);
|
||||
|
||||
if (server_version_num >= 1000000)
|
||||
if (server_version_num >= 100000)
|
||||
{
|
||||
snprintf(archive_status_dir, MAXPGPATH,
|
||||
"%s/pg_wal/archive_status",
|
||||
@@ -1376,6 +1376,60 @@ get_ready_archive_files(PGconn *conn, const char *data_directory)
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
get_replication_lag_seconds(PGconn *conn)
|
||||
{
|
||||
PQExpBufferData query;
|
||||
PGresult *res;
|
||||
int lag_seconds = 0;
|
||||
|
||||
if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
|
||||
server_version_num = get_server_version(conn, NULL);
|
||||
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
if (server_version_num >= 100000)
|
||||
{
|
||||
appendPQExpBuffer(
|
||||
&query,
|
||||
" SELECT CASE WHEN (pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn()) ");
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBuffer(
|
||||
&query,
|
||||
" SELECT CASE WHEN (pg_catalog.pg_last_xlog_receive_location() = pg_catalog.pg_last_xlog_replay_location()) ");
|
||||
}
|
||||
|
||||
appendPQExpBuffer(
|
||||
&query,
|
||||
" THEN 0 "
|
||||
" ELSE EXTRACT(epoch FROM (clock_timestamp() - pg_catalog.pg_last_xact_replay_timestamp()))::INT "
|
||||
" END "
|
||||
" AS lag_seconds");
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
termPQExpBuffer(&query);
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_node_record():\n%s", query.data);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK || !PQntuples(res))
|
||||
{
|
||||
log_warning("%s", PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
|
||||
/* XXX magic number */
|
||||
return -1;
|
||||
}
|
||||
|
||||
lag_seconds = atoi(PQgetvalue(res, 0, 0));
|
||||
|
||||
PQclear(res);
|
||||
return lag_seconds;
|
||||
}
|
||||
|
||||
|
||||
/* ================ */
|
||||
/* result functions */
|
||||
/* ================ */
|
||||
|
||||
@@ -324,6 +324,7 @@ int get_primary_node_id(PGconn *conn);
|
||||
bool get_replication_info(PGconn *conn, ReplInfo *replication_info);
|
||||
bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
|
||||
int get_ready_archive_files(PGconn *conn, const char *data_directory);
|
||||
int get_replication_lag_seconds(PGconn *conn);
|
||||
|
||||
/* extension functions */
|
||||
ExtensionStatus get_repmgr_extension_status(PGconn *conn);
|
||||
|
||||
@@ -395,15 +395,29 @@ do_node_check(void)
|
||||
PQfinish(conn);
|
||||
return;
|
||||
}
|
||||
|
||||
if (runtime_options.replication_lag == true)
|
||||
{
|
||||
(void) do_node_check_replication_lag(conn, runtime_options.output_mode, NULL);
|
||||
PQfinish(conn);
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool
|
||||
CheckStatus
|
||||
do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
||||
{
|
||||
bool own_buffer = false;
|
||||
int ready_archive_files = 0;
|
||||
PQExpBufferData buf;
|
||||
bool check_ok = true;
|
||||
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||
|
||||
if (mode == OM_CSV)
|
||||
{
|
||||
log_error(_("--csv output not provided with --archiver option"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (output == NULL)
|
||||
{
|
||||
@@ -412,10 +426,13 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
||||
own_buffer = true;
|
||||
}
|
||||
|
||||
|
||||
ready_archive_files = get_ready_archive_files(conn, config_file_options.data_directory);
|
||||
|
||||
if (ready_archive_files > config_file_options.archiver_lag_critical)
|
||||
{
|
||||
status = CHECK_STATUS_CRITICAL;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case OM_OPTFORMAT:
|
||||
@@ -425,12 +442,29 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
||||
ready_archive_files,
|
||||
config_file_options.archiver_lag_critical);
|
||||
break;
|
||||
case OM_NAGIOS:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"PG_ARCHIVER CRITICAL: %i pending files (critical: %i)",
|
||||
ready_archive_files,
|
||||
config_file_options.archiver_lag_critical);
|
||||
break;
|
||||
case OM_TEXT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"CRITICAL - %i pending files (threshold: %i)",
|
||||
ready_archive_files,
|
||||
config_file_options.archiver_lag_critical);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (ready_archive_files > config_file_options.archiver_lag_warning)
|
||||
{
|
||||
status = CHECK_STATUS_WARNING;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case OM_OPTFORMAT:
|
||||
@@ -440,12 +474,55 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
||||
ready_archive_files,
|
||||
config_file_options.archiver_lag_warning);
|
||||
break;
|
||||
case OM_NAGIOS:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"PG_ARCHIVER WARNING: %i pending files (warning: %i)",
|
||||
ready_archive_files,
|
||||
config_file_options.archiver_lag_warning);
|
||||
break;
|
||||
case OM_TEXT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"WARNING - %i pending files (threshold: %i)",
|
||||
ready_archive_files,
|
||||
config_file_options.archiver_lag_warning);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (ready_archive_files < 0)
|
||||
{
|
||||
status = CHECK_STATUS_UNKNOWN;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case OM_OPTFORMAT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"--status=UNKNOWN");
|
||||
break;
|
||||
case OM_NAGIOS:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"PG_ARCHIVER UNKNOWN: unable to check archive_status directory");
|
||||
break;
|
||||
case OM_TEXT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"UNKNOWN - unable to check archive_status directory");
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
status = CHECK_STATUS_OK;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case OM_OPTFORMAT:
|
||||
@@ -454,19 +531,192 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
||||
"--status=OK --files=%i",
|
||||
ready_archive_files);
|
||||
break;
|
||||
case OM_NAGIOS:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"PG_ARCHIVER OK: %i pending files",
|
||||
ready_archive_files);
|
||||
break;
|
||||
case OM_TEXT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"OK - %i pending files",
|
||||
ready_archive_files);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (own_buffer == true)
|
||||
{
|
||||
printf("%s\n", buf.data);
|
||||
termPQExpBuffer(&buf);
|
||||
}
|
||||
|
||||
return check_ok;
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
CheckStatus
|
||||
do_node_check_replication_lag(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
||||
{
|
||||
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||
bool own_buffer = false;
|
||||
PQExpBufferData buf;
|
||||
int lag_seconds;
|
||||
|
||||
if (mode == OM_CSV)
|
||||
{
|
||||
log_error(_("--csv output not provided with --replication-lag option"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (output == NULL)
|
||||
{
|
||||
initPQExpBuffer(&buf);
|
||||
output = &buf;
|
||||
own_buffer = true;
|
||||
}
|
||||
|
||||
lag_seconds = get_replication_lag_seconds(conn);
|
||||
|
||||
log_debug("lag seconds: %i", lag_seconds);
|
||||
|
||||
if (lag_seconds >= config_file_options.replication_lag_critical)
|
||||
{
|
||||
status = CHECK_STATUS_CRITICAL;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case OM_OPTFORMAT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"--status=CRITICAL --lag=%i --threshold=%i",
|
||||
lag_seconds,
|
||||
config_file_options.replication_lag_critical);
|
||||
break;
|
||||
case OM_NAGIOS:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"PG_REPLICATION_LAG CRITICAL: %i seconds (critical: %i)",
|
||||
lag_seconds,
|
||||
config_file_options.replication_lag_critical);
|
||||
break;
|
||||
case OM_TEXT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"CRITICAL - %i seconds (threshold: %i)",
|
||||
lag_seconds,
|
||||
config_file_options.replication_lag_critical);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (lag_seconds > config_file_options.replication_lag_warning)
|
||||
{
|
||||
status = CHECK_STATUS_WARNING;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case OM_OPTFORMAT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"--status=WARNING --lag=%i --threshold=%i",
|
||||
lag_seconds,
|
||||
config_file_options.replication_lag_warning);
|
||||
break;
|
||||
case OM_NAGIOS:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"PG_REPLICATION_LAG WARNING: %i seconds (warning: %i)",
|
||||
lag_seconds,
|
||||
config_file_options.replication_lag_warning);
|
||||
break;
|
||||
case OM_TEXT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"WARNING - %i seconds (threshold: %i)",
|
||||
lag_seconds,
|
||||
config_file_options.replication_lag_warning);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (lag_seconds < 0)
|
||||
{
|
||||
status = CHECK_STATUS_UNKNOWN;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case OM_OPTFORMAT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"--status=UNKNOWN");
|
||||
break;
|
||||
case OM_NAGIOS:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"PG_REPLICATION_LAG UNKNOWN: unable to query replication lag");
|
||||
break;
|
||||
case OM_TEXT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"UNKNOWN - unable to query replication lag");
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
status = CHECK_STATUS_OK;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case OM_OPTFORMAT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"--status=OK --files=%i",
|
||||
lag_seconds);
|
||||
break;
|
||||
case OM_NAGIOS:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"PG_REPLICATION_LAG OK: %i seconds",
|
||||
lag_seconds);
|
||||
break;
|
||||
case OM_TEXT:
|
||||
appendPQExpBuffer(
|
||||
output,
|
||||
"OK - %i seconds",
|
||||
lag_seconds);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
if (own_buffer == true)
|
||||
{
|
||||
printf("%s\n", buf.data);
|
||||
termPQExpBuffer(&buf);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
// --action=...
|
||||
// --check
|
||||
// --list -> list what would be executed for each action, filter to --action
|
||||
|
||||
@@ -8,7 +8,9 @@
|
||||
|
||||
extern void do_node_status(void);
|
||||
extern void do_node_check(void);
|
||||
extern bool do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output);
|
||||
extern CheckStatus do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output);
|
||||
extern CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, PQExpBufferData *output);
|
||||
|
||||
|
||||
extern void do_node_archive_config(void);
|
||||
extern void do_node_restore_config(void);
|
||||
|
||||
@@ -82,6 +82,7 @@ static char *make_barman_ssh_command(char *buf);
|
||||
|
||||
static NodeStatus parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint);
|
||||
static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold);
|
||||
static CheckStatus parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold);
|
||||
|
||||
/*
|
||||
* do_standby_clone()
|
||||
@@ -1711,8 +1712,6 @@ do_standby_switchover(void)
|
||||
termPQExpBuffer(&reason);
|
||||
}
|
||||
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
/*
|
||||
* Check that we can connect by SSH to the remote (current primary) server
|
||||
@@ -1725,65 +1724,149 @@ do_standby_switchover(void)
|
||||
{
|
||||
log_error(_("unable to connect via SSH to host \"%s\", user \"%s\""),
|
||||
remote_host, runtime_options.remote_user);
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* check replication status */
|
||||
/* check archive/replication status */
|
||||
{
|
||||
bool command_success;
|
||||
int files = 0;
|
||||
int threshold = 0;
|
||||
CheckStatus status;
|
||||
int lag_seconds = 0;
|
||||
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||
|
||||
initPQExpBuffer(&remote_command_str);
|
||||
make_remote_repmgr_path(&remote_command_str);
|
||||
appendPQExpBuffer(&remote_command_str,
|
||||
"node check --terse -LERROR --archiver --optformat");
|
||||
/* archive status - check when "archive_mode" is activated */
|
||||
|
||||
initPQExpBuffer(&command_output);
|
||||
|
||||
command_success = remote_command(
|
||||
remote_host,
|
||||
runtime_options.remote_user,
|
||||
remote_command_str.data,
|
||||
&command_output);
|
||||
|
||||
termPQExpBuffer(&remote_command_str);
|
||||
|
||||
status = parse_node_check_archiver(command_output.data, &files, &threshold);
|
||||
|
||||
log_debug("%i %i; '%s'", files, threshold, command_output.data);
|
||||
if (status == CHECK_STATUS_CRITICAL)
|
||||
if (guc_set(remote_conn, "archive_mode", "!=", "off"))
|
||||
{
|
||||
if (runtime_options.force == false)
|
||||
int files = 0;
|
||||
int threshold = 0;
|
||||
bool command_success;
|
||||
|
||||
initPQExpBuffer(&remote_command_str);
|
||||
make_remote_repmgr_path(&remote_command_str);
|
||||
appendPQExpBuffer(&remote_command_str,
|
||||
"node check --terse -LERROR --archiver --optformat");
|
||||
|
||||
initPQExpBuffer(&command_output);
|
||||
|
||||
command_success = remote_command(
|
||||
remote_host,
|
||||
runtime_options.remote_user,
|
||||
remote_command_str.data,
|
||||
&command_output);
|
||||
|
||||
termPQExpBuffer(&remote_command_str);
|
||||
|
||||
if (command_success == true)
|
||||
{
|
||||
log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
||||
files, threshold);
|
||||
log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway"));
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
status = parse_node_check_archiver(command_output.data, &files, &threshold);
|
||||
|
||||
log_debug("%i %i; '%s'", files, threshold, command_output.data);
|
||||
}
|
||||
else
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
if (status == CHECK_STATUS_UNKNOWN)
|
||||
{
|
||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("unable to check number of pending archive files on demotion candidate \"%s\""),
|
||||
remote_node_record.node_name);
|
||||
log_hint(_("use -F/--force to continue anyway"));
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
}
|
||||
|
||||
log_warning(_("unable to check number of pending archive files on demotion candidate \"%s\""),
|
||||
remote_node_record.node_name);
|
||||
log_notice(_("-F/--force set, continuing with switchover"));
|
||||
|
||||
}
|
||||
else if (status == CHECK_STATUS_CRITICAL)
|
||||
{
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
||||
files, threshold);
|
||||
log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway"));
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
}
|
||||
|
||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
||||
files, threshold);
|
||||
log_notice(_("-F/--force set, continuing with switchover"));
|
||||
}
|
||||
else if (status == CHECK_STATUS_WARNING)
|
||||
{
|
||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (warning threshold: %i)"),
|
||||
files, threshold);
|
||||
log_hint(_("PostgreSQL will not shut down until all files are archived"));
|
||||
}
|
||||
|
||||
}
|
||||
else if (status == CHECK_STATUS_WARNING)
|
||||
|
||||
/* check replication lag */
|
||||
lag_seconds = get_replication_lag_seconds(local_conn);
|
||||
|
||||
log_debug("lag is %i ", lag_seconds);
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
|
||||
if (lag_seconds >= config_file_options.replication_lag_critical)
|
||||
{
|
||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"),
|
||||
remote_node_record.node_name);
|
||||
log_detail(_("%i pending archive files (warning threshold: %i)"),
|
||||
files, threshold);
|
||||
log_hint(_("PostgreSQL will not shut down until all files are archived"));
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("replication lag on this node is critical"));
|
||||
log_detail(_("lag is %i seconds (critical threshold: %i)"),
|
||||
lag_seconds, config_file_options.replication_lag_critical);
|
||||
log_hint(_("PostgreSQL on the demotion candidate will not shut down until pending WAL is flushed to the standby; use -F/--force to continue anyway"));
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
}
|
||||
|
||||
log_warning(_("replication lag on this node is critical"));
|
||||
log_detail(_("lag is %i seconds (critical threshold: %i)"),
|
||||
lag_seconds, config_file_options.replication_lag_critical);
|
||||
log_notice(_("-F/--force set, continuing with switchover"));
|
||||
}
|
||||
else if (lag_seconds >= config_file_options.replication_lag_warning)
|
||||
{
|
||||
log_warning(_("replication lag on this node is warning"));
|
||||
log_detail(_("lag is %i seconds (warning threshold: %i)"),
|
||||
lag_seconds, config_file_options.replication_lag_warning);
|
||||
}
|
||||
else if (lag_seconds < 0)
|
||||
{
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("unable to check replication lag on local node"));
|
||||
log_hint(_("use -F/--force to continue anyway"));
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
}
|
||||
|
||||
log_warning(_("unable to check replication lag on local node"));
|
||||
log_notice(_("-F/--force set, continuing with switchover"));
|
||||
}
|
||||
}
|
||||
|
||||
PQfinish(remote_conn);
|
||||
PQfinish(local_conn);
|
||||
|
||||
|
||||
|
||||
@@ -3476,7 +3559,7 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* TODO: consolidate code in below functions */
|
||||
static NodeStatus
|
||||
parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint)
|
||||
{
|
||||
@@ -3599,7 +3682,6 @@ parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkP
|
||||
}
|
||||
|
||||
|
||||
|
||||
static CheckStatus
|
||||
parse_node_check_archiver(const char *node_check_output, int *files, int *threshold)
|
||||
{
|
||||
@@ -3738,3 +3820,144 @@ parse_node_check_archiver(const char *node_check_output, int *files, int *thresh
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static CheckStatus
|
||||
parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold)
|
||||
{
|
||||
int options_len = 0;
|
||||
char *options_string = NULL;
|
||||
char *options_string_ptr = NULL;
|
||||
|
||||
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||
|
||||
|
||||
/*
|
||||
* Add parsed options to this list, then copy to an array
|
||||
* to pass to getopt
|
||||
*/
|
||||
static ItemList option_argv = { NULL, NULL };
|
||||
|
||||
char *argv_item;
|
||||
int c, argc_item = 1;
|
||||
|
||||
char **argv_array;
|
||||
ItemListCell *cell;
|
||||
|
||||
int optindex = 0;
|
||||
|
||||
/* We're only interested in these options */
|
||||
static struct option long_options[] =
|
||||
{
|
||||
{"status", required_argument, NULL, 'S'},
|
||||
{"lag", required_argument, NULL, 'l'},
|
||||
{"threshold", required_argument, NULL, 't'},
|
||||
{NULL, 0, NULL, 0}
|
||||
};
|
||||
|
||||
*seconds = 0;
|
||||
*threshold = 0;
|
||||
|
||||
/* Don't attempt to tokenise an empty string */
|
||||
if (!strlen(node_check_output))
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
options_len = strlen(node_check_output) + 1;
|
||||
options_string = pg_malloc(options_len);
|
||||
options_string_ptr = options_string;
|
||||
|
||||
/* Copy the string before operating on it with strtok() */
|
||||
strncpy(options_string, node_check_output, options_len);
|
||||
|
||||
/* Extract arguments into a list and keep a count of the total */
|
||||
while ((argv_item = strtok(options_string_ptr, " ")) != NULL)
|
||||
{
|
||||
item_list_append(&option_argv, argv_item);
|
||||
|
||||
argc_item++;
|
||||
|
||||
if (options_string_ptr != NULL)
|
||||
options_string_ptr = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Array of argument values to pass to getopt_long - this will need to
|
||||
* include an empty string as the first value (normally this would be
|
||||
* the program name)
|
||||
*/
|
||||
argv_array = pg_malloc0(sizeof(char *) * (argc_item + 2));
|
||||
|
||||
/* Insert a blank dummy program name at the start of the array */
|
||||
argv_array[0] = pg_malloc0(1);
|
||||
|
||||
c = 1;
|
||||
|
||||
/*
|
||||
* Copy the previously extracted arguments from our list to the array
|
||||
*/
|
||||
for (cell = option_argv.head; cell; cell = cell->next)
|
||||
{
|
||||
int argv_len = strlen(cell->string) + 1;
|
||||
|
||||
argv_array[c] = pg_malloc0(argv_len);
|
||||
|
||||
strncpy(argv_array[c], cell->string, argv_len);
|
||||
|
||||
c++;
|
||||
}
|
||||
|
||||
argv_array[c] = NULL;
|
||||
|
||||
/* Reset getopt's optind variable */
|
||||
optind = 0;
|
||||
|
||||
/* Prevent getopt from emitting errors */
|
||||
opterr = 0;
|
||||
|
||||
while ((c = getopt_long(argc_item, argv_array, "l:S:t:", long_options,
|
||||
&optindex)) != -1)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
/* --files */
|
||||
case 'l':
|
||||
*seconds = atoi(optarg);
|
||||
break;
|
||||
|
||||
case 't':
|
||||
*threshold = atoi(optarg);
|
||||
break;
|
||||
|
||||
/* --status */
|
||||
case 'S':
|
||||
{
|
||||
if (strncmp(optarg, "OK", MAXLEN) == 0)
|
||||
{
|
||||
status = CHECK_STATUS_OK;
|
||||
}
|
||||
else if (strncmp(optarg, "WARNING", MAXLEN) == 0)
|
||||
{
|
||||
status = CHECK_STATUS_WARNING;
|
||||
}
|
||||
else if (strncmp(optarg, "CRITICAL", MAXLEN) == 0)
|
||||
{
|
||||
status = CHECK_STATUS_CRITICAL;
|
||||
}
|
||||
else if (strncmp(optarg, "UNKNOWN", MAXLEN) == 0)
|
||||
{
|
||||
status = CHECK_STATUS_UNKNOWN;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = CHECK_STATUS_UNKNOWN;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -92,6 +92,7 @@ typedef struct
|
||||
|
||||
/* "node check" options */
|
||||
bool archiver;
|
||||
bool replication_lag;
|
||||
|
||||
/* "node service" options */
|
||||
char action[MAXLEN];
|
||||
@@ -134,7 +135,7 @@ typedef struct
|
||||
/* "node status" options */ \
|
||||
false, \
|
||||
/* "node check" options */ \
|
||||
false, \
|
||||
false, false, \
|
||||
/* "node service" options */ \
|
||||
"", false, false, false, \
|
||||
/* "cluster event" options */ \
|
||||
|
||||
@@ -427,6 +427,10 @@ main(int argc, char **argv)
|
||||
runtime_options.archiver = true;
|
||||
break;
|
||||
|
||||
case OPT_REPLICATION_LAG:
|
||||
runtime_options.replication_lag = true;
|
||||
break;
|
||||
|
||||
/* "node service" options *
|
||||
* ---------------------- */
|
||||
|
||||
|
||||
@@ -68,6 +68,7 @@
|
||||
#define OPT_NAGIOS 1031
|
||||
#define OPT_ARCHIVER 1032
|
||||
#define OPT_OPTFORMAT 1033
|
||||
#define OPT_REPLICATION_LAG 1034
|
||||
/* deprecated since 3.3 */
|
||||
#define OPT_DATA_DIR 999
|
||||
#define OPT_NO_CONNINFO_PASSWORD 998
|
||||
@@ -135,6 +136,7 @@ static struct option long_options[] =
|
||||
|
||||
/* "node check" options */
|
||||
{"archiver", no_argument, NULL, OPT_ARCHIVER },
|
||||
{"replication-lag", no_argument, NULL, OPT_REPLICATION_LAG },
|
||||
|
||||
/* "node service" options */
|
||||
{"action", required_argument, NULL, OPT_ACTION},
|
||||
|
||||
@@ -266,6 +266,12 @@ ssh_options='-q' # Options to append to "ssh"
|
||||
# "repmgr standby switchover" to warn about potential
|
||||
# issues with shutting down the demotion candidate.
|
||||
|
||||
#replication_lag_warning=300 # repmgr node check --replication-lag
|
||||
#replication_lag_critical=600 #
|
||||
# Note that these values will be checked when executing
|
||||
# "repmgr standby switchover" to warn about potential
|
||||
# issues with shutting down the demotion candidate.
|
||||
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# BDR monitoring options
|
||||
|
||||
2
repmgr.h
2
repmgr.h
@@ -49,6 +49,8 @@
|
||||
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
|
||||
#define DEFAULT_ARCHIVER_LAG_WARNING 16 /* WAL files */
|
||||
#define DEFAULT_ARCHIVER_LAG_CRITICAL 128 /* WAL files */
|
||||
#define DEFAULT_REPLICATION_LAG_WARNING 300 /* seconds */
|
||||
#define DEFAULT_REPLICATION_LAG_CRITICAL 600 /* seconds */
|
||||
|
||||
#define FAILOVER_NODES_MAX_CHECK 50
|
||||
|
||||
|
||||
Reference in New Issue
Block a user