mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Check replication lag before attempting switchover
This commit is contained in:
@@ -395,6 +395,10 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->archiver_lag_warning = repmgr_atoi(value, name, error_list, 1);
|
options->archiver_lag_warning = repmgr_atoi(value, name, error_list, 1);
|
||||||
else if (strcmp(name, "archiver_lag_critcial") == 0)
|
else if (strcmp(name, "archiver_lag_critcial") == 0)
|
||||||
options->archiver_lag_critical = repmgr_atoi(value, name, error_list, 1);
|
options->archiver_lag_critical = repmgr_atoi(value, name, error_list, 1);
|
||||||
|
else if (strcmp(name, "replication_lag_warning") == 0)
|
||||||
|
options->replication_lag_warning = repmgr_atoi(value, name, error_list, 1);
|
||||||
|
else if (strcmp(name, "replication_lag_critical") == 0)
|
||||||
|
options->replication_lag_critical = repmgr_atoi(value, name, error_list, 1);
|
||||||
|
|
||||||
/* repmgrd settings */
|
/* repmgrd settings */
|
||||||
else if (strcmp(name, "failover_mode") == 0)
|
else if (strcmp(name, "failover_mode") == 0)
|
||||||
@@ -619,6 +623,11 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
_("\archiver_lag_critical\" must be greater than \"archiver_lag_warning\""));
|
_("\archiver_lag_critical\" must be greater than \"archiver_lag_warning\""));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if( options->replication_lag_warning >= options->replication_lag_critical)
|
||||||
|
{
|
||||||
|
item_list_append(error_list,
|
||||||
|
_("\replication_lag_critical\" must be greater than \"replication_lag_warning\""));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -75,6 +75,8 @@ typedef struct
|
|||||||
/* node check settings */
|
/* node check settings */
|
||||||
int archiver_lag_warning;
|
int archiver_lag_warning;
|
||||||
int archiver_lag_critical;
|
int archiver_lag_critical;
|
||||||
|
int replication_lag_warning;
|
||||||
|
int replication_lag_critical;
|
||||||
|
|
||||||
/* repmgrd settings */
|
/* repmgrd settings */
|
||||||
failover_mode_opt failover_mode;
|
failover_mode_opt failover_mode;
|
||||||
@@ -130,6 +132,7 @@ typedef struct
|
|||||||
false, "", "", "", "", { NULL, NULL }, \
|
false, "", "", "", "", { NULL, NULL }, \
|
||||||
/* node check settings */ \
|
/* node check settings */ \
|
||||||
DEFAULT_ARCHIVER_LAG_WARNING, DEFAULT_ARCHIVER_LAG_CRITICAL, \
|
DEFAULT_ARCHIVER_LAG_WARNING, DEFAULT_ARCHIVER_LAG_CRITICAL, \
|
||||||
|
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
|
||||||
/* repmgrd settings */ \
|
/* repmgrd settings */ \
|
||||||
FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", \
|
FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", \
|
||||||
DEFAULT_MONITORING_INTERVAL, \
|
DEFAULT_MONITORING_INTERVAL, \
|
||||||
|
|||||||
56
dbutils.c
56
dbutils.c
@@ -1309,7 +1309,7 @@ get_ready_archive_files(PGconn *conn, const char *data_directory)
|
|||||||
if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
|
if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
|
||||||
server_version_num = get_server_version(conn, NULL);
|
server_version_num = get_server_version(conn, NULL);
|
||||||
|
|
||||||
if (server_version_num >= 1000000)
|
if (server_version_num >= 100000)
|
||||||
{
|
{
|
||||||
snprintf(archive_status_dir, MAXPGPATH,
|
snprintf(archive_status_dir, MAXPGPATH,
|
||||||
"%s/pg_wal/archive_status",
|
"%s/pg_wal/archive_status",
|
||||||
@@ -1376,6 +1376,60 @@ get_ready_archive_files(PGconn *conn, const char *data_directory)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
get_replication_lag_seconds(PGconn *conn)
|
||||||
|
{
|
||||||
|
PQExpBufferData query;
|
||||||
|
PGresult *res;
|
||||||
|
int lag_seconds = 0;
|
||||||
|
|
||||||
|
if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
|
||||||
|
server_version_num = get_server_version(conn, NULL);
|
||||||
|
|
||||||
|
initPQExpBuffer(&query);
|
||||||
|
|
||||||
|
if (server_version_num >= 100000)
|
||||||
|
{
|
||||||
|
appendPQExpBuffer(
|
||||||
|
&query,
|
||||||
|
" SELECT CASE WHEN (pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn()) ");
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
appendPQExpBuffer(
|
||||||
|
&query,
|
||||||
|
" SELECT CASE WHEN (pg_catalog.pg_last_xlog_receive_location() = pg_catalog.pg_last_xlog_replay_location()) ");
|
||||||
|
}
|
||||||
|
|
||||||
|
appendPQExpBuffer(
|
||||||
|
&query,
|
||||||
|
" THEN 0 "
|
||||||
|
" ELSE EXTRACT(epoch FROM (clock_timestamp() - pg_catalog.pg_last_xact_replay_timestamp()))::INT "
|
||||||
|
" END "
|
||||||
|
" AS lag_seconds");
|
||||||
|
|
||||||
|
res = PQexec(conn, query.data);
|
||||||
|
termPQExpBuffer(&query);
|
||||||
|
|
||||||
|
log_verbose(LOG_DEBUG, "get_node_record():\n%s", query.data);
|
||||||
|
|
||||||
|
if (PQresultStatus(res) != PGRES_TUPLES_OK || !PQntuples(res))
|
||||||
|
{
|
||||||
|
log_warning("%s", PQerrorMessage(conn));
|
||||||
|
PQclear(res);
|
||||||
|
|
||||||
|
/* XXX magic number */
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
lag_seconds = atoi(PQgetvalue(res, 0, 0));
|
||||||
|
|
||||||
|
PQclear(res);
|
||||||
|
return lag_seconds;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* ================ */
|
/* ================ */
|
||||||
/* result functions */
|
/* result functions */
|
||||||
/* ================ */
|
/* ================ */
|
||||||
|
|||||||
@@ -324,6 +324,7 @@ int get_primary_node_id(PGconn *conn);
|
|||||||
bool get_replication_info(PGconn *conn, ReplInfo *replication_info);
|
bool get_replication_info(PGconn *conn, ReplInfo *replication_info);
|
||||||
bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
|
bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
|
||||||
int get_ready_archive_files(PGconn *conn, const char *data_directory);
|
int get_ready_archive_files(PGconn *conn, const char *data_directory);
|
||||||
|
int get_replication_lag_seconds(PGconn *conn);
|
||||||
|
|
||||||
/* extension functions */
|
/* extension functions */
|
||||||
ExtensionStatus get_repmgr_extension_status(PGconn *conn);
|
ExtensionStatus get_repmgr_extension_status(PGconn *conn);
|
||||||
|
|||||||
@@ -395,15 +395,29 @@ do_node_check(void)
|
|||||||
PQfinish(conn);
|
PQfinish(conn);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (runtime_options.replication_lag == true)
|
||||||
|
{
|
||||||
|
(void) do_node_check_replication_lag(conn, runtime_options.output_mode, NULL);
|
||||||
|
PQfinish(conn);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
CheckStatus
|
||||||
do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
||||||
{
|
{
|
||||||
bool own_buffer = false;
|
bool own_buffer = false;
|
||||||
int ready_archive_files = 0;
|
int ready_archive_files = 0;
|
||||||
PQExpBufferData buf;
|
PQExpBufferData buf;
|
||||||
bool check_ok = true;
|
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||||
|
|
||||||
|
if (mode == OM_CSV)
|
||||||
|
{
|
||||||
|
log_error(_("--csv output not provided with --archiver option"));
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
if (output == NULL)
|
if (output == NULL)
|
||||||
{
|
{
|
||||||
@@ -412,10 +426,13 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
|||||||
own_buffer = true;
|
own_buffer = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
ready_archive_files = get_ready_archive_files(conn, config_file_options.data_directory);
|
ready_archive_files = get_ready_archive_files(conn, config_file_options.data_directory);
|
||||||
|
|
||||||
if (ready_archive_files > config_file_options.archiver_lag_critical)
|
if (ready_archive_files > config_file_options.archiver_lag_critical)
|
||||||
{
|
{
|
||||||
|
status = CHECK_STATUS_CRITICAL;
|
||||||
|
|
||||||
switch (mode)
|
switch (mode)
|
||||||
{
|
{
|
||||||
case OM_OPTFORMAT:
|
case OM_OPTFORMAT:
|
||||||
@@ -425,12 +442,29 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
|||||||
ready_archive_files,
|
ready_archive_files,
|
||||||
config_file_options.archiver_lag_critical);
|
config_file_options.archiver_lag_critical);
|
||||||
break;
|
break;
|
||||||
|
case OM_NAGIOS:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"PG_ARCHIVER CRITICAL: %i pending files (critical: %i)",
|
||||||
|
ready_archive_files,
|
||||||
|
config_file_options.archiver_lag_critical);
|
||||||
|
break;
|
||||||
|
case OM_TEXT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"CRITICAL - %i pending files (threshold: %i)",
|
||||||
|
ready_archive_files,
|
||||||
|
config_file_options.archiver_lag_critical);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (ready_archive_files > config_file_options.archiver_lag_warning)
|
else if (ready_archive_files > config_file_options.archiver_lag_warning)
|
||||||
{
|
{
|
||||||
|
status = CHECK_STATUS_WARNING;
|
||||||
|
|
||||||
switch (mode)
|
switch (mode)
|
||||||
{
|
{
|
||||||
case OM_OPTFORMAT:
|
case OM_OPTFORMAT:
|
||||||
@@ -440,12 +474,55 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
|||||||
ready_archive_files,
|
ready_archive_files,
|
||||||
config_file_options.archiver_lag_warning);
|
config_file_options.archiver_lag_warning);
|
||||||
break;
|
break;
|
||||||
|
case OM_NAGIOS:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"PG_ARCHIVER WARNING: %i pending files (warning: %i)",
|
||||||
|
ready_archive_files,
|
||||||
|
config_file_options.archiver_lag_warning);
|
||||||
|
break;
|
||||||
|
case OM_TEXT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"WARNING - %i pending files (threshold: %i)",
|
||||||
|
ready_archive_files,
|
||||||
|
config_file_options.archiver_lag_warning);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (ready_archive_files < 0)
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_UNKNOWN;
|
||||||
|
|
||||||
|
switch (mode)
|
||||||
|
{
|
||||||
|
case OM_OPTFORMAT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"--status=UNKNOWN");
|
||||||
|
break;
|
||||||
|
case OM_NAGIOS:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"PG_ARCHIVER UNKNOWN: unable to check archive_status directory");
|
||||||
|
break;
|
||||||
|
case OM_TEXT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"UNKNOWN - unable to check archive_status directory");
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
status = CHECK_STATUS_OK;
|
||||||
|
|
||||||
switch (mode)
|
switch (mode)
|
||||||
{
|
{
|
||||||
case OM_OPTFORMAT:
|
case OM_OPTFORMAT:
|
||||||
@@ -454,19 +531,192 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
|||||||
"--status=OK --files=%i",
|
"--status=OK --files=%i",
|
||||||
ready_archive_files);
|
ready_archive_files);
|
||||||
break;
|
break;
|
||||||
|
case OM_NAGIOS:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"PG_ARCHIVER OK: %i pending files",
|
||||||
|
ready_archive_files);
|
||||||
|
break;
|
||||||
|
case OM_TEXT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"OK - %i pending files",
|
||||||
|
ready_archive_files);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (own_buffer == true)
|
if (own_buffer == true)
|
||||||
{
|
{
|
||||||
printf("%s\n", buf.data);
|
printf("%s\n", buf.data);
|
||||||
termPQExpBuffer(&buf);
|
termPQExpBuffer(&buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
return check_ok;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
CheckStatus
|
||||||
|
do_node_check_replication_lag(PGconn *conn, OutputMode mode, PQExpBufferData *output)
|
||||||
|
{
|
||||||
|
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||||
|
bool own_buffer = false;
|
||||||
|
PQExpBufferData buf;
|
||||||
|
int lag_seconds;
|
||||||
|
|
||||||
|
if (mode == OM_CSV)
|
||||||
|
{
|
||||||
|
log_error(_("--csv output not provided with --replication-lag option"));
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (output == NULL)
|
||||||
|
{
|
||||||
|
initPQExpBuffer(&buf);
|
||||||
|
output = &buf;
|
||||||
|
own_buffer = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
lag_seconds = get_replication_lag_seconds(conn);
|
||||||
|
|
||||||
|
log_debug("lag seconds: %i", lag_seconds);
|
||||||
|
|
||||||
|
if (lag_seconds >= config_file_options.replication_lag_critical)
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_CRITICAL;
|
||||||
|
|
||||||
|
switch (mode)
|
||||||
|
{
|
||||||
|
case OM_OPTFORMAT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"--status=CRITICAL --lag=%i --threshold=%i",
|
||||||
|
lag_seconds,
|
||||||
|
config_file_options.replication_lag_critical);
|
||||||
|
break;
|
||||||
|
case OM_NAGIOS:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"PG_REPLICATION_LAG CRITICAL: %i seconds (critical: %i)",
|
||||||
|
lag_seconds,
|
||||||
|
config_file_options.replication_lag_critical);
|
||||||
|
break;
|
||||||
|
case OM_TEXT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"CRITICAL - %i seconds (threshold: %i)",
|
||||||
|
lag_seconds,
|
||||||
|
config_file_options.replication_lag_critical);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (lag_seconds > config_file_options.replication_lag_warning)
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_WARNING;
|
||||||
|
|
||||||
|
switch (mode)
|
||||||
|
{
|
||||||
|
case OM_OPTFORMAT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"--status=WARNING --lag=%i --threshold=%i",
|
||||||
|
lag_seconds,
|
||||||
|
config_file_options.replication_lag_warning);
|
||||||
|
break;
|
||||||
|
case OM_NAGIOS:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"PG_REPLICATION_LAG WARNING: %i seconds (warning: %i)",
|
||||||
|
lag_seconds,
|
||||||
|
config_file_options.replication_lag_warning);
|
||||||
|
break;
|
||||||
|
case OM_TEXT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"WARNING - %i seconds (threshold: %i)",
|
||||||
|
lag_seconds,
|
||||||
|
config_file_options.replication_lag_warning);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (lag_seconds < 0)
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_UNKNOWN;
|
||||||
|
|
||||||
|
switch (mode)
|
||||||
|
{
|
||||||
|
case OM_OPTFORMAT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"--status=UNKNOWN");
|
||||||
|
break;
|
||||||
|
case OM_NAGIOS:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"PG_REPLICATION_LAG UNKNOWN: unable to query replication lag");
|
||||||
|
break;
|
||||||
|
case OM_TEXT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"UNKNOWN - unable to query replication lag");
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_OK;
|
||||||
|
|
||||||
|
switch (mode)
|
||||||
|
{
|
||||||
|
case OM_OPTFORMAT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"--status=OK --files=%i",
|
||||||
|
lag_seconds);
|
||||||
|
break;
|
||||||
|
case OM_NAGIOS:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"PG_REPLICATION_LAG OK: %i seconds",
|
||||||
|
lag_seconds);
|
||||||
|
break;
|
||||||
|
case OM_TEXT:
|
||||||
|
appendPQExpBuffer(
|
||||||
|
output,
|
||||||
|
"OK - %i seconds",
|
||||||
|
lag_seconds);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if (own_buffer == true)
|
||||||
|
{
|
||||||
|
printf("%s\n", buf.data);
|
||||||
|
termPQExpBuffer(&buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// --action=...
|
// --action=...
|
||||||
// --check
|
// --check
|
||||||
// --list -> list what would be executed for each action, filter to --action
|
// --list -> list what would be executed for each action, filter to --action
|
||||||
|
|||||||
@@ -8,7 +8,9 @@
|
|||||||
|
|
||||||
extern void do_node_status(void);
|
extern void do_node_status(void);
|
||||||
extern void do_node_check(void);
|
extern void do_node_check(void);
|
||||||
extern bool do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output);
|
extern CheckStatus do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output);
|
||||||
|
extern CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, PQExpBufferData *output);
|
||||||
|
|
||||||
|
|
||||||
extern void do_node_archive_config(void);
|
extern void do_node_archive_config(void);
|
||||||
extern void do_node_restore_config(void);
|
extern void do_node_restore_config(void);
|
||||||
|
|||||||
@@ -82,6 +82,7 @@ static char *make_barman_ssh_command(char *buf);
|
|||||||
|
|
||||||
static NodeStatus parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint);
|
static NodeStatus parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint);
|
||||||
static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold);
|
static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold);
|
||||||
|
static CheckStatus parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* do_standby_clone()
|
* do_standby_clone()
|
||||||
@@ -1711,8 +1712,6 @@ do_standby_switchover(void)
|
|||||||
termPQExpBuffer(&reason);
|
termPQExpBuffer(&reason);
|
||||||
}
|
}
|
||||||
|
|
||||||
PQfinish(remote_conn);
|
|
||||||
PQfinish(local_conn);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check that we can connect by SSH to the remote (current primary) server
|
* Check that we can connect by SSH to the remote (current primary) server
|
||||||
@@ -1725,65 +1724,149 @@ do_standby_switchover(void)
|
|||||||
{
|
{
|
||||||
log_error(_("unable to connect via SSH to host \"%s\", user \"%s\""),
|
log_error(_("unable to connect via SSH to host \"%s\", user \"%s\""),
|
||||||
remote_host, runtime_options.remote_user);
|
remote_host, runtime_options.remote_user);
|
||||||
|
PQfinish(remote_conn);
|
||||||
|
PQfinish(local_conn);
|
||||||
|
|
||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check replication status */
|
/* check archive/replication status */
|
||||||
{
|
{
|
||||||
bool command_success;
|
int lag_seconds = 0;
|
||||||
int files = 0;
|
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||||
int threshold = 0;
|
|
||||||
CheckStatus status;
|
|
||||||
|
|
||||||
initPQExpBuffer(&remote_command_str);
|
/* archive status - check when "archive_mode" is activated */
|
||||||
make_remote_repmgr_path(&remote_command_str);
|
|
||||||
appendPQExpBuffer(&remote_command_str,
|
|
||||||
"node check --terse -LERROR --archiver --optformat");
|
|
||||||
|
|
||||||
initPQExpBuffer(&command_output);
|
if (guc_set(remote_conn, "archive_mode", "!=", "off"))
|
||||||
|
|
||||||
command_success = remote_command(
|
|
||||||
remote_host,
|
|
||||||
runtime_options.remote_user,
|
|
||||||
remote_command_str.data,
|
|
||||||
&command_output);
|
|
||||||
|
|
||||||
termPQExpBuffer(&remote_command_str);
|
|
||||||
|
|
||||||
status = parse_node_check_archiver(command_output.data, &files, &threshold);
|
|
||||||
|
|
||||||
log_debug("%i %i; '%s'", files, threshold, command_output.data);
|
|
||||||
if (status == CHECK_STATUS_CRITICAL)
|
|
||||||
{
|
{
|
||||||
if (runtime_options.force == false)
|
int files = 0;
|
||||||
|
int threshold = 0;
|
||||||
|
bool command_success;
|
||||||
|
|
||||||
|
initPQExpBuffer(&remote_command_str);
|
||||||
|
make_remote_repmgr_path(&remote_command_str);
|
||||||
|
appendPQExpBuffer(&remote_command_str,
|
||||||
|
"node check --terse -LERROR --archiver --optformat");
|
||||||
|
|
||||||
|
initPQExpBuffer(&command_output);
|
||||||
|
|
||||||
|
command_success = remote_command(
|
||||||
|
remote_host,
|
||||||
|
runtime_options.remote_user,
|
||||||
|
remote_command_str.data,
|
||||||
|
&command_output);
|
||||||
|
|
||||||
|
termPQExpBuffer(&remote_command_str);
|
||||||
|
|
||||||
|
if (command_success == true)
|
||||||
{
|
{
|
||||||
log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
status = parse_node_check_archiver(command_output.data, &files, &threshold);
|
||||||
remote_node_record.node_name);
|
|
||||||
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
log_debug("%i %i; '%s'", files, threshold, command_output.data);
|
||||||
files, threshold);
|
|
||||||
log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway"));
|
|
||||||
exit(ERR_SWITCHOVER_FAIL);
|
|
||||||
}
|
}
|
||||||
else
|
|
||||||
|
termPQExpBuffer(&command_output);
|
||||||
|
if (status == CHECK_STATUS_UNKNOWN)
|
||||||
{
|
{
|
||||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
if (runtime_options.force == false)
|
||||||
|
{
|
||||||
|
log_error(_("unable to check number of pending archive files on demotion candidate \"%s\""),
|
||||||
|
remote_node_record.node_name);
|
||||||
|
log_hint(_("use -F/--force to continue anyway"));
|
||||||
|
PQfinish(remote_conn);
|
||||||
|
PQfinish(local_conn);
|
||||||
|
|
||||||
|
exit(ERR_SWITCHOVER_FAIL);
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning(_("unable to check number of pending archive files on demotion candidate \"%s\""),
|
||||||
remote_node_record.node_name);
|
remote_node_record.node_name);
|
||||||
|
log_notice(_("-F/--force set, continuing with switchover"));
|
||||||
|
|
||||||
|
}
|
||||||
|
else if (status == CHECK_STATUS_CRITICAL)
|
||||||
|
{
|
||||||
|
if (runtime_options.force == false)
|
||||||
|
{
|
||||||
|
log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||||
|
remote_node_record.node_name);
|
||||||
|
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
||||||
|
files, threshold);
|
||||||
|
log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway"));
|
||||||
|
PQfinish(remote_conn);
|
||||||
|
PQfinish(local_conn);
|
||||||
|
|
||||||
|
exit(ERR_SWITCHOVER_FAIL);
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"),
|
||||||
|
remote_node_record.node_name);
|
||||||
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
log_detail(_("%i pending archive files (critical threshold: %i)"),
|
||||||
files, threshold);
|
files, threshold);
|
||||||
log_notice(_("-F/--force set, continuing with switchover"));
|
log_notice(_("-F/--force set, continuing with switchover"));
|
||||||
}
|
}
|
||||||
|
else if (status == CHECK_STATUS_WARNING)
|
||||||
|
{
|
||||||
|
log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"),
|
||||||
|
remote_node_record.node_name);
|
||||||
|
log_detail(_("%i pending archive files (warning threshold: %i)"),
|
||||||
|
files, threshold);
|
||||||
|
log_hint(_("PostgreSQL will not shut down until all files are archived"));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
else if (status == CHECK_STATUS_WARNING)
|
|
||||||
|
/* check replication lag */
|
||||||
|
lag_seconds = get_replication_lag_seconds(local_conn);
|
||||||
|
|
||||||
|
log_debug("lag is %i ", lag_seconds);
|
||||||
|
|
||||||
|
termPQExpBuffer(&command_output);
|
||||||
|
|
||||||
|
if (lag_seconds >= config_file_options.replication_lag_critical)
|
||||||
{
|
{
|
||||||
log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"),
|
if (runtime_options.force == false)
|
||||||
remote_node_record.node_name);
|
{
|
||||||
log_detail(_("%i pending archive files (warning threshold: %i)"),
|
log_error(_("replication lag on this node is critical"));
|
||||||
files, threshold);
|
log_detail(_("lag is %i seconds (critical threshold: %i)"),
|
||||||
log_hint(_("PostgreSQL will not shut down until all files are archived"));
|
lag_seconds, config_file_options.replication_lag_critical);
|
||||||
|
log_hint(_("PostgreSQL on the demotion candidate will not shut down until pending WAL is flushed to the standby; use -F/--force to continue anyway"));
|
||||||
|
PQfinish(remote_conn);
|
||||||
|
PQfinish(local_conn);
|
||||||
|
|
||||||
|
exit(ERR_SWITCHOVER_FAIL);
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning(_("replication lag on this node is critical"));
|
||||||
|
log_detail(_("lag is %i seconds (critical threshold: %i)"),
|
||||||
|
lag_seconds, config_file_options.replication_lag_critical);
|
||||||
|
log_notice(_("-F/--force set, continuing with switchover"));
|
||||||
|
}
|
||||||
|
else if (lag_seconds >= config_file_options.replication_lag_warning)
|
||||||
|
{
|
||||||
|
log_warning(_("replication lag on this node is warning"));
|
||||||
|
log_detail(_("lag is %i seconds (warning threshold: %i)"),
|
||||||
|
lag_seconds, config_file_options.replication_lag_warning);
|
||||||
|
}
|
||||||
|
else if (lag_seconds < 0)
|
||||||
|
{
|
||||||
|
if (runtime_options.force == false)
|
||||||
|
{
|
||||||
|
log_error(_("unable to check replication lag on local node"));
|
||||||
|
log_hint(_("use -F/--force to continue anyway"));
|
||||||
|
PQfinish(remote_conn);
|
||||||
|
PQfinish(local_conn);
|
||||||
|
|
||||||
|
exit(ERR_SWITCHOVER_FAIL);
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning(_("unable to check replication lag on local node"));
|
||||||
|
log_notice(_("-F/--force set, continuing with switchover"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PQfinish(remote_conn);
|
||||||
|
PQfinish(local_conn);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -3476,7 +3559,7 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* TODO: consolidate code in below functions */
|
||||||
static NodeStatus
|
static NodeStatus
|
||||||
parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint)
|
parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint)
|
||||||
{
|
{
|
||||||
@@ -3599,7 +3682,6 @@ parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkP
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static CheckStatus
|
static CheckStatus
|
||||||
parse_node_check_archiver(const char *node_check_output, int *files, int *threshold)
|
parse_node_check_archiver(const char *node_check_output, int *files, int *threshold)
|
||||||
{
|
{
|
||||||
@@ -3738,3 +3820,144 @@ parse_node_check_archiver(const char *node_check_output, int *files, int *thresh
|
|||||||
|
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static CheckStatus
|
||||||
|
parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold)
|
||||||
|
{
|
||||||
|
int options_len = 0;
|
||||||
|
char *options_string = NULL;
|
||||||
|
char *options_string_ptr = NULL;
|
||||||
|
|
||||||
|
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add parsed options to this list, then copy to an array
|
||||||
|
* to pass to getopt
|
||||||
|
*/
|
||||||
|
static ItemList option_argv = { NULL, NULL };
|
||||||
|
|
||||||
|
char *argv_item;
|
||||||
|
int c, argc_item = 1;
|
||||||
|
|
||||||
|
char **argv_array;
|
||||||
|
ItemListCell *cell;
|
||||||
|
|
||||||
|
int optindex = 0;
|
||||||
|
|
||||||
|
/* We're only interested in these options */
|
||||||
|
static struct option long_options[] =
|
||||||
|
{
|
||||||
|
{"status", required_argument, NULL, 'S'},
|
||||||
|
{"lag", required_argument, NULL, 'l'},
|
||||||
|
{"threshold", required_argument, NULL, 't'},
|
||||||
|
{NULL, 0, NULL, 0}
|
||||||
|
};
|
||||||
|
|
||||||
|
*seconds = 0;
|
||||||
|
*threshold = 0;
|
||||||
|
|
||||||
|
/* Don't attempt to tokenise an empty string */
|
||||||
|
if (!strlen(node_check_output))
|
||||||
|
{
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
options_len = strlen(node_check_output) + 1;
|
||||||
|
options_string = pg_malloc(options_len);
|
||||||
|
options_string_ptr = options_string;
|
||||||
|
|
||||||
|
/* Copy the string before operating on it with strtok() */
|
||||||
|
strncpy(options_string, node_check_output, options_len);
|
||||||
|
|
||||||
|
/* Extract arguments into a list and keep a count of the total */
|
||||||
|
while ((argv_item = strtok(options_string_ptr, " ")) != NULL)
|
||||||
|
{
|
||||||
|
item_list_append(&option_argv, argv_item);
|
||||||
|
|
||||||
|
argc_item++;
|
||||||
|
|
||||||
|
if (options_string_ptr != NULL)
|
||||||
|
options_string_ptr = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Array of argument values to pass to getopt_long - this will need to
|
||||||
|
* include an empty string as the first value (normally this would be
|
||||||
|
* the program name)
|
||||||
|
*/
|
||||||
|
argv_array = pg_malloc0(sizeof(char *) * (argc_item + 2));
|
||||||
|
|
||||||
|
/* Insert a blank dummy program name at the start of the array */
|
||||||
|
argv_array[0] = pg_malloc0(1);
|
||||||
|
|
||||||
|
c = 1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copy the previously extracted arguments from our list to the array
|
||||||
|
*/
|
||||||
|
for (cell = option_argv.head; cell; cell = cell->next)
|
||||||
|
{
|
||||||
|
int argv_len = strlen(cell->string) + 1;
|
||||||
|
|
||||||
|
argv_array[c] = pg_malloc0(argv_len);
|
||||||
|
|
||||||
|
strncpy(argv_array[c], cell->string, argv_len);
|
||||||
|
|
||||||
|
c++;
|
||||||
|
}
|
||||||
|
|
||||||
|
argv_array[c] = NULL;
|
||||||
|
|
||||||
|
/* Reset getopt's optind variable */
|
||||||
|
optind = 0;
|
||||||
|
|
||||||
|
/* Prevent getopt from emitting errors */
|
||||||
|
opterr = 0;
|
||||||
|
|
||||||
|
while ((c = getopt_long(argc_item, argv_array, "l:S:t:", long_options,
|
||||||
|
&optindex)) != -1)
|
||||||
|
{
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
/* --files */
|
||||||
|
case 'l':
|
||||||
|
*seconds = atoi(optarg);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 't':
|
||||||
|
*threshold = atoi(optarg);
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* --status */
|
||||||
|
case 'S':
|
||||||
|
{
|
||||||
|
if (strncmp(optarg, "OK", MAXLEN) == 0)
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_OK;
|
||||||
|
}
|
||||||
|
else if (strncmp(optarg, "WARNING", MAXLEN) == 0)
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_WARNING;
|
||||||
|
}
|
||||||
|
else if (strncmp(optarg, "CRITICAL", MAXLEN) == 0)
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_CRITICAL;
|
||||||
|
}
|
||||||
|
else if (strncmp(optarg, "UNKNOWN", MAXLEN) == 0)
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_UNKNOWN;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
status = CHECK_STATUS_UNKNOWN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|||||||
@@ -92,6 +92,7 @@ typedef struct
|
|||||||
|
|
||||||
/* "node check" options */
|
/* "node check" options */
|
||||||
bool archiver;
|
bool archiver;
|
||||||
|
bool replication_lag;
|
||||||
|
|
||||||
/* "node service" options */
|
/* "node service" options */
|
||||||
char action[MAXLEN];
|
char action[MAXLEN];
|
||||||
@@ -134,7 +135,7 @@ typedef struct
|
|||||||
/* "node status" options */ \
|
/* "node status" options */ \
|
||||||
false, \
|
false, \
|
||||||
/* "node check" options */ \
|
/* "node check" options */ \
|
||||||
false, \
|
false, false, \
|
||||||
/* "node service" options */ \
|
/* "node service" options */ \
|
||||||
"", false, false, false, \
|
"", false, false, false, \
|
||||||
/* "cluster event" options */ \
|
/* "cluster event" options */ \
|
||||||
|
|||||||
@@ -427,6 +427,10 @@ main(int argc, char **argv)
|
|||||||
runtime_options.archiver = true;
|
runtime_options.archiver = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case OPT_REPLICATION_LAG:
|
||||||
|
runtime_options.replication_lag = true;
|
||||||
|
break;
|
||||||
|
|
||||||
/* "node service" options *
|
/* "node service" options *
|
||||||
* ---------------------- */
|
* ---------------------- */
|
||||||
|
|
||||||
|
|||||||
@@ -68,6 +68,7 @@
|
|||||||
#define OPT_NAGIOS 1031
|
#define OPT_NAGIOS 1031
|
||||||
#define OPT_ARCHIVER 1032
|
#define OPT_ARCHIVER 1032
|
||||||
#define OPT_OPTFORMAT 1033
|
#define OPT_OPTFORMAT 1033
|
||||||
|
#define OPT_REPLICATION_LAG 1034
|
||||||
/* deprecated since 3.3 */
|
/* deprecated since 3.3 */
|
||||||
#define OPT_DATA_DIR 999
|
#define OPT_DATA_DIR 999
|
||||||
#define OPT_NO_CONNINFO_PASSWORD 998
|
#define OPT_NO_CONNINFO_PASSWORD 998
|
||||||
@@ -135,6 +136,7 @@ static struct option long_options[] =
|
|||||||
|
|
||||||
/* "node check" options */
|
/* "node check" options */
|
||||||
{"archiver", no_argument, NULL, OPT_ARCHIVER },
|
{"archiver", no_argument, NULL, OPT_ARCHIVER },
|
||||||
|
{"replication-lag", no_argument, NULL, OPT_REPLICATION_LAG },
|
||||||
|
|
||||||
/* "node service" options */
|
/* "node service" options */
|
||||||
{"action", required_argument, NULL, OPT_ACTION},
|
{"action", required_argument, NULL, OPT_ACTION},
|
||||||
|
|||||||
@@ -266,6 +266,12 @@ ssh_options='-q' # Options to append to "ssh"
|
|||||||
# "repmgr standby switchover" to warn about potential
|
# "repmgr standby switchover" to warn about potential
|
||||||
# issues with shutting down the demotion candidate.
|
# issues with shutting down the demotion candidate.
|
||||||
|
|
||||||
|
#replication_lag_warning=300 # repmgr node check --replication-lag
|
||||||
|
#replication_lag_critical=600 #
|
||||||
|
# Note that these values will be checked when executing
|
||||||
|
# "repmgr standby switchover" to warn about potential
|
||||||
|
# issues with shutting down the demotion candidate.
|
||||||
|
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
# BDR monitoring options
|
# BDR monitoring options
|
||||||
|
|||||||
2
repmgr.h
2
repmgr.h
@@ -49,6 +49,8 @@
|
|||||||
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
|
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
|
||||||
#define DEFAULT_ARCHIVER_LAG_WARNING 16 /* WAL files */
|
#define DEFAULT_ARCHIVER_LAG_WARNING 16 /* WAL files */
|
||||||
#define DEFAULT_ARCHIVER_LAG_CRITICAL 128 /* WAL files */
|
#define DEFAULT_ARCHIVER_LAG_CRITICAL 128 /* WAL files */
|
||||||
|
#define DEFAULT_REPLICATION_LAG_WARNING 300 /* seconds */
|
||||||
|
#define DEFAULT_REPLICATION_LAG_CRITICAL 600 /* seconds */
|
||||||
|
|
||||||
#define FAILOVER_NODES_MAX_CHECK 50
|
#define FAILOVER_NODES_MAX_CHECK 50
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user