diff --git a/configfile.c b/configfile.c index 191bbea1..ded9f013 100644 --- a/configfile.c +++ b/configfile.c @@ -395,6 +395,10 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->archiver_lag_warning = repmgr_atoi(value, name, error_list, 1); else if (strcmp(name, "archiver_lag_critcial") == 0) options->archiver_lag_critical = repmgr_atoi(value, name, error_list, 1); + else if (strcmp(name, "replication_lag_warning") == 0) + options->replication_lag_warning = repmgr_atoi(value, name, error_list, 1); + else if (strcmp(name, "replication_lag_critical") == 0) + options->replication_lag_critical = repmgr_atoi(value, name, error_list, 1); /* repmgrd settings */ else if (strcmp(name, "failover_mode") == 0) @@ -619,6 +623,11 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * _("\archiver_lag_critical\" must be greater than \"archiver_lag_warning\"")); } + if( options->replication_lag_warning >= options->replication_lag_critical) + { + item_list_append(error_list, + _("\replication_lag_critical\" must be greater than \"replication_lag_warning\"")); + } } diff --git a/configfile.h b/configfile.h index f9794856..22555dc8 100644 --- a/configfile.h +++ b/configfile.h @@ -75,6 +75,8 @@ typedef struct /* node check settings */ int archiver_lag_warning; int archiver_lag_critical; + int replication_lag_warning; + int replication_lag_critical; /* repmgrd settings */ failover_mode_opt failover_mode; @@ -130,6 +132,7 @@ typedef struct false, "", "", "", "", { NULL, NULL }, \ /* node check settings */ \ DEFAULT_ARCHIVER_LAG_WARNING, DEFAULT_ARCHIVER_LAG_CRITICAL, \ + DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \ /* repmgrd settings */ \ FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", \ DEFAULT_MONITORING_INTERVAL, \ diff --git a/dbutils.c b/dbutils.c index 2addc162..937f7b84 100644 --- a/dbutils.c +++ b/dbutils.c @@ -1309,7 +1309,7 @@ get_ready_archive_files(PGconn *conn, const char *data_directory) if (server_version_num == UNKNOWN_SERVER_VERSION_NUM) server_version_num = get_server_version(conn, NULL); - if (server_version_num >= 1000000) + if (server_version_num >= 100000) { snprintf(archive_status_dir, MAXPGPATH, "%s/pg_wal/archive_status", @@ -1376,6 +1376,60 @@ get_ready_archive_files(PGconn *conn, const char *data_directory) } +int +get_replication_lag_seconds(PGconn *conn) +{ + PQExpBufferData query; + PGresult *res; + int lag_seconds = 0; + + if (server_version_num == UNKNOWN_SERVER_VERSION_NUM) + server_version_num = get_server_version(conn, NULL); + + initPQExpBuffer(&query); + + if (server_version_num >= 100000) + { + appendPQExpBuffer( + &query, + " SELECT CASE WHEN (pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn()) "); + + } + else + { + appendPQExpBuffer( + &query, + " SELECT CASE WHEN (pg_catalog.pg_last_xlog_receive_location() = pg_catalog.pg_last_xlog_replay_location()) "); + } + + appendPQExpBuffer( + &query, + " THEN 0 " + " ELSE EXTRACT(epoch FROM (clock_timestamp() - pg_catalog.pg_last_xact_replay_timestamp()))::INT " + " END " + " AS lag_seconds"); + + res = PQexec(conn, query.data); + termPQExpBuffer(&query); + + log_verbose(LOG_DEBUG, "get_node_record():\n%s", query.data); + + if (PQresultStatus(res) != PGRES_TUPLES_OK || !PQntuples(res)) + { + log_warning("%s", PQerrorMessage(conn)); + PQclear(res); + + /* XXX magic number */ + return -1; + } + + lag_seconds = atoi(PQgetvalue(res, 0, 0)); + + PQclear(res); + return lag_seconds; +} + + /* ================ */ /* result functions */ /* ================ */ diff --git a/dbutils.h b/dbutils.h index 0d8677cb..58b99d68 100644 --- a/dbutils.h +++ b/dbutils.h @@ -324,6 +324,7 @@ int get_primary_node_id(PGconn *conn); bool get_replication_info(PGconn *conn, ReplInfo *replication_info); bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason); int get_ready_archive_files(PGconn *conn, const char *data_directory); +int get_replication_lag_seconds(PGconn *conn); /* extension functions */ ExtensionStatus get_repmgr_extension_status(PGconn *conn); diff --git a/repmgr-action-node.c b/repmgr-action-node.c index 69cc7f41..7b673b51 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -395,15 +395,29 @@ do_node_check(void) PQfinish(conn); return; } + + if (runtime_options.replication_lag == true) + { + (void) do_node_check_replication_lag(conn, runtime_options.output_mode, NULL); + PQfinish(conn); + return; + } + } -bool +CheckStatus do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output) { bool own_buffer = false; int ready_archive_files = 0; PQExpBufferData buf; - bool check_ok = true; + CheckStatus status = CHECK_STATUS_UNKNOWN; + + if (mode == OM_CSV) + { + log_error(_("--csv output not provided with --archiver option")); + exit(ERR_BAD_CONFIG); + } if (output == NULL) { @@ -412,10 +426,13 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output) own_buffer = true; } + ready_archive_files = get_ready_archive_files(conn, config_file_options.data_directory); if (ready_archive_files > config_file_options.archiver_lag_critical) { + status = CHECK_STATUS_CRITICAL; + switch (mode) { case OM_OPTFORMAT: @@ -425,12 +442,29 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output) ready_archive_files, config_file_options.archiver_lag_critical); break; + case OM_NAGIOS: + appendPQExpBuffer( + output, + "PG_ARCHIVER CRITICAL: %i pending files (critical: %i)", + ready_archive_files, + config_file_options.archiver_lag_critical); + break; + case OM_TEXT: + appendPQExpBuffer( + output, + "CRITICAL - %i pending files (threshold: %i)", + ready_archive_files, + config_file_options.archiver_lag_critical); + break; + default: break; } } else if (ready_archive_files > config_file_options.archiver_lag_warning) { + status = CHECK_STATUS_WARNING; + switch (mode) { case OM_OPTFORMAT: @@ -440,12 +474,55 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output) ready_archive_files, config_file_options.archiver_lag_warning); break; + case OM_NAGIOS: + appendPQExpBuffer( + output, + "PG_ARCHIVER WARNING: %i pending files (warning: %i)", + ready_archive_files, + config_file_options.archiver_lag_warning); + break; + case OM_TEXT: + appendPQExpBuffer( + output, + "WARNING - %i pending files (threshold: %i)", + ready_archive_files, + config_file_options.archiver_lag_warning); + break; + + default: + break; + } + } + else if (ready_archive_files < 0) + { + status = CHECK_STATUS_UNKNOWN; + + switch (mode) + { + case OM_OPTFORMAT: + appendPQExpBuffer( + output, + "--status=UNKNOWN"); + break; + case OM_NAGIOS: + appendPQExpBuffer( + output, + "PG_ARCHIVER UNKNOWN: unable to check archive_status directory"); + break; + case OM_TEXT: + appendPQExpBuffer( + output, + "UNKNOWN - unable to check archive_status directory"); + break; + default: break; } } else { + status = CHECK_STATUS_OK; + switch (mode) { case OM_OPTFORMAT: @@ -454,19 +531,192 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output) "--status=OK --files=%i", ready_archive_files); break; + case OM_NAGIOS: + appendPQExpBuffer( + output, + "PG_ARCHIVER OK: %i pending files", + ready_archive_files); + break; + case OM_TEXT: + appendPQExpBuffer( + output, + "OK - %i pending files", + ready_archive_files); + break; + default: break; } } + if (own_buffer == true) { printf("%s\n", buf.data); termPQExpBuffer(&buf); } - return check_ok; + return status; } + +CheckStatus +do_node_check_replication_lag(PGconn *conn, OutputMode mode, PQExpBufferData *output) +{ + CheckStatus status = CHECK_STATUS_UNKNOWN; + bool own_buffer = false; + PQExpBufferData buf; + int lag_seconds; + + if (mode == OM_CSV) + { + log_error(_("--csv output not provided with --replication-lag option")); + exit(ERR_BAD_CONFIG); + } + + if (output == NULL) + { + initPQExpBuffer(&buf); + output = &buf; + own_buffer = true; + } + + lag_seconds = get_replication_lag_seconds(conn); + + log_debug("lag seconds: %i", lag_seconds); + + if (lag_seconds >= config_file_options.replication_lag_critical) + { + status = CHECK_STATUS_CRITICAL; + + switch (mode) + { + case OM_OPTFORMAT: + appendPQExpBuffer( + output, + "--status=CRITICAL --lag=%i --threshold=%i", + lag_seconds, + config_file_options.replication_lag_critical); + break; + case OM_NAGIOS: + appendPQExpBuffer( + output, + "PG_REPLICATION_LAG CRITICAL: %i seconds (critical: %i)", + lag_seconds, + config_file_options.replication_lag_critical); + break; + case OM_TEXT: + appendPQExpBuffer( + output, + "CRITICAL - %i seconds (threshold: %i)", + lag_seconds, + config_file_options.replication_lag_critical); + break; + + default: + break; + } + } + else if (lag_seconds > config_file_options.replication_lag_warning) + { + status = CHECK_STATUS_WARNING; + + switch (mode) + { + case OM_OPTFORMAT: + appendPQExpBuffer( + output, + "--status=WARNING --lag=%i --threshold=%i", + lag_seconds, + config_file_options.replication_lag_warning); + break; + case OM_NAGIOS: + appendPQExpBuffer( + output, + "PG_REPLICATION_LAG WARNING: %i seconds (warning: %i)", + lag_seconds, + config_file_options.replication_lag_warning); + break; + case OM_TEXT: + appendPQExpBuffer( + output, + "WARNING - %i seconds (threshold: %i)", + lag_seconds, + config_file_options.replication_lag_warning); + break; + + default: + break; + } + } + else if (lag_seconds < 0) + { + status = CHECK_STATUS_UNKNOWN; + + switch (mode) + { + case OM_OPTFORMAT: + appendPQExpBuffer( + output, + "--status=UNKNOWN"); + break; + case OM_NAGIOS: + appendPQExpBuffer( + output, + "PG_REPLICATION_LAG UNKNOWN: unable to query replication lag"); + break; + case OM_TEXT: + appendPQExpBuffer( + output, + "UNKNOWN - unable to query replication lag"); + break; + + default: + break; + } + } + else + { + status = CHECK_STATUS_OK; + + switch (mode) + { + case OM_OPTFORMAT: + appendPQExpBuffer( + output, + "--status=OK --files=%i", + lag_seconds); + break; + case OM_NAGIOS: + appendPQExpBuffer( + output, + "PG_REPLICATION_LAG OK: %i seconds", + lag_seconds); + break; + case OM_TEXT: + appendPQExpBuffer( + output, + "OK - %i seconds", + lag_seconds); + break; + + default: + break; + } + } + + + + + if (own_buffer == true) + { + printf("%s\n", buf.data); + termPQExpBuffer(&buf); + } + + return status; +} + + // --action=... // --check // --list -> list what would be executed for each action, filter to --action diff --git a/repmgr-action-node.h b/repmgr-action-node.h index 0c5e8f52..75120343 100644 --- a/repmgr-action-node.h +++ b/repmgr-action-node.h @@ -8,7 +8,9 @@ extern void do_node_status(void); extern void do_node_check(void); -extern bool do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output); +extern CheckStatus do_node_check_archiver(PGconn *conn, OutputMode mode, PQExpBufferData *output); +extern CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, PQExpBufferData *output); + extern void do_node_archive_config(void); extern void do_node_restore_config(void); diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 247b2f26..01a049f4 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -82,6 +82,7 @@ static char *make_barman_ssh_command(char *buf); static NodeStatus parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint); static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold); +static CheckStatus parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold); /* * do_standby_clone() @@ -1711,8 +1712,6 @@ do_standby_switchover(void) termPQExpBuffer(&reason); } - PQfinish(remote_conn); - PQfinish(local_conn); /* * Check that we can connect by SSH to the remote (current primary) server @@ -1725,65 +1724,149 @@ do_standby_switchover(void) { log_error(_("unable to connect via SSH to host \"%s\", user \"%s\""), remote_host, runtime_options.remote_user); + PQfinish(remote_conn); + PQfinish(local_conn); + exit(ERR_BAD_CONFIG); } - /* check replication status */ + /* check archive/replication status */ { - bool command_success; - int files = 0; - int threshold = 0; - CheckStatus status; + int lag_seconds = 0; + CheckStatus status = CHECK_STATUS_UNKNOWN; - initPQExpBuffer(&remote_command_str); - make_remote_repmgr_path(&remote_command_str); - appendPQExpBuffer(&remote_command_str, - "node check --terse -LERROR --archiver --optformat"); + /* archive status - check when "archive_mode" is activated */ - initPQExpBuffer(&command_output); - - command_success = remote_command( - remote_host, - runtime_options.remote_user, - remote_command_str.data, - &command_output); - - termPQExpBuffer(&remote_command_str); - - status = parse_node_check_archiver(command_output.data, &files, &threshold); - - log_debug("%i %i; '%s'", files, threshold, command_output.data); - if (status == CHECK_STATUS_CRITICAL) + if (guc_set(remote_conn, "archive_mode", "!=", "off")) { - if (runtime_options.force == false) + int files = 0; + int threshold = 0; + bool command_success; + + initPQExpBuffer(&remote_command_str); + make_remote_repmgr_path(&remote_command_str); + appendPQExpBuffer(&remote_command_str, + "node check --terse -LERROR --archiver --optformat"); + + initPQExpBuffer(&command_output); + + command_success = remote_command( + remote_host, + runtime_options.remote_user, + remote_command_str.data, + &command_output); + + termPQExpBuffer(&remote_command_str); + + if (command_success == true) { - log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"), - remote_node_record.node_name); - log_detail(_("%i pending archive files (critical threshold: %i)"), - files, threshold); - log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway")); - exit(ERR_SWITCHOVER_FAIL); + status = parse_node_check_archiver(command_output.data, &files, &threshold); + + log_debug("%i %i; '%s'", files, threshold, command_output.data); } - else + + termPQExpBuffer(&command_output); + if (status == CHECK_STATUS_UNKNOWN) { - log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"), + if (runtime_options.force == false) + { + log_error(_("unable to check number of pending archive files on demotion candidate \"%s\""), + remote_node_record.node_name); + log_hint(_("use -F/--force to continue anyway")); + PQfinish(remote_conn); + PQfinish(local_conn); + + exit(ERR_SWITCHOVER_FAIL); + } + + log_warning(_("unable to check number of pending archive files on demotion candidate \"%s\""), remote_node_record.node_name); + log_notice(_("-F/--force set, continuing with switchover")); + + } + else if (status == CHECK_STATUS_CRITICAL) + { + if (runtime_options.force == false) + { + log_error(_("number of pending archive files on demotion candidate \"%s\" is critical"), + remote_node_record.node_name); + log_detail(_("%i pending archive files (critical threshold: %i)"), + files, threshold); + log_hint(_("PostgreSQL will not shut down until all files are archived; use -F/--force to continue anyway")); + PQfinish(remote_conn); + PQfinish(local_conn); + + exit(ERR_SWITCHOVER_FAIL); + } + + log_warning(_("number of pending archive files on demotion candidate \"%s\" is critical"), + remote_node_record.node_name); log_detail(_("%i pending archive files (critical threshold: %i)"), files, threshold); log_notice(_("-F/--force set, continuing with switchover")); } + else if (status == CHECK_STATUS_WARNING) + { + log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"), + remote_node_record.node_name); + log_detail(_("%i pending archive files (warning threshold: %i)"), + files, threshold); + log_hint(_("PostgreSQL will not shut down until all files are archived")); + } } - else if (status == CHECK_STATUS_WARNING) + + /* check replication lag */ + lag_seconds = get_replication_lag_seconds(local_conn); + + log_debug("lag is %i ", lag_seconds); + + termPQExpBuffer(&command_output); + + if (lag_seconds >= config_file_options.replication_lag_critical) { - log_warning(_("number of pending archive files on demotion candidate \"%s\" is warning"), - remote_node_record.node_name); - log_detail(_("%i pending archive files (warning threshold: %i)"), - files, threshold); - log_hint(_("PostgreSQL will not shut down until all files are archived")); + if (runtime_options.force == false) + { + log_error(_("replication lag on this node is critical")); + log_detail(_("lag is %i seconds (critical threshold: %i)"), + lag_seconds, config_file_options.replication_lag_critical); + log_hint(_("PostgreSQL on the demotion candidate will not shut down until pending WAL is flushed to the standby; use -F/--force to continue anyway")); + PQfinish(remote_conn); + PQfinish(local_conn); + + exit(ERR_SWITCHOVER_FAIL); + } + + log_warning(_("replication lag on this node is critical")); + log_detail(_("lag is %i seconds (critical threshold: %i)"), + lag_seconds, config_file_options.replication_lag_critical); + log_notice(_("-F/--force set, continuing with switchover")); + } + else if (lag_seconds >= config_file_options.replication_lag_warning) + { + log_warning(_("replication lag on this node is warning")); + log_detail(_("lag is %i seconds (warning threshold: %i)"), + lag_seconds, config_file_options.replication_lag_warning); + } + else if (lag_seconds < 0) + { + if (runtime_options.force == false) + { + log_error(_("unable to check replication lag on local node")); + log_hint(_("use -F/--force to continue anyway")); + PQfinish(remote_conn); + PQfinish(local_conn); + + exit(ERR_SWITCHOVER_FAIL); + } + + log_warning(_("unable to check replication lag on local node")); + log_notice(_("-F/--force set, continuing with switchover")); } } + PQfinish(remote_conn); + PQfinish(local_conn); @@ -3476,7 +3559,7 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name) } - +/* TODO: consolidate code in below functions */ static NodeStatus parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkPoint) { @@ -3599,7 +3682,6 @@ parse_node_status_is_shutdown(const char *node_status_output, XLogRecPtr *checkP } - static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold) { @@ -3738,3 +3820,144 @@ parse_node_check_archiver(const char *node_check_output, int *files, int *thresh return status; } + + + +static CheckStatus +parse_node_check_replication_lag(const char *node_check_output, int *seconds, int *threshold) +{ + int options_len = 0; + char *options_string = NULL; + char *options_string_ptr = NULL; + + CheckStatus status = CHECK_STATUS_UNKNOWN; + + + /* + * Add parsed options to this list, then copy to an array + * to pass to getopt + */ + static ItemList option_argv = { NULL, NULL }; + + char *argv_item; + int c, argc_item = 1; + + char **argv_array; + ItemListCell *cell; + + int optindex = 0; + + /* We're only interested in these options */ + static struct option long_options[] = + { + {"status", required_argument, NULL, 'S'}, + {"lag", required_argument, NULL, 'l'}, + {"threshold", required_argument, NULL, 't'}, + {NULL, 0, NULL, 0} + }; + + *seconds = 0; + *threshold = 0; + + /* Don't attempt to tokenise an empty string */ + if (!strlen(node_check_output)) + { + return status; + } + + options_len = strlen(node_check_output) + 1; + options_string = pg_malloc(options_len); + options_string_ptr = options_string; + + /* Copy the string before operating on it with strtok() */ + strncpy(options_string, node_check_output, options_len); + + /* Extract arguments into a list and keep a count of the total */ + while ((argv_item = strtok(options_string_ptr, " ")) != NULL) + { + item_list_append(&option_argv, argv_item); + + argc_item++; + + if (options_string_ptr != NULL) + options_string_ptr = NULL; + } + + /* + * Array of argument values to pass to getopt_long - this will need to + * include an empty string as the first value (normally this would be + * the program name) + */ + argv_array = pg_malloc0(sizeof(char *) * (argc_item + 2)); + + /* Insert a blank dummy program name at the start of the array */ + argv_array[0] = pg_malloc0(1); + + c = 1; + + /* + * Copy the previously extracted arguments from our list to the array + */ + for (cell = option_argv.head; cell; cell = cell->next) + { + int argv_len = strlen(cell->string) + 1; + + argv_array[c] = pg_malloc0(argv_len); + + strncpy(argv_array[c], cell->string, argv_len); + + c++; + } + + argv_array[c] = NULL; + + /* Reset getopt's optind variable */ + optind = 0; + + /* Prevent getopt from emitting errors */ + opterr = 0; + + while ((c = getopt_long(argc_item, argv_array, "l:S:t:", long_options, + &optindex)) != -1) + { + switch (c) + { + /* --files */ + case 'l': + *seconds = atoi(optarg); + break; + + case 't': + *threshold = atoi(optarg); + break; + + /* --status */ + case 'S': + { + if (strncmp(optarg, "OK", MAXLEN) == 0) + { + status = CHECK_STATUS_OK; + } + else if (strncmp(optarg, "WARNING", MAXLEN) == 0) + { + status = CHECK_STATUS_WARNING; + } + else if (strncmp(optarg, "CRITICAL", MAXLEN) == 0) + { + status = CHECK_STATUS_CRITICAL; + } + else if (strncmp(optarg, "UNKNOWN", MAXLEN) == 0) + { + status = CHECK_STATUS_UNKNOWN; + } + else + { + status = CHECK_STATUS_UNKNOWN; + } + } + break; + } + } + + return status; +} diff --git a/repmgr-client-global.h b/repmgr-client-global.h index 58e7891e..37999b15 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -92,6 +92,7 @@ typedef struct /* "node check" options */ bool archiver; + bool replication_lag; /* "node service" options */ char action[MAXLEN]; @@ -134,7 +135,7 @@ typedef struct /* "node status" options */ \ false, \ /* "node check" options */ \ - false, \ + false, false, \ /* "node service" options */ \ "", false, false, false, \ /* "cluster event" options */ \ diff --git a/repmgr-client.c b/repmgr-client.c index f1dd225a..3a31a0ef 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -427,6 +427,10 @@ main(int argc, char **argv) runtime_options.archiver = true; break; + case OPT_REPLICATION_LAG: + runtime_options.replication_lag = true; + break; + /* "node service" options * * ---------------------- */ diff --git a/repmgr-client.h b/repmgr-client.h index 5517a7b3..3fa2f752 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -68,6 +68,7 @@ #define OPT_NAGIOS 1031 #define OPT_ARCHIVER 1032 #define OPT_OPTFORMAT 1033 +#define OPT_REPLICATION_LAG 1034 /* deprecated since 3.3 */ #define OPT_DATA_DIR 999 #define OPT_NO_CONNINFO_PASSWORD 998 @@ -135,6 +136,7 @@ static struct option long_options[] = /* "node check" options */ {"archiver", no_argument, NULL, OPT_ARCHIVER }, + {"replication-lag", no_argument, NULL, OPT_REPLICATION_LAG }, /* "node service" options */ {"action", required_argument, NULL, OPT_ACTION}, diff --git a/repmgr.conf.sample b/repmgr.conf.sample index 8df195ab..c66a9b65 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -266,6 +266,12 @@ ssh_options='-q' # Options to append to "ssh" # "repmgr standby switchover" to warn about potential # issues with shutting down the demotion candidate. +#replication_lag_warning=300 # repmgr node check --replication-lag +#replication_lag_critical=600 # + # Note that these values will be checked when executing + # "repmgr standby switchover" to warn about potential + # issues with shutting down the demotion candidate. + #------------------------------------------------------------------------------ # BDR monitoring options diff --git a/repmgr.h b/repmgr.h index 5d3d78b9..f8cf1973 100644 --- a/repmgr.h +++ b/repmgr.h @@ -49,6 +49,8 @@ #define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */ #define DEFAULT_ARCHIVER_LAG_WARNING 16 /* WAL files */ #define DEFAULT_ARCHIVER_LAG_CRITICAL 128 /* WAL files */ +#define DEFAULT_REPLICATION_LAG_WARNING 300 /* seconds */ +#define DEFAULT_REPLICATION_LAG_CRITICAL 600 /* seconds */ #define FAILOVER_NODES_MAX_CHECK 50