From 4c0d719cdbe757ffb4c0cafe30e8ed1431c7a5f2 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 16 Aug 2017 11:17:02 +0900 Subject: [PATCH] Add replication slot check to "repmgr node check" --- README.md | 38 ++++- dbutils.c | 10 +- dbutils.h | 3 +- repmgr-action-node.c | 319 ++++++++++++++++++++++++++++------------- repmgr-client-global.h | 3 +- repmgr-client.c | 4 + repmgr-client.h | 2 + 7 files changed, 276 insertions(+), 103 deletions(-) diff --git a/README.md b/README.md index 393d44e5..9f131e64 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,13 @@ operations. `repmgr 4` is a complete rewrite of the existing `repmgr` codebase. Supports PostgreSQL 9.5 and later; support for PostgreSQL 9.3 and 9.4 has been -dropped. To use `repmgr 4` with BDR 2.0, PostgreSQL 9.6 is required. +dropped. Please continue to use repmgrd 3.x for those versions. + +### BDR support + +`repmgr 4` supports monitoring of a two-node BDR 2.0 cluster. PostgreSQL 9.6 is +required for BDR 2.0. Note that BDR 2.0 is not publicly available; please contact +2ndQuadrant for details. `repmgr 4` will support future public BDR releases. Building from source -------------------- @@ -67,6 +73,36 @@ The following commands are available: if the configuration file on each sibling is the same path as specifed in -f/--config-file or -C/--remote-config-file. +* `node status` + +* `node check` + + Performs some health checks on a node from a replication perspective. + + Sample output (execute `repmgr node check`): + + Node "node1": + Server role: OK (node is primary) + Replication lag: OK (N/A - node is primary) + WAL archiving: OK (0 pending files) + Downstream servers: OK (2 of 2 downstream nodes attached) + Replication slots: OK (node has no replication slots) + + Additionally each check can be performed individually by supplying + an additional command line parameter, e.g.: + + $ repmgr node check --role + OK (node is primary) + + Parameters for individual checks are as follows: + + * `--role`: checks if the node has the expected role + * `--replication-lag"`: checks if the node is lagging by more than + `replication_lag_warning` or `replication_lag_critical` seconds. + * `--archiver`: checks for WAL files which have not yet been archived + * `--downstream`: checks that the expected downstream nodes are attached + * `--slots`: checks there are no inactive replication slots + * `cluster show` Displays information about each active node in the replication cluster. This diff --git a/dbutils.c b/dbutils.c index 085996a5..82269e9f 100644 --- a/dbutils.c +++ b/dbutils.c @@ -2400,8 +2400,10 @@ get_node_replication_stats(PGconn *conn, t_node_info *node_info) " SELECT current_setting('max_wal_senders')::INT AS max_wal_senders, " " (SELECT COUNT(*) FROM pg_catalog.pg_stat_replication) AS attached_wal_receivers, " " current_setting('max_replication_slots')::INT AS max_replication_slots, " + " (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots) AS total_replication_slots, " " (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active = TRUE) AS active_replication_slots, " - " (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active = FALSE) AS inactive_replication_slots "); + " (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active = FALSE) AS inactive_replication_slots, " + " pg_catalog.pg_is_in_recovery() AS in_recovery"); res = PQexec(conn, query.data); termPQExpBuffer(&query); @@ -2417,8 +2419,10 @@ get_node_replication_stats(PGconn *conn, t_node_info *node_info) node_info->max_wal_senders = atoi(PQgetvalue(res, 0, 0)); node_info->attached_wal_receivers = atoi(PQgetvalue(res, 0, 1)); node_info->max_replication_slots = atoi(PQgetvalue(res, 0, 2)); - node_info->active_replication_slots = atoi(PQgetvalue(res, 0, 3)); - node_info->inactive_replication_slots = atoi(PQgetvalue(res, 0, 4)); + node_info->total_replication_slots = atoi(PQgetvalue(res, 0, 3)); + node_info->active_replication_slots = atoi(PQgetvalue(res, 0, 4)); + node_info->inactive_replication_slots = atoi(PQgetvalue(res, 0, 5)); + node_info->recovery_type = strcmp(PQgetvalue(res, 0, 6), "f") == 0 ? RECTYPE_PRIMARY : RECTYPE_STANDBY; PQclear(res); diff --git a/dbutils.h b/dbutils.h index 7fe439d4..c8e946f4 100644 --- a/dbutils.h +++ b/dbutils.h @@ -102,6 +102,7 @@ typedef struct s_node_info int max_wal_senders; int attached_wal_receivers; int max_replication_slots; + int total_replication_slots; int active_replication_slots; int inactive_replication_slots; } t_node_info; @@ -130,7 +131,7 @@ typedef struct s_node_info /* for ad-hoc use e.g. when working with a list of nodes */ \ "", true, true \ /* various statistics */ \ - -1, -1, -1, -1, -1 \ + -1, -1, -1, -1, -1, -1 \ } diff --git a/repmgr-action-node.c b/repmgr-action-node.c index 4b1044f8..0e8f802f 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -29,11 +29,11 @@ static void _do_node_status_is_shutdown(void); static void _do_node_archive_config(void); static void _do_node_restore_config(void); -static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_archiver(PGconn *conn, OutputMode mode, CheckStatusList *list_output); -static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, CheckStatusList *list_output); static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_output); - +static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); +static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); +static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); void do_node_status(void) @@ -536,6 +536,9 @@ do_node_check(void) exit(ERR_BAD_CONFIG); } + /* add replication statistics to node record */ + get_node_replication_stats(conn, &node_info); + /* handle specific checks * ====================== */ if (runtime_options.archiver == true) @@ -545,9 +548,17 @@ do_node_check(void) return; } + if (runtime_options.downstream == true) + { + (void) do_node_check_downstream(conn, runtime_options.output_mode, NULL); + PQfinish(conn); + return; + } + + if (runtime_options.replication_lag == true) { - (void) do_node_check_replication_lag(conn, runtime_options.output_mode, NULL); + (void) do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); return; } @@ -559,20 +570,24 @@ do_node_check(void) return; } - if (runtime_options.downstream == true) + if (runtime_options.slots == true) { - (void) do_node_check_downstream(conn, runtime_options.output_mode, NULL); + (void) do_node_check_slots(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); return; } + + /* output general overview */ initPQExpBuffer(&output); + /* order functions are called is also output order */ (void) do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list); - (void) do_node_check_replication_lag(conn, runtime_options.output_mode, &status_list); + (void) do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list); (void) do_node_check_archiver(conn, runtime_options.output_mode, &status_list); (void) do_node_check_downstream(conn, runtime_options.output_mode, &status_list); + (void) do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list); if (runtime_options.output_mode == OM_CSV) { @@ -640,6 +655,12 @@ do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckS &details, _("node is registered as primary but running as standby")); } + else + { + appendPQExpBuffer( + &details, + _("node is primary")); + } break; case STANDBY: if (recovery_type == RECTYPE_PRIMARY) @@ -649,6 +670,12 @@ do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckS &details, _("node is registered as standby but running as primary")); } + else + { + appendPQExpBuffer( + &details, + _("node is standby")); + } break; case BDR: { @@ -696,10 +723,9 @@ do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckS } else { - printf("%s", output_check_status(status)); - if (strlen(details.data)) - printf(" (%s)", details.data); - puts(""); + printf("%s (%s)\n", + output_check_status(status), + details.data); } default: break; @@ -710,6 +736,70 @@ do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckS } + +static CheckStatus +do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) +{ + CheckStatus status = CHECK_STATUS_OK; + PQExpBufferData details; + + initPQExpBuffer(&details); + + if (node_info->total_replication_slots == 0) + { + appendPQExpBuffer( + &details, + _("node has no replication slots")); + } + else if (node_info->inactive_replication_slots == 0) + { + appendPQExpBuffer( + &details, + _("%i of %i replication slots are active"), + node_info->total_replication_slots, + node_info->total_replication_slots); + } + else if (node_info->inactive_replication_slots > 0) + { + status = CHECK_STATUS_CRITICAL; + + appendPQExpBuffer( + &details, + _("%i of %i replication slots are inactive"), + node_info->inactive_replication_slots, + node_info->total_replication_slots); + } + + switch (mode) + { + case OM_NAGIOS: + printf("PG_INACTIVE_SLOTS %s: %s\n", + output_check_status(status), + details.data); + break; + case OM_TEXT: + if (list_output != NULL) + { + check_status_list_set(list_output, + "Replication slots", + status, + details.data); + } + else + { + printf("%s (%s)\n", + output_check_status(status), + details.data); + } + default: + break; + } + + termPQExpBuffer(&details); + return status; +} + + static CheckStatus do_node_check_archiver(PGconn *conn, OutputMode mode, CheckStatusList *list_output) { @@ -866,9 +956,9 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, CheckStatusList *list_outp static CheckStatus -do_node_check_replication_lag(PGconn *conn, OutputMode mode, CheckStatusList *list_output) +do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { - CheckStatus status = CHECK_STATUS_UNKNOWN; + CheckStatus status = CHECK_STATUS_OK; int lag_seconds = 0; PQExpBufferData details; @@ -881,109 +971,135 @@ do_node_check_replication_lag(PGconn *conn, OutputMode mode, CheckStatusList *li initPQExpBuffer(&details); - lag_seconds = get_replication_lag_seconds(conn); - - log_debug("lag seconds: %i", lag_seconds); - - if (lag_seconds >= config_file_options.replication_lag_critical) + if (node_info->recovery_type == RECTYPE_PRIMARY) { - status = CHECK_STATUS_CRITICAL; - switch (mode) { case OM_OPTFORMAT: appendPQExpBuffer( &details, - "--lag=%i --threshold=%i", - lag_seconds, config_file_options.replication_lag_critical); + "--lag=0"); break; case OM_NAGIOS: appendPQExpBuffer( &details, - "%i seconds (critical: %i)", - lag_seconds, config_file_options.replication_lag_critical); + "0 seconds"); break; case OM_TEXT: appendPQExpBuffer( &details, - "%i seconds, threshold: %i)", - lag_seconds, config_file_options.replication_lag_critical); + "N/A - node is primary"); break; - - default: - break; - } - } - else if (lag_seconds > config_file_options.replication_lag_warning) - { - status = CHECK_STATUS_WARNING; - - switch (mode) - { - case OM_OPTFORMAT: - appendPQExpBuffer( - &details, - "--lag=%i --threshold=%i", - lag_seconds, config_file_options.replication_lag_warning); - break; - case OM_NAGIOS: - appendPQExpBuffer( - &details, - "%i seconds (warning: %i)", - lag_seconds, config_file_options.replication_lag_warning); - break; - case OM_TEXT: - appendPQExpBuffer( - &details, - "%i seconds, threshold: %i)", - lag_seconds, config_file_options.replication_lag_warning); - break; - - default: - break; - } - } - else if (lag_seconds < 0) - { - status = CHECK_STATUS_UNKNOWN; - - switch (mode) - { - case OM_OPTFORMAT: - break; - case OM_NAGIOS: - case OM_TEXT: - appendPQExpBuffer( - &details, - "unable to query replication lag"); - break; - default: break; } } else { - status = CHECK_STATUS_OK; + lag_seconds = get_replication_lag_seconds(conn); - switch (mode) + log_debug("lag seconds: %i", lag_seconds); + + if (lag_seconds >= config_file_options.replication_lag_critical) { - case OM_OPTFORMAT: - appendPQExpBuffer( - &details, - "--lag=%i", - lag_seconds); - break; - case OM_NAGIOS: - case OM_TEXT: - appendPQExpBuffer( - &details, - "%i seconds", - lag_seconds); - break; + status = CHECK_STATUS_CRITICAL; - default: - break; + switch (mode) + { + case OM_OPTFORMAT: + appendPQExpBuffer( + &details, + "--lag=%i --threshold=%i", + lag_seconds, config_file_options.replication_lag_critical); + break; + case OM_NAGIOS: + appendPQExpBuffer( + &details, + "%i seconds (critical: %i)", + lag_seconds, config_file_options.replication_lag_critical); + break; + case OM_TEXT: + appendPQExpBuffer( + &details, + "%i seconds, threshold: %i)", + lag_seconds, config_file_options.replication_lag_critical); + break; + + default: + break; + } + } + else if (lag_seconds > config_file_options.replication_lag_warning) + { + status = CHECK_STATUS_WARNING; + + switch (mode) + { + case OM_OPTFORMAT: + appendPQExpBuffer( + &details, + "--lag=%i --threshold=%i", + lag_seconds, config_file_options.replication_lag_warning); + break; + case OM_NAGIOS: + appendPQExpBuffer( + &details, + "%i seconds (warning: %i)", + lag_seconds, config_file_options.replication_lag_warning); + break; + case OM_TEXT: + appendPQExpBuffer( + &details, + "%i seconds, threshold: %i)", + lag_seconds, config_file_options.replication_lag_warning); + break; + + default: + break; + } + } + else if (lag_seconds < 0) + { + status = CHECK_STATUS_UNKNOWN; + + switch (mode) + { + case OM_OPTFORMAT: + break; + case OM_NAGIOS: + case OM_TEXT: + appendPQExpBuffer( + &details, + "unable to query replication lag"); + break; + + default: + break; + } + } + else + { + status = CHECK_STATUS_OK; + + switch (mode) + { + case OM_OPTFORMAT: + appendPQExpBuffer( + &details, + "--lag=%i", + lag_seconds); + break; + case OM_NAGIOS: + case OM_TEXT: + appendPQExpBuffer( + &details, + "%i seconds", + lag_seconds); + break; + + default: + break; + } } } @@ -1072,16 +1188,23 @@ do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_ou appendPQExpBuffer( &details, - "%i of %i downstream nodes not attached (missing: ", + "%i of %i downstream nodes not attached", missing_nodes_count, downstream_nodes.node_count); + if (mode == OM_NAGIOS) + appendPQExpBuffer( + &details, " (missing: "); + else + appendPQExpBuffer( + &details, "; missing: "); + for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next) { if (first == false) appendPQExpBuffer( &details, - ","); + ", "); else first = false; @@ -1090,6 +1213,10 @@ do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_ou &details, "%s", missing_cell->string); } + + if (mode == OM_NAGIOS) + appendPQExpBufferChar( + &details, ')'); } switch (mode) @@ -1118,7 +1245,7 @@ do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_ou } termPQExpBuffer(&details); - + clear_node_info_list(&downstream_nodes); return status; } @@ -1840,5 +1967,3 @@ copy_file(const char *src_file, const char *dest_file) return true; } - - diff --git a/repmgr-client-global.h b/repmgr-client-global.h index 54aefb20..bcb51b8f 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -88,6 +88,7 @@ typedef struct bool downstream; bool replication_lag; bool role; + bool slots; /* "node join" options */ char config_files[MAXLEN]; @@ -133,7 +134,7 @@ typedef struct /* "node status" options */ \ false, \ /* "node check" options */ \ - false, false, false, false,\ + false, false, false, false, false, \ /* "node join" options */ \ "", \ /* "node service" options */ \ diff --git a/repmgr-client.c b/repmgr-client.c index ac5007f2..b536f25f 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -438,6 +438,10 @@ main(int argc, char **argv) runtime_options.role = true; break; + case OPT_SLOTS: + runtime_options.slots = true; + break; + /* "node join" options * * ------------------- */ case OPT_CONFIG_FILES: diff --git a/repmgr-client.h b/repmgr-client.h index b4df881b..db01dcf2 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -72,6 +72,7 @@ #define OPT_SIBLINGS_FOLLOW 1036 #define OPT_ROLE 1037 #define OPT_DOWNSTREAM 1038 +#define OPT_SLOTS 1039 /* deprecated since 3.3 */ #define OPT_DATA_DIR 999 #define OPT_NO_CONNINFO_PASSWORD 998 @@ -143,6 +144,7 @@ static struct option long_options[] = {"downstream", no_argument, NULL, OPT_DOWNSTREAM }, {"replication-lag", no_argument, NULL, OPT_REPLICATION_LAG }, {"role", no_argument, NULL, OPT_ROLE }, + {"slots", no_argument, NULL, OPT_SLOTS }, /* "node join" options */ {"config-files", required_argument, NULL, OPT_CONFIG_FILES },