Add replication slot check to "repmgr node check"

This commit is contained in:
Ian Barwick
2017-08-16 11:17:02 +09:00
parent 3e9ce6fe38
commit 4c0d719cdb
7 changed files with 276 additions and 103 deletions

View File

@@ -10,7 +10,13 @@ operations.
`repmgr 4` is a complete rewrite of the existing `repmgr` codebase.
Supports PostgreSQL 9.5 and later; support for PostgreSQL 9.3 and 9.4 has been
dropped. To use `repmgr 4` with BDR 2.0, PostgreSQL 9.6 is required.
dropped. Please continue to use repmgrd 3.x for those versions.
### BDR support
`repmgr 4` supports monitoring of a two-node BDR 2.0 cluster. PostgreSQL 9.6 is
required for BDR 2.0. Note that BDR 2.0 is not publicly available; please contact
2ndQuadrant for details. `repmgr 4` will support future public BDR releases.
Building from source
--------------------
@@ -67,6 +73,36 @@ The following commands are available:
if the configuration file on each sibling is the same path as specifed
in -f/--config-file or -C/--remote-config-file.
* `node status`
* `node check`
Performs some health checks on a node from a replication perspective.
Sample output (execute `repmgr node check`):
Node "node1":
Server role: OK (node is primary)
Replication lag: OK (N/A - node is primary)
WAL archiving: OK (0 pending files)
Downstream servers: OK (2 of 2 downstream nodes attached)
Replication slots: OK (node has no replication slots)
Additionally each check can be performed individually by supplying
an additional command line parameter, e.g.:
$ repmgr node check --role
OK (node is primary)
Parameters for individual checks are as follows:
* `--role`: checks if the node has the expected role
* `--replication-lag"`: checks if the node is lagging by more than
`replication_lag_warning` or `replication_lag_critical` seconds.
* `--archiver`: checks for WAL files which have not yet been archived
* `--downstream`: checks that the expected downstream nodes are attached
* `--slots`: checks there are no inactive replication slots
* `cluster show`
Displays information about each active node in the replication cluster. This

View File

@@ -2400,8 +2400,10 @@ get_node_replication_stats(PGconn *conn, t_node_info *node_info)
" SELECT current_setting('max_wal_senders')::INT AS max_wal_senders, "
" (SELECT COUNT(*) FROM pg_catalog.pg_stat_replication) AS attached_wal_receivers, "
" current_setting('max_replication_slots')::INT AS max_replication_slots, "
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots) AS total_replication_slots, "
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active = TRUE) AS active_replication_slots, "
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active = FALSE) AS inactive_replication_slots ");
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active = FALSE) AS inactive_replication_slots, "
" pg_catalog.pg_is_in_recovery() AS in_recovery");
res = PQexec(conn, query.data);
termPQExpBuffer(&query);
@@ -2417,8 +2419,10 @@ get_node_replication_stats(PGconn *conn, t_node_info *node_info)
node_info->max_wal_senders = atoi(PQgetvalue(res, 0, 0));
node_info->attached_wal_receivers = atoi(PQgetvalue(res, 0, 1));
node_info->max_replication_slots = atoi(PQgetvalue(res, 0, 2));
node_info->active_replication_slots = atoi(PQgetvalue(res, 0, 3));
node_info->inactive_replication_slots = atoi(PQgetvalue(res, 0, 4));
node_info->total_replication_slots = atoi(PQgetvalue(res, 0, 3));
node_info->active_replication_slots = atoi(PQgetvalue(res, 0, 4));
node_info->inactive_replication_slots = atoi(PQgetvalue(res, 0, 5));
node_info->recovery_type = strcmp(PQgetvalue(res, 0, 6), "f") == 0 ? RECTYPE_PRIMARY : RECTYPE_STANDBY;
PQclear(res);

View File

@@ -102,6 +102,7 @@ typedef struct s_node_info
int max_wal_senders;
int attached_wal_receivers;
int max_replication_slots;
int total_replication_slots;
int active_replication_slots;
int inactive_replication_slots;
} t_node_info;
@@ -130,7 +131,7 @@ typedef struct s_node_info
/* for ad-hoc use e.g. when working with a list of nodes */ \
"", true, true \
/* various statistics */ \
-1, -1, -1, -1, -1 \
-1, -1, -1, -1, -1, -1 \
}

View File

@@ -29,11 +29,11 @@ static void _do_node_status_is_shutdown(void);
static void _do_node_archive_config(void);
static void _do_node_restore_config(void);
static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
static CheckStatus do_node_check_archiver(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
void
do_node_status(void)
@@ -536,6 +536,9 @@ do_node_check(void)
exit(ERR_BAD_CONFIG);
}
/* add replication statistics to node record */
get_node_replication_stats(conn, &node_info);
/* handle specific checks
* ====================== */
if (runtime_options.archiver == true)
@@ -545,9 +548,17 @@ do_node_check(void)
return;
}
if (runtime_options.downstream == true)
{
(void) do_node_check_downstream(conn, runtime_options.output_mode, NULL);
PQfinish(conn);
return;
}
if (runtime_options.replication_lag == true)
{
(void) do_node_check_replication_lag(conn, runtime_options.output_mode, NULL);
(void) do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, NULL);
PQfinish(conn);
return;
}
@@ -559,20 +570,24 @@ do_node_check(void)
return;
}
if (runtime_options.downstream == true)
if (runtime_options.slots == true)
{
(void) do_node_check_downstream(conn, runtime_options.output_mode, NULL);
(void) do_node_check_slots(conn, runtime_options.output_mode, &node_info, NULL);
PQfinish(conn);
return;
}
/* output general overview */
initPQExpBuffer(&output);
/* order functions are called is also output order */
(void) do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list);
(void) do_node_check_replication_lag(conn, runtime_options.output_mode, &status_list);
(void) do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list);
(void) do_node_check_archiver(conn, runtime_options.output_mode, &status_list);
(void) do_node_check_downstream(conn, runtime_options.output_mode, &status_list);
(void) do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list);
if (runtime_options.output_mode == OM_CSV)
{
@@ -640,6 +655,12 @@ do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckS
&details,
_("node is registered as primary but running as standby"));
}
else
{
appendPQExpBuffer(
&details,
_("node is primary"));
}
break;
case STANDBY:
if (recovery_type == RECTYPE_PRIMARY)
@@ -649,6 +670,12 @@ do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckS
&details,
_("node is registered as standby but running as primary"));
}
else
{
appendPQExpBuffer(
&details,
_("node is standby"));
}
break;
case BDR:
{
@@ -696,10 +723,9 @@ do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckS
}
else
{
printf("%s", output_check_status(status));
if (strlen(details.data))
printf(" (%s)", details.data);
puts("");
printf("%s (%s)\n",
output_check_status(status),
details.data);
}
default:
break;
@@ -710,6 +736,70 @@ do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckS
}
static CheckStatus
do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
{
CheckStatus status = CHECK_STATUS_OK;
PQExpBufferData details;
initPQExpBuffer(&details);
if (node_info->total_replication_slots == 0)
{
appendPQExpBuffer(
&details,
_("node has no replication slots"));
}
else if (node_info->inactive_replication_slots == 0)
{
appendPQExpBuffer(
&details,
_("%i of %i replication slots are active"),
node_info->total_replication_slots,
node_info->total_replication_slots);
}
else if (node_info->inactive_replication_slots > 0)
{
status = CHECK_STATUS_CRITICAL;
appendPQExpBuffer(
&details,
_("%i of %i replication slots are inactive"),
node_info->inactive_replication_slots,
node_info->total_replication_slots);
}
switch (mode)
{
case OM_NAGIOS:
printf("PG_INACTIVE_SLOTS %s: %s\n",
output_check_status(status),
details.data);
break;
case OM_TEXT:
if (list_output != NULL)
{
check_status_list_set(list_output,
"Replication slots",
status,
details.data);
}
else
{
printf("%s (%s)\n",
output_check_status(status),
details.data);
}
default:
break;
}
termPQExpBuffer(&details);
return status;
}
static CheckStatus
do_node_check_archiver(PGconn *conn, OutputMode mode, CheckStatusList *list_output)
{
@@ -866,9 +956,9 @@ do_node_check_archiver(PGconn *conn, OutputMode mode, CheckStatusList *list_outp
static CheckStatus
do_node_check_replication_lag(PGconn *conn, OutputMode mode, CheckStatusList *list_output)
do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
{
CheckStatus status = CHECK_STATUS_UNKNOWN;
CheckStatus status = CHECK_STATUS_OK;
int lag_seconds = 0;
PQExpBufferData details;
@@ -881,109 +971,135 @@ do_node_check_replication_lag(PGconn *conn, OutputMode mode, CheckStatusList *li
initPQExpBuffer(&details);
lag_seconds = get_replication_lag_seconds(conn);
log_debug("lag seconds: %i", lag_seconds);
if (lag_seconds >= config_file_options.replication_lag_critical)
if (node_info->recovery_type == RECTYPE_PRIMARY)
{
status = CHECK_STATUS_CRITICAL;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(
&details,
"--lag=%i --threshold=%i",
lag_seconds, config_file_options.replication_lag_critical);
"--lag=0");
break;
case OM_NAGIOS:
appendPQExpBuffer(
&details,
"%i seconds (critical: %i)",
lag_seconds, config_file_options.replication_lag_critical);
"0 seconds");
break;
case OM_TEXT:
appendPQExpBuffer(
&details,
"%i seconds, threshold: %i)",
lag_seconds, config_file_options.replication_lag_critical);
"N/A - node is primary");
break;
default:
break;
}
}
else if (lag_seconds > config_file_options.replication_lag_warning)
{
status = CHECK_STATUS_WARNING;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(
&details,
"--lag=%i --threshold=%i",
lag_seconds, config_file_options.replication_lag_warning);
break;
case OM_NAGIOS:
appendPQExpBuffer(
&details,
"%i seconds (warning: %i)",
lag_seconds, config_file_options.replication_lag_warning);
break;
case OM_TEXT:
appendPQExpBuffer(
&details,
"%i seconds, threshold: %i)",
lag_seconds, config_file_options.replication_lag_warning);
break;
default:
break;
}
}
else if (lag_seconds < 0)
{
status = CHECK_STATUS_UNKNOWN;
switch (mode)
{
case OM_OPTFORMAT:
break;
case OM_NAGIOS:
case OM_TEXT:
appendPQExpBuffer(
&details,
"unable to query replication lag");
break;
default:
break;
}
}
else
{
status = CHECK_STATUS_OK;
lag_seconds = get_replication_lag_seconds(conn);
switch (mode)
log_debug("lag seconds: %i", lag_seconds);
if (lag_seconds >= config_file_options.replication_lag_critical)
{
case OM_OPTFORMAT:
appendPQExpBuffer(
&details,
"--lag=%i",
lag_seconds);
break;
case OM_NAGIOS:
case OM_TEXT:
appendPQExpBuffer(
&details,
"%i seconds",
lag_seconds);
break;
status = CHECK_STATUS_CRITICAL;
default:
break;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(
&details,
"--lag=%i --threshold=%i",
lag_seconds, config_file_options.replication_lag_critical);
break;
case OM_NAGIOS:
appendPQExpBuffer(
&details,
"%i seconds (critical: %i)",
lag_seconds, config_file_options.replication_lag_critical);
break;
case OM_TEXT:
appendPQExpBuffer(
&details,
"%i seconds, threshold: %i)",
lag_seconds, config_file_options.replication_lag_critical);
break;
default:
break;
}
}
else if (lag_seconds > config_file_options.replication_lag_warning)
{
status = CHECK_STATUS_WARNING;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(
&details,
"--lag=%i --threshold=%i",
lag_seconds, config_file_options.replication_lag_warning);
break;
case OM_NAGIOS:
appendPQExpBuffer(
&details,
"%i seconds (warning: %i)",
lag_seconds, config_file_options.replication_lag_warning);
break;
case OM_TEXT:
appendPQExpBuffer(
&details,
"%i seconds, threshold: %i)",
lag_seconds, config_file_options.replication_lag_warning);
break;
default:
break;
}
}
else if (lag_seconds < 0)
{
status = CHECK_STATUS_UNKNOWN;
switch (mode)
{
case OM_OPTFORMAT:
break;
case OM_NAGIOS:
case OM_TEXT:
appendPQExpBuffer(
&details,
"unable to query replication lag");
break;
default:
break;
}
}
else
{
status = CHECK_STATUS_OK;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(
&details,
"--lag=%i",
lag_seconds);
break;
case OM_NAGIOS:
case OM_TEXT:
appendPQExpBuffer(
&details,
"%i seconds",
lag_seconds);
break;
default:
break;
}
}
}
@@ -1072,16 +1188,23 @@ do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_ou
appendPQExpBuffer(
&details,
"%i of %i downstream nodes not attached (missing: ",
"%i of %i downstream nodes not attached",
missing_nodes_count,
downstream_nodes.node_count);
if (mode == OM_NAGIOS)
appendPQExpBuffer(
&details, " (missing: ");
else
appendPQExpBuffer(
&details, "; missing: ");
for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
{
if (first == false)
appendPQExpBuffer(
&details,
",");
", ");
else
first = false;
@@ -1090,6 +1213,10 @@ do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_ou
&details,
"%s", missing_cell->string);
}
if (mode == OM_NAGIOS)
appendPQExpBufferChar(
&details, ')');
}
switch (mode)
@@ -1118,7 +1245,7 @@ do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_ou
}
termPQExpBuffer(&details);
clear_node_info_list(&downstream_nodes);
return status;
}
@@ -1840,5 +1967,3 @@ copy_file(const char *src_file, const char *dest_file)
return true;
}

View File

@@ -88,6 +88,7 @@ typedef struct
bool downstream;
bool replication_lag;
bool role;
bool slots;
/* "node join" options */
char config_files[MAXLEN];
@@ -133,7 +134,7 @@ typedef struct
/* "node status" options */ \
false, \
/* "node check" options */ \
false, false, false, false,\
false, false, false, false, false, \
/* "node join" options */ \
"", \
/* "node service" options */ \

View File

@@ -438,6 +438,10 @@ main(int argc, char **argv)
runtime_options.role = true;
break;
case OPT_SLOTS:
runtime_options.slots = true;
break;
/* "node join" options *
* ------------------- */
case OPT_CONFIG_FILES:

View File

@@ -72,6 +72,7 @@
#define OPT_SIBLINGS_FOLLOW 1036
#define OPT_ROLE 1037
#define OPT_DOWNSTREAM 1038
#define OPT_SLOTS 1039
/* deprecated since 3.3 */
#define OPT_DATA_DIR 999
#define OPT_NO_CONNINFO_PASSWORD 998
@@ -143,6 +144,7 @@ static struct option long_options[] =
{"downstream", no_argument, NULL, OPT_DOWNSTREAM },
{"replication-lag", no_argument, NULL, OPT_REPLICATION_LAG },
{"role", no_argument, NULL, OPT_ROLE },
{"slots", no_argument, NULL, OPT_SLOTS },
/* "node join" options */
{"config-files", required_argument, NULL, OPT_CONFIG_FILES },