From ae60caacddc254b78fbfe94e10a6bccd7c0e5cff Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 5 Jul 2018 14:31:06 +0900 Subject: [PATCH] repmgr: make "node check" and "node status" return ERR_NODE_STATUS when appropriate If any issue is detected (and "node check" is not being executed with a specific individual check), "ERR_NODE_STATUS" is returned. --- HISTORY | 30 +++++++------ doc/appendix-release-notes.sgml | 18 ++++++-- doc/repmgr-cluster-show.sgml | 7 +++ doc/repmgr-node-check.sgml | 80 ++++++++++++++++++++++++++++++++- doc/repmgr-node-status.sgml | 3 +- repmgr-action-node.c | 35 ++++++++++++--- 6 files changed, 147 insertions(+), 26 deletions(-) diff --git a/HISTORY b/HISTORY index a853e75f..a7996d6e 100644 --- a/HISTORY +++ b/HISTORY @@ -1,28 +1,30 @@ 4.1.0 2018-??-?? - repmgr: add "--missing-slots" check to "repmgr node check" (Ian) + repmgr: add "--missing-slots" check to "repmgr node check" (Ian) repmgr: improve command line error handling; GitHub #464 (Ian) repmgr: fix "standby register --wait-sync" when no timeout provided (Ian) repmgr: "cluster show" returns non-zero value if an issue encountered; GitHub #456 (Ian) + repmgr: "node check" and "node status" returns non-zero value if an issue + encountered (Ian) repmgr: "node status" returns non-zero value if an issue encountered (Ian) repmgrd: create a PID file by default; GitHub #457 (Ian) repmgrd: daemonize process by default; GitHub #458 (Ian) 4.0.6 2018-06-14 repmgr: (witness register) prevent registration of a witness server with the - same name as an existing node (Ian) - repmgr: (standby follow) check node has actually connected to new primary - before reporting success; GitHub #444 (Ian) - repmgr: (standby clone) improve handling of external configuration file copying, - including consideration in --dry-run check; GitHub #443 (Ian) - repmgr: (standby clone) don't require presence of "user" parameter in - conninfo string; GitHub #437 (Ian) - repmgr: (standby clone) improve documentation of --recovery-conf-only - mode; GitHub #438 (Ian) - repmgr: (node rejoin) fix bug when parsing --config-files parameter; - GitHub #442 (Ian) - repmgr: when using --dry-run, force log level to INFO to ensure output - will always be displayed; GitHub #441 (Ian) + same name as an existing node (Ian) + repmgr: (standby follow) check node has actually connected to new primary + before reporting success; GitHub #444 (Ian) + repmgr: (standby clone) improve handling of external configuration file copying, + including consideration in --dry-run check; GitHub #443 (Ian) + repmgr: (standby clone) don't require presence of "user" parameter in + conninfo string; GitHub #437 (Ian) + repmgr: (standby clone) improve documentation of --recovery-conf-only + mode; GitHub #438 (Ian) + repmgr: (node rejoin) fix bug when parsing --config-files parameter; + GitHub #442 (Ian) + repmgr: when using --dry-run, force log level to INFO to ensure output + will always be displayed; GitHub #441 (Ian) repmgr: (cluster matrix/crosscheck) return non-zero exit code if node connection issues detected; GitHub #447 (Ian) repmgrd: ensure local node is counted as quorum member; GitHub #439 (Ian) diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index f8049e92..da3bcb90 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -34,11 +34,11 @@ - repmgrd enhancements + repmgr enhancements - + repmgr: always exit with an error if an unrecognised command line option is provided. This matches the behaviour of other PostgreSQL @@ -48,10 +48,20 @@ - repmgr cluster-show - returns non-zero exit code if node status issues detected (GitHub #456). + repmgr cluster-show, + repmgr node check and + repmgr node status + return non-zero exit code if node status issues detected. (GitHub #456). + + + + + + repmgrd enhancements + + diff --git a/doc/repmgr-cluster-show.sgml b/doc/repmgr-cluster-show.sgml index 227108e3..50fcad45 100644 --- a/doc/repmgr-cluster-show.sgml +++ b/doc/repmgr-cluster-show.sgml @@ -142,4 +142,11 @@ + + See also + + , + + + diff --git a/doc/repmgr-node-check.sgml b/doc/repmgr-node-check.sgml index 9a80e949..eed85577 100644 --- a/doc/repmgr-node-check.sgml +++ b/doc/repmgr-node-check.sgml @@ -61,7 +61,9 @@ - --archive-ready: checks for WAL files which have not yet been archived + --archive-ready: checks for WAL files which have not yet been archived, + and returns WARNING or CRITICAL if the number + exceeds archive_ready_warning or archive_ready_critical respectively. @@ -107,4 +109,80 @@ + + + Exit codes + + + When executing repmgr node check with one of the individual + checks listed above, &repmgr; will emit one of the following Nagios-style exit codes + (even if --nagios is not supplied): + + + + + + 0: OK + + + + + + 1: WARNING + + + + + + 2: ERROR + + + + + + 3: UNKNOWN + + + + + + + + + + Following exit codes can be emitted by repmgr status check + if no individual check was specified. + + + + + + + + No issues were detected. + + + + + + + + + One or more issues were detected. + + + + + + + + + + + See also + + , + + + diff --git a/doc/repmgr-node-status.sgml b/doc/repmgr-node-status.sgml index d85f819b..76cdf6b2 100644 --- a/doc/repmgr-node-status.sgml +++ b/doc/repmgr-node-status.sgml @@ -84,7 +84,8 @@ See also - See to diagnose issues. + See to diagnose issues and + for an overview of all nodes in the cluster. diff --git a/repmgr-action-node.c b/repmgr-action-node.c index 88493343..0950fbe8 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -170,11 +170,17 @@ do_node_status(void) } else { + /* "archive_mode" is not "off", i.e. one of "on", "always" */ bool enabled = true; PQExpBufferData archiving_status; char archive_command[MAXLEN] = ""; initPQExpBuffer(&archiving_status); + + /* + * if the node is a standby, and "archive_mode" is "on", archiving will + * actually be disabled. + */ if (recovery_type == RECTYPE_STANDBY) { if (guc_set(conn, "archive_mode", "=", "on")) @@ -642,6 +648,7 @@ do_node_check(void) CheckStatusList status_list = {NULL, NULL}; CheckStatusListCell *cell = NULL; + bool issue_detected = false; /* for internal use */ if (runtime_options.has_passfile == true) @@ -750,12 +757,23 @@ do_node_check(void) initPQExpBuffer(&output); /* order functions are called is also output order */ - (void) do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list); - (void) do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list); - (void) do_node_check_archive_ready(conn, runtime_options.output_mode, &status_list); - (void) do_node_check_downstream(conn, runtime_options.output_mode, &status_list); - (void) do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list); - (void) do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, &status_list); + if (do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) + issue_detected = true; + + if (do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) + issue_detected = true; + + if (do_node_check_archive_ready(conn, runtime_options.output_mode, &status_list) != CHECK_STATUS_OK) + issue_detected = true; + + if (do_node_check_downstream(conn, runtime_options.output_mode, &status_list) != CHECK_STATUS_OK) + issue_detected = true; + + if (do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) + issue_detected = true; + + if (do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) + issue_detected = true; if (runtime_options.output_mode == OM_CSV) { @@ -812,6 +830,11 @@ do_node_check(void) check_status_list_free(&status_list); PQfinish(conn); + + if (issue_detected == true) + { + exit(ERR_NODE_STATUS); + } }