From 3870768d80cdf2c9eaadf4a5a602a38541f89f5a Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Mon, 27 Sep 2021 17:46:33 +0900 Subject: [PATCH] Add --repmgrd option to "repmgr node check" This provides a simple way for checking whether the node's repmgrd is running. GitHub #719. --- dbutils.c | 37 ++++++++++++++++ dbutils.h | 3 ++ doc/appendix-release-notes.xml | 8 ++++ doc/repmgr-node-check.xml | 21 ++++++++- doc/repmgrd-configuration.xml | 23 ++++++++++ repmgr-action-node.c | 81 +++++++++++++++++++++++++++++++++- repmgr-client-global.h | 3 +- repmgr-client.c | 4 ++ repmgr-client.h | 2 + strutil.c | 1 - 10 files changed, 178 insertions(+), 5 deletions(-) diff --git a/dbutils.c b/dbutils.c index 64e438ee..d8d7f404 100644 --- a/dbutils.c +++ b/dbutils.c @@ -6008,6 +6008,43 @@ is_wal_replay_paused(PGconn *conn, bool check_pending_wal) return is_paused; } +/* repmgrd status functions */ + +CheckStatus +get_repmgrd_status(PGconn *conn) +{ + PQExpBufferData query; + PGresult *res = NULL; + CheckStatus repmgrd_status = CHECK_STATUS_CRITICAL; + + initPQExpBuffer(&query); + + appendPQExpBufferStr(&query, + " SELECT " + " CASE " + " WHEN repmgr.repmgrd_is_running() " + " THEN " + " CASE " + " WHEN repmgr.repmgrd_is_paused() THEN 1 ELSE 0 " + " END " + " ELSE 2 " + " END AS repmgrd_status"); + res = PQexec(conn, query.data); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + log_db_error(conn, query.data, _("unable to execute repmgrd status query")); + } + else + { + repmgrd_status = atoi(PQgetvalue(res, 0, 0)); + } + + termPQExpBuffer(&query); + PQclear(res); + return repmgrd_status; +} + /* miscellaneous debugging functions */ diff --git a/dbutils.h b/dbutils.h index 62c21d9e..e2783cea 100644 --- a/dbutils.h +++ b/dbutils.h @@ -602,6 +602,9 @@ int get_upstream_last_seen(PGconn *conn, t_server_type node_type); bool is_wal_replay_paused(PGconn *conn, bool check_pending_wal); +/* repmgrd status functions */ +CheckStatus get_repmgrd_status(PGconn *conn); + /* miscellaneous debugging functions */ const char *print_node_status(NodeStatus node_status); const char *print_pqping_status(PGPing ping_status); diff --git a/doc/appendix-release-notes.xml b/doc/appendix-release-notes.xml index b8f88db0..b2139ed5 100644 --- a/doc/appendix-release-notes.xml +++ b/doc/appendix-release-notes.xml @@ -69,6 +69,14 @@ This makes it clearer what &repmgr; is trying to do. + + + + repmgr node check: + option added to check &repmgrd; + status. + + diff --git a/doc/repmgr-node-check.xml b/doc/repmgr-node-check.xml index 55f88cea..63f152d5 100644 --- a/doc/repmgr-node-check.xml +++ b/doc/repmgr-node-check.xml @@ -125,12 +125,29 @@ is correctly configured. - - + + + repmgrd + + A separate check is available to verify whether &repmgrd; is running, + This is not included in the general output, as this does not + per-se constitute a check of the node's replication status. + + + + + : checks whether &repmgrd; is running. + If &repmgrd; is running but paused, status 1 + (WARNING) is returned. + + + + + Additional checks diff --git a/doc/repmgrd-configuration.xml b/doc/repmgrd-configuration.xml index 31a4eea8..920ab6c0 100644 --- a/doc/repmgrd-configuration.xml +++ b/doc/repmgrd-configuration.xml @@ -1079,6 +1079,29 @@ REPMGRD_OPTS="--daemonize=false" + + + repmgrd daemon monitoring + + repmgrd + monitoring + + + monitoring + repmgrd + + + + The command repmgr service status + provides an overview of the &repmgrd; daemon status (including pause status) + on all nodes in the cluster. + + + From &repmgr; 5.3, repmgr node check --repmgrd + can be used to check the status of &repmgrd; (including pause status) + on the local node. + + diff --git a/repmgr-action-node.c b/repmgr-action-node.c index e69d7425..f709ca63 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -35,6 +35,7 @@ static bool copy_file(const char *src_file, const char *dest_file); static void format_archive_dir(PQExpBufferData *archive_dir); static t_server_action parse_server_action(const char *action); +static const char *output_repmgrd_status(CheckStatus status); static void exit_optformat_error(const char *error, int errcode); @@ -52,9 +53,11 @@ static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); +static CheckStatus do_node_check_repmgrd(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_db_connection(PGconn *conn, OutputMode mode); + /* * NODE STATUS * @@ -941,6 +944,16 @@ do_node_check(void) exit(return_code); } + if (runtime_options.repmgrd == true) + { + return_code = do_node_check_repmgrd(conn, + runtime_options.output_mode, + &node_info, + NULL); + PQfinish(conn); + exit(return_code); + } + if (runtime_options.replication_config_owner == true) { return_code = do_node_check_replication_config_owner(conn, @@ -2024,7 +2037,6 @@ do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_inf return status; } - CheckStatus do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { @@ -2159,6 +2171,53 @@ do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_in return status; } +CheckStatus +do_node_check_repmgrd(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) +{ + CheckStatus status = CHECK_STATUS_OK; + + if (mode == OM_CSV && list_output == NULL) + { + log_error(_("--csv output not provided with --repmgrd option")); + PQfinish(conn); + exit(ERR_BAD_CONFIG); + } + + status = get_repmgrd_status(conn); + switch (mode) + { + case OM_OPTFORMAT: + printf("--repmgrd=%s\n", + output_check_status(status)); + break; + case OM_NAGIOS: + printf("REPMGRD %s: %s\n", + output_check_status(status), + output_repmgrd_status(status)); + + break; + case OM_CSV: + case OM_TEXT: + if (list_output != NULL) + { + check_status_list_set(list_output, + "repmgrd", + status, + output_repmgrd_status(status)); + } + else + { + printf("%s (%s)\n", + output_check_status(status), + output_repmgrd_status(status)); + } + default: + break; + } + + return status; +} + /* * This is not included in the general list output */ @@ -3570,6 +3629,25 @@ copy_file(const char *src_file, const char *dest_file) } +static const char * +output_repmgrd_status(CheckStatus status) +{ + switch (status) + { + case CHECK_STATUS_OK: + return "repmgrd running"; + case CHECK_STATUS_WARNING: + return "repmgrd running but paused"; + case CHECK_STATUS_CRITICAL: + return "repmgrd not running"; + case CHECK_STATUS_UNKNOWN: + return "repmgrd status unknown"; + } + + return "UNKNOWN"; +} + + void do_node_help(void) { @@ -3612,6 +3690,7 @@ do_node_help(void) printf(_(" --role check node has expected role\n")); printf(_(" --slots check for inactive replication slots\n")); printf(_(" --missing-slots check for missing replication slots\n")); + printf(_(" --repmgrd check if repmgrd is running\n")); printf(_(" --data-directory-config check repmgr's data directory configuration\n")); puts(""); diff --git a/repmgr-client-global.h b/repmgr-client-global.h index fe9a3968..c16bb9ea 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -120,6 +120,7 @@ typedef struct bool missing_slots; bool has_passfile; bool replication_connection; + bool repmgrd; bool data_directory_config; bool replication_config_owner; bool db_connection; @@ -175,7 +176,7 @@ typedef struct /* "node status" options */ \ false, \ /* "node check" options */ \ - false, false, false, false, false, false, false, false, false, false, false, false, \ + false, false, false, false, false, false, false, false, false, false, false, false, false, \ /* "node rejoin" options */ \ "", \ /* "node service" options */ \ diff --git a/repmgr-client.c b/repmgr-client.c index acea3524..fb554a73 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -549,6 +549,10 @@ main(int argc, char **argv) runtime_options.data_directory_config = true; break; + case OPT_REPMGRD: + runtime_options.repmgrd = true; + break; + case OPT_REPLICATION_CONFIG_OWNER: runtime_options.replication_config_owner = true; break; diff --git a/repmgr-client.h b/repmgr-client.h index 8488b640..8ce3451b 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -100,6 +100,7 @@ #define OPT_DB_CONNECTION 1047 #define OPT_VERIFY_BACKUP 1048 #define OPT_RECOVERY_MIN_APPLY_DELAY 1049 +#define OPT_REPMGRD 1050 /* These options are for internal use only */ #define OPT_CONFIG_ARCHIVE_DIR 2001 @@ -193,6 +194,7 @@ static struct option long_options[] = {"role", no_argument, NULL, OPT_ROLE}, {"slots", no_argument, NULL, OPT_SLOTS}, {"missing-slots", no_argument, NULL, OPT_MISSING_SLOTS}, + {"repmgrd", no_argument, NULL, OPT_REPMGRD}, {"has-passfile", no_argument, NULL, OPT_HAS_PASSFILE}, {"replication-connection", no_argument, NULL, OPT_REPL_CONN}, {"data-directory-config", no_argument, NULL, OPT_DATA_DIRECTORY_CONFIG}, diff --git a/strutil.c b/strutil.c index c9b1e462..a9f37f74 100644 --- a/strutil.c +++ b/strutil.c @@ -369,7 +369,6 @@ check_status_list_free(CheckStatusList *list) } - const char * output_check_status(CheckStatus status) {