From 554673e83eaa18e68ad808d04e4aa4159ffd8eba Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Tue, 15 Aug 2017 15:50:46 +0900 Subject: [PATCH] Add "repmgr node check --downstream" --- controldata.c | 10 ++-- controldata.h | 2 - dbutils.c | 47 ++++++++++++++++++ dbutils.h | 4 +- repmgr-action-node.c | 106 ++++++++++++++++++++++++++++++++++++++++- repmgr-client-global.h | 3 +- repmgr-client.c | 6 ++- repmgr-client.h | 2 + 8 files changed, 169 insertions(+), 11 deletions(-) diff --git a/controldata.c b/controldata.c index 312b0c9a..85e4be2b 100644 --- a/controldata.c +++ b/controldata.c @@ -20,7 +20,7 @@ static ControlFileInfo *get_controlfile(const char *DataDir); uint64 get_system_identifier(const char *data_directory) { - ControlFileInfo *control_file_info = T_CONTROLFILEINFO_INITIALIZER; + ControlFileInfo *control_file_info = NULL; uint64 system_identifier = UNKNOWN_SYSTEM_IDENTIFIER; control_file_info = get_controlfile(data_directory); @@ -39,7 +39,7 @@ get_system_identifier(const char *data_directory) DBState get_db_state(const char *data_directory) { - ControlFileInfo *control_file_info = T_CONTROLFILEINFO_INITIALIZER; + ControlFileInfo *control_file_info = NULL; DBState state; control_file_info = get_controlfile(data_directory); @@ -60,7 +60,7 @@ get_db_state(const char *data_directory) extern XLogRecPtr get_latest_checkpoint_location(const char *data_directory) { - ControlFileInfo *control_file_info = T_CONTROLFILEINFO_INITIALIZER; + ControlFileInfo *control_file_info = NULL; XLogRecPtr checkPoint = InvalidXLogRecPtr; control_file_info = get_controlfile(data_directory); @@ -80,7 +80,7 @@ get_latest_checkpoint_location(const char *data_directory) int get_data_checksum_version(const char *data_directory) { - ControlFileInfo *control_file_info = T_CONTROLFILEINFO_INITIALIZER; + ControlFileInfo *control_file_info = NULL; int data_checksum_version = -1; control_file_info = get_controlfile(data_directory); @@ -132,7 +132,7 @@ describe_db_state(DBState state) static ControlFileInfo * get_controlfile(const char *DataDir) { - ControlFileInfo *control_file_info = T_CONTROLFILEINFO_INITIALIZER; + ControlFileInfo *control_file_info; int fd; char ControlFilePath[MAXPGPATH] = ""; diff --git a/controldata.h b/controldata.h index 8585029f..e3ca9e21 100644 --- a/controldata.h +++ b/controldata.h @@ -18,8 +18,6 @@ typedef struct ControlFileData *control_file; } ControlFileInfo; -#define T_CONTROLFILEINFO_INITIALIZER { false, NULL } - extern DBState get_db_state(const char *data_directory); extern const char * describe_db_state(DBState state); extern int get_data_checksum_version(const char *data_directory); diff --git a/dbutils.c b/dbutils.c index 3661c3f5..085996a5 100644 --- a/dbutils.c +++ b/dbutils.c @@ -2426,6 +2426,53 @@ get_node_replication_stats(PGconn *conn, t_node_info *node_info) } +bool +is_downstream_node_attached(PGconn *conn, char *node_name) +{ + PQExpBufferData query; + PGresult *res = NULL; + int c = 0; + + initPQExpBuffer(&query); + + appendPQExpBuffer( + &query, + " SELECT COUNT(*) FROM pg_catalog.pg_stat_replication " + " WHERE application_name = '%s'", + node_name); + res = PQexec(conn, query.data); + termPQExpBuffer(&query); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + log_verbose(LOG_WARNING, _("unable to query pg_stat_replication")); + log_detail("%s", PQerrorMessage(conn)); + PQclear(res); + return false; + } + + if (PQntuples(res) != 1) + { + log_verbose(LOG_WARNING, _("unexpected number of tuples (%i) returned"), PQntuples(res)); + PQclear(res); + return false; + } + + c = atoi(PQgetvalue(res, 0, 0)); + PQclear(res); + + if (c == 0) + { + log_verbose(LOG_WARNING, _("node \"%s\" not found in \"pg_stat_replication\""), node_name); + return false; + } + + if (c > 1) + log_verbose(LOG_WARNING, _("multiple entries with \"application_name\" set to \"%s\" found in \"pg_stat_replication\""), + node_name); + + return true; +} void clear_node_info_list(NodeInfoList *nodes) diff --git a/dbutils.h b/dbutils.h index b6a29499..7fe439d4 100644 --- a/dbutils.h +++ b/dbutils.h @@ -97,6 +97,7 @@ typedef struct s_node_info /* for ad-hoc use e.g. when working with a list of nodes */ char details[MAXLEN]; bool reachable; + bool attached; /* various statistics */ int max_wal_senders; int attached_wal_receivers; @@ -127,7 +128,7 @@ typedef struct s_node_info MS_NORMAL, \ NULL, \ /* for ad-hoc use e.g. when working with a list of nodes */ \ - "", true \ + "", true, true \ /* various statistics */ \ -1, -1, -1, -1, -1 \ } @@ -437,6 +438,7 @@ XLogRecPtr get_last_wal_receive_location(PGconn *conn); bool get_replication_info(PGconn *conn, ReplInfo *replication_info); int get_replication_lag_seconds(PGconn *conn); void get_node_replication_stats(PGconn *conn, t_node_info *node_info); +bool is_downstream_node_attached(PGconn *conn, char *node_name); /* BDR functions */ void get_all_bdr_node_records(PGconn *conn, BdrNodeInfoList *node_list); diff --git a/repmgr-action-node.c b/repmgr-action-node.c index f1a0e109..f63287f9 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -32,6 +32,7 @@ static void _do_node_restore_config(void); static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_archiver(PGconn *conn, OutputMode mode, CheckStatusList *list_output); static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, CheckStatusList *list_output); +static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_output); void @@ -558,6 +559,12 @@ do_node_check(void) return; } + if (runtime_options.downstream == true) + { + (void) do_node_check_downstream(conn, runtime_options.output_mode, NULL); + PQfinish(conn); + return; + } /* output general overview */ initPQExpBuffer(&output); @@ -565,7 +572,7 @@ do_node_check(void) (void) do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list); (void) do_node_check_replication_lag(conn, runtime_options.output_mode, &status_list); (void) do_node_check_archiver(conn, runtime_options.output_mode, &status_list); - + (void) do_node_check_downstream(conn, runtime_options.output_mode, &status_list); if (runtime_options.output_mode == OM_CSV) { @@ -1016,6 +1023,103 @@ do_node_check_replication_lag(PGconn *conn, OutputMode mode, CheckStatusList *li return status; } +/* TODO: ensure only runs on streaming replication nodes */ +static CheckStatus +do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_output) +{ + NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER; + NodeInfoListCell *cell = NULL; + int missing_nodes_count = 0; + CheckStatus status = CHECK_STATUS_OK; + ItemList missing_nodes = { NULL, NULL }; + PQExpBufferData details; + + initPQExpBuffer(&details); + + get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes); + + for (cell = downstream_nodes.head; cell; cell = cell->next) + { + if (is_downstream_node_attached(conn, cell->node_info->node_name) == false) + { + missing_nodes_count ++; + item_list_append_format(&missing_nodes, + "%s (ID: %i)", + cell->node_info->node_name, + cell->node_info->node_id); + } + } + + if (missing_nodes_count == 0) + { + if (downstream_nodes.node_count == 0) + appendPQExpBuffer( + &details, + "this node has no downstream nodes"); + else + appendPQExpBuffer( + &details, + "%i of %i downstream nodes attached", + downstream_nodes.node_count, + downstream_nodes.node_count); + } + else + { + ItemListCell *missing_cell = NULL; + bool first = true; + status = CHECK_STATUS_CRITICAL; + + appendPQExpBuffer( + &details, + "%i of %i downstream nodes not attached (missing: ", + missing_nodes_count, + downstream_nodes.node_count); + + for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next) + { + if (first == false) + appendPQExpBuffer( + &details, + ","); + else + first = false; + + if (first == false) + appendPQExpBuffer( + &details, + "%s", missing_cell->string); + } + } + + switch (mode) + { + case OM_NAGIOS: + printf("PG_DOWNSTREAM_SERVERS %s: %s\n", + output_check_status(status), + details.data); + break; + case OM_TEXT: + if (list_output != NULL) + { + check_status_list_set(list_output, + "Downstream servers", + status, + details.data); + } + else + { + printf("%s (%s)\n", + output_check_status(status), + details.data); + } + default: + break; + + } + termPQExpBuffer(&details); + + return status; +} // --action=... // --check diff --git a/repmgr-client-global.h b/repmgr-client-global.h index b7ade5c7..54aefb20 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -85,6 +85,7 @@ typedef struct /* "node check" options */ bool archiver; + bool downstream; bool replication_lag; bool role; @@ -132,7 +133,7 @@ typedef struct /* "node status" options */ \ false, \ /* "node check" options */ \ - false, false, false, \ + false, false, false, false,\ /* "node join" options */ \ "", \ /* "node service" options */ \ diff --git a/repmgr-client.c b/repmgr-client.c index 7d8af941..ac5007f2 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -426,6 +426,10 @@ main(int argc, char **argv) runtime_options.archiver = true; break; + case OPT_DOWNSTREAM: + runtime_options.downstream = true; + break; + case OPT_REPLICATION_LAG: runtime_options.replication_lag = true; break; @@ -3001,7 +3005,7 @@ init_node_record(t_node_info *node_record) if (config_file_options.replication_user[0] != '\0') { - /* Replication user explicitly provided */ + /* replication user explicitly provided */ strncpy(node_record->repluser, config_file_options.replication_user, NAMEDATALEN); } else diff --git a/repmgr-client.h b/repmgr-client.h index 5297a85e..b4df881b 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -71,6 +71,7 @@ #define OPT_CONFIG_FILES 1035 #define OPT_SIBLINGS_FOLLOW 1036 #define OPT_ROLE 1037 +#define OPT_DOWNSTREAM 1038 /* deprecated since 3.3 */ #define OPT_DATA_DIR 999 #define OPT_NO_CONNINFO_PASSWORD 998 @@ -139,6 +140,7 @@ static struct option long_options[] = /* "node check" options */ {"archiver", no_argument, NULL, OPT_ARCHIVER }, + {"downstream", no_argument, NULL, OPT_DOWNSTREAM }, {"replication-lag", no_argument, NULL, OPT_REPLICATION_LAG }, {"role", no_argument, NULL, OPT_ROLE },