From b6b31b15b2dd00b96b66f0b721a80ee393d2bd68 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Mon, 11 Sep 2017 13:48:46 +0900 Subject: [PATCH] Implement "repmgr cluster cleanup" --- README.md | 18 +++++-- dbutils.c | 101 +++++++++++++++++++++++++++++++++++++++- dbutils.h | 9 ++-- repmgr-action-cluster.c | 83 +++++++++++++++++++++++++++++---- repmgr-action-cluster.h | 1 + repmgr-client-global.h | 5 ++ repmgr-client.c | 29 +++++++++--- repmgr-client.h | 5 +- 8 files changed, 227 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 750aad24..3fac9e3a 100644 --- a/README.md +++ b/README.md @@ -1405,10 +1405,10 @@ The view `replication_status` shows the most recent state for each node, e.g.: The interval in which monitoring history is written is controlled by the configuration parameter `monitor_interval_secs`; default is 2. -As this can generate a large amount of monitoring data in the `monitoring_history` -table, it's advisable to regularly purge historical data with -`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how -many day's worth of data should be retained. *XXX not yet implemented* +As this can generate a large amount of monitoring data in the table +`repmgr.monitoring_history`. it's advisable to regularly purge historical data +using the `repmgr cluster cleanup` command ; use the `-k/--keep-history` to +specify how many day's worth of data should be retained. It's possible to use `repmgrd` to provide monitoring only for some or all nodes by setting `failover=manual` in the node's `repmgr.conf` file. In the @@ -1870,6 +1870,16 @@ The following commands are available: 3 | node3 | standby_register | t | 2017-08-17 10:28:55 | standby registration succeeded 2 | node2 | standby_register | t | 2017-08-17 10:28:53 | standby registration succeeded +* `cluster cleanup` + + Purges monitoring history from the `repmgr.monitoring_history` table to + prevent excessive table growth. Use the `-k/--keep-history` to specify the + number of days of monitoring history to retain. This command can be used + manually or as a cronjob. + + This command requires a valid `repmgr.conf` file for the node on which it is + executed, either specified explicitly with `-f/--config-file` or located in + the current working directory; no additional arguments are required. Generating event notifications with repmgr/repmgrd diff --git a/dbutils.c b/dbutils.c index fabe3da0..00e02ecb 100644 --- a/dbutils.c +++ b/dbutils.c @@ -1661,6 +1661,31 @@ checkpoint(PGconn *conn) return; } +/* assumes superuser connection */ +bool +vacuum_table(PGconn *primary_conn, const char *table) +{ + PQExpBufferData query; + bool success = true; + PGresult *res = NULL; + + initPQExpBuffer(&query); + + appendPQExpBuffer(&query, "VACUUM %s", table); + + res = PQexec(primary_conn, query.data); + termPQExpBuffer(&query); + + log_debug("%i", (int) PQresultStatus(res)); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + success = false; + } + + PQclear(res); + + return success; +} /* ===================== */ /* Node record functions */ @@ -3408,8 +3433,7 @@ is_server_available(const char *conninfo) /* ==================== */ void -add_monitoring_record( - PGconn *primary_conn, +add_monitoring_record(PGconn *primary_conn, PGconn *local_conn, int primary_node_id, int local_node_id, @@ -3478,6 +3502,79 @@ add_monitoring_record( } +int +get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_history) +{ + PQExpBufferData query; + int record_count = -1; + PGresult *res = NULL; + + initPQExpBuffer(&query); + + appendPQExpBuffer(&query, + "SELECT COUNT(*) " + " FROM repmgr.monitoring_history " + " WHERE age(now(), last_monitor_time) >= '%d days'::interval", + keep_history); + + res = PQexec(primary_conn, query.data); + termPQExpBuffer(&query); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + log_error(_("unable to query number of monitoring records to clean up")); + log_detail("%s", PQerrorMessage(primary_conn)); + + PQclear(res); + PQfinish(primary_conn); + exit(ERR_DB_QUERY); + } + else + { + record_count = atoi(PQgetvalue(res, 0, 0)); + } + + PQclear(res); + + return record_count; +} + + +bool +delete_monitoring_records(PGconn *primary_conn, int keep_history) +{ + PQExpBufferData query; + bool success = true; + PGresult *res = NULL; + + initPQExpBuffer(&query); + + if (keep_history > 0) + { + appendPQExpBuffer(&query, + "DELETE FROM repmgr.monitoring_history " + " WHERE age(now(), last_monitor_time) >= '%d days'::interval ", + keep_history); + } + else + { + appendPQExpBuffer(&query, + "TRUNCATE TABLE repmgr.monitoring_history"); + } + + res = PQexec(primary_conn, query.data); + termPQExpBuffer(&query); + + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + success = false; + } + + PQclear(res); + + return success; +} + /* * node voting functions * diff --git a/dbutils.h b/dbutils.h index 4fc63478..fbd002cc 100644 --- a/dbutils.h +++ b/dbutils.h @@ -396,7 +396,7 @@ ExtensionStatus get_repmgr_extension_status(PGconn *conn); /* node management functions */ void checkpoint(PGconn *conn); - +bool vacuum_table(PGconn *conn, const char *table); /* node record functions */ @@ -456,8 +456,7 @@ bool is_server_available(const char *conninfo); /* monitoring functions */ void -add_monitoring_record( - PGconn *primary_conn, +add_monitoring_record(PGconn *primary_conn, PGconn *local_conn, int primary_node_id, int local_node_id, @@ -469,6 +468,10 @@ add_monitoring_record( long long unsigned int apply_lag_bytes ); +int get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_history); +bool delete_monitoring_records(PGconn *primary_conn, int keep_history); + + /* node voting functions */ NodeVotingStatus get_voting_status(PGconn *conn); diff --git a/repmgr-action-cluster.c b/repmgr-action-cluster.c index ee17b9c0..b1b6065d 100644 --- a/repmgr-action-cluster.c +++ b/repmgr-action-cluster.c @@ -1287,6 +1287,67 @@ cube_set_node_status(t_node_status_cube **cube, int n, int execute_node_id, int } +void +do_cluster_cleanup(void) +{ + PGconn *conn = NULL; + PGconn *primary_conn = NULL; + int entries_to_delete = 0; + + conn = establish_db_connection(config_file_options.conninfo, true); + + /* check if there is a master in this cluster */ + log_info(_("connecting to primary server")); + primary_conn = establish_primary_db_connection(conn, true); + + PQfinish(conn); + + log_debug(_("number of days of monitoring history to retain: %i"), runtime_options.keep_history); + + entries_to_delete = get_number_of_monitoring_records_to_delete(primary_conn, runtime_options.keep_history); + + if (entries_to_delete == 0) + { + log_info(_("no monitoring records to delete")); + PQfinish(primary_conn); + return; + } + + log_debug("at least %i monitoring records for deletion", + entries_to_delete); + + if (delete_monitoring_records(primary_conn, runtime_options.keep_history) == false) + { + log_error(_("unable to delete monitoring records")); + log_detail("%s", PQerrorMessage(primary_conn)); + PQfinish(primary_conn); + exit(ERR_DB_QUERY); + } + + if (vacuum_table(primary_conn, "repmgr.monitoring_history") == false) + { + /* annoying if this fails, but not fatal */ + log_warning(_("unable to vacuum table repmgr.monitoring_history\n")); + log_detail("%s", PQerrorMessage(primary_conn)); + } + + + PQfinish(primary_conn); + + if (runtime_options.keep_history > 0) + { + log_notice(_("monitoring records older than %i day(s) deleted"), + runtime_options.keep_history); + } + else + { + log_info(_("all monitoring records deleted")); + } + + return; +} + + void do_cluster_help(void) { @@ -1305,7 +1366,7 @@ do_cluster_help(void) puts(""); printf(_(" Configuration file or database connection required.\n")); puts(""); - printf(_(" --csv emit output as CSV (with a subset of fields)\n")); + printf(_(" --csv emit output as CSV (with a subset of fields)\n")); puts(""); printf(_("CLUSTER MATRIX\n")); @@ -1314,7 +1375,7 @@ do_cluster_help(void) puts(""); printf(_(" Configuration file or database connection required.\n")); puts(""); - printf(_(" --csv emit output as CSV\n")); + printf(_(" --csv emit output as CSV\n")); puts(""); printf(_("CLUSTER CROSSCHECK\n")); @@ -1323,7 +1384,7 @@ do_cluster_help(void) puts(""); printf(_(" Configuration file or database connection required.\n")); puts(""); - printf(_(" --csv emit output as CSV\n")); + printf(_(" --csv emit output as CSV\n")); puts(""); @@ -1331,12 +1392,18 @@ do_cluster_help(void) puts(""); printf(_(" \"cluster event\" lists recent events logged in the \"repmgr.events\" table.\n")); puts(""); - printf(_(" --limit maximum number of events to display (default: %i)\n"), CLUSTER_EVENT_LIMIT); - printf(_(" --all display all events (overrides --limit)\n")); - printf(_(" --event filter specific event\n")); - printf(_(" --node-id restrict entries to node with this ID\n")); - printf(_(" --node-name restrict entries to node with this name\n")); + printf(_(" --limit maximum number of events to display (default: %i)\n"), CLUSTER_EVENT_LIMIT); + printf(_(" --all display all events (overrides --limit)\n")); + printf(_(" --event filter specific event\n")); + printf(_(" --node-id restrict entries to node with this ID\n")); + printf(_(" --node-name restrict entries to node with this name\n")); + puts(""); + printf(_("CLUSTER EVENT\n")); + puts(""); + printf(_(" \"cluster event\" purges records from the \"repmgr.monitor\" table.\n")); + puts(""); + printf(_(" -k, --keep-history=VALUE retain indicated number of days of history (default: 0)\n")); puts(""); } diff --git a/repmgr-action-cluster.h b/repmgr-action-cluster.h index 0927e86c..07a7195b 100644 --- a/repmgr-action-cluster.h +++ b/repmgr-action-cluster.h @@ -47,6 +47,7 @@ extern void do_cluster_show(void); extern void do_cluster_event(void); extern void do_cluster_crosscheck(void); extern void do_cluster_matrix(void); +extern void do_cluster_cleanup(void); extern void do_cluster_help(void); diff --git a/repmgr-client-global.h b/repmgr-client-global.h index 17367ffa..066dbec4 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -116,6 +116,9 @@ typedef struct char event[MAXLEN]; int limit; + /* "cluster cleanup" options */ + int keep_history; + /* following options for internal use */ char config_archive_dir[MAXPGPATH]; OutputMode output_mode; @@ -155,6 +158,8 @@ typedef struct "", false, false, false, \ /* "cluster event" options */ \ false, "", CLUSTER_EVENT_LIMIT, \ + /* "cluster cleanup" options */ \ + 0, \ /* Following options for internal use */ \ "/tmp", OM_TEXT \ } diff --git a/repmgr-client.c b/repmgr-client.c index 0eddc7d2..0494ec51 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -25,6 +25,7 @@ * CLUSTER EVENT * CLUSTER CROSSCHECK * CLUSTER MATRIX + * CLUSTER CLEANUP * * NODE STATUS * NODE CHECK @@ -499,6 +500,16 @@ main(int argc, char **argv) runtime_options.all = true; break; + /*------------------------ + * "cluster cleanup" options + *------------------------ + */ + + /* -k/--keep-history */ + case 'k': + runtime_options.keep_history = repmgr_atoi(optarg, "-k/--keep-history", &cli_errors, false); + break; + /*---------------- * logging options *---------------- @@ -688,17 +699,18 @@ main(int argc, char **argv) exit_with_cli_errors(&cli_errors); } - /* + /*---------- * Determine the node type and action; following are valid: * - * { PRIMARY | MASTER } REGISTER | STANDBY {REGISTER | UNREGISTER | CLONE - * [node] | PROMOTE | FOLLOW [node] | SWITCHOVER | REWIND} | BDR { - * REGISTER | UNREGISTER } | NODE { STATUS | CHECK | REJOIN | - * ARCHIVE-CONFIG | RESTORE-CONFIG | SERVICE } | CLUSTER { CROSSCHECK | - * MATRIX | SHOW | CLEANUP | EVENT } + * { PRIMARY | MASTER } REGISTER | + * STANDBY { REGISTER | UNREGISTER | CLONE [node] | PROMOTE | FOLLOW [node] | SWITCHOVER } | + * BDR { REGISTER | UNREGISTER } | + * NODE { STATUS | CHECK | REJOIN | SERVICE } | + * CLUSTER { CROSSCHECK | MATRIX | SHOW | EVENT | CLEANUP } * * [node] is an optional hostname, provided instead of the -h/--host * option + * --------- */ if (optind < argc) { @@ -818,6 +830,8 @@ main(int argc, char **argv) action = CLUSTER_CROSSCHECK; else if (strcasecmp(repmgr_action, "MATRIX") == 0) action = CLUSTER_MATRIX; + else if (strcasecmp(repmgr_action, "CLEANUP") == 0) + action = CLUSTER_CLEANUP; } else { @@ -1200,6 +1214,9 @@ main(int argc, char **argv) case CLUSTER_MATRIX: do_cluster_matrix(); break; + case CLUSTER_CLEANUP: + do_cluster_cleanup(); + break; default: /* An action will have been determined by this point */ diff --git a/repmgr-client.h b/repmgr-client.h index 3158c961..f53b1efc 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -169,6 +169,9 @@ static struct option long_options[] = {"event", required_argument, NULL, OPT_EVENT}, {"limit", required_argument, NULL, OPT_LIMIT}, +/* "cluster cleanup" options */ + {"keep-history", required_argument, NULL, 'k'}, + /* Following options for internal use */ {"config-archive-dir", required_argument, NULL, OPT_CONFIG_ARCHIVE_DIR}, @@ -181,7 +184,7 @@ static struct option long_options[] = /* not yet handled */ - {"keep-history", required_argument, NULL, 'k'}, + {"mode", required_argument, NULL, 'm'}, {"check-upstream-config", no_argument, NULL, OPT_CHECK_UPSTREAM_CONFIG}, {"pg_rewind", optional_argument, NULL, OPT_PG_REWIND},