Implement "repmgr cluster cleanup"

This commit is contained in:
Ian Barwick
2017-09-11 13:48:46 +09:00
parent a9f4a027a7
commit b6b31b15b2
8 changed files with 227 additions and 24 deletions

View File

@@ -1405,10 +1405,10 @@ The view `replication_status` shows the most recent state for each node, e.g.:
The interval in which monitoring history is written is controlled by the The interval in which monitoring history is written is controlled by the
configuration parameter `monitor_interval_secs`; default is 2. configuration parameter `monitor_interval_secs`; default is 2.
As this can generate a large amount of monitoring data in the `monitoring_history` As this can generate a large amount of monitoring data in the table
table, it's advisable to regularly purge historical data with `repmgr.monitoring_history`. it's advisable to regularly purge historical data
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how using the `repmgr cluster cleanup` command ; use the `-k/--keep-history` to
many day's worth of data should be retained. *XXX not yet implemented* specify how many day's worth of data should be retained.
It's possible to use `repmgrd` to provide monitoring only for some or all It's possible to use `repmgrd` to provide monitoring only for some or all
nodes by setting `failover=manual` in the node's `repmgr.conf` file. In the nodes by setting `failover=manual` in the node's `repmgr.conf` file. In the
@@ -1870,6 +1870,16 @@ The following commands are available:
3 | node3 | standby_register | t | 2017-08-17 10:28:55 | standby registration succeeded 3 | node3 | standby_register | t | 2017-08-17 10:28:55 | standby registration succeeded
2 | node2 | standby_register | t | 2017-08-17 10:28:53 | standby registration succeeded 2 | node2 | standby_register | t | 2017-08-17 10:28:53 | standby registration succeeded
* `cluster cleanup`
Purges monitoring history from the `repmgr.monitoring_history` table to
prevent excessive table growth. Use the `-k/--keep-history` to specify the
number of days of monitoring history to retain. This command can be used
manually or as a cronjob.
This command requires a valid `repmgr.conf` file for the node on which it is
executed, either specified explicitly with `-f/--config-file` or located in
the current working directory; no additional arguments are required.
Generating event notifications with repmgr/repmgrd Generating event notifications with repmgr/repmgrd

101
dbutils.c
View File

@@ -1661,6 +1661,31 @@ checkpoint(PGconn *conn)
return; return;
} }
/* assumes superuser connection */
bool
vacuum_table(PGconn *primary_conn, const char *table)
{
PQExpBufferData query;
bool success = true;
PGresult *res = NULL;
initPQExpBuffer(&query);
appendPQExpBuffer(&query, "VACUUM %s", table);
res = PQexec(primary_conn, query.data);
termPQExpBuffer(&query);
log_debug("%i", (int) PQresultStatus(res));
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
success = false;
}
PQclear(res);
return success;
}
/* ===================== */ /* ===================== */
/* Node record functions */ /* Node record functions */
@@ -3408,8 +3433,7 @@ is_server_available(const char *conninfo)
/* ==================== */ /* ==================== */
void void
add_monitoring_record( add_monitoring_record(PGconn *primary_conn,
PGconn *primary_conn,
PGconn *local_conn, PGconn *local_conn,
int primary_node_id, int primary_node_id,
int local_node_id, int local_node_id,
@@ -3478,6 +3502,79 @@ add_monitoring_record(
} }
int
get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_history)
{
PQExpBufferData query;
int record_count = -1;
PGresult *res = NULL;
initPQExpBuffer(&query);
appendPQExpBuffer(&query,
"SELECT COUNT(*) "
" FROM repmgr.monitoring_history "
" WHERE age(now(), last_monitor_time) >= '%d days'::interval",
keep_history);
res = PQexec(primary_conn, query.data);
termPQExpBuffer(&query);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_error(_("unable to query number of monitoring records to clean up"));
log_detail("%s", PQerrorMessage(primary_conn));
PQclear(res);
PQfinish(primary_conn);
exit(ERR_DB_QUERY);
}
else
{
record_count = atoi(PQgetvalue(res, 0, 0));
}
PQclear(res);
return record_count;
}
bool
delete_monitoring_records(PGconn *primary_conn, int keep_history)
{
PQExpBufferData query;
bool success = true;
PGresult *res = NULL;
initPQExpBuffer(&query);
if (keep_history > 0)
{
appendPQExpBuffer(&query,
"DELETE FROM repmgr.monitoring_history "
" WHERE age(now(), last_monitor_time) >= '%d days'::interval ",
keep_history);
}
else
{
appendPQExpBuffer(&query,
"TRUNCATE TABLE repmgr.monitoring_history");
}
res = PQexec(primary_conn, query.data);
termPQExpBuffer(&query);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
success = false;
}
PQclear(res);
return success;
}
/* /*
* node voting functions * node voting functions
* *

View File

@@ -396,7 +396,7 @@ ExtensionStatus get_repmgr_extension_status(PGconn *conn);
/* node management functions */ /* node management functions */
void checkpoint(PGconn *conn); void checkpoint(PGconn *conn);
bool vacuum_table(PGconn *conn, const char *table);
/* node record functions */ /* node record functions */
@@ -456,8 +456,7 @@ bool is_server_available(const char *conninfo);
/* monitoring functions */ /* monitoring functions */
void void
add_monitoring_record( add_monitoring_record(PGconn *primary_conn,
PGconn *primary_conn,
PGconn *local_conn, PGconn *local_conn,
int primary_node_id, int primary_node_id,
int local_node_id, int local_node_id,
@@ -469,6 +468,10 @@ add_monitoring_record(
long long unsigned int apply_lag_bytes long long unsigned int apply_lag_bytes
); );
int get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_history);
bool delete_monitoring_records(PGconn *primary_conn, int keep_history);
/* node voting functions */ /* node voting functions */
NodeVotingStatus get_voting_status(PGconn *conn); NodeVotingStatus get_voting_status(PGconn *conn);

View File

@@ -1287,6 +1287,67 @@ cube_set_node_status(t_node_status_cube **cube, int n, int execute_node_id, int
} }
void
do_cluster_cleanup(void)
{
PGconn *conn = NULL;
PGconn *primary_conn = NULL;
int entries_to_delete = 0;
conn = establish_db_connection(config_file_options.conninfo, true);
/* check if there is a master in this cluster */
log_info(_("connecting to primary server"));
primary_conn = establish_primary_db_connection(conn, true);
PQfinish(conn);
log_debug(_("number of days of monitoring history to retain: %i"), runtime_options.keep_history);
entries_to_delete = get_number_of_monitoring_records_to_delete(primary_conn, runtime_options.keep_history);
if (entries_to_delete == 0)
{
log_info(_("no monitoring records to delete"));
PQfinish(primary_conn);
return;
}
log_debug("at least %i monitoring records for deletion",
entries_to_delete);
if (delete_monitoring_records(primary_conn, runtime_options.keep_history) == false)
{
log_error(_("unable to delete monitoring records"));
log_detail("%s", PQerrorMessage(primary_conn));
PQfinish(primary_conn);
exit(ERR_DB_QUERY);
}
if (vacuum_table(primary_conn, "repmgr.monitoring_history") == false)
{
/* annoying if this fails, but not fatal */
log_warning(_("unable to vacuum table repmgr.monitoring_history\n"));
log_detail("%s", PQerrorMessage(primary_conn));
}
PQfinish(primary_conn);
if (runtime_options.keep_history > 0)
{
log_notice(_("monitoring records older than %i day(s) deleted"),
runtime_options.keep_history);
}
else
{
log_info(_("all monitoring records deleted"));
}
return;
}
void void
do_cluster_help(void) do_cluster_help(void)
{ {
@@ -1305,7 +1366,7 @@ do_cluster_help(void)
puts(""); puts("");
printf(_(" Configuration file or database connection required.\n")); printf(_(" Configuration file or database connection required.\n"));
puts(""); puts("");
printf(_(" --csv emit output as CSV (with a subset of fields)\n")); printf(_(" --csv emit output as CSV (with a subset of fields)\n"));
puts(""); puts("");
printf(_("CLUSTER MATRIX\n")); printf(_("CLUSTER MATRIX\n"));
@@ -1314,7 +1375,7 @@ do_cluster_help(void)
puts(""); puts("");
printf(_(" Configuration file or database connection required.\n")); printf(_(" Configuration file or database connection required.\n"));
puts(""); puts("");
printf(_(" --csv emit output as CSV\n")); printf(_(" --csv emit output as CSV\n"));
puts(""); puts("");
printf(_("CLUSTER CROSSCHECK\n")); printf(_("CLUSTER CROSSCHECK\n"));
@@ -1323,7 +1384,7 @@ do_cluster_help(void)
puts(""); puts("");
printf(_(" Configuration file or database connection required.\n")); printf(_(" Configuration file or database connection required.\n"));
puts(""); puts("");
printf(_(" --csv emit output as CSV\n")); printf(_(" --csv emit output as CSV\n"));
puts(""); puts("");
@@ -1331,12 +1392,18 @@ do_cluster_help(void)
puts(""); puts("");
printf(_(" \"cluster event\" lists recent events logged in the \"repmgr.events\" table.\n")); printf(_(" \"cluster event\" lists recent events logged in the \"repmgr.events\" table.\n"));
puts(""); puts("");
printf(_(" --limit maximum number of events to display (default: %i)\n"), CLUSTER_EVENT_LIMIT); printf(_(" --limit maximum number of events to display (default: %i)\n"), CLUSTER_EVENT_LIMIT);
printf(_(" --all display all events (overrides --limit)\n")); printf(_(" --all display all events (overrides --limit)\n"));
printf(_(" --event filter specific event\n")); printf(_(" --event filter specific event\n"));
printf(_(" --node-id restrict entries to node with this ID\n")); printf(_(" --node-id restrict entries to node with this ID\n"));
printf(_(" --node-name restrict entries to node with this name\n")); printf(_(" --node-name restrict entries to node with this name\n"));
puts("");
printf(_("CLUSTER EVENT\n"));
puts("");
printf(_(" \"cluster event\" purges records from the \"repmgr.monitor\" table.\n"));
puts("");
printf(_(" -k, --keep-history=VALUE retain indicated number of days of history (default: 0)\n"));
puts(""); puts("");
} }

View File

@@ -47,6 +47,7 @@ extern void do_cluster_show(void);
extern void do_cluster_event(void); extern void do_cluster_event(void);
extern void do_cluster_crosscheck(void); extern void do_cluster_crosscheck(void);
extern void do_cluster_matrix(void); extern void do_cluster_matrix(void);
extern void do_cluster_cleanup(void);
extern void do_cluster_help(void); extern void do_cluster_help(void);

View File

@@ -116,6 +116,9 @@ typedef struct
char event[MAXLEN]; char event[MAXLEN];
int limit; int limit;
/* "cluster cleanup" options */
int keep_history;
/* following options for internal use */ /* following options for internal use */
char config_archive_dir[MAXPGPATH]; char config_archive_dir[MAXPGPATH];
OutputMode output_mode; OutputMode output_mode;
@@ -155,6 +158,8 @@ typedef struct
"", false, false, false, \ "", false, false, false, \
/* "cluster event" options */ \ /* "cluster event" options */ \
false, "", CLUSTER_EVENT_LIMIT, \ false, "", CLUSTER_EVENT_LIMIT, \
/* "cluster cleanup" options */ \
0, \
/* Following options for internal use */ \ /* Following options for internal use */ \
"/tmp", OM_TEXT \ "/tmp", OM_TEXT \
} }

View File

@@ -25,6 +25,7 @@
* CLUSTER EVENT * CLUSTER EVENT
* CLUSTER CROSSCHECK * CLUSTER CROSSCHECK
* CLUSTER MATRIX * CLUSTER MATRIX
* CLUSTER CLEANUP
* *
* NODE STATUS * NODE STATUS
* NODE CHECK * NODE CHECK
@@ -499,6 +500,16 @@ main(int argc, char **argv)
runtime_options.all = true; runtime_options.all = true;
break; break;
/*------------------------
* "cluster cleanup" options
*------------------------
*/
/* -k/--keep-history */
case 'k':
runtime_options.keep_history = repmgr_atoi(optarg, "-k/--keep-history", &cli_errors, false);
break;
/*---------------- /*----------------
* logging options * logging options
*---------------- *----------------
@@ -688,17 +699,18 @@ main(int argc, char **argv)
exit_with_cli_errors(&cli_errors); exit_with_cli_errors(&cli_errors);
} }
/* /*----------
* Determine the node type and action; following are valid: * Determine the node type and action; following are valid:
* *
* { PRIMARY | MASTER } REGISTER | STANDBY {REGISTER | UNREGISTER | CLONE * { PRIMARY | MASTER } REGISTER |
* [node] | PROMOTE | FOLLOW [node] | SWITCHOVER | REWIND} | BDR { * STANDBY { REGISTER | UNREGISTER | CLONE [node] | PROMOTE | FOLLOW [node] | SWITCHOVER } |
* REGISTER | UNREGISTER } | NODE { STATUS | CHECK | REJOIN | * BDR { REGISTER | UNREGISTER } |
* ARCHIVE-CONFIG | RESTORE-CONFIG | SERVICE } | CLUSTER { CROSSCHECK | * NODE { STATUS | CHECK | REJOIN | SERVICE } |
* MATRIX | SHOW | CLEANUP | EVENT } * CLUSTER { CROSSCHECK | MATRIX | SHOW | EVENT | CLEANUP }
* *
* [node] is an optional hostname, provided instead of the -h/--host * [node] is an optional hostname, provided instead of the -h/--host
* option * option
* ---------
*/ */
if (optind < argc) if (optind < argc)
{ {
@@ -818,6 +830,8 @@ main(int argc, char **argv)
action = CLUSTER_CROSSCHECK; action = CLUSTER_CROSSCHECK;
else if (strcasecmp(repmgr_action, "MATRIX") == 0) else if (strcasecmp(repmgr_action, "MATRIX") == 0)
action = CLUSTER_MATRIX; action = CLUSTER_MATRIX;
else if (strcasecmp(repmgr_action, "CLEANUP") == 0)
action = CLUSTER_CLEANUP;
} }
else else
{ {
@@ -1200,6 +1214,9 @@ main(int argc, char **argv)
case CLUSTER_MATRIX: case CLUSTER_MATRIX:
do_cluster_matrix(); do_cluster_matrix();
break; break;
case CLUSTER_CLEANUP:
do_cluster_cleanup();
break;
default: default:
/* An action will have been determined by this point */ /* An action will have been determined by this point */

View File

@@ -169,6 +169,9 @@ static struct option long_options[] =
{"event", required_argument, NULL, OPT_EVENT}, {"event", required_argument, NULL, OPT_EVENT},
{"limit", required_argument, NULL, OPT_LIMIT}, {"limit", required_argument, NULL, OPT_LIMIT},
/* "cluster cleanup" options */
{"keep-history", required_argument, NULL, 'k'},
/* Following options for internal use */ /* Following options for internal use */
{"config-archive-dir", required_argument, NULL, OPT_CONFIG_ARCHIVE_DIR}, {"config-archive-dir", required_argument, NULL, OPT_CONFIG_ARCHIVE_DIR},
@@ -181,7 +184,7 @@ static struct option long_options[] =
/* not yet handled */ /* not yet handled */
{"keep-history", required_argument, NULL, 'k'},
{"mode", required_argument, NULL, 'm'}, {"mode", required_argument, NULL, 'm'},
{"check-upstream-config", no_argument, NULL, OPT_CHECK_UPSTREAM_CONFIG}, {"check-upstream-config", no_argument, NULL, OPT_CHECK_UPSTREAM_CONFIG},
{"pg_rewind", optional_argument, NULL, OPT_PG_REWIND}, {"pg_rewind", optional_argument, NULL, OPT_PG_REWIND},