Add a CLUSTER CLEANUP command to clean monitor's history,

also include a --keep-history (-k) option to indicate how many
days of history to keep
This commit is contained in:
Jaime Casanova
2012-06-13 00:39:54 -05:00
parent 7a76f1998c
commit 64fce88e99
3 changed files with 113 additions and 6 deletions

View File

@@ -825,6 +825,22 @@ and on "prime."
The servers are now again acting as primary on "prime" and standby on "standby".
Maintainance of monitor history
-------------------------------
Once you have changed roles (with a failover or to restore original roles)
you would end up with records saying that node1 is primary and other records
saying that node2 is the primary. Which could be confusing.
Also, if you don't do anything about it the monitor history will keep growing.
For both of those reasons you sometime want to make some maintainance of the
``repl_monitor`` table.
If you want to clean the history after a few days you can execute the
CLUSTER CLEANUP command in a cron. For example to keep just one day of history
you can put this in your crontab::
0 1 * * * repmgr cluster cleanup -k 1 -f ~/repmgr.conf
Configuration and command reference
===================================
@@ -953,6 +969,26 @@ its port if is different from the default one.
./repmgr standby follow
* cluster show
* Shows the role (standby/master) and connection string for all nodes configured
in the cluster or "FAILED" if the node doesn't respond. This allow us to know
which nodes are alive and which one needs attention and to have a notion of the
structure of clusters we just have access to. Example::
./repmgr cluster show
* cluster cleanup
* Cleans the monitor's history from repmgr tables. This avoids the repl_monitor table
to grow excesivelly which in turns affects repl_status view performance, also
keeps controlled the space in disk used by repmgr. This command can be used manually
or in a cron to make it periodically.
There is also a --keep-history (-k) option to indicate how many days of history we
want to keep, so the command will clean up history older than "keep-history" days. Example::
./repmgr cluster cleanup -k 2
repmgrd Daemon
--------------

View File

@@ -8,6 +8,7 @@
* Commands implemented are.
* MASTER REGISTER
* STANDBY REGISTER, STANDBY CLONE, STANDBY FOLLOW, STANDBY PROMOTE
* CLUSTER SHOW, CLUSTER CLEANUP
* WITNESS CREATE
*
* This program is free software: you can redistribute it and/or modify
@@ -48,6 +49,7 @@
#define STANDBY_FOLLOW 5
#define WITNESS_CREATE 6
#define CLUSTER_SHOW 7
#define CLUSTER_CLEANUP 8
static bool create_recovery_file(const char *data_dir);
static int test_ssh_connection(char *host, char *remote_user);
@@ -65,6 +67,7 @@ static void do_standby_promote(void);
static void do_standby_follow(void);
static void do_witness_create(void);
static void do_cluster_show(void);
static void do_cluster_cleanup(void);
static void usage(void);
static void help(const char *progname);
@@ -80,7 +83,7 @@ bool need_a_node = true;
bool require_password = false;
/* Initialization of runtime options */
t_runtime_options runtime_options = { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, "" };
t_runtime_options runtime_options = { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, "", 0 };
t_configuration_options options = { "", -1, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", -1 };
static char *server_mode = NULL;
@@ -100,6 +103,7 @@ main(int argc, char **argv)
{"config-file", required_argument, NULL, 'f'},
{"remote-user", required_argument, NULL, 'R'},
{"wal-keep-segments", required_argument, NULL, 'w'},
{"keep-history", required_argument, NULL, 'k'},
{"force", no_argument, NULL, 'F'},
{"ignore-rsync-warning", no_argument, NULL, 'I'},
{"verbose", no_argument, NULL, 'v'},
@@ -127,7 +131,7 @@ main(int argc, char **argv)
}
while ((c = getopt_long(argc, argv, "d:h:p:U:D:l:f:R:w:F:I:v", long_options,
while ((c = getopt_long(argc, argv, "d:h:p:U:D:l:f:R:w:k:F:I:v", long_options,
&optindex)) != -1)
{
switch (c)
@@ -162,6 +166,12 @@ main(int argc, char **argv)
if (atoi(optarg) > 0)
strncpy(runtime_options.wal_keep_segments, optarg, MAXLEN);
break;
case 'k':
if (atoi(optarg) > 0)
runtime_options.keep_history = atoi(optarg);
else
runtime_options.keep_history = 0;
break;
case 'F':
runtime_options.force = true;
break;
@@ -182,7 +192,7 @@ main(int argc, char **argv)
* MASTER REGISTER |
* STANDBY {REGISTER | CLONE [node] | PROMOTE | FOLLOW [node]} |
* WITNESS CREATE
* CLUSTER SHOW
* CLUSTER {SHOW | CLEANUP}
*
* the node part is optional, if we receive it then we shouldn't
* have received a -h option
@@ -223,6 +233,8 @@ main(int argc, char **argv)
{
if( strcasecmp(server_cmd, "SHOW") == 0)
action = CLUSTER_SHOW;
else if(strcasecmp(server_cmd, "CLEANUP") == 0)
action = CLUSTER_CLEANUP;
}
else if (strcasecmp(server_mode, "WITNESS") == 0)
if (strcasecmp(server_cmd, "CREATE") == 0)
@@ -348,6 +360,9 @@ main(int argc, char **argv)
case CLUSTER_SHOW:
do_cluster_show();
break;
case CLUSTER_CLEANUP:
do_cluster_cleanup();
break;
default:
usage();
exit(ERR_BAD_CONFIG);
@@ -402,10 +417,59 @@ do_cluster_show(void)
}
PQclear(res);
}
static void
do_cluster_cleanup(void)
{
int master_id;
PGconn *master_conn;
PGresult *res;
char sqlquery[QUERY_STR_LEN];
char node_role[MAXLEN];
int i;
/* check if there is a master in this cluster */
log_info(_("%s connecting to master database\n"), progname);
master_conn = getMasterConnection(master_conn, repmgr_schema, options.cluster_name,
&master_id, NULL);
if (!master_conn)
{
log_err(_("cluster cleanup: cannot connect to master\n"));
exit(ERR_DB_CON);
}
if (runtime_options.keep_history > 0)
{
sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_monitor "
" WHERE age(now(), last_monitor_time) >= '%d days'::interval;",
repmgr_schema, keep_history);
}
else
{
sqlquery_snprintf(sqlquery, "TRUNCATE TABLE %s.repl_monitor;", repmgr_schema);
}
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("cluster cleanup: Couldn't clean history\n%s\n"), PQerrorMessage(conn));
PQclear(res);
PQfinish(conn);
exit(ERR_BAD_CONFIG);
}
PQclear(res);
/* Let's VACUUM the table to avoid autovacuum to be launched in an unexpected hour */
sqlquery_snprintf(sqlquery, "VACUUM %s.repl_monitor;", repmgr_schema);
res = PQexec(conn, sqlquery);
/* XXX There is any need to check this VACUUM happens without problems? */
PQclear(res);
PQfinish(conn);
}
static void
do_master_register(void)
{
@@ -1642,7 +1706,7 @@ help(const char *progname)
printf(_(" %s [OPTIONS] master {register}\n"), progname);
printf(_(" %s [OPTIONS] standby {register|clone|promote|follow}\n"),
progname);
printf(_(" %s [OPTIONS] cluster show\n"), progname);
printf(_(" %s [OPTIONS] cluster {show|cleanup}\n"), progname);
printf(_("\nGeneral options:\n"));
printf(_(" --help show this help, then exit\n"));
printf(_(" --version output version information, then exit\n"));
@@ -1659,6 +1723,7 @@ help(const char *progname)
printf(_(" -R, --remote-user=USERNAME database server username for rsync\n"));
printf(_(" -w, --wal-keep-segments=VALUE minimum value for the GUC wal_keep_segments (default: 5000)\n"));
printf(_(" -I, --ignore-rsync-warning ignore rsync partial transfer warning\n"));
printf(_(" -k, --keep-history=VALUE keeps indicated number of days of history\n"));
printf(_(" -F, --force force potentially dangerous operations to happen\n"));
printf(_("\n%s performs some tasks like clone a node, promote it "), progname);
@@ -1671,6 +1736,7 @@ help(const char *progname)
printf(_("new master in the event of a failover\n"));
printf(_(" standby follow - allows the standby to re-point itself to a new master\n"));
printf(_(" cluster show - print node informations\n"));
printf(_(" cluster cleanup - cleans monitor's history\n"));
}
@@ -1935,6 +2001,9 @@ check_parameters_for_action(const int action)
case CLUSTER_SHOW:
/* allow all parameters to be supplied */
break;
case CLUSTER_CLEANUP:
/* allow all parameters to be supplied */
break;
}
return ok;

View File

@@ -64,6 +64,8 @@ typedef struct
char masterport[MAXLEN];
char localport[MAXLEN];
/* parameter used by CLUSTER CLEANUP */
int keep_history;
} t_runtime_options;
#define SLEEP_MONITOR 2