From 64fce88e99d9d3cdceebffc071450a4d2a580a25 Mon Sep 17 00:00:00 2001 From: Jaime Casanova Date: Wed, 13 Jun 2012 00:39:54 -0500 Subject: [PATCH] Add a CLUSTER CLEANUP command to clean monitor's history, also include a --keep-history (-k) option to indicate how many days of history to keep --- README.rst | 36 ++++++++++++++++++++++++ repmgr.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++---- repmgr.h | 2 ++ 3 files changed, 113 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 3be80b6c..30f32081 100644 --- a/README.rst +++ b/README.rst @@ -825,6 +825,22 @@ and on "prime." The servers are now again acting as primary on "prime" and standby on "standby". +Maintainance of monitor history +------------------------------- + +Once you have changed roles (with a failover or to restore original roles) +you would end up with records saying that node1 is primary and other records +saying that node2 is the primary. Which could be confusing. +Also, if you don't do anything about it the monitor history will keep growing. +For both of those reasons you sometime want to make some maintainance of the +``repl_monitor`` table. + +If you want to clean the history after a few days you can execute the +CLUSTER CLEANUP command in a cron. For example to keep just one day of history +you can put this in your crontab:: + +0 1 * * * repmgr cluster cleanup -k 1 -f ~/repmgr.conf + Configuration and command reference =================================== @@ -953,6 +969,26 @@ its port if is different from the default one. ./repmgr standby follow +* cluster show + + * Shows the role (standby/master) and connection string for all nodes configured + in the cluster or "FAILED" if the node doesn't respond. This allow us to know + which nodes are alive and which one needs attention and to have a notion of the + structure of clusters we just have access to. Example:: + + ./repmgr cluster show + +* cluster cleanup + + * Cleans the monitor's history from repmgr tables. This avoids the repl_monitor table + to grow excesivelly which in turns affects repl_status view performance, also + keeps controlled the space in disk used by repmgr. This command can be used manually + or in a cron to make it periodically. + There is also a --keep-history (-k) option to indicate how many days of history we + want to keep, so the command will clean up history older than "keep-history" days. Example:: + + ./repmgr cluster cleanup -k 2 + repmgrd Daemon -------------- diff --git a/repmgr.c b/repmgr.c index a65d3559..b06d2388 100644 --- a/repmgr.c +++ b/repmgr.c @@ -8,6 +8,7 @@ * Commands implemented are. * MASTER REGISTER * STANDBY REGISTER, STANDBY CLONE, STANDBY FOLLOW, STANDBY PROMOTE + * CLUSTER SHOW, CLUSTER CLEANUP * WITNESS CREATE * * This program is free software: you can redistribute it and/or modify @@ -48,6 +49,7 @@ #define STANDBY_FOLLOW 5 #define WITNESS_CREATE 6 #define CLUSTER_SHOW 7 +#define CLUSTER_CLEANUP 8 static bool create_recovery_file(const char *data_dir); static int test_ssh_connection(char *host, char *remote_user); @@ -65,6 +67,7 @@ static void do_standby_promote(void); static void do_standby_follow(void); static void do_witness_create(void); static void do_cluster_show(void); +static void do_cluster_cleanup(void); static void usage(void); static void help(const char *progname); @@ -80,7 +83,7 @@ bool need_a_node = true; bool require_password = false; /* Initialization of runtime options */ -t_runtime_options runtime_options = { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, "" }; +t_runtime_options runtime_options = { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, "", 0 }; t_configuration_options options = { "", -1, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", -1 }; static char *server_mode = NULL; @@ -100,6 +103,7 @@ main(int argc, char **argv) {"config-file", required_argument, NULL, 'f'}, {"remote-user", required_argument, NULL, 'R'}, {"wal-keep-segments", required_argument, NULL, 'w'}, + {"keep-history", required_argument, NULL, 'k'}, {"force", no_argument, NULL, 'F'}, {"ignore-rsync-warning", no_argument, NULL, 'I'}, {"verbose", no_argument, NULL, 'v'}, @@ -127,7 +131,7 @@ main(int argc, char **argv) } - while ((c = getopt_long(argc, argv, "d:h:p:U:D:l:f:R:w:F:I:v", long_options, + while ((c = getopt_long(argc, argv, "d:h:p:U:D:l:f:R:w:k:F:I:v", long_options, &optindex)) != -1) { switch (c) @@ -162,6 +166,12 @@ main(int argc, char **argv) if (atoi(optarg) > 0) strncpy(runtime_options.wal_keep_segments, optarg, MAXLEN); break; + case 'k': + if (atoi(optarg) > 0) + runtime_options.keep_history = atoi(optarg); + else + runtime_options.keep_history = 0; + break; case 'F': runtime_options.force = true; break; @@ -182,7 +192,7 @@ main(int argc, char **argv) * MASTER REGISTER | * STANDBY {REGISTER | CLONE [node] | PROMOTE | FOLLOW [node]} | * WITNESS CREATE - * CLUSTER SHOW + * CLUSTER {SHOW | CLEANUP} * * the node part is optional, if we receive it then we shouldn't * have received a -h option @@ -223,6 +233,8 @@ main(int argc, char **argv) { if( strcasecmp(server_cmd, "SHOW") == 0) action = CLUSTER_SHOW; + else if(strcasecmp(server_cmd, "CLEANUP") == 0) + action = CLUSTER_CLEANUP; } else if (strcasecmp(server_mode, "WITNESS") == 0) if (strcasecmp(server_cmd, "CREATE") == 0) @@ -348,6 +360,9 @@ main(int argc, char **argv) case CLUSTER_SHOW: do_cluster_show(); break; + case CLUSTER_CLEANUP: + do_cluster_cleanup(); + break; default: usage(); exit(ERR_BAD_CONFIG); @@ -402,10 +417,59 @@ do_cluster_show(void) } PQclear(res); - - } + static void +do_cluster_cleanup(void) +{ + int master_id; + PGconn *master_conn; + PGresult *res; + char sqlquery[QUERY_STR_LEN]; + char node_role[MAXLEN]; + int i; + + /* check if there is a master in this cluster */ + log_info(_("%s connecting to master database\n"), progname); + master_conn = getMasterConnection(master_conn, repmgr_schema, options.cluster_name, + &master_id, NULL); + if (!master_conn) + { + log_err(_("cluster cleanup: cannot connect to master\n")); + exit(ERR_DB_CON); + } + + if (runtime_options.keep_history > 0) + { + sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_monitor " + " WHERE age(now(), last_monitor_time) >= '%d days'::interval;", + repmgr_schema, keep_history); + } + else + { + sqlquery_snprintf(sqlquery, "TRUNCATE TABLE %s.repl_monitor;", repmgr_schema); + } + res = PQexec(conn, sqlquery); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + log_err(_("cluster cleanup: Couldn't clean history\n%s\n"), PQerrorMessage(conn)); + PQclear(res); + PQfinish(conn); + exit(ERR_BAD_CONFIG); + } + PQclear(res); + + /* Let's VACUUM the table to avoid autovacuum to be launched in an unexpected hour */ + sqlquery_snprintf(sqlquery, "VACUUM %s.repl_monitor;", repmgr_schema); + res = PQexec(conn, sqlquery); + + /* XXX There is any need to check this VACUUM happens without problems? */ + + PQclear(res); + PQfinish(conn); +} + + static void do_master_register(void) { @@ -1642,7 +1706,7 @@ help(const char *progname) printf(_(" %s [OPTIONS] master {register}\n"), progname); printf(_(" %s [OPTIONS] standby {register|clone|promote|follow}\n"), progname); - printf(_(" %s [OPTIONS] cluster show\n"), progname); + printf(_(" %s [OPTIONS] cluster {show|cleanup}\n"), progname); printf(_("\nGeneral options:\n")); printf(_(" --help show this help, then exit\n")); printf(_(" --version output version information, then exit\n")); @@ -1659,6 +1723,7 @@ help(const char *progname) printf(_(" -R, --remote-user=USERNAME database server username for rsync\n")); printf(_(" -w, --wal-keep-segments=VALUE minimum value for the GUC wal_keep_segments (default: 5000)\n")); printf(_(" -I, --ignore-rsync-warning ignore rsync partial transfer warning\n")); + printf(_(" -k, --keep-history=VALUE keeps indicated number of days of history\n")); printf(_(" -F, --force force potentially dangerous operations to happen\n")); printf(_("\n%s performs some tasks like clone a node, promote it "), progname); @@ -1671,6 +1736,7 @@ help(const char *progname) printf(_("new master in the event of a failover\n")); printf(_(" standby follow - allows the standby to re-point itself to a new master\n")); printf(_(" cluster show - print node informations\n")); + printf(_(" cluster cleanup - cleans monitor's history\n")); } @@ -1935,6 +2001,9 @@ check_parameters_for_action(const int action) case CLUSTER_SHOW: /* allow all parameters to be supplied */ break; + case CLUSTER_CLEANUP: + /* allow all parameters to be supplied */ + break; } return ok; diff --git a/repmgr.h b/repmgr.h index 4718af94..62cc6d0a 100644 --- a/repmgr.h +++ b/repmgr.h @@ -64,6 +64,8 @@ typedef struct char masterport[MAXLEN]; char localport[MAXLEN]; + /* parameter used by CLUSTER CLEANUP */ + int keep_history; } t_runtime_options; #define SLEEP_MONITOR 2