mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 15:16:29 +00:00
Compare commits
42 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
efd50f11ac | ||
|
|
45a39084ed | ||
|
|
94c73a016f | ||
|
|
be5cbe4ddd | ||
|
|
30d35d5b4c | ||
|
|
fa889a11ac | ||
|
|
f4087d0a32 | ||
|
|
a55d7a4bd3 | ||
|
|
5d8cf6abe0 | ||
|
|
9caa243354 | ||
|
|
6880483947 | ||
|
|
3d89fdadab | ||
|
|
6e9e4e05ae | ||
|
|
17a160e970 | ||
|
|
e0e01aa9db | ||
|
|
b09eff9f76 | ||
|
|
3c5d82b9ef | ||
|
|
257dbc4f42 | ||
|
|
2a64099163 | ||
|
|
41c05bea7b | ||
|
|
7d76d86e19 | ||
|
|
36d5b5bc24 | ||
|
|
c543402d65 | ||
|
|
d0959b953e | ||
|
|
0660bded0b | ||
|
|
209a0c64d2 | ||
|
|
fd76ec6283 | ||
|
|
7d579cf71f | ||
|
|
d790ef740b | ||
|
|
aa6633b027 | ||
|
|
c3bffce379 | ||
|
|
78aea00a6d | ||
|
|
91601204b5 | ||
|
|
c91ddc2f5e | ||
|
|
72f74dd7a7 | ||
|
|
901d07fa92 | ||
|
|
f0e609bcd4 | ||
|
|
94c9c3a5c6 | ||
|
|
3af5243bcc | ||
|
|
85bbae462a | ||
|
|
14e49d41c2 | ||
|
|
1bd8a703c8 |
2
CREDITS
2
CREDITS
@@ -10,3 +10,5 @@ Hannu Krosing <hannu@2ndQuadrant.com>
|
||||
Cédric Villemain <cedric@2ndquadrant.com>
|
||||
Charles Duffy <charles@dyfis.net>
|
||||
Daniel Farina <daniel@heroku.com>
|
||||
Marco Nenciarini <marco.nenciarini@2ndquadrant.it>
|
||||
Carlo Ascani <carlo.ascani@2ndquadrant.it>
|
||||
|
||||
25
HISTORY
25
HISTORY
@@ -1,5 +1,21 @@
|
||||
1.0.0 2010-12-05
|
||||
First public release
|
||||
1.2.0 2012-07-27
|
||||
Test ssh connection before trying to rsync (Cédric)
|
||||
Add CLUSTER SHOW command (Carlo)
|
||||
Add CLUSTER CLEANUP command (Jaime)
|
||||
Add function write_primary_conninfo (Marco)
|
||||
Teach repmgr how to get tablespace's location in different pg version (Jaime)
|
||||
Improve version message (Carlo)
|
||||
|
||||
1.1.1 2012-04-18
|
||||
Add --ignore-rsync-warning (Cédric)
|
||||
Add strnlen for compatibility with OS X (Greg)
|
||||
Improve performance of repl_status view (Jaime)
|
||||
Remove last argument from log_err (Jaime, Reported by Jeroen Dekkers)
|
||||
Complete documentation about possible error conditions (Jaime)
|
||||
Document how to clean history (Jaime)
|
||||
|
||||
1.1.0 2011-03-09
|
||||
Make options -U, -R and -p not mandatory (Jaime)
|
||||
|
||||
1.1.0b1 2011-02-24
|
||||
Fix missing "--force" option in help (Greg Smith)
|
||||
@@ -28,6 +44,5 @@
|
||||
Map old verbose flag into a useful setting for the new logger (Greg)
|
||||
Document repmgrd startup restrictions and log info about them (Greg)
|
||||
|
||||
1.1.0 2011-03-09
|
||||
Make options -U, -R and -p not mandatory (Jaime)
|
||||
|
||||
1.0.0 2010-12-05
|
||||
First public release
|
||||
|
||||
43
README.rst
43
README.rst
@@ -814,6 +814,23 @@ and on "prime."
|
||||
|
||||
The servers are now again acting as primary on "prime" and standby on "standby".
|
||||
|
||||
Maintainance of monitor history
|
||||
-------------------------------
|
||||
|
||||
Once you have changed roles (with a failover or to restore original roles)
|
||||
you would end up with records saying that node1 is primary and other records
|
||||
saying that node2 is the primary. Which could be confusing.
|
||||
Also, if you don't do anything about it the monitor history will keep growing.
|
||||
For both of those reasons you sometime want to make some maintainance of the
|
||||
``repl_monitor`` table.
|
||||
|
||||
If you want to clean the history after a few days you can execute a
|
||||
truncate/delete (wheter you want to completely clean history or want to keep
|
||||
a few days of history) in a cron. For example to keep just one day of history
|
||||
you can put this in your crontab::
|
||||
|
||||
0 1 * * * psql -c "DELETE FROM repmgr_schema.repl_monitor where now() - last_monitor_time >= '1 day'::interval;" postgres
|
||||
|
||||
Configuration and command reference
|
||||
===================================
|
||||
|
||||
@@ -845,6 +862,7 @@ The output from this program looks like this::
|
||||
Usage:
|
||||
repmgr [OPTIONS] master {register}
|
||||
repmgr [OPTIONS] standby {register|clone|promote|follow}
|
||||
repmgr [OPTIONS] cluster {show|cleanup}
|
||||
|
||||
General options:
|
||||
--help show this help, then exit
|
||||
@@ -863,6 +881,8 @@ The output from this program looks like this::
|
||||
-R, --remote-user=USERNAME database server username for rsync
|
||||
-w, --wal-keep-segments=VALUE minimum value for the GUC wal_keep_segments (default: 5000)
|
||||
-F, --force force potentially dangerous operations to happen
|
||||
-I, --ignore-rsync-warning Ignore partial transfert warning
|
||||
-k, --keep-history keeps indicated number of days of history
|
||||
|
||||
repmgr performs some tasks like clone a node, promote it or making follow another node and then exits.
|
||||
COMMANDS:
|
||||
@@ -871,6 +891,8 @@ The output from this program looks like this::
|
||||
standby clone [node] - allows creation of a new standby
|
||||
standby promote - allows manual promotion of a specific standby into a new master in the event of a failover
|
||||
standby follow - allows the standby to re-point itself to a new master
|
||||
cluster show - print node informations
|
||||
cluster cleanup - cleans monitor's history
|
||||
|
||||
The ``--verbose`` option can be useful in troubleshooting issues with
|
||||
the program.
|
||||
@@ -941,6 +963,26 @@ its port if is different from the default one.
|
||||
|
||||
./repmgr standby follow
|
||||
|
||||
* cluster show
|
||||
|
||||
* Shows the role (standby/master) and connection string for all nodes configured
|
||||
in the cluster or "FAILED" if the node doesn't respond. This allow us to know
|
||||
which nodes are alive and which one needs attention and to have a notion of the
|
||||
structure of clusters we just have access to. Example::
|
||||
|
||||
./repmgr cluster show
|
||||
|
||||
* cluster cleanup
|
||||
|
||||
* Cleans the monitor's history from repmgr tables. This avoids the repl_monitor table
|
||||
to grow excesivelly which in turns affects repl_status view performance, also
|
||||
keeps controlled the space in disk used by repmgr. This command can be used manually
|
||||
or in a cron to make it periodically.
|
||||
There is also a --keep-history (-k) option to indicate how many days of history we
|
||||
want to keep, so the command will clean up history older than "keep-history" days. Example::
|
||||
|
||||
./repmgr cluster cleanup -k 2
|
||||
|
||||
repmgrd Daemon
|
||||
--------------
|
||||
|
||||
@@ -1023,6 +1065,7 @@ following
|
||||
* ERR_DB_QUERY 7: Error executing a database query.
|
||||
* ERR_PROMOTED 8: Exiting program because the node has been promoted to master.
|
||||
* ERR_BAD_PASSWORD 9: Password used to connect to a database was rejected.
|
||||
* ERR_STR_OVERFLOW 10: A string was larger than expected.
|
||||
|
||||
License and Contributions
|
||||
=========================
|
||||
|
||||
7
TODO
7
TODO
@@ -12,10 +12,3 @@ Known issues in repmgr
|
||||
|
||||
* After running repmgrd as a regular foreground application, hitting
|
||||
control-C causes the program to crash.
|
||||
|
||||
Planned feature improvements
|
||||
============================
|
||||
|
||||
* Before running ``pg_start_backup()``, a sanity check that there is a
|
||||
a working ssh connection to the destination would help find
|
||||
configuration errors before disturbing the database.
|
||||
|
||||
@@ -202,7 +202,7 @@ get_cluster_size(PGconn *conn)
|
||||
* connection string is placed there.
|
||||
*/
|
||||
PGconn *
|
||||
getMasterConnection(PGconn *standby_conn, int id, char *cluster,
|
||||
getMasterConnection(PGconn *standby_conn, char *cluster,
|
||||
int *master_id, char *master_conninfo_out)
|
||||
{
|
||||
PGconn *master_conn = NULL;
|
||||
@@ -242,8 +242,8 @@ getMasterConnection(PGconn *standby_conn, int id, char *cluster,
|
||||
cluster);
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT * FROM %s.repl_nodes "
|
||||
" WHERE cluster = '%s' and id <> %d",
|
||||
schema_quoted, cluster, id);
|
||||
" WHERE cluster = '%s'",
|
||||
schema_quoted, cluster);
|
||||
|
||||
res1 = PQexec(standby_conn, sqlquery);
|
||||
if (PQresultStatus(res1) != PGRES_TUPLES_OK)
|
||||
|
||||
@@ -20,6 +20,8 @@
|
||||
#ifndef _REPMGR_DBUTILS_H_
|
||||
#define _REPMGR_DBUTILS_H_
|
||||
|
||||
#include "strutil.h"
|
||||
|
||||
PGconn *establishDBConnection(const char *conninfo, const bool exit_on_error);
|
||||
PGconn *establishDBConnectionByParams(const char *keywords[],
|
||||
const char *values[],
|
||||
@@ -29,7 +31,7 @@ char *pg_version(PGconn *conn, char* major_version);
|
||||
bool guc_setted(PGconn *conn, const char *parameter, const char *op,
|
||||
const char *value);
|
||||
const char *get_cluster_size(PGconn *conn);
|
||||
PGconn *getMasterConnection(PGconn *standby_conn, int id, char *cluster,
|
||||
PGconn *getMasterConnection(PGconn *standby_conn, char *cluster,
|
||||
int *master_id, char *master_conninfo_out);
|
||||
|
||||
#endif
|
||||
|
||||
397
repmgr.c
397
repmgr.c
@@ -7,7 +7,7 @@
|
||||
*
|
||||
* Commands implemented are.
|
||||
* MASTER REGISTER, STANDBY REGISTER, STANDBY CLONE, STANDBY FOLLOW,
|
||||
* STANDBY PROMOTE
|
||||
* STANDBY PROMOTE, CLUSTER SHOW, CLUSTER CLEANUP
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "config.h"
|
||||
#include "check_dir.h"
|
||||
#include "strutil.h"
|
||||
#include "version.h"
|
||||
|
||||
#define RECOVERY_FILE "recovery.conf"
|
||||
#define RECOVERY_DONE_FILE "recovery.done"
|
||||
@@ -45,19 +46,25 @@
|
||||
#define STANDBY_CLONE 3
|
||||
#define STANDBY_PROMOTE 4
|
||||
#define STANDBY_FOLLOW 5
|
||||
#define CLUSTER_SHOW 6
|
||||
#define CLUSTER_CLEANUP 7
|
||||
|
||||
static void help(const char *progname);
|
||||
static bool create_recovery_file(const char *data_dir, char *master_conninfo);
|
||||
static int test_ssh_connection(char *host, char *remote_user);
|
||||
static int copy_remote_files(char *host, char *remote_user, char *remote_path,
|
||||
char *local_path, bool is_directory);
|
||||
static bool check_parameters_for_action(const int action);
|
||||
static void write_primary_conninfo(char* line);
|
||||
|
||||
static void do_master_register(void);
|
||||
static void do_standby_register(void);
|
||||
static void do_standby_clone(void);
|
||||
static void do_standby_promote(void);
|
||||
static void do_standby_follow(void);
|
||||
static void help(const char* progname);
|
||||
static void do_cluster_show(void);
|
||||
static void do_cluster_cleanup(void);
|
||||
|
||||
static void usage(void);
|
||||
|
||||
/* Global variables */
|
||||
@@ -71,7 +78,7 @@ bool need_a_node = true;
|
||||
bool require_password = false;
|
||||
|
||||
/* Initialization of runtime options */
|
||||
t_runtime_options runtime_options = { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, "" };
|
||||
t_runtime_options runtime_options = { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, "", 0 };
|
||||
t_configuration_options options = { "", -1, "", "", "" };
|
||||
|
||||
static char *server_mode = NULL;
|
||||
@@ -90,7 +97,9 @@ main(int argc, char **argv)
|
||||
{"config-file", required_argument, NULL, 'f'},
|
||||
{"remote-user", required_argument, NULL, 'R'},
|
||||
{"wal-keep-segments", required_argument, NULL, 'w'},
|
||||
{"keep-history", required_argument, NULL, 'k'},
|
||||
{"force", no_argument, NULL, 'F'},
|
||||
{"ignore-rsync-warning", no_argument, NULL, 'I'},
|
||||
{"verbose", no_argument, NULL, 'v'},
|
||||
{NULL, 0, NULL, 0}
|
||||
};
|
||||
@@ -110,13 +119,13 @@ main(int argc, char **argv)
|
||||
}
|
||||
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
|
||||
{
|
||||
printf("%s (PostgreSQL) " PG_VERSION "\n", progname);
|
||||
printf("%s %s (PostgreSQL %s)\n", progname, REPMGR_VERSION, PG_VERSION);
|
||||
exit(SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
while ((c = getopt_long(argc, argv, "d:h:p:U:D:f:R:w:F:v", long_options,
|
||||
while ((c = getopt_long(argc, argv, "d:h:p:U:D:f:R:w:k:F:I:v", long_options,
|
||||
&optindex)) != -1)
|
||||
{
|
||||
switch (c)
|
||||
@@ -147,9 +156,18 @@ main(int argc, char **argv)
|
||||
if (atoi(optarg) > 0)
|
||||
strncpy(runtime_options.wal_keep_segments, optarg, MAXLEN);
|
||||
break;
|
||||
case 'k':
|
||||
if (atoi(optarg) > 0)
|
||||
runtime_options.keep_history = atoi(optarg);
|
||||
else
|
||||
runtime_options.keep_history = 0;
|
||||
break;
|
||||
case 'F':
|
||||
runtime_options.force = true;
|
||||
break;
|
||||
case 'I':
|
||||
runtime_options.ignore_rsync_warn = true;
|
||||
break;
|
||||
case 'v':
|
||||
runtime_options.verbose = true;
|
||||
break;
|
||||
@@ -162,7 +180,8 @@ main(int argc, char **argv)
|
||||
/*
|
||||
* Now we need to obtain the action, this comes in one of these forms:
|
||||
* MASTER REGISTER |
|
||||
* STANDBY {REGISTER | CLONE [node] | PROMOTE | FOLLOW [node]}
|
||||
* STANDBY {REGISTER | CLONE [node] | PROMOTE | FOLLOW [node]} |
|
||||
* CLUSTER {SHOW | CLEANUP}
|
||||
*
|
||||
* the node part is optional, if we receive it then we shouldn't
|
||||
* have received a -h option
|
||||
@@ -170,8 +189,8 @@ main(int argc, char **argv)
|
||||
if (optind < argc)
|
||||
{
|
||||
server_mode = argv[optind++];
|
||||
if (strcasecmp(server_mode, "STANDBY") != 0 &&
|
||||
strcasecmp(server_mode, "MASTER") != 0)
|
||||
if (strcasecmp(server_mode, "STANDBY") != 0 && strcasecmp(server_mode, "MASTER") != 0 &&
|
||||
strcasecmp(server_mode, "CLUSTER") != 0 )
|
||||
{
|
||||
usage();
|
||||
exit(ERR_BAD_CONFIG);
|
||||
@@ -200,13 +219,21 @@ main(int argc, char **argv)
|
||||
action = STANDBY_PROMOTE;
|
||||
else if (strcasecmp(server_cmd, "FOLLOW") == 0)
|
||||
action = STANDBY_FOLLOW;
|
||||
else
|
||||
else if (strcasecmp(server_mode, "CLUSTER") == 0)
|
||||
{
|
||||
usage();
|
||||
exit(ERR_BAD_CONFIG);
|
||||
if(strcasecmp(server_cmd, "SHOW") == 0)
|
||||
action = CLUSTER_SHOW;
|
||||
else if(strcasecmp(server_cmd, "CLEANUP") == 0)
|
||||
action = CLUSTER_CLEANUP;
|
||||
}
|
||||
}
|
||||
|
||||
if (action == NO_ACTION)
|
||||
{
|
||||
usage();
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* For some actions we still can receive a last argument */
|
||||
if (action == STANDBY_CLONE)
|
||||
{
|
||||
@@ -310,6 +337,12 @@ main(int argc, char **argv)
|
||||
case STANDBY_FOLLOW:
|
||||
do_standby_follow();
|
||||
break;
|
||||
case CLUSTER_SHOW:
|
||||
do_cluster_show();
|
||||
break;
|
||||
case CLUSTER_CLEANUP:
|
||||
do_cluster_cleanup();
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
exit(ERR_BAD_CONFIG);
|
||||
@@ -319,6 +352,114 @@ main(int argc, char **argv)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
do_cluster_show(void)
|
||||
{
|
||||
PGconn *conn;
|
||||
PGconn *node_conn = NULL;
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
char node_role[MAXLEN];
|
||||
int i;
|
||||
|
||||
/* We need to connect to check configuration */
|
||||
log_info(_("%s connecting to database\n"), progname);
|
||||
conn = establishDBConnection(options.conninfo, true);
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT conninfo FROM %s.repl_nodes;", repmgr_schema);
|
||||
log_debug("cluster show: %s\n", sqlquery);
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("Can't get nodes informations, have you regitered them?\n%s\n"), PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
printf("Role | Connection String \n");
|
||||
for (i = 0; i < PQntuples(res); i++)
|
||||
{
|
||||
node_conn = establishDBConnection(PQgetvalue(res, i, 0), false);
|
||||
if (PQstatus(node_conn) != CONNECTION_OK)
|
||||
strcpy(node_role, " FAILED");
|
||||
else if (is_standby(node_conn))
|
||||
strcpy(node_role, " standby");
|
||||
else
|
||||
strcpy(node_role, "* master");
|
||||
|
||||
printf("%-10s", node_role);
|
||||
printf("| %s\n", PQgetvalue(res, i, 0));
|
||||
|
||||
PQfinish(node_conn);
|
||||
}
|
||||
PQclear(res);
|
||||
PQfinish(conn);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
do_cluster_cleanup(void)
|
||||
{
|
||||
int master_id;
|
||||
PGconn *conn;
|
||||
PGconn *master_conn;
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
/* I need a connection to my local db to know what node is the master */
|
||||
log_info(_("%s connecting to database\n"), progname);
|
||||
conn = establishDBConnection(options.conninfo, true);
|
||||
|
||||
/* check if there is a master in this cluster */
|
||||
log_info(_("%s connecting to master database\n"), progname);
|
||||
master_conn = getMasterConnection(conn, options.cluster_name,
|
||||
&master_id, NULL);
|
||||
if (!master_conn)
|
||||
{
|
||||
log_err(_("cluster cleanup: cannot connect to master\n"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_DB_CON);
|
||||
}
|
||||
|
||||
/* I don't need a local connection anymore */
|
||||
PQfinish(conn);
|
||||
|
||||
if (runtime_options.keep_history > 0)
|
||||
{
|
||||
sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_monitor "
|
||||
" WHERE age(now(), last_monitor_time) >= '%d days'::interval;",
|
||||
repmgr_schema, runtime_options.keep_history);
|
||||
}
|
||||
else
|
||||
{
|
||||
sqlquery_snprintf(sqlquery, "TRUNCATE TABLE %s.repl_monitor;", repmgr_schema);
|
||||
}
|
||||
|
||||
log_debug("cluster cleanup: %s\n", sqlquery);
|
||||
res = PQexec(master_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("cluster cleanup: Couldn't clean history\n%s\n"), PQerrorMessage(master_conn));
|
||||
PQclear(res);
|
||||
PQfinish(master_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
PQclear(res);
|
||||
|
||||
/* Let's VACUUM the table to avoid autovacuum to be launched in an unexpected hour */
|
||||
sqlquery_snprintf(sqlquery, "VACUUM %s.repl_monitor;", repmgr_schema);
|
||||
log_debug("cluster cleanup: %s\n", sqlquery);
|
||||
res = PQexec(master_conn, sqlquery);
|
||||
|
||||
/* XXX There is any need to check this VACUUM happens without problems? */
|
||||
|
||||
PQclear(res);
|
||||
PQfinish(master_conn);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
do_master_register(void)
|
||||
{
|
||||
@@ -346,7 +487,7 @@ do_master_register(void)
|
||||
log_info(_("%s connected to master, checking its state\n"), progname);
|
||||
if (is_standby(conn))
|
||||
{
|
||||
log_err(_("%s needs master to be PostgreSQL 9.0 or better\n"), progname);
|
||||
log_err(_("Trying to register a standby node as a master\n"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
@@ -438,14 +579,13 @@ do_master_register(void)
|
||||
|
||||
/* and the view */
|
||||
sqlquery_snprintf(sqlquery, "CREATE VIEW %s.repl_status AS "
|
||||
" WITH monitor_info AS (SELECT *, ROW_NUMBER() OVER (PARTITION BY primary_node, standby_node "
|
||||
" ORDER BY last_monitor_time desc) "
|
||||
" FROM %s.repl_monitor) "
|
||||
" SELECT primary_node, standby_node, last_monitor_time, last_wal_primary_location, "
|
||||
" last_wal_standby_location, pg_size_pretty(replication_lag) replication_lag, "
|
||||
" pg_size_pretty(apply_lag) apply_lag, age(now(), last_monitor_time) AS time_lag "
|
||||
" FROM monitor_info a "
|
||||
" WHERE row_number = 1", repmgr_schema, repmgr_schema);
|
||||
" FROM %s.repl_monitor "
|
||||
" WHERE (standby_node, last_monitor_time) IN (SELECT standby_node, MAX(last_monitor_time) "
|
||||
" FROM %s.repl_monitor GROUP BY 1)",
|
||||
repmgr_schema, repmgr_schema, repmgr_schema);
|
||||
log_debug("master register: %s\n", sqlquery);
|
||||
if (!PQexec(conn, sqlquery))
|
||||
{
|
||||
@@ -454,6 +594,19 @@ do_master_register(void)
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* an index to improve performance of the view */
|
||||
sqlquery_snprintf(sqlquery, "CREATE INDEX idx_repl_status_sort "
|
||||
" ON %s.repl_monitor (last_monitor_time, standby_node) ",
|
||||
repmgr_schema);
|
||||
log_debug(_("master register: %s\n"), sqlquery);
|
||||
if (!PQexec(conn, sqlquery))
|
||||
{
|
||||
log_err(_("Cannot indexing table %s.repl_monitor: %s\n"),
|
||||
repmgr_schema, PQerrorMessage(conn));
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -461,8 +614,7 @@ do_master_register(void)
|
||||
int id;
|
||||
|
||||
/* Ensure there isn't any other master already registered */
|
||||
master_conn = getMasterConnection(conn, options.node,
|
||||
options.cluster_name, &id,NULL);
|
||||
master_conn = getMasterConnection(conn, options.cluster_name, &id,NULL);
|
||||
if (master_conn != NULL)
|
||||
{
|
||||
PQfinish(master_conn);
|
||||
@@ -581,7 +733,7 @@ do_standby_register(void)
|
||||
|
||||
/* check if there is a master in this cluster */
|
||||
log_info(_("%s connecting to master database\n"), progname);
|
||||
master_conn = getMasterConnection(conn, options.node, options.cluster_name,
|
||||
master_conn = getMasterConnection(conn, options.cluster_name,
|
||||
&master_id, NULL);
|
||||
if (!master_conn)
|
||||
{
|
||||
@@ -788,10 +940,17 @@ do_standby_clone(void)
|
||||
* Check if the tablespace locations exists and that we can write to
|
||||
* them.
|
||||
*/
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT spclocation "
|
||||
" FROM pg_tablespace "
|
||||
"WHERE spcname NOT IN ('pg_default', 'pg_global')");
|
||||
if (strcmp(master_version, "9.0") == 0 || strcmp(master_version, "9.1") == 0)
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT spclocation "
|
||||
" FROM pg_tablespace "
|
||||
"WHERE spcname NOT IN ('pg_default', 'pg_global')");
|
||||
else
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT pg_tablespace_location(oid) spclocation "
|
||||
" FROM pg_tablespace "
|
||||
"WHERE spcname NOT IN ('pg_default', 'pg_global')");
|
||||
|
||||
log_debug("standby clone: %s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
@@ -845,6 +1004,7 @@ do_standby_clone(void)
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
/* Trouble accessing directory */
|
||||
log_err(_("%s: could not access directory \"%s\": %s\n"),
|
||||
@@ -855,7 +1015,14 @@ do_standby_clone(void)
|
||||
}
|
||||
}
|
||||
|
||||
log_notice("Starting backup...\n");
|
||||
r = test_ssh_connection(runtime_options.host, runtime_options.remote_user);
|
||||
if (r != 0)
|
||||
{
|
||||
log_err(_("%s: Aborting, remote host %s is not reachable.\n"), progname, runtime_options.host);
|
||||
PQclear(res);
|
||||
PQfinish(conn);
|
||||
exit(ERR_DB_CON);
|
||||
}
|
||||
|
||||
/* Get the data directory full path and the configuration files location */
|
||||
sqlquery_snprintf(sqlquery,
|
||||
@@ -871,6 +1038,16 @@ do_standby_clone(void)
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* We need all 4 parameters, and they can be retrieved only by superusers */
|
||||
if (PQntuples(res) != 4)
|
||||
{
|
||||
log_err("%s: STANDBY CLONE should be run by a SUPERUSER\n", progname);
|
||||
PQclear(res);
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
for (i = 0; i < PQntuples(res); i++)
|
||||
{
|
||||
if (strcmp(PQgetvalue(res, i, 0), "data_directory") == 0)
|
||||
@@ -886,6 +1063,22 @@ do_standby_clone(void)
|
||||
}
|
||||
PQclear(res);
|
||||
|
||||
log_notice("Starting backup...\n");
|
||||
|
||||
/*
|
||||
* in pg 9.1 default is to wait for a sync standby to ack,
|
||||
* avoid that by turning off sync rep for this session
|
||||
*/
|
||||
sqlquery_snprintf(sqlquery, "SET synchronous_commit TO OFF");
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err("Can't set synchronous_commit: %s\n", PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/*
|
||||
* inform the master we will start a backup and get the first XLog filename
|
||||
* so we can say to the user we need those files
|
||||
@@ -963,10 +1156,17 @@ do_standby_clone(void)
|
||||
* find and appropiate rsync option but besides we could someday make all
|
||||
* these rsync happen concurrently
|
||||
*/
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT spclocation "
|
||||
" FROM pg_tablespace "
|
||||
" WHERE spcname NOT IN ('pg_default', 'pg_global')");
|
||||
if (strcmp(master_version, "9.0") == 0 || strcmp(master_version, "9.1") == 0)
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT spclocation "
|
||||
" FROM pg_tablespace "
|
||||
" WHERE spcname NOT IN ('pg_default', 'pg_global')");
|
||||
else
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT pg_tablespace_location(oid) spclocation "
|
||||
" FROM pg_tablespace "
|
||||
" WHERE spcname NOT IN ('pg_default', 'pg_global')");
|
||||
|
||||
log_debug("standby clone: %s\n", sqlquery);
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -1022,9 +1222,6 @@ stop_backup:
|
||||
* Don't have this one exit if it fails, so that a more informative
|
||||
* error message will also appear about the backup not being stopped.
|
||||
*/
|
||||
log_info(_("%s connecting to master database to stop backup\n"), progname);
|
||||
conn=establishDBConnectionByParams(keywords,values,false);
|
||||
|
||||
log_notice("Finishing backup...\n");
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_xlogfile_name(pg_stop_backup())");
|
||||
log_debug("standby clone: %s\n", sqlquery);
|
||||
@@ -1039,8 +1236,10 @@ stop_backup:
|
||||
}
|
||||
last_wal_segment = PQgetvalue(res, 0, 0);
|
||||
|
||||
log_info(_("%s requires primary to keep WAL files %s until at least %s\n"),
|
||||
progname, first_wal_segment, last_wal_segment);
|
||||
/* don't show this message if rsync failed */
|
||||
if (r == 0 && runtime_options.verbose)
|
||||
log_info(_("%s requires primary to keep WAL files %s until at least %s\n"),
|
||||
progname, first_wal_segment, last_wal_segment);
|
||||
|
||||
/* Finished with the database connection now */
|
||||
PQclear(res);
|
||||
@@ -1121,7 +1320,7 @@ do_standby_promote(void)
|
||||
}
|
||||
|
||||
/* we also need to check if there isn't any master already */
|
||||
old_master_conn = getMasterConnection(conn, options.node, options.cluster_name,
|
||||
old_master_conn = getMasterConnection(conn, options.cluster_name,
|
||||
&old_master_id, NULL);
|
||||
if (old_master_conn != NULL)
|
||||
{
|
||||
@@ -1225,8 +1424,7 @@ do_standby_follow(void)
|
||||
|
||||
/* we also need to check if there is any master in the cluster */
|
||||
log_info(_("%s connecting to master database\n"), progname);
|
||||
master_conn = getMasterConnection(conn, options.node,
|
||||
options.cluster_name, &master_id,(char *) &master_conninfo);
|
||||
master_conn = getMasterConnection(conn, options.cluster_name, &master_id,(char *) &master_conninfo);
|
||||
if (master_conn == NULL)
|
||||
{
|
||||
PQfinish(conn);
|
||||
@@ -1272,7 +1470,7 @@ do_standby_follow(void)
|
||||
strncpy(runtime_options.masterport, PQport(master_conn), MAXLEN);
|
||||
PQfinish(master_conn);
|
||||
|
||||
log_info(_("%s Changing standby's master"),progname);
|
||||
log_info(_("%s Changing standby's master\n"),progname);
|
||||
|
||||
/* Get the data directory full path */
|
||||
sqlquery_snprintf(sqlquery, "SELECT setting "
|
||||
@@ -1322,6 +1520,7 @@ void help(const char *progname)
|
||||
printf(_(" %s [OPTIONS] master {register}\n"), progname);
|
||||
printf(_(" %s [OPTIONS] standby {register|clone|promote|follow}\n"),
|
||||
progname);
|
||||
printf(_(" %s [OPTIONS] cluster {show|cleanup}\n"), progname);
|
||||
printf(_("\nGeneral options:\n"));
|
||||
printf(_(" --help show this help, then exit\n"));
|
||||
printf(_(" --version output version information, then exit\n"));
|
||||
@@ -1337,6 +1536,8 @@ void help(const char *progname)
|
||||
printf(_(" -R, --remote-user=USERNAME database server username for rsync\n"));
|
||||
printf(_(" -w, --wal-keep-segments=VALUE minimum value for the GUC wal_keep_segments (default: 5000)\n"));
|
||||
printf(_(" -F, --force force potentially dangerous operations to happen\n"));
|
||||
printf(_(" -I, --ignore-rsync-warning Ignore partial transfert warning\n"));
|
||||
printf(_(" -k, --keep-history=VALUE keeps indicated number of days of history\n"));
|
||||
|
||||
printf(_("\n%s performs some tasks like clone a node, promote it "), progname);
|
||||
printf(_("or making follow another node and then exits.\n"));
|
||||
@@ -1347,6 +1548,8 @@ void help(const char *progname)
|
||||
printf(_(" standby promote - allows manual promotion of a specific standby into a "));
|
||||
printf(_("new master in the event of a failover\n"));
|
||||
printf(_(" standby follow - allows the standby to re-point itself to a new master\n"));
|
||||
printf(_(" cluster show - print node informations\n"));
|
||||
printf(_(" cluster cleanup - cleans monitor's history\n"));
|
||||
}
|
||||
|
||||
|
||||
@@ -1379,41 +1582,7 @@ create_recovery_file(const char *data_dir, char *master_conninfo)
|
||||
return false;
|
||||
}
|
||||
|
||||
maxlen_snprintf(line, "primary_conninfo = 'host=%s port=%s'\n", runtime_options.host,
|
||||
(runtime_options.masterport[0]) ? runtime_options.masterport : "5432");
|
||||
|
||||
/*
|
||||
* Template a password into the connection string in recovery.conf
|
||||
* if a full connection string is not already provided.
|
||||
*
|
||||
* Sometimes this is passed by the user explicitly, and otherwise
|
||||
* we try to get it into the environment.
|
||||
*
|
||||
* XXX: This is pretty dirty, at least push this up to the caller rather
|
||||
* than hitting environment variables at this level.
|
||||
*/
|
||||
if (master_conninfo == NULL)
|
||||
{
|
||||
char *password = getenv("PGPASSWORD");
|
||||
|
||||
if (password != NULL)
|
||||
{
|
||||
maxlen_snprintf(line,
|
||||
"primary_conninfo = 'host=%s port=%s password=%s'\n",
|
||||
runtime_options.host,
|
||||
(runtime_options.masterport[0]) ? runtime_options.masterport : "5432",
|
||||
password);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (require_password)
|
||||
{
|
||||
log_err(_("%s: PGPASSWORD not set, but having one is required\n"),
|
||||
progname);
|
||||
exit(ERR_BAD_PASSWORD);
|
||||
}
|
||||
}
|
||||
}
|
||||
write_primary_conninfo(line);
|
||||
|
||||
if (fputs(line, recovery_file) == EOF)
|
||||
{
|
||||
@@ -1428,6 +1597,24 @@ create_recovery_file(const char *data_dir, char *master_conninfo)
|
||||
return true;
|
||||
}
|
||||
|
||||
static int
|
||||
test_ssh_connection(char *host, char *remote_user)
|
||||
{
|
||||
char script[MAXLEN];
|
||||
int r;
|
||||
|
||||
/* Check if we have ssh connectivity to host before trying to rsync */
|
||||
if (!remote_user[0])
|
||||
maxlen_snprintf(script, "ssh -o Batchmode=yes %s /bin/true", host);
|
||||
else
|
||||
maxlen_snprintf(script, "ssh -o Batchmode=yes %s -l %s /bin/true", host, remote_user);
|
||||
|
||||
log_debug(_("command is: %s"), script);
|
||||
r = system(script);
|
||||
if (r != 0)
|
||||
log_info(_("Cannot connect to the remote host (%s)\n"), host);
|
||||
return r;
|
||||
}
|
||||
|
||||
static int
|
||||
copy_remote_files(char *host, char *remote_user, char *remote_path,
|
||||
@@ -1473,6 +1660,29 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
|
||||
|
||||
r = system(script);
|
||||
|
||||
/*
|
||||
* If we are transfering a directory (ie: data directory, tablespace directories)
|
||||
* then we can ignore some rsync warning, so if we get some of those errors we
|
||||
* treat them as 0 if we have --ignore-rsync-warning commandline option set
|
||||
* List of ignorable rsync errors:
|
||||
* 24 Partial transfer due to vanished source files
|
||||
*/
|
||||
if ((WEXITSTATUS(r) == 24) && is_directory)
|
||||
{
|
||||
if (!runtime_options.ignore_rsync_warn)
|
||||
{
|
||||
log_warning( _("\nrsync completed with return code 24 "
|
||||
"\"Partial transfer due to vanished source files\".\n"
|
||||
"This can happen because of normal operation "
|
||||
"on the master server, but it may indicate an "
|
||||
"issue during cloning. If you are certain no "
|
||||
"changes were made to the master, try cloning "
|
||||
"again using \"repmgr --force --ignore-rsync-warning\"."));
|
||||
exit(ERR_BAD_RSYNC);
|
||||
}
|
||||
else
|
||||
r = 0;
|
||||
}
|
||||
if (r != 0)
|
||||
log_err(_("Can't rsync from remote file or directory (%s:%s)\n"),
|
||||
host_string, remote_path);
|
||||
@@ -1599,7 +1809,48 @@ check_parameters_for_action(const int action)
|
||||
}
|
||||
need_a_node = false;
|
||||
break;
|
||||
case CLUSTER_SHOW:
|
||||
/* allow all parameters to be supplied */
|
||||
break;
|
||||
case CLUSTER_CLEANUP:
|
||||
/* allow all parameters to be supplied */
|
||||
break;
|
||||
}
|
||||
|
||||
return ok;
|
||||
}
|
||||
|
||||
/* This function uses global variables to determine connection settings. Special
|
||||
* usage of the PGPASSWORD variable is handled, but strongly discouraged */
|
||||
static void
|
||||
write_primary_conninfo(char* line)
|
||||
{
|
||||
char host_buf[MAXLEN] = "";
|
||||
char conn_buf[MAXLEN] = "";
|
||||
char user_buf[MAXLEN] = "";
|
||||
char password_buf[MAXLEN] = "";
|
||||
|
||||
/* Environment variable for password (UGLY, please use .pgpass!) */
|
||||
const char *password = getenv("PGPASSWORD");
|
||||
if (password != NULL) {
|
||||
maxlen_snprintf(password_buf, " password=%s", password);
|
||||
}
|
||||
else if (require_password) {
|
||||
log_err(_("%s: PGPASSWORD not set, but having one is required\n"),
|
||||
progname);
|
||||
exit(ERR_BAD_PASSWORD);
|
||||
}
|
||||
|
||||
if (runtime_options.host[0]) {
|
||||
maxlen_snprintf(host_buf, " host=%s", runtime_options.host);
|
||||
}
|
||||
|
||||
if (runtime_options.username[0]) {
|
||||
maxlen_snprintf(user_buf, " user=%s", runtime_options.username);
|
||||
}
|
||||
|
||||
maxlen_snprintf(conn_buf, "port=%s%s%s%s",
|
||||
(runtime_options.masterport[0]) ? runtime_options.masterport : "5432", host_buf, user_buf, password_buf);
|
||||
|
||||
maxlen_snprintf(line, "primary_conninfo = '%s'", conn_buf);
|
||||
}
|
||||
|
||||
3
repmgr.h
3
repmgr.h
@@ -55,9 +55,12 @@ typedef struct
|
||||
char wal_keep_segments[MAXLEN];
|
||||
bool verbose;
|
||||
bool force;
|
||||
bool ignore_rsync_warn;
|
||||
|
||||
char masterport[MAXLEN];
|
||||
|
||||
/* parameter used by CLUSTER CLEANUP */
|
||||
int keep_history;
|
||||
} t_runtime_options;
|
||||
|
||||
#endif
|
||||
|
||||
12
repmgrd.c
12
repmgrd.c
@@ -30,6 +30,7 @@
|
||||
#include "config.h"
|
||||
#include "log.h"
|
||||
#include "strutil.h"
|
||||
#include "version.h"
|
||||
|
||||
#include "libpq/pqsignal.h"
|
||||
|
||||
@@ -116,7 +117,7 @@ main(int argc, char **argv)
|
||||
}
|
||||
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
|
||||
{
|
||||
printf("%s (PostgreSQL) " PG_VERSION "\n", progname);
|
||||
printf("%s %s (PostgreSQL %s)\n", progname, REPMGR_VERSION, PG_VERSION);
|
||||
exit(SUCCESS);
|
||||
}
|
||||
}
|
||||
@@ -185,7 +186,7 @@ main(int argc, char **argv)
|
||||
/* I need the id of the primary as well as a connection to it */
|
||||
log_info(_("%s Connecting to primary for cluster '%s'\n"),
|
||||
progname, local_options.cluster_name);
|
||||
primaryConn = getMasterConnection(myLocalConn, local_options.node,
|
||||
primaryConn = getMasterConnection(myLocalConn,
|
||||
local_options.cluster_name,
|
||||
&primary_options.node,NULL);
|
||||
if (primaryConn == NULL)
|
||||
@@ -269,7 +270,7 @@ MonitorExecute(void)
|
||||
log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
|
||||
for (connection_retries = 0; connection_retries < 6; connection_retries++)
|
||||
{
|
||||
primaryConn = getMasterConnection(myLocalConn, local_options.node,
|
||||
primaryConn = getMasterConnection(myLocalConn,
|
||||
local_options.cluster_name, &primary_options.node,NULL);
|
||||
if (PQstatus(primaryConn) == CONNECTION_OK)
|
||||
{
|
||||
@@ -456,7 +457,10 @@ checkNodeConfiguration(char *conninfo)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
PQclear(res);
|
||||
else
|
||||
{
|
||||
PQclear(res);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -27,6 +27,15 @@
|
||||
|
||||
static int xvsnprintf(char *str, size_t size, const char *format, va_list ap);
|
||||
|
||||
/* Add strnlen on platforms that don't have it, like OS X */
|
||||
#ifndef strnlen
|
||||
size_t
|
||||
strnlen(const char *s, size_t n)
|
||||
{
|
||||
const char *end = (const char *) memchr(s, '\0', n);
|
||||
return(end ? end - s : n);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
xvsnprintf(char *str, size_t size, const char *format, va_list ap)
|
||||
|
||||
@@ -35,4 +35,9 @@ extern int xsnprintf(char *str, size_t size, const char *format, ...);
|
||||
extern int sqlquery_snprintf(char *str, const char *format, ...);
|
||||
extern int maxlen_snprintf(char *str, const char *format, ...);
|
||||
|
||||
/* Add strnlen on platforms that don't have it, like OS X */
|
||||
#ifndef strnlen
|
||||
extern size_t strnlen(const char *s, size_t n);
|
||||
#endif
|
||||
|
||||
#endif /* _STRUTIL_H_ */
|
||||
|
||||
Reference in New Issue
Block a user