mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Compare commits
113 Commits
REL3_4_STA
...
REL3_0_STA
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7cc7a278c2 | ||
|
|
20d9f978ab | ||
|
|
70a2797b9a | ||
|
|
8f62b4c9e6 | ||
|
|
310f3f31f9 | ||
|
|
4f849de95e | ||
|
|
0de4260664 | ||
|
|
fc75084e42 | ||
|
|
cfbc9dd3c6 | ||
|
|
94579b5f2e | ||
|
|
e9a25c367a | ||
|
|
3088096318 | ||
|
|
3bbd32c73c | ||
|
|
ac17033d61 | ||
|
|
711ad0a76c | ||
|
|
ad988dccce | ||
|
|
53fe3c7e5a | ||
|
|
7a439c90d0 | ||
|
|
87e5257cb8 | ||
|
|
1f240ff9b3 | ||
|
|
9d6cff0d40 | ||
|
|
f86e251430 | ||
|
|
085b7cb8b4 | ||
|
|
5ccf89ad9b | ||
|
|
6ae5401df0 | ||
|
|
4bd8190d02 | ||
|
|
efdc2355a7 | ||
|
|
61b1f72a0e | ||
|
|
882bfd9d8e | ||
|
|
c93f717305 | ||
|
|
85be96a0be | ||
|
|
ce2d4fb86f | ||
|
|
40354e1d62 | ||
|
|
3e1655f241 | ||
|
|
8387e7f65e | ||
|
|
aa4dd155b2 | ||
|
|
a171a501ab | ||
|
|
f42f771ff4 | ||
|
|
88cfcf358e | ||
|
|
ce3594d52d | ||
|
|
f64c42a514 | ||
|
|
3072139d06 | ||
|
|
3b7185fd39 | ||
|
|
819f980e76 | ||
|
|
49316fb8fb | ||
|
|
fa4ff73b87 | ||
|
|
29842f0e0d | ||
|
|
25db1ba737 | ||
|
|
7b9f6f5352 | ||
|
|
53b8f99217 | ||
|
|
95cdaac91d | ||
|
|
e7dd0f690c | ||
|
|
e0c5bb8d31 | ||
|
|
df3e55fa35 | ||
|
|
0ee2a1e6ba | ||
|
|
df05214970 | ||
|
|
bd1314d232 | ||
|
|
745566605d | ||
|
|
807dcc1038 | ||
|
|
acc0ffa81f | ||
|
|
1725e90308 | ||
|
|
2a3fb89603 | ||
|
|
8f24167f68 | ||
|
|
6ce94778d7 | ||
|
|
3a3c6d5143 | ||
|
|
73661637e9 | ||
|
|
ae84041a4e | ||
|
|
ea01d1d30b | ||
|
|
53ed8e948c | ||
|
|
43626892d0 | ||
|
|
8870b7d7f1 | ||
|
|
72b1e57251 | ||
|
|
6054da2c25 | ||
|
|
049ea4e24f | ||
|
|
5f8185ef3a | ||
|
|
66a6c15773 | ||
|
|
919fc0fbef | ||
|
|
c7c117130b | ||
|
|
df6517f167 | ||
|
|
0bf3fb0605 | ||
|
|
c2172d79a5 | ||
|
|
709276a19c | ||
|
|
3f98e1b91b | ||
|
|
8af08ab3f4 | ||
|
|
ff038a5148 | ||
|
|
f56f70c2a6 | ||
|
|
d353fe2a9f | ||
|
|
a70a44605f | ||
|
|
d14dcb3d8b | ||
|
|
249ac7c72a | ||
|
|
9d850fc4bd | ||
|
|
42cb811a07 | ||
|
|
1e202540e3 | ||
|
|
52db03d320 | ||
|
|
60d720f0c7 | ||
|
|
34af7dec2a | ||
|
|
a59ea243c0 | ||
|
|
0c5025b3d6 | ||
|
|
42b79b9b54 | ||
|
|
2e47c6b40b | ||
|
|
6fbff4747f | ||
|
|
cc567d38c8 | ||
|
|
69c552b8e0 | ||
|
|
51967d2bd8 | ||
|
|
97be9c0cda | ||
|
|
00a28fbb1e | ||
|
|
d512bac31d | ||
|
|
fb6781775d | ||
|
|
04c751a912 | ||
|
|
2615cffecc | ||
|
|
1f838f99c2 | ||
|
|
d3f119005b | ||
|
|
db6d4d8820 |
@@ -2,7 +2,7 @@ License and Contributions
|
||||
=========================
|
||||
|
||||
`repmgr` is licensed under the GPL v3. All of its code and documentation is
|
||||
Copyright 2010-2015, 2ndQuadrant Limited. See the files COPYRIGHT and LICENSE for
|
||||
Copyright 2010-2016, 2ndQuadrant Limited. See the files COPYRIGHT and LICENSE for
|
||||
details.
|
||||
|
||||
The development of repmgr has primarily been sponsored by 2ndQuadrant customers.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
Copyright (c) 2010-2015, 2ndQuadrant Limited
|
||||
Copyright (c) 2010-2016, 2ndQuadrant Limited
|
||||
All rights reserved.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
|
||||
@@ -93,7 +93,6 @@ Create the user and database to manage replication::
|
||||
su - postgres
|
||||
createuser -s repmgr
|
||||
createdb -O repmgr repmgr
|
||||
psql -f /usr/share/postgresql/9.0/contrib/repmgr_funcs.sql repmgr
|
||||
|
||||
Restart the PostgreSQL server::
|
||||
|
||||
@@ -121,7 +120,7 @@ Log in to node2.
|
||||
Clone node1 (the current Master)::
|
||||
|
||||
su - postgres
|
||||
repmgr -d repmgr -U repmgr -h node1 standby clone
|
||||
repmgr -d repmgr -U repmgr -h node1 standby clone
|
||||
|
||||
Start the PostgreSQL server::
|
||||
|
||||
@@ -172,11 +171,13 @@ Register Master and Standby
|
||||
|
||||
Log in to node1.
|
||||
|
||||
Register the node as Master::
|
||||
Register the node as master::
|
||||
|
||||
su - postgres
|
||||
repmgr -f /etc/repmgr/repmgr.conf master register
|
||||
|
||||
This will also create the repmgr schema and functions.
|
||||
|
||||
Log in to node2. Register it as a standby::
|
||||
|
||||
su - postgres
|
||||
|
||||
23
FAQ.md
23
FAQ.md
@@ -34,6 +34,11 @@ General
|
||||
replication slots, setting a higher figure will make adding new nodes
|
||||
easier.
|
||||
|
||||
- Does `repmgr` support hash indexes?
|
||||
|
||||
No. Hash indexes and replication do not mix well and their use is
|
||||
explicitly discouraged; see:
|
||||
http://www.postgresql.org/docs/current/interactive/sql-createindex.html#AEN74175
|
||||
|
||||
`repmgr`
|
||||
--------
|
||||
@@ -96,8 +101,9 @@ General
|
||||
is intended to support running the witness server as a separate
|
||||
instance on a normal node server, rather than on its own dedicated server.
|
||||
|
||||
To specify a port for the witness server, supply the port number to
|
||||
repmgr with the `-l/--local-port` command line option.
|
||||
To specify different port for the witness server, supply the port number
|
||||
in the `conninfo` string in `repmgr.conf`
|
||||
(repmgr 3.0.1 and earlier: use the `-l/--local-port` option)
|
||||
|
||||
- Do I need to include `shared_preload_libraries = 'repmgr_funcs'`
|
||||
in `postgresql.conf` if I'm not using `repmgrd`?
|
||||
@@ -106,6 +112,14 @@ General
|
||||
If you later decide to run `repmgrd`, you just need to add
|
||||
`shared_preload_libraries = 'repmgr_funcs'` and restart PostgreSQL.
|
||||
|
||||
- I've provided replication permission for the `repmgr` user in `pg_hba.conf`
|
||||
but `repmgr`/`repmgrd` complains it can't connect to the server... Why?
|
||||
|
||||
`repmgr`/`repmgrd` need to be able to connect to the repmgr database
|
||||
with a normal connection to query metadata. The `replication` connection
|
||||
permission is for PostgreSQL's streaming replication and doesn't
|
||||
necessarily need to be the `repmgr` user.
|
||||
|
||||
|
||||
`repmgrd`
|
||||
---------
|
||||
@@ -134,3 +148,8 @@ General
|
||||
|
||||
Note that after registering a delayed standby, `repmgrd` will only start
|
||||
once the metadata added in the master node has been replicated.
|
||||
|
||||
- How can I get `repmgrd` to rotate its logfile?
|
||||
|
||||
Configure your system's `logrotate` service to do this; see example
|
||||
in README.md
|
||||
|
||||
26
HISTORY
26
HISTORY
@@ -1,4 +1,24 @@
|
||||
3.0.2 2015-09-
|
||||
3.0.4 2016-01-
|
||||
Remove requirement for 'archive_mode' to be enabled (Ian)
|
||||
|
||||
3.0.3 2016-01-04
|
||||
Create replication slot if required before base backup is run (Abhijit)
|
||||
standy clone: when using rsync, clean up "pg_replslot" directory (Ian)
|
||||
Improve --help output (Ian)
|
||||
Improve config file parsing (Ian)
|
||||
Various logging output improvements, including explicit HINTS (Ian)
|
||||
Add --log-level to explicitly set log level on command line (Ian)
|
||||
Repurpose --verbose to display extra log output (Ian)
|
||||
Add --terse to hide hints and other non-critical output (Ian)
|
||||
Reference internal functions with explicit catalog path (Ian)
|
||||
When following a new primary, have repmgr (not repmgrd) create the new slot (Ian)
|
||||
Add /etc/repmgr.conf as a default configuration file location (Ian)
|
||||
Prevent repmgrd's -v/--verbose option expecting a parameter (Ian)
|
||||
Prevent invalid replication_lag values being written to the monitoring table (Ian)
|
||||
Improve repmgrd behaviour when monitored standby node is temporarily
|
||||
unavailable (Martín)
|
||||
|
||||
3.0.2 2015-10-02
|
||||
Improve handling of --help/--version options; and improve help output (Ian)
|
||||
Improve handling of situation where logfile can't be opened (Ian)
|
||||
Always pass -D/--pgdata option to pg_basebackup (Ian)
|
||||
@@ -12,7 +32,9 @@
|
||||
Update tablespace remapping in --rsync-only mode for 9.5 and later (Ian)
|
||||
Deprecate `-l/--local-port` option - the port can be extracted
|
||||
from the conninfo string in repmgr.conf (Ian)
|
||||
Add STANDBY UNREGISTE (Vik Fearing)
|
||||
Add STANDBY UNREGISTER (Vik Fearing)
|
||||
Don't fail with error when registering master if schema already defined (Ian)
|
||||
Fixes to whitespace handling when parsing config file (Ian)
|
||||
|
||||
3.0.1 2015-04-16
|
||||
Prevent repmgrd from looping infinitely if node was not registered (Ian)
|
||||
|
||||
2
Makefile
2
Makefile
@@ -1,6 +1,6 @@
|
||||
#
|
||||
# Makefile
|
||||
# Copyright (c) 2ndQuadrant, 2010-2015
|
||||
# Copyright (c) 2ndQuadrant, 2010-2016
|
||||
|
||||
repmgrd_OBJS = dbutils.o config.o repmgrd.o log.o strutil.o
|
||||
repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o
|
||||
|
||||
62
README.md
62
README.md
@@ -12,10 +12,13 @@ This version can use `pg_basebackup` to clone standby servers, supports
|
||||
replication slots and cascading replication, doesn't require a restart
|
||||
after promotion, and has many usability improvements.
|
||||
|
||||
Please continue to use `repmgr 2` with earlier PostgreSQL 9.x versions.
|
||||
Please continue to use `repmgr 2` with PostgreSQL 9.2 and earlier.
|
||||
For a list of changes since `repmgr 2` and instructions on upgrading to
|
||||
`repmgr 3`, see the "Upgrading from repmgr 2" section below.
|
||||
|
||||
For a list of frequently asked questions about `repmgr`, please refer
|
||||
to the file `FAQ.md`.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
@@ -111,7 +114,7 @@ like the following in `postgresql.conf`:
|
||||
# How much WAL to retain on the primary to allow a temporarily
|
||||
# disconnected standby to catch up again. The larger this is, the
|
||||
# longer the standby can be disconnected. This is needed only in
|
||||
# 9.3; in 9.4, replication slots can be used instead (see below).
|
||||
# 9.3; from 9.4, replication slots can be used instead (see below).
|
||||
|
||||
wal_keep_segments = 5000
|
||||
|
||||
@@ -141,10 +144,14 @@ running the following commands:
|
||||
createuser -s repmgr
|
||||
createdb repmgr -O repmgr
|
||||
|
||||
We recommend using the name `repmgr` for both, but you can use whatever
|
||||
name you like (and you need to set the names you chose in the `conninfo`
|
||||
string in `repmgr.conf`; see below). `repmgr` will create the schema and
|
||||
objects it needs when it connects to the server.
|
||||
We recommend using the name `repmgr` for both user and database, but you
|
||||
can use whatever name you like (and you need to set the names you chose
|
||||
in the `conninfo` string in `repmgr.conf`; see below). We also recommend
|
||||
that you set the `repmgr` user's search path to include the `repmgr` schema
|
||||
for convenience when querying the metadata tables and views.
|
||||
|
||||
The `repmgr` application will create its metadata schema in the `repmgr`
|
||||
database when the master server is registered.
|
||||
|
||||
### repmgr configuration
|
||||
|
||||
@@ -260,6 +267,20 @@ Example log output (at default log level):
|
||||
[2015-03-11 13:15:40] [INFO] reloading configuration file and updating repmgr tables
|
||||
[2015-03-11 13:15:40] [INFO] starting continuous standby node monitoring
|
||||
|
||||
Note that currently `repmgrd` does not provide logfile rotation. To ensure
|
||||
the current logfile does not grow indefinitely, configure your system's `logrotate`
|
||||
to do this. Sample configuration to rotate logfiles weekly with retention
|
||||
for up to 52 weeks and rotation forced if a file grows beyond 100Mb:
|
||||
|
||||
/var/log/postgresql/repmgr-9.4.log {
|
||||
missingok
|
||||
compress
|
||||
rotate 52
|
||||
maxsize 100M
|
||||
weekly
|
||||
create 0600 postgres postgres
|
||||
}
|
||||
|
||||
|
||||
Witness server
|
||||
--------------
|
||||
@@ -355,6 +376,7 @@ Following event types currently exist:
|
||||
standby_promote
|
||||
witness_create
|
||||
repmgrd_start
|
||||
repmgrd_monitor
|
||||
repmgrd_failover_promote
|
||||
repmgrd_failover_follow
|
||||
|
||||
@@ -585,20 +607,20 @@ and one view:
|
||||
`repmgr` or `repmgrd` will return one of the following error codes on program
|
||||
exit:
|
||||
|
||||
* SUCCESS (0) Program ran successfully.
|
||||
* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid
|
||||
* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error
|
||||
* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed
|
||||
* ERR_DB_CON (6) Error when trying to connect to a database
|
||||
* ERR_DB_QUERY (7) Error while executing a database query
|
||||
* ERR_PROMOTED (8) Exiting program because the node has been promoted to master
|
||||
* ERR_BAD_PASSWORD (9) Password used to connect to a database was rejected
|
||||
* ERR_STR_OVERFLOW (10) String overflow error
|
||||
* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only)
|
||||
* ERR_BAD_SSH (12) Error when connecting to remote host via SSH
|
||||
* ERR_SYS_FAILURE (13) Error when forking (repmgrd only)
|
||||
* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup
|
||||
|
||||
* SUCCESS (0) Program ran successfully.
|
||||
* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid
|
||||
* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error
|
||||
* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed
|
||||
* ERR_DB_CON (6) Error when trying to connect to a database
|
||||
* ERR_DB_QUERY (7) Error while executing a database query
|
||||
* ERR_PROMOTED (8) Exiting program because the node has been promoted to master
|
||||
* ERR_BAD_PASSWORD (9) Password used to connect to a database was rejected
|
||||
* ERR_STR_OVERFLOW (10) String overflow error
|
||||
* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only)
|
||||
* ERR_BAD_SSH (12) Error when connecting to remote host via SSH
|
||||
* ERR_SYS_FAILURE (13) Error when forking (repmgrd only)
|
||||
* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup
|
||||
* ERR_MONITORING_FAIL (16) Unrecoverable error encountered during monitoring (repmgrd only)
|
||||
|
||||
Support and Assistance
|
||||
----------------------
|
||||
|
||||
39
TODO
39
TODO
@@ -7,6 +7,7 @@ Known issues in repmgr
|
||||
|
||||
* PGPASSFILE may not be passed to pg_basebackup
|
||||
|
||||
|
||||
Planned feature improvements
|
||||
============================
|
||||
|
||||
@@ -37,4 +38,40 @@ Planned feature improvements
|
||||
before the primary. See github issue #80.
|
||||
|
||||
* make old master node ID available for event notification commands
|
||||
(See github issue #80).
|
||||
(See github issue #80).
|
||||
|
||||
* Have pg_basebackup use replication slots, if and when support for
|
||||
this is added; see:
|
||||
http://www.postgresql.org/message-id/555DD2B2.7020000@gmx.net
|
||||
|
||||
* use "primary/standby" terminology in place of "master/slave" for consistency
|
||||
with main PostrgreSQL usage
|
||||
|
||||
* repmgr standby clone: possibility to use barman instead of performing a new base backup
|
||||
|
||||
* possibility to transform a failed master into a new standby with pg_rewind
|
||||
|
||||
* "repmgr standby switchover" to promote a standby in a controlled manner
|
||||
and convert the existing primary into a standby
|
||||
|
||||
* make repmgrd more robust
|
||||
|
||||
* repmgr: when cloning a standby using pg_basebackup and replication slots are
|
||||
requested, activate the replication slot using pg_receivexlog to negate the
|
||||
need to set `wal_keep_segments` just for the initial clone (9.4 and 9.5).
|
||||
|
||||
Usability improvements
|
||||
======================
|
||||
|
||||
* repmgr: add interrupt handler, so that if the program is interrupted
|
||||
while running a backup, an attempt can be made to execute pg_stop_backup()
|
||||
on the primary, to prevent an orphaned backup state existing.
|
||||
|
||||
* repmgr: when unregistering a node, delete any entries in the repl_monitoring
|
||||
table.
|
||||
|
||||
* repmgr: for "standby unregister", accept connection parameters for the
|
||||
primary and perform metadata updates (and slot removal) directly on
|
||||
the primary, to allow a shutdown standby to be unregistered
|
||||
(currently the standby must still be running, which means the replication
|
||||
slot can't be dropped).
|
||||
|
||||
10
check_dir.c
10
check_dir.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* check_dir.c - Directories management functions
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -320,10 +320,10 @@ _create_pg_dir(char *dir, bool force, bool for_witness)
|
||||
}
|
||||
else if (pg_dir && !force)
|
||||
{
|
||||
log_warning(_("\nThis looks like a PostgreSQL directory.\n"
|
||||
"If you are sure you want to clone here, "
|
||||
"please check there is no PostgreSQL server "
|
||||
"running and use the --force option\n"));
|
||||
log_hint(_("This looks like a PostgreSQL directory.\n"
|
||||
"If you are sure you want to clone here, "
|
||||
"please check there is no PostgreSQL server "
|
||||
"running and use the -F/--force option\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* check_dir.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
||||
437
config.c
437
config.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* config.c - Functions to parse the config file
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -26,9 +26,25 @@
|
||||
|
||||
static void parse_event_notifications_list(t_configuration_options *options, const char *arg);
|
||||
static void tablespace_list_append(t_configuration_options *options, const char *arg);
|
||||
static void exit_with_errors(ErrorList *config_errors);
|
||||
|
||||
const static char *_progname = '\0';
|
||||
static char config_file_path[MAXPGPATH];
|
||||
static bool config_file_provided = false;
|
||||
static bool config_file_found = false;
|
||||
|
||||
|
||||
void
|
||||
set_progname(const char *argv0)
|
||||
{
|
||||
_progname = get_progname(argv0);
|
||||
}
|
||||
|
||||
const char *
|
||||
progname(void)
|
||||
{
|
||||
return _progname;
|
||||
}
|
||||
|
||||
/*
|
||||
* load_config()
|
||||
@@ -40,61 +56,123 @@ static bool config_file_provided = false;
|
||||
*
|
||||
* Any configuration options changed in this function must also be changed in
|
||||
* reload_config()
|
||||
*
|
||||
* NOTE: this function is called before the logger is set up, so we need
|
||||
* to handle the verbose option ourselves; also the default log level is NOTICE,
|
||||
* so we can't use DEBUG.
|
||||
*/
|
||||
bool
|
||||
load_config(const char *config_file, t_configuration_options *options, char *argv0)
|
||||
load_config(const char *config_file, bool verbose, t_configuration_options *options, char *argv0)
|
||||
{
|
||||
struct stat config;
|
||||
/* Sanity checks */
|
||||
struct stat stat_config;
|
||||
|
||||
/*
|
||||
* If a configuration file was provided, check it exists, otherwise
|
||||
* emit an error and terminate
|
||||
* emit an error and terminate. We assume that if a user explicitly
|
||||
* provides a configuration file, they'll want to make sure it's
|
||||
* used and not fall back to any of the defaults.
|
||||
*/
|
||||
if (config_file[0])
|
||||
{
|
||||
strncpy(config_file_path, config_file, MAXPGPATH);
|
||||
canonicalize_path(config_file_path);
|
||||
|
||||
if (stat(config_file_path, &config) != 0)
|
||||
if (stat(config_file_path, &stat_config) != 0)
|
||||
{
|
||||
log_err(_("provided configuration file '%s' not found: %s\n"),
|
||||
log_err(_("provided configuration file \"%s\" not found: %s\n"),
|
||||
config_file,
|
||||
strerror(errno)
|
||||
);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("using configuration file \"%s\"\n"), config_file);
|
||||
}
|
||||
|
||||
config_file_provided = true;
|
||||
config_file_found = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If no configuration file was provided, attempt to find a default file
|
||||
* in this order:
|
||||
* - current directory
|
||||
* - /etc/repmgr.conf
|
||||
* - default sysconfdir
|
||||
*
|
||||
* here we just check for the existence of the file; parse_config()
|
||||
* will handle read errors etc.
|
||||
*/
|
||||
if (config_file_provided == false)
|
||||
{
|
||||
char my_exec_path[MAXPGPATH];
|
||||
char etc_path[MAXPGPATH];
|
||||
char sysconf_etc_path[MAXPGPATH];
|
||||
|
||||
/* First check if one is in the default sysconfdir */
|
||||
/* 1. "./repmgr.conf" */
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("looking for configuration file in current directory\n"));
|
||||
}
|
||||
|
||||
snprintf(config_file_path, MAXPGPATH, "./%s", CONFIG_FILE_NAME);
|
||||
canonicalize_path(config_file_path);
|
||||
|
||||
if (stat(config_file_path, &stat_config) == 0)
|
||||
{
|
||||
config_file_found = true;
|
||||
goto end_search;
|
||||
}
|
||||
|
||||
/* 2. "/etc/repmgr.conf" */
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("looking for configuration file in /etc\n"));
|
||||
}
|
||||
|
||||
snprintf(config_file_path, MAXPGPATH, "/etc/%s", CONFIG_FILE_NAME);
|
||||
if (stat(config_file_path, &stat_config) == 0)
|
||||
{
|
||||
config_file_found = true;
|
||||
goto end_search;
|
||||
}
|
||||
|
||||
/* 3. default sysconfdir */
|
||||
if (find_my_exec(argv0, my_exec_path) < 0)
|
||||
{
|
||||
fprintf(stderr, _("%s: could not find own program executable\n"), argv0);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
get_etc_path(my_exec_path, etc_path);
|
||||
get_etc_path(my_exec_path, sysconf_etc_path);
|
||||
|
||||
snprintf(config_file_path, MAXPGPATH, "%s/repmgr.conf", etc_path);
|
||||
|
||||
log_debug(_("Looking for configuration file in %s\n"), etc_path);
|
||||
|
||||
if (stat(config_file_path, &config) != 0)
|
||||
if (verbose == true)
|
||||
{
|
||||
/* Not found - default to ./repmgr.conf */
|
||||
strncpy(config_file_path, DEFAULT_CONFIG_FILE, MAXPGPATH);
|
||||
canonicalize_path(config_file_path);
|
||||
log_debug(_("Looking for configuration file in %s\n"), config_file_path);
|
||||
log_notice(_("looking for configuration file in %s"), sysconf_etc_path);
|
||||
}
|
||||
|
||||
snprintf(config_file_path, MAXPGPATH, "%s/%s", sysconf_etc_path, CONFIG_FILE_NAME);
|
||||
if (stat(config_file_path, &stat_config) == 0)
|
||||
{
|
||||
config_file_found = true;
|
||||
goto end_search;
|
||||
}
|
||||
|
||||
end_search:
|
||||
if (config_file_found == true)
|
||||
{
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("configuration file found at: %s\n"), config_file_path);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("no configuration file provided or found\n"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,12 +180,19 @@ load_config(const char *config_file, t_configuration_options *options, char *arg
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Parse configuration file; if any errors are encountered,
|
||||
* list them and exit.
|
||||
*
|
||||
* Ensure any default values set here are synced with repmgr.conf.sample
|
||||
* and any other documentation.
|
||||
*/
|
||||
bool
|
||||
parse_config(t_configuration_options *options)
|
||||
{
|
||||
FILE *fp;
|
||||
char *s,
|
||||
buff[MAXLINELENGTH];
|
||||
buf[MAXLINELENGTH];
|
||||
char name[MAXLEN];
|
||||
char value[MAXLEN];
|
||||
|
||||
@@ -115,36 +200,17 @@ parse_config(t_configuration_options *options)
|
||||
PQconninfoOption *conninfo_options;
|
||||
char *conninfo_errmsg = NULL;
|
||||
|
||||
fp = fopen(config_file_path, "r");
|
||||
/* Collate configuration file errors here for friendlier reporting */
|
||||
static ErrorList config_errors = { NULL, NULL };
|
||||
|
||||
/*
|
||||
* Since some commands don't require a config file at all, not having one
|
||||
* isn't necessarily a problem.
|
||||
*
|
||||
* If the user explictly provided a configuration file and we can't
|
||||
* read it we'll raise an error.
|
||||
*
|
||||
* If no configuration file was provided, we'll try and read the default\
|
||||
* file if it exists and is readable, but won't worry if it's not.
|
||||
/* Initialize configuration options with sensible defaults
|
||||
* note: the default log level is set in log.c and does not need
|
||||
* to be initialised here
|
||||
*/
|
||||
if (fp == NULL)
|
||||
{
|
||||
if (config_file_provided)
|
||||
{
|
||||
log_err(_("unable to open provided configuration file '%s'; terminating\n"), config_file_path);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
log_notice(_("no configuration file provided and default file '%s' not found - "
|
||||
"continuing with default values\n"),
|
||||
DEFAULT_CONFIG_FILE);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Initialize configuration options with sensible defaults */
|
||||
memset(options->cluster_name, 0, sizeof(options->cluster_name));
|
||||
options->node = -1;
|
||||
options->upstream_node = NO_UPSTREAM_NODE;
|
||||
options->use_replication_slots = 0;
|
||||
memset(options->conninfo, 0, sizeof(options->conninfo));
|
||||
options->failover = MANUAL_FAILOVER;
|
||||
options->priority = DEFAULT_PRIORITY;
|
||||
@@ -162,7 +228,7 @@ parse_config(t_configuration_options *options)
|
||||
|
||||
/* default to 6 reconnection attempts at intervals of 10 seconds */
|
||||
options->reconnect_attempts = 6;
|
||||
options->reconnect_intvl = 10;
|
||||
options->reconnect_interval = 10;
|
||||
|
||||
options->monitor_interval_secs = 2;
|
||||
options->retry_promote_interval_secs = 300;
|
||||
@@ -172,15 +238,45 @@ parse_config(t_configuration_options *options)
|
||||
options->tablespace_mapping.head = NULL;
|
||||
options->tablespace_mapping.tail = NULL;
|
||||
|
||||
/*
|
||||
* If no configuration file available (user didn't specify and none found
|
||||
* in the default locations), return with default values
|
||||
*/
|
||||
if (config_file_found == false)
|
||||
{
|
||||
log_notice(_("no configuration file provided and no default file found - "
|
||||
"continuing with default values\n"));
|
||||
return true;
|
||||
}
|
||||
|
||||
fp = fopen(config_file_path, "r");
|
||||
|
||||
/* Read next line */
|
||||
while ((s = fgets(buff, sizeof buff, fp)) != NULL)
|
||||
/*
|
||||
* A configuration file has been found, either provided by the user
|
||||
* or found in one of the default locations. If we can't open it,
|
||||
* fail with an error.
|
||||
*/
|
||||
if (fp == NULL)
|
||||
{
|
||||
if (config_file_provided)
|
||||
{
|
||||
log_err(_("unable to open provided configuration file \"%s\"; terminating\n"), config_file_path);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_err(_("unable to open default configuration file \"%s\"; terminating\n"), config_file_path);
|
||||
}
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* Read file */
|
||||
while ((s = fgets(buf, sizeof buf, fp)) != NULL)
|
||||
{
|
||||
bool known_parameter = true;
|
||||
|
||||
/* Parse name/value pair from line */
|
||||
parse_line(buff, name, value);
|
||||
parse_line(buf, name, value);
|
||||
|
||||
/* Skip blank lines */
|
||||
if (!strlen(name))
|
||||
@@ -194,9 +290,9 @@ parse_config(t_configuration_options *options)
|
||||
if (strcmp(name, "cluster") == 0)
|
||||
strncpy(options->cluster_name, value, MAXLEN);
|
||||
else if (strcmp(name, "node") == 0)
|
||||
options->node = atoi(value);
|
||||
options->node = repmgr_atoi(value, "node", &config_errors);
|
||||
else if (strcmp(name, "upstream_node") == 0)
|
||||
options->upstream_node = atoi(value);
|
||||
options->upstream_node = repmgr_atoi(value, "upstream_node", &config_errors);
|
||||
else if (strcmp(name, "conninfo") == 0)
|
||||
strncpy(options->conninfo, value, MAXLEN);
|
||||
else if (strcmp(name, "rsync_options") == 0)
|
||||
@@ -223,12 +319,11 @@ parse_config(t_configuration_options *options)
|
||||
}
|
||||
else
|
||||
{
|
||||
log_err(_("value for 'failover' must be 'automatic' or 'manual'\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
error_list_append(&config_errors,_("value for 'failover' must be 'automatic' or 'manual'\n"));
|
||||
}
|
||||
}
|
||||
else if (strcmp(name, "priority") == 0)
|
||||
options->priority = atoi(value);
|
||||
options->priority = repmgr_atoi(value, "priority", &config_errors);
|
||||
else if (strcmp(name, "node_name") == 0)
|
||||
strncpy(options->node_name, value, MAXLEN);
|
||||
else if (strcmp(name, "promote_command") == 0)
|
||||
@@ -236,11 +331,16 @@ parse_config(t_configuration_options *options)
|
||||
else if (strcmp(name, "follow_command") == 0)
|
||||
strncpy(options->follow_command, value, MAXLEN);
|
||||
else if (strcmp(name, "master_response_timeout") == 0)
|
||||
options->master_response_timeout = atoi(value);
|
||||
options->master_response_timeout = repmgr_atoi(value, "master_response_timeout", &config_errors);
|
||||
/* 'primary_response_timeout' as synonym for 'master_response_timeout' -
|
||||
* we'll switch terminology in a future release (3.1?)
|
||||
*/
|
||||
else if (strcmp(name, "primary_response_timeout") == 0)
|
||||
options->master_response_timeout = repmgr_atoi(value, "primary_response_timeout", &config_errors);
|
||||
else if (strcmp(name, "reconnect_attempts") == 0)
|
||||
options->reconnect_attempts = atoi(value);
|
||||
options->reconnect_attempts = repmgr_atoi(value, "reconnect_attempts", &config_errors);
|
||||
else if (strcmp(name, "reconnect_interval") == 0)
|
||||
options->reconnect_intvl = atoi(value);
|
||||
options->reconnect_interval = repmgr_atoi(value, "reconnect_interval", &config_errors);
|
||||
else if (strcmp(name, "pg_bindir") == 0)
|
||||
strncpy(options->pg_bindir, value, MAXLEN);
|
||||
else if (strcmp(name, "pg_ctl_options") == 0)
|
||||
@@ -250,11 +350,12 @@ parse_config(t_configuration_options *options)
|
||||
else if (strcmp(name, "logfile") == 0)
|
||||
strncpy(options->logfile, value, MAXLEN);
|
||||
else if (strcmp(name, "monitor_interval_secs") == 0)
|
||||
options->monitor_interval_secs = atoi(value);
|
||||
options->monitor_interval_secs = repmgr_atoi(value, "monitor_interval_secs", &config_errors);
|
||||
else if (strcmp(name, "retry_promote_interval_secs") == 0)
|
||||
options->retry_promote_interval_secs = atoi(value);
|
||||
options->retry_promote_interval_secs = repmgr_atoi(value, "retry_promote_interval_secs", &config_errors);
|
||||
else if (strcmp(name, "use_replication_slots") == 0)
|
||||
options->use_replication_slots = atoi(value);
|
||||
/* XXX we should have a dedicated boolean argument format */
|
||||
options->use_replication_slots = repmgr_atoi(value, "use_replication_slots", &config_errors);
|
||||
else if (strcmp(name, "event_notification_command") == 0)
|
||||
strncpy(options->event_notification_command, value, MAXLEN);
|
||||
else if (strcmp(name, "event_notifications") == 0)
|
||||
@@ -274,8 +375,13 @@ parse_config(t_configuration_options *options)
|
||||
* as currently e.g. an empty `node` value will be converted to '0'.
|
||||
*/
|
||||
if (known_parameter == true && !strlen(value)) {
|
||||
log_err(_("no value provided for parameter '%s'\n"), name);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
char error_message_buf[MAXLEN] = "";
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("no value provided for parameter \"%s\""),
|
||||
name);
|
||||
|
||||
error_list_append(&config_errors, error_message_buf);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -286,64 +392,49 @@ parse_config(t_configuration_options *options)
|
||||
/* The following checks are for the presence of the parameter */
|
||||
if (*options->cluster_name == '\0')
|
||||
{
|
||||
log_err(_("required parameter 'cluster' was not found\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
error_list_append(&config_errors, _("\"cluster\": parameter was not found\n"));
|
||||
}
|
||||
|
||||
if (options->node == -1)
|
||||
{
|
||||
log_err(_("required parameter 'node' was not found\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (options->node == 0)
|
||||
{
|
||||
log_err(_("'node' must be an integer greater than zero\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
error_list_append(&config_errors, _("\"node\": parameter was not found\n"));
|
||||
}
|
||||
|
||||
if (*options->node_name == '\0')
|
||||
{
|
||||
log_err(_("required parameter 'node_name' was not found\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
error_list_append(&config_errors, _("\"node_name\": parameter was not found\n"));
|
||||
}
|
||||
|
||||
if (*options->conninfo == '\0')
|
||||
{
|
||||
log_err(_("required parameter 'conninfo' was not found\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
error_list_append(&config_errors, _("\"conninfo\": parameter was not found\n"));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
/* Sanity check the provided conninfo string
|
||||
*
|
||||
* NOTE: PQconninfoParse() verifies the string format and checks for valid options
|
||||
* but does not sanity check values
|
||||
*/
|
||||
conninfo_options = PQconninfoParse(options->conninfo, &conninfo_errmsg);
|
||||
if (conninfo_options == NULL)
|
||||
{
|
||||
char error_message_buf[MAXLEN] = "";
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("\"conninfo\": %s"),
|
||||
conninfo_errmsg);
|
||||
|
||||
error_list_append(&config_errors, error_message_buf);
|
||||
}
|
||||
|
||||
PQconninfoFree(conninfo_options);
|
||||
}
|
||||
|
||||
/* Sanity check the provided conninfo string
|
||||
*
|
||||
* NOTE: this verifies the string format and checks for valid options
|
||||
* but does not sanity check values
|
||||
*/
|
||||
conninfo_options = PQconninfoParse(options->conninfo, &conninfo_errmsg);
|
||||
if (conninfo_options == NULL)
|
||||
if (config_errors.head != NULL)
|
||||
{
|
||||
log_err(_("Parameter 'conninfo' is invalid: %s"), conninfo_errmsg);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
PQconninfoFree(conninfo_options);
|
||||
|
||||
/* The following checks are for valid parameter values */
|
||||
if (options->master_response_timeout <= 0)
|
||||
{
|
||||
log_err(_("'master_response_timeout' must be greater than zero\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (options->reconnect_attempts < 0)
|
||||
{
|
||||
log_err(_("'reconnect_attempts' must be zero or greater\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (options->reconnect_intvl < 0)
|
||||
{
|
||||
log_err(_("'reconnect_interval' must be zero or greater\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
exit_with_errors(&config_errors);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -378,7 +469,7 @@ trim(char *s)
|
||||
}
|
||||
|
||||
void
|
||||
parse_line(char *buff, char *name, char *value)
|
||||
parse_line(char *buf, char *name, char *value)
|
||||
{
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
@@ -389,10 +480,10 @@ parse_line(char *buff, char *name, char *value)
|
||||
for (; i < MAXLEN; ++i)
|
||||
{
|
||||
|
||||
if (buff[i] == '=')
|
||||
if (buf[i] == '=')
|
||||
break;
|
||||
|
||||
switch(buff[i])
|
||||
switch(buf[i])
|
||||
{
|
||||
/* Ignore whitespace */
|
||||
case ' ':
|
||||
@@ -401,7 +492,7 @@ parse_line(char *buff, char *name, char *value)
|
||||
case '\t':
|
||||
continue;
|
||||
default:
|
||||
name[j++] = buff[i];
|
||||
name[j++] = buf[i];
|
||||
}
|
||||
}
|
||||
name[j] = '\0';
|
||||
@@ -411,9 +502,9 @@ parse_line(char *buff, char *name, char *value)
|
||||
*/
|
||||
for (; i < MAXLEN; ++i)
|
||||
{
|
||||
if (buff[i+1] == ' ')
|
||||
if (buf[i+1] == ' ')
|
||||
continue;
|
||||
if (buff[i+1] == '\t')
|
||||
if (buf[i+1] == '\t')
|
||||
continue;
|
||||
|
||||
break;
|
||||
@@ -424,12 +515,12 @@ parse_line(char *buff, char *name, char *value)
|
||||
*/
|
||||
j = 0;
|
||||
for (++i; i < MAXLEN; ++i)
|
||||
if (buff[i] == '\'')
|
||||
if (buf[i] == '\'')
|
||||
continue;
|
||||
else if (buff[i] == '#')
|
||||
else if (buf[i] == '#')
|
||||
break;
|
||||
else if (buff[i] != '\n')
|
||||
value[j++] = buff[i];
|
||||
else if (buf[i] != '\n')
|
||||
value[j++] = buf[i];
|
||||
else
|
||||
break;
|
||||
value[j] = '\0';
|
||||
@@ -491,7 +582,7 @@ reload_config(t_configuration_options *orig_options)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (new_options.reconnect_intvl < 0)
|
||||
if (new_options.reconnect_interval < 0)
|
||||
{
|
||||
log_warning(_("new value for 'reconnect_interval' must be zero or greater\n"));
|
||||
return false;
|
||||
@@ -610,10 +701,10 @@ reload_config(t_configuration_options *orig_options)
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/* reconnect_intvl */
|
||||
if (orig_options->reconnect_intvl != new_options.reconnect_intvl)
|
||||
/* reconnect_interval */
|
||||
if (orig_options->reconnect_interval != new_options.reconnect_interval)
|
||||
{
|
||||
orig_options->reconnect_intvl = new_options.reconnect_intvl;
|
||||
orig_options->reconnect_interval = new_options.reconnect_interval;
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
@@ -665,6 +756,96 @@ reload_config(t_configuration_options *orig_options)
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
error_list_append(ErrorList *error_list, char *error_message)
|
||||
{
|
||||
ErrorListCell *cell;
|
||||
|
||||
cell = (ErrorListCell *) pg_malloc0(sizeof(ErrorListCell));
|
||||
|
||||
if (cell == NULL)
|
||||
{
|
||||
log_err(_("unable to allocate memory; terminating.\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
cell->error_message = pg_malloc0(MAXLEN);
|
||||
strncpy(cell->error_message, error_message, MAXLEN);
|
||||
|
||||
if (error_list->tail)
|
||||
{
|
||||
error_list->tail->next = cell;
|
||||
}
|
||||
else
|
||||
{
|
||||
error_list->head = cell;
|
||||
}
|
||||
|
||||
error_list->tail = cell;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Convert provided string to an integer using strtol;
|
||||
* on error, if a callback is provided, pass the error message to that,
|
||||
* otherwise exit
|
||||
*/
|
||||
int
|
||||
repmgr_atoi(const char *value, const char *config_item, ErrorList *error_list)
|
||||
{
|
||||
char *endptr;
|
||||
long longval = 0;
|
||||
char error_message_buf[MAXLEN] = "";
|
||||
|
||||
/* It's possible that some versions of strtol() don't treat an empty
|
||||
* string as an error.
|
||||
*/
|
||||
|
||||
if (*value == '\0')
|
||||
{
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("no value provided for \"%s\""),
|
||||
config_item);
|
||||
}
|
||||
else
|
||||
{
|
||||
errno = 0;
|
||||
longval = strtol(value, &endptr, 10);
|
||||
|
||||
if (value == endptr || errno)
|
||||
{
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("\"%s\": invalid value (provided: \"%s\")"),
|
||||
config_item, value);
|
||||
}
|
||||
}
|
||||
|
||||
/* Currently there are no values which could be negative */
|
||||
if (longval < 0)
|
||||
{
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("\"%s\" must be zero or greater (provided: %s)"),
|
||||
config_item, value);
|
||||
}
|
||||
|
||||
/* Error message buffer is set */
|
||||
if (error_message_buf[0] != '\0')
|
||||
{
|
||||
if (error_list == NULL)
|
||||
{
|
||||
log_err("%s\n", error_message_buf);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
error_list_append(error_list, error_message_buf);
|
||||
}
|
||||
|
||||
return (int32) longval;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Split argument into old_dir and new_dir and append to tablespace mapping
|
||||
@@ -797,3 +978,21 @@ parse_event_notifications_list(t_configuration_options *options, const char *arg
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void
|
||||
exit_with_errors(ErrorList *config_errors)
|
||||
{
|
||||
ErrorListCell *cell;
|
||||
|
||||
log_err(_("%s: following errors were found in the configuration file.\n"), progname());
|
||||
|
||||
for (cell = config_errors->head; cell; cell = cell->next)
|
||||
{
|
||||
log_err("%s\n", cell->error_message);
|
||||
}
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
|
||||
25
config.h
25
config.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* config.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -24,6 +24,7 @@
|
||||
|
||||
#include "strutil.h"
|
||||
|
||||
#define CONFIG_FILE_NAME "repmgr.conf"
|
||||
|
||||
typedef struct EventNotificationListCell
|
||||
{
|
||||
@@ -67,7 +68,7 @@ typedef struct
|
||||
char ssh_options[QUERY_STR_LEN];
|
||||
int master_response_timeout;
|
||||
int reconnect_attempts;
|
||||
int reconnect_intvl;
|
||||
int reconnect_interval;
|
||||
char pg_bindir[MAXLEN];
|
||||
char pg_ctl_options[MAXLEN];
|
||||
char pg_basebackup_options[MAXLEN];
|
||||
@@ -82,11 +83,29 @@ typedef struct
|
||||
|
||||
#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", 0, 0, 0, "", { NULL, NULL }, {NULL, NULL} }
|
||||
|
||||
typedef struct ErrorListCell
|
||||
{
|
||||
struct ErrorListCell *next;
|
||||
char *error_message;
|
||||
} ErrorListCell;
|
||||
|
||||
bool load_config(const char *config_file, t_configuration_options *options, char *argv0);
|
||||
typedef struct ErrorList
|
||||
{
|
||||
ErrorListCell *head;
|
||||
ErrorListCell *tail;
|
||||
} ErrorList;
|
||||
|
||||
void set_progname(const char *argv0);
|
||||
const char * progname(void);
|
||||
|
||||
bool load_config(const char *config_file, bool verbose, t_configuration_options *options, char *argv0);
|
||||
bool reload_config(t_configuration_options *orig_options);
|
||||
bool parse_config(t_configuration_options *options);
|
||||
void parse_line(char *buff, char *name, char *value);
|
||||
char *trim(char *s);
|
||||
void error_list_append(ErrorList *error_list, char *error_message);
|
||||
int repmgr_atoi(const char *s,
|
||||
const char *config_item,
|
||||
ErrorList *error_list);
|
||||
|
||||
#endif
|
||||
|
||||
306
dbutils.c
306
dbutils.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* dbutils.c - Database connection/management functions
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -87,6 +87,8 @@ begin_transaction(PGconn *conn)
|
||||
{
|
||||
PGresult *res;
|
||||
|
||||
log_verbose(LOG_DEBUG, "begin_transaction()\n");
|
||||
|
||||
res = PQexec(conn, "BEGIN");
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
@@ -109,6 +111,8 @@ commit_transaction(PGconn *conn)
|
||||
{
|
||||
PGresult *res;
|
||||
|
||||
log_verbose(LOG_DEBUG, "commit_transaction()\n");
|
||||
|
||||
res = PQexec(conn, "COMMIT");
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
@@ -131,6 +135,8 @@ rollback_transaction(PGconn *conn)
|
||||
{
|
||||
PGresult *res;
|
||||
|
||||
log_verbose(LOG_DEBUG, "rollback_transaction()\n");
|
||||
|
||||
res = PQexec(conn, "ROLLBACK");
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
@@ -158,7 +164,8 @@ check_cluster_schema(PGconn *conn)
|
||||
"SELECT 1 FROM pg_namespace WHERE nspname = '%s'",
|
||||
get_repmgr_schema());
|
||||
|
||||
log_debug(_("check_cluster_schema(): %s\n"), sqlquery);
|
||||
log_verbose(LOG_DEBUG, "check_cluster_schema(): %s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -188,17 +195,22 @@ is_standby(PGconn *conn)
|
||||
{
|
||||
PGresult *res;
|
||||
int result = 0;
|
||||
char *sqlquery = "SELECT pg_catalog.pg_is_in_recovery()";
|
||||
|
||||
res = PQexec(conn, "SELECT pg_is_in_recovery()");
|
||||
log_verbose(LOG_DEBUG, "is_standby(): %s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
if (res == NULL || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("Can't query server mode: %s"),
|
||||
log_err(_("Unable to query server mode: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
result = -1;
|
||||
}
|
||||
else if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0)
|
||||
{
|
||||
result = 1;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
return result;
|
||||
@@ -285,6 +297,8 @@ get_master_node_id(PGconn *conn, char *cluster)
|
||||
get_repmgr_schema_quoted(conn),
|
||||
cluster);
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_master_node_id():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -341,14 +355,17 @@ guc_set(PGconn *conn, const char *parameter, const char *op,
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
int retval = 1;
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings "
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT true FROM pg_settings "
|
||||
" WHERE name = '%s' AND setting %s '%s'",
|
||||
parameter, op, value);
|
||||
|
||||
log_verbose(LOG_DEBUG, "guc_set():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("GUC setting check PQexec failed: %s"),
|
||||
log_err(_("guc_set(): unable to execute query\n%s\n"),
|
||||
PQerrorMessage(conn));
|
||||
retval = -1;
|
||||
}
|
||||
@@ -379,10 +396,12 @@ guc_set_typed(PGconn *conn, const char *parameter, const char *op,
|
||||
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
|
||||
parameter, datatype, op, value, datatype);
|
||||
|
||||
log_verbose(LOG_DEBUG, "guc_set_typed():n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("GUC setting check PQexec failed: %s"),
|
||||
log_err(_("guc_set_typed(): unable to execute query\n%s\n"),
|
||||
PQerrorMessage(conn));
|
||||
retval = -1;
|
||||
}
|
||||
@@ -403,15 +422,16 @@ get_cluster_size(PGconn *conn, char *size)
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
sqlquery_snprintf(
|
||||
sqlquery,
|
||||
"SELECT pg_size_pretty(SUM(pg_database_size(oid))::bigint) "
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT pg_catalog.pg_size_pretty(SUM(pg_catalog.pg_database_size(oid))::bigint) "
|
||||
" FROM pg_database ");
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_cluster_size():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (res == NULL || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("get_cluster_size(): PQexec failed: %s"),
|
||||
log_err(_("get_cluster_size(): unable to execute query\n%s\n"),
|
||||
PQerrorMessage(conn));
|
||||
|
||||
PQclear(res);
|
||||
@@ -439,7 +459,7 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
|
||||
" FROM pg_settings WHERE name = '%s'",
|
||||
setting);
|
||||
|
||||
log_debug(_("get_pg_setting(): %s\n"), sqlquery);
|
||||
log_verbose(LOG_DEBUG, "get_pg_setting(): %s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
@@ -461,13 +481,14 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
|
||||
}
|
||||
else
|
||||
{
|
||||
log_err(_("unknown parameter: %s"), PQgetvalue(res, i, 0));
|
||||
/* XXX highly unlikely this would ever happen */
|
||||
log_err(_("get_pg_setting(): unknown parameter \"%s\""), PQgetvalue(res, i, 0));
|
||||
}
|
||||
}
|
||||
|
||||
if (success == true)
|
||||
{
|
||||
log_debug(_("get_pg_setting(): returned value is '%s'\n"), output);
|
||||
log_debug(_("get_pg_setting(): returned value is \"%s\"\n"), output);
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
@@ -512,13 +533,13 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
||||
cluster,
|
||||
node_id);
|
||||
|
||||
log_debug("get_upstream_connection(): %s\n", sqlquery);
|
||||
log_verbose(LOG_DEBUG, "get_upstream_connection():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(standby_conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("unable to get conninfo for upstream server: %s\n"),
|
||||
log_err(_("unable to get conninfo for upstream server\n%s\n"),
|
||||
PQerrorMessage(standby_conn));
|
||||
PQclear(res);
|
||||
return NULL;
|
||||
@@ -538,7 +559,7 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
||||
|
||||
PQclear(res);
|
||||
|
||||
log_debug("conninfo is: '%s'\n", upstream_conninfo);
|
||||
log_verbose(LOG_DEBUG, "get_upstream_connection(): conninfo is \"%s\"\n", upstream_conninfo);
|
||||
upstream_conn = establish_db_connection(upstream_conninfo, false);
|
||||
|
||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||
@@ -553,24 +574,26 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
||||
|
||||
|
||||
/*
|
||||
* get a connection to master by reading repl_nodes, creating a connection
|
||||
* to each node (one at a time) and finding if it is a master or a standby
|
||||
* Read the node list from the local node and attempt to connect to each node
|
||||
* in turn to definitely establish if it's the cluster primary.
|
||||
*
|
||||
* NB: If master_conninfo_out may be NULL. If it is non-null, it is assumed to
|
||||
* point to allocated memory of MAXCONNINFO in length, and the master server
|
||||
* connection string is placed there.
|
||||
* The node list is returned in the order which makes it likely that the
|
||||
* current primary will be returned first, reducing the number of speculative
|
||||
* connections which need to be made to other nodes.
|
||||
*
|
||||
* If master_conninfo_out points to allocated memory of MAXCONNINFO in length,
|
||||
* the primary server's conninfo string will be copied there.
|
||||
*/
|
||||
|
||||
PGconn *
|
||||
get_master_connection(PGconn *standby_conn, char *cluster,
|
||||
int *master_id, char *master_conninfo_out)
|
||||
{
|
||||
PGconn *master_conn = NULL;
|
||||
PGresult *res1;
|
||||
PGresult *res2;
|
||||
PGconn *remote_conn = NULL;
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
char master_conninfo_stack[MAXCONNINFO];
|
||||
char *master_conninfo = &*master_conninfo_stack;
|
||||
char remote_conninfo_stack[MAXCONNINFO];
|
||||
char *remote_conninfo = &*remote_conninfo_stack;
|
||||
|
||||
int i,
|
||||
node_id;
|
||||
@@ -581,59 +604,60 @@ get_master_connection(PGconn *standby_conn, char *cluster,
|
||||
}
|
||||
|
||||
/* find all nodes belonging to this cluster */
|
||||
log_info(_("finding node list for cluster '%s'\n"),
|
||||
log_info(_("retrieving node list for cluster '%s'\n"),
|
||||
cluster);
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT id, conninfo "
|
||||
" FROM %s.repl_nodes "
|
||||
" WHERE cluster = '%s' "
|
||||
" AND type != 'witness' ",
|
||||
" SELECT id, conninfo, "
|
||||
" CASE WHEN type = 'master' THEN 1 ELSE 2 END AS type_priority"
|
||||
" FROM %s.repl_nodes "
|
||||
" WHERE cluster = '%s' "
|
||||
" AND type != 'witness' "
|
||||
"ORDER BY active DESC, type_priority, priority, id",
|
||||
get_repmgr_schema_quoted(standby_conn),
|
||||
cluster);
|
||||
|
||||
res1 = PQexec(standby_conn, sqlquery);
|
||||
if (PQresultStatus(res1) != PGRES_TUPLES_OK)
|
||||
log_verbose(LOG_DEBUG, "get_master_connection():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(standby_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("unable to retrieve node records: %s\n"),
|
||||
PQerrorMessage(standby_conn));
|
||||
PQclear(res1);
|
||||
PQclear(res);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (i = 0; i < PQntuples(res1); i++)
|
||||
for (i = 0; i < PQntuples(res); i++)
|
||||
{
|
||||
/* initialize with the values of the current node being processed */
|
||||
node_id = atoi(PQgetvalue(res1, i, 0));
|
||||
strncpy(master_conninfo, PQgetvalue(res1, i, 1), MAXCONNINFO);
|
||||
log_info(_("checking role of cluster node '%i'\n"),
|
||||
node_id);
|
||||
master_conn = establish_db_connection(master_conninfo, false);
|
||||
int is_node_standby;
|
||||
|
||||
if (PQstatus(master_conn) != CONNECTION_OK)
|
||||
/* initialize with the values of the current node being processed */
|
||||
node_id = atoi(PQgetvalue(res, i, 0));
|
||||
strncpy(remote_conninfo, PQgetvalue(res, i, 1), MAXCONNINFO);
|
||||
log_verbose(LOG_INFO,
|
||||
_("checking role of cluster node '%i'\n"),
|
||||
node_id);
|
||||
remote_conn = establish_db_connection(remote_conninfo, false);
|
||||
|
||||
if (PQstatus(remote_conn) != CONNECTION_OK)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Can't use the is_standby() function here because on error that
|
||||
* function closes the connection passed and exits. This still needs
|
||||
* to close master_conn first.
|
||||
*/
|
||||
res2 = PQexec(master_conn, "SELECT pg_is_in_recovery()");
|
||||
is_node_standby = is_standby(remote_conn);
|
||||
|
||||
if (PQresultStatus(res2) != PGRES_TUPLES_OK)
|
||||
if (is_node_standby == -1)
|
||||
{
|
||||
log_err(_("unable to retrieve recovery state from this node: %s\n"),
|
||||
PQerrorMessage(master_conn));
|
||||
PQclear(res2);
|
||||
PQfinish(master_conn);
|
||||
log_err(_("unable to retrieve recovery state from node %i:\n%s\n"),
|
||||
node_id,
|
||||
PQerrorMessage(remote_conn));
|
||||
PQfinish(remote_conn);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* if false, this is the master */
|
||||
if (strcmp(PQgetvalue(res2, 0, 0), "f") == 0)
|
||||
/* if is_standby() returns 0, queried node is the master */
|
||||
if (is_node_standby == 0)
|
||||
{
|
||||
PQclear(res2);
|
||||
PQclear(res1);
|
||||
PQclear(res);
|
||||
log_debug(_("get_master_connection(): current master node is %i\n"), node_id);
|
||||
|
||||
if (master_id != NULL)
|
||||
@@ -641,14 +665,12 @@ get_master_connection(PGconn *standby_conn, char *cluster,
|
||||
*master_id = node_id;
|
||||
}
|
||||
|
||||
return master_conn;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* if it is a standby, clear info */
|
||||
PQclear(res2);
|
||||
PQfinish(master_conn);
|
||||
return remote_conn;
|
||||
}
|
||||
|
||||
|
||||
/* if it is a standby, clear connection info and continue*/
|
||||
PQfinish(remote_conn);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -659,7 +681,7 @@ get_master_connection(PGconn *standby_conn, char *cluster,
|
||||
* Probably we will need to check the error to know if we need to start
|
||||
* failover procedure or just fix some situation on the standby.
|
||||
*/
|
||||
PQclear(res1);
|
||||
PQclear(res);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -687,7 +709,7 @@ wait_connection_availability(PGconn *conn, long long timeout)
|
||||
{
|
||||
if (PQconsumeInput(conn) == 0)
|
||||
{
|
||||
log_warning(_("wait_connection_availability: could not receive data from connection. %s\n"),
|
||||
log_warning(_("wait_connection_availability(): could not receive data from connection. %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
return 0;
|
||||
}
|
||||
@@ -714,7 +736,7 @@ wait_connection_availability(PGconn *conn, long long timeout)
|
||||
if (select(sock, &read_set, NULL, NULL, &tmout) == -1)
|
||||
{
|
||||
log_warning(
|
||||
_("wait_connection_availability: select() returned with error: %s"),
|
||||
_("wait_connection_availability(): select() returned with error\n%s\n"),
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
@@ -730,7 +752,7 @@ wait_connection_availability(PGconn *conn, long long timeout)
|
||||
return 1;
|
||||
}
|
||||
|
||||
log_warning(_("wait_connection_availability: timeout reached"));
|
||||
log_warning(_("wait_connection_availability(): timeout reached"));
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -765,6 +787,12 @@ cancel_query(PGconn *conn, int timeout)
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/* Return the repmgr schema as an unmodified string
|
||||
* This is useful for displaying the schema name in log messages,
|
||||
* however inclusion in SQL statements, get_repmgr_schema_quoted() should
|
||||
* always be used.
|
||||
*/
|
||||
char *
|
||||
get_repmgr_schema(void)
|
||||
{
|
||||
@@ -806,6 +834,8 @@ create_replication_slot(PGconn *conn, char *slot_name)
|
||||
" WHERE slot_name = '%s' ",
|
||||
slot_name);
|
||||
|
||||
log_verbose(LOG_DEBUG, "create_replication_slot():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -826,8 +856,8 @@ create_replication_slot(PGconn *conn, char *slot_name)
|
||||
if (strcmp(PQgetvalue(res, 0, 0), "f") == 0)
|
||||
{
|
||||
PQclear(res);
|
||||
log_debug(_("Replication slot '%s' exists but is inactive; reusing\n"),
|
||||
slot_name);
|
||||
log_debug("Replication slot '%s' exists but is inactive; reusing\n",
|
||||
slot_name);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -842,6 +872,7 @@ create_replication_slot(PGconn *conn, char *slot_name)
|
||||
slot_name);
|
||||
|
||||
log_debug(_("create_replication_slot(): Creating slot '%s' on primary\n"), slot_name);
|
||||
log_verbose(LOG_DEBUG, "create_replication_slot():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -857,6 +888,33 @@ create_replication_slot(PGconn *conn, char *slot_name)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
drop_replication_slot(PGconn *conn, char *slot_name)
|
||||
{
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
PGresult *res;
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT pg_drop_replication_slot('%s')",
|
||||
slot_name);
|
||||
|
||||
log_verbose(LOG_DEBUG, "drop_replication_slot():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("unable to drop replication slot \"%s\":\n %s\n"),
|
||||
slot_name,
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
return false;
|
||||
}
|
||||
|
||||
log_verbose(LOG_DEBUG, "replication slot \"%s\" successfully dropped\n",
|
||||
slot_name);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint)
|
||||
@@ -865,11 +923,11 @@ start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint)
|
||||
PGresult *res;
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT pg_xlogfile_name(pg_start_backup('repmgr_standby_clone_%ld', %s))",
|
||||
"SELECT pg_catalog.pg_xlogfile_name(pg_catalog.pg_start_backup('repmgr_standby_clone_%ld', %s))",
|
||||
time(NULL),
|
||||
fast_checkpoint ? "TRUE" : "FALSE");
|
||||
|
||||
log_debug(_("standby clone: %s\n"), sqlquery);
|
||||
log_verbose(LOG_DEBUG, "start_backup():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -884,7 +942,7 @@ start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint)
|
||||
char *first_wal_seg_pq = PQgetvalue(res, 0, 0);
|
||||
size_t buf_sz = strlen(first_wal_seg_pq);
|
||||
|
||||
first_wal_segment = malloc(buf_sz + 1);
|
||||
first_wal_segment = pg_malloc0(buf_sz + 1);
|
||||
xsnprintf(first_wal_segment, buf_sz + 1, "%s", first_wal_seg_pq);
|
||||
}
|
||||
|
||||
@@ -900,7 +958,7 @@ stop_backup(PGconn *conn, char *last_wal_segment)
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
PGresult *res;
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_xlogfile_name(pg_stop_backup())");
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_xlogfile_name(pg_catalog.pg_stop_backup())");
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -915,7 +973,7 @@ stop_backup(PGconn *conn, char *last_wal_segment)
|
||||
char *last_wal_seg_pq = PQgetvalue(res, 0, 0);
|
||||
size_t buf_sz = strlen(last_wal_seg_pq);
|
||||
|
||||
last_wal_segment = malloc(buf_sz + 1);
|
||||
last_wal_segment = pg_malloc0(buf_sz + 1);
|
||||
xsnprintf(last_wal_segment, buf_sz + 1, "%s", last_wal_seg_pq);
|
||||
}
|
||||
|
||||
@@ -936,6 +994,8 @@ set_config_bool(PGconn *conn, const char *config_param, bool state)
|
||||
config_param,
|
||||
state ? "TRUE" : "FALSE");
|
||||
|
||||
log_verbose(LOG_DEBUG, "set_config_bool():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
@@ -967,11 +1027,13 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
|
||||
int i;
|
||||
|
||||
sqlquery_snprintf(sqlquery, "TRUNCATE TABLE %s.repl_nodes", get_repmgr_schema_quoted(witnessconn));
|
||||
log_debug("copy_configuration: %s\n", sqlquery);
|
||||
|
||||
log_verbose(LOG_DEBUG, "copy_configuration():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(witnessconn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
fprintf(stderr, "Cannot clean node details in the witness, %s\n",
|
||||
log_err(_("Unable to truncate witness servers's repl_nodes table:\n%s\n"),
|
||||
PQerrorMessage(witnessconn));
|
||||
return false;
|
||||
}
|
||||
@@ -979,10 +1041,13 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name FROM %s.repl_nodes",
|
||||
get_repmgr_schema_quoted(masterconn));
|
||||
|
||||
log_verbose(LOG_DEBUG, "copy_configuration():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(masterconn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
fprintf(stderr, "Can't get configuration from master: %s\n",
|
||||
log_err("Unable to retrieve node records from master:\n%s\n",
|
||||
PQerrorMessage(masterconn));
|
||||
PQclear(res);
|
||||
return false;
|
||||
@@ -991,9 +1056,11 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
|
||||
for (i = 0; i < PQntuples(res); i++)
|
||||
{
|
||||
bool node_record_created;
|
||||
char *witness = PQgetvalue(res, i, 4);
|
||||
|
||||
log_debug(_("copy_configuration(): %s\n"), witness);
|
||||
log_verbose(LOG_DEBUG,
|
||||
"copy_configuration(): writing node record for node %s (id: %s)\n",
|
||||
PQgetvalue(res, i, 4),
|
||||
PQgetvalue(res, i, 0));
|
||||
|
||||
node_record_created = create_node_record(witnessconn,
|
||||
"copy_configuration",
|
||||
@@ -1013,7 +1080,9 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
|
||||
|
||||
if (node_record_created == false)
|
||||
{
|
||||
fprintf(stderr, "Unable to copy node record to witness database: %s\n",
|
||||
PQclear(res);
|
||||
|
||||
log_err("Unable to copy node record to witness database\n%s\n",
|
||||
PQerrorMessage(witnessconn));
|
||||
return false;
|
||||
}
|
||||
@@ -1069,6 +1138,7 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
|
||||
maxlen_snprintf(slot_name_buf, "%s", "NULL");
|
||||
}
|
||||
|
||||
/* XXX convert to placeholder query */
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"INSERT INTO %s.repl_nodes "
|
||||
" (id, type, upstream_node_id, cluster, "
|
||||
@@ -1084,16 +1154,18 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
|
||||
slot_name_buf,
|
||||
priority);
|
||||
|
||||
log_verbose(LOG_DEBUG, "create_node_record(): %s\n", sqlquery);
|
||||
|
||||
if (action != NULL)
|
||||
{
|
||||
log_debug(_("%s: %s\n"), action, sqlquery);
|
||||
log_verbose(LOG_DEBUG, "create_node_record(): action is \"%s\"\n", action);
|
||||
}
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_warning(_("Unable to create node record: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
log_err(_("Unable to create node record\n%s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
return false;
|
||||
}
|
||||
@@ -1115,15 +1187,18 @@ delete_node_record(PGconn *conn, int node, char *action)
|
||||
" WHERE id = %d",
|
||||
get_repmgr_schema_quoted(conn),
|
||||
node);
|
||||
|
||||
log_verbose(LOG_DEBUG, "delete_node_record(): %s\n", sqlquery);
|
||||
|
||||
if (action != NULL)
|
||||
{
|
||||
log_debug(_("%s: %s\n"), action, sqlquery);
|
||||
log_verbose(LOG_DEBUG, "create_node_record(): action is \"%s\"\n", action);
|
||||
}
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_warning(_("Unable to delete node record: %s\n"),
|
||||
log_err(_("Unable to delete node record: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
return false;
|
||||
@@ -1195,6 +1270,8 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
" RETURNING event_timestamp ",
|
||||
get_repmgr_schema_quoted(conn));
|
||||
|
||||
log_verbose(LOG_DEBUG, "create_event_record():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexecParams(conn,
|
||||
sqlquery,
|
||||
4,
|
||||
@@ -1206,7 +1283,6 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
|
||||
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
|
||||
log_warning(_("Unable to create event record: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
|
||||
@@ -1217,7 +1293,7 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
{
|
||||
/* Store timestamp to send to the notification command */
|
||||
strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
|
||||
log_debug(_("Event timestamp is: %s\n"), event_timestamp);
|
||||
log_verbose(LOG_DEBUG, "create_event_record(): Event timestamp is \"%s\"\n", event_timestamp);
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
@@ -1337,12 +1413,13 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
|
||||
*dst_ptr = '\0';
|
||||
|
||||
log_debug(_("Executing: %s\n"), parsed_command);
|
||||
log_debug("create_event_record(): executing\n%s\n", parsed_command);
|
||||
|
||||
r = system(parsed_command);
|
||||
if (r != 0)
|
||||
{
|
||||
log_warning(_("Unable to execute event notification command\n"));
|
||||
log_info(_("Parsed event notification command was:\n%s\n"), parsed_command);
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
@@ -1350,6 +1427,50 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
return success;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update node record following change of status
|
||||
* (e.g. inactive primary converted to standby)
|
||||
*/
|
||||
bool
|
||||
update_node_record_status(PGconn *conn, char *cluster_name, int this_node_id, char *type, int upstream_node_id, bool active)
|
||||
{
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
" UPDATE %s.repl_nodes "
|
||||
" SET type = '%s', "
|
||||
" upstream_node_id = %i, "
|
||||
" active = %s "
|
||||
" WHERE cluster = '%s' "
|
||||
" AND id = %i ",
|
||||
get_repmgr_schema_quoted(conn),
|
||||
type,
|
||||
upstream_node_id,
|
||||
active ? "TRUE" : "FALSE",
|
||||
cluster_name,
|
||||
this_node_id);
|
||||
|
||||
log_verbose(LOG_DEBUG, "update_node_record_status():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to update node record: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_id, int new_upstream_node_id)
|
||||
{
|
||||
@@ -1367,6 +1488,9 @@ update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_
|
||||
new_upstream_node_id,
|
||||
cluster_name,
|
||||
this_node_id);
|
||||
|
||||
log_verbose(LOG_DEBUG, "update_node_record_set_upstream():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
@@ -1398,7 +1522,7 @@ get_node_record(PGconn *conn, char *cluster, int node_id)
|
||||
cluster,
|
||||
node_id);
|
||||
|
||||
log_debug("get_node_record(): %s\n", sqlquery);
|
||||
log_verbose(LOG_DEBUG, "get_node_record():\n%s\n", sqlquery);
|
||||
|
||||
return PQexec(conn, sqlquery);
|
||||
}
|
||||
|
||||
45
dbutils.h
45
dbutils.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* dbutils.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -20,10 +20,51 @@
|
||||
#ifndef _REPMGR_DBUTILS_H_
|
||||
#define _REPMGR_DBUTILS_H_
|
||||
|
||||
#include "access/xlogdefs.h"
|
||||
|
||||
#include "config.h"
|
||||
#include "strutil.h"
|
||||
|
||||
|
||||
typedef enum {
|
||||
UNKNOWN = 0,
|
||||
MASTER,
|
||||
STANDBY,
|
||||
WITNESS
|
||||
} t_server_type;
|
||||
|
||||
/*
|
||||
* Struct to store node information
|
||||
*/
|
||||
typedef struct s_node_info
|
||||
{
|
||||
int node_id;
|
||||
int upstream_node_id;
|
||||
t_server_type type;
|
||||
char name[MAXLEN];
|
||||
char conninfo_str[MAXLEN];
|
||||
char slot_name[MAXLEN];
|
||||
int priority;
|
||||
bool active;
|
||||
bool is_ready;
|
||||
bool is_visible;
|
||||
XLogRecPtr xlog_location;
|
||||
} t_node_info;
|
||||
|
||||
|
||||
#define T_NODE_INFO_INITIALIZER { \
|
||||
NODE_NOT_FOUND, \
|
||||
NO_UPSTREAM_NODE, \
|
||||
UNKNOWN, \
|
||||
"", \
|
||||
"", \
|
||||
"", \
|
||||
DEFAULT_PRIORITY, \
|
||||
true, \
|
||||
false, \
|
||||
false, \
|
||||
InvalidXLogRecPtr \
|
||||
}
|
||||
|
||||
PGconn *establish_db_connection(const char *conninfo,
|
||||
const bool exit_on_error);
|
||||
@@ -58,6 +99,7 @@ bool cancel_query(PGconn *conn, int timeout);
|
||||
char *get_repmgr_schema(void);
|
||||
char *get_repmgr_schema_quoted(PGconn *conn);
|
||||
bool create_replication_slot(PGconn *conn, char *slot_name);
|
||||
bool drop_replication_slot(PGconn *conn, char *slot_name);
|
||||
|
||||
bool start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint);
|
||||
bool stop_backup(PGconn *conn, char *last_wal_segment);
|
||||
@@ -66,6 +108,7 @@ bool copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_
|
||||
bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name);
|
||||
bool delete_node_record(PGconn *conn, int node, char *action);
|
||||
bool create_event_record(PGconn *conn, t_configuration_options *options, int node_id, char *event, bool successful, char *details);
|
||||
bool update_node_record_status(PGconn *conn, char *cluster_name, int this_node_id, char *type, int upstream_node_id, bool active);
|
||||
bool update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_id, int new_upstream_node_id);
|
||||
PGresult * get_node_record(PGconn *conn, char *cluster, int node_id);
|
||||
|
||||
|
||||
2
debian/repmgr.repmgrd.init
vendored
2
debian/repmgr.repmgrd.init
vendored
@@ -59,7 +59,7 @@ do_stop()
|
||||
# 0 if daemon has been stopped
|
||||
# 1 if daemon was already stopped
|
||||
# other if daemon could not be stopped or a failure occurred
|
||||
start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 --pidfile $REPMGRD_PIDFILE --exec $REPMGRD_BIN
|
||||
start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 --pidfile $REPMGRD_PIDFILE --name "$(basename $REPMGRD_BIN)"
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* errcode.h
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -36,5 +36,6 @@
|
||||
#define ERR_SYS_FAILURE 13
|
||||
#define ERR_BAD_BASEBACKUP 14
|
||||
#define ERR_INTERNAL 15
|
||||
#define ERR_MONITORING_FAIL 16
|
||||
|
||||
#endif /* _ERRCODE_H_ */
|
||||
|
||||
130
log.c
130
log.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* log.c - Logging methods
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This module is a set of methods for logging (currently only syslog)
|
||||
*
|
||||
@@ -39,13 +39,37 @@
|
||||
|
||||
/* #define REPMGR_DEBUG */
|
||||
|
||||
static int detect_log_facility(const char *facility);
|
||||
static void _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap);
|
||||
|
||||
int log_type = REPMGR_STDERR;
|
||||
int log_level = LOG_NOTICE;
|
||||
int last_log_level = LOG_NOTICE;
|
||||
int verbose_logging = false;
|
||||
int terse_logging = false;
|
||||
|
||||
void
|
||||
stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
|
||||
va_start(arglist, fmt);
|
||||
_stderr_log_with_level(level_name, level, fmt, arglist);
|
||||
va_end(arglist);
|
||||
}
|
||||
|
||||
static void
|
||||
_stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap)
|
||||
{
|
||||
time_t t;
|
||||
struct tm *tm;
|
||||
char buff[100];
|
||||
va_list ap;
|
||||
|
||||
/*
|
||||
* Store the requested level so that if there's a subsequent
|
||||
* log_hint(), we can suppress that if appropriate.
|
||||
*/
|
||||
last_log_level = level;
|
||||
|
||||
if (log_level >= level)
|
||||
{
|
||||
@@ -54,24 +78,74 @@ stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
|
||||
strftime(buff, 100, "[%Y-%m-%d %H:%M:%S]", tm);
|
||||
fprintf(stderr, "%s [%s] ", buff, level_name);
|
||||
|
||||
va_start(ap, fmt);
|
||||
vfprintf(stderr, fmt, ap);
|
||||
va_end(ap);
|
||||
|
||||
fflush(stderr);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
log_hint(const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
static int detect_log_level(const char *level);
|
||||
static int detect_log_facility(const char *facility);
|
||||
if (terse_logging == false)
|
||||
{
|
||||
va_start(ap, fmt);
|
||||
_stderr_log_with_level("HINT", last_log_level, fmt, ap);
|
||||
va_end(ap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
log_verbose(int level, const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, fmt);
|
||||
|
||||
if (verbose_logging == true)
|
||||
{
|
||||
switch(level)
|
||||
{
|
||||
case LOG_EMERG:
|
||||
_stderr_log_with_level("EMERG", level, fmt, ap);
|
||||
break;
|
||||
case LOG_ALERT:
|
||||
_stderr_log_with_level("ALERT", level, fmt, ap);
|
||||
break;
|
||||
case LOG_CRIT:
|
||||
_stderr_log_with_level("CRIT", level, fmt, ap);
|
||||
break;
|
||||
case LOG_ERR:
|
||||
_stderr_log_with_level("ERR", level, fmt, ap);
|
||||
break;
|
||||
case LOG_WARNING:
|
||||
_stderr_log_with_level("WARNING", level, fmt, ap);
|
||||
break;
|
||||
case LOG_NOTICE:
|
||||
_stderr_log_with_level("NOTICE", level, fmt, ap);
|
||||
break;
|
||||
case LOG_INFO:
|
||||
_stderr_log_with_level("INFO", level, fmt, ap);
|
||||
break;
|
||||
case LOG_DEBUG:
|
||||
_stderr_log_with_level("DEBUG", level, fmt, ap);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
int log_type = REPMGR_STDERR;
|
||||
int log_level = LOG_NOTICE;
|
||||
|
||||
bool
|
||||
logger_init(t_configuration_options * opts, const char *ident, const char *level, const char *facility)
|
||||
logger_init(t_configuration_options * opts, const char *ident)
|
||||
{
|
||||
char *level = opts->loglevel;
|
||||
char *facility = opts->logfacility;
|
||||
|
||||
int l;
|
||||
int f;
|
||||
|
||||
@@ -95,10 +169,10 @@ logger_init(t_configuration_options * opts, const char *ident, const char *level
|
||||
printf("Assigned level for logger: %d\n", l);
|
||||
#endif
|
||||
|
||||
if (l > 0)
|
||||
if (l >= 0)
|
||||
log_level = l;
|
||||
else
|
||||
stderr_log_warning(_("Cannot detect log level %s (use any of DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG)\n"), level);
|
||||
stderr_log_warning(_("Invalid log level \"%s\" (available values: DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG)\n"), level);
|
||||
}
|
||||
|
||||
if (facility && *facility)
|
||||
@@ -174,9 +248,9 @@ logger_init(t_configuration_options * opts, const char *ident, const char *level
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
logger_shutdown(void)
|
||||
{
|
||||
@@ -189,17 +263,32 @@ logger_shutdown(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Set a minimum logging level. Intended for command line verbosity
|
||||
* options, which might increase requested logging over what's specified
|
||||
* in the regular configuration file.
|
||||
* Indicate whether extra-verbose logging is required. This will
|
||||
* generate a lot of output, particularly debug logging, and should
|
||||
* not be permanently enabled in production.
|
||||
*
|
||||
* NOTE: in previous repmgr versions, this option forced the log
|
||||
* level to INFO.
|
||||
*/
|
||||
void
|
||||
logger_min_verbose(int minimum)
|
||||
logger_set_verbose(void)
|
||||
{
|
||||
if (log_level < minimum)
|
||||
log_level = minimum;
|
||||
verbose_logging = true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Indicate whether some non-critical log messages can be omitted.
|
||||
* Currently this includes warnings about irrelevant command line
|
||||
* options and hints.
|
||||
*/
|
||||
|
||||
void logger_set_terse(void)
|
||||
{
|
||||
terse_logging = true;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
detect_log_level(const char *level)
|
||||
{
|
||||
@@ -220,17 +309,16 @@ detect_log_level(const char *level)
|
||||
if (!strcmp(level, "EMERG"))
|
||||
return LOG_EMERG;
|
||||
|
||||
return 0;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
static int
|
||||
detect_log_facility(const char *facility)
|
||||
{
|
||||
int local = 0;
|
||||
|
||||
if (!strncmp(facility, "LOCAL", 5) && strlen(facility) == 6)
|
||||
{
|
||||
|
||||
local = atoi(&facility[5]);
|
||||
|
||||
switch (local)
|
||||
|
||||
14
log.h
14
log.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* log.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -112,13 +112,19 @@ __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
|
||||
#endif
|
||||
|
||||
|
||||
int detect_log_level(const char *level);
|
||||
|
||||
/* Logger initialisation and shutdown */
|
||||
|
||||
bool logger_init(t_configuration_options * opts, const char *ident);
|
||||
|
||||
bool logger_shutdown(void);
|
||||
|
||||
bool logger_init(t_configuration_options * opts, const char *ident,
|
||||
const char *level, const char *facility);
|
||||
void logger_set_verbose(void);
|
||||
void logger_set_terse(void);
|
||||
|
||||
void logger_min_verbose(int minimum);
|
||||
void log_hint(const char *fmt, ...);
|
||||
void log_verbose(int level, const char *fmt, ...);
|
||||
|
||||
extern int log_type;
|
||||
extern int log_level;
|
||||
|
||||
@@ -16,11 +16,15 @@ cluster=example_cluster
|
||||
# Node ID and name
|
||||
# (Note: we recommend to avoid naming nodes after their initial
|
||||
# replication funcion, as this will cause confusion when e.g.
|
||||
# "standby2" is promoted to master)
|
||||
node=2
|
||||
node_name=node2
|
||||
# "standby2" is promoted to primary)
|
||||
node=2 # a unique integer
|
||||
node_name=node2 # an arbitrary (but unique) string; we recommend using
|
||||
# the server's hostname or another identifier unambiguously
|
||||
# associated with the server to avoid confusion
|
||||
|
||||
# Database connection information
|
||||
# Database connection information as a conninfo string
|
||||
# This must be accessible to all servers in the cluster; for details see:
|
||||
# http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING
|
||||
conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
|
||||
|
||||
# Optional configuration items
|
||||
@@ -32,7 +36,7 @@ conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
|
||||
# when using cascading replication and a standby is to be connected to an
|
||||
# upstream standby, specify that node's ID with 'upstream_node'. The node
|
||||
# must exist before the new standby can be registered. If a standby is
|
||||
# to connect directly to a master node, this parameter is not required.
|
||||
# to connect directly to a primary node, this parameter is not required.
|
||||
#
|
||||
# upstream_node=1
|
||||
|
||||
@@ -40,7 +44,9 @@ conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
|
||||
# (default: 0)
|
||||
#
|
||||
# use_replication_slots=0
|
||||
|
||||
#
|
||||
# NOTE: 'max_replication_slots' should be configured for at least the
|
||||
# number of standbys which will connect to the primary.
|
||||
|
||||
# Logging and monitoring settings
|
||||
# -------------------------------
|
||||
@@ -110,28 +116,29 @@ logfacility=STDERR
|
||||
#
|
||||
# These settings are only applied when repmgrd is running.
|
||||
|
||||
# How many seconds we wait for master response before declaring master failure
|
||||
# Number of seconds to wait for a response from the primary server before
|
||||
# deciding it has failed
|
||||
|
||||
master_response_timeout=60
|
||||
|
||||
# How many time we try to reconnect to master before starting failover procedure
|
||||
# Number of times to try and reconnect to the primary before starting
|
||||
# the failover procedure
|
||||
reconnect_attempts=6
|
||||
reconnect_interval=10
|
||||
|
||||
# Autofailover options
|
||||
failover=automatic # one of 'automatic', 'manual'
|
||||
priority=100 # a value of zero or less prevents the node being promoted to master
|
||||
priority=100 # a value of zero or less prevents the node being promoted to primary
|
||||
promote_command='repmgr standby promote -f /path/to/repmgr.conf'
|
||||
follow_command='repmgr standby follow -f /path/to/repmgr.conf -W'
|
||||
|
||||
# monitoring interval; default is 2s
|
||||
# monitoring interval in seconds; default is 2
|
||||
#
|
||||
# monitor_interval_secs=2
|
||||
|
||||
# change wait time for master; before we bail out and exit when the master
|
||||
# change wait time for primary; before we bail out and exit when the primary
|
||||
# disappears, we wait 'reconnect_attempts' * 'retry_promote_interval_secs'
|
||||
# seconds; by default this would be half an hour, as 'retry_promote_interval_secs'
|
||||
# default value is 300)
|
||||
#
|
||||
# retry_promote_interval_secs=300
|
||||
|
||||
|
||||
|
||||
27
repmgr.h
27
repmgr.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgr.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -36,7 +36,6 @@
|
||||
#define MAXFILENAME 1024
|
||||
#define ERRBUFF_SIZE 512
|
||||
|
||||
#define DEFAULT_CONFIG_FILE "./repmgr.conf"
|
||||
#define DEFAULT_WAL_KEEP_SEGMENTS "5000"
|
||||
#define DEFAULT_DEST_DIR "."
|
||||
#define DEFAULT_MASTER_PORT "5432"
|
||||
@@ -49,14 +48,7 @@
|
||||
#define AUTOMATIC_FAILOVER 1
|
||||
#define NODE_NOT_FOUND -1
|
||||
#define NO_UPSTREAM_NODE -1
|
||||
|
||||
|
||||
typedef enum {
|
||||
UNKNOWN = 0,
|
||||
MASTER,
|
||||
STANDBY,
|
||||
WITNESS
|
||||
} t_server_type;
|
||||
#define UNKNOWN_NODE_ID -1
|
||||
|
||||
|
||||
|
||||
@@ -73,6 +65,7 @@ typedef struct
|
||||
char superuser[MAXLEN];
|
||||
char wal_keep_segments[MAXLEN];
|
||||
bool verbose;
|
||||
bool terse;
|
||||
bool force;
|
||||
bool wait_for_master;
|
||||
bool ignore_rsync_warn;
|
||||
@@ -82,6 +75,7 @@ typedef struct
|
||||
bool ignore_external_config_files;
|
||||
char masterport[MAXLEN];
|
||||
char localport[MAXLEN];
|
||||
char loglevel[MAXLEN];
|
||||
|
||||
/* parameter used by CLUSTER CLEANUP */
|
||||
int keep_history;
|
||||
@@ -91,20 +85,9 @@ typedef struct
|
||||
char recovery_min_apply_delay[MAXLEN];
|
||||
} t_runtime_options;
|
||||
|
||||
#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, "", "", 0, "", "" }
|
||||
#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, false, "", "", "", 0, "", "" }
|
||||
|
||||
extern char repmgr_schema[MAXLEN];
|
||||
|
||||
typedef struct ErrorListCell
|
||||
{
|
||||
struct ErrorListCell *next;
|
||||
char *error_message;
|
||||
} ErrorListCell;
|
||||
|
||||
typedef struct ErrorList
|
||||
{
|
||||
ErrorListCell *head;
|
||||
ErrorListCell *tail;
|
||||
} ErrorList;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* repmgr.sql
|
||||
*
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
310
repmgrd.c
310
repmgrd.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgrd.c - Replication manager daemon
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This module connects to the nodes of a replication cluster and monitors
|
||||
* how far are they from master
|
||||
@@ -41,22 +41,6 @@
|
||||
#include "access/xlogdefs.h"
|
||||
#include "pqexpbuffer.h"
|
||||
|
||||
/*
|
||||
* Struct to store node information
|
||||
*/
|
||||
typedef struct s_node_info
|
||||
{
|
||||
int node_id;
|
||||
int upstream_node_id;
|
||||
char conninfo_str[MAXLEN];
|
||||
XLogRecPtr xlog_location;
|
||||
t_server_type type;
|
||||
bool is_ready;
|
||||
bool is_visible;
|
||||
char slot_name[MAXLEN];
|
||||
bool active;
|
||||
} t_node_info;
|
||||
|
||||
|
||||
|
||||
/* Local info */
|
||||
@@ -68,9 +52,7 @@ t_configuration_options master_options;
|
||||
|
||||
PGconn *master_conn = NULL;
|
||||
|
||||
const char *progname;
|
||||
|
||||
char *config_file = DEFAULT_CONFIG_FILE;
|
||||
char *config_file = "";
|
||||
bool verbose = false;
|
||||
bool monitoring_history = false;
|
||||
t_node_info node_info;
|
||||
@@ -81,7 +63,7 @@ char *pid_file = NULL;
|
||||
|
||||
t_configuration_options config = T_CONFIGURATION_OPTIONS_INITIALIZER;
|
||||
|
||||
static void help(const char *progname);
|
||||
static void help(void);
|
||||
static void usage(void);
|
||||
static void check_cluster_configuration(PGconn *conn);
|
||||
static void check_node_configuration(void);
|
||||
@@ -89,7 +71,7 @@ static void check_node_configuration(void);
|
||||
static void standby_monitor(void);
|
||||
static void witness_monitor(void);
|
||||
static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
|
||||
static bool set_local_node_failed(void);
|
||||
static bool set_local_node_status(void);
|
||||
|
||||
static void update_shared_memory(char *last_wal_standby_applied);
|
||||
static void update_registration(void);
|
||||
@@ -158,9 +140,10 @@ main(int argc, char **argv)
|
||||
FILE *fd;
|
||||
|
||||
int server_version_num = 0;
|
||||
progname = get_progname(argv[0]);
|
||||
|
||||
while ((c = getopt_long(argc, argv, "?Vf:v:mdp:", long_options, &optindex)) != -1)
|
||||
set_progname(argv[0]);
|
||||
|
||||
while ((c = getopt_long(argc, argv, "?Vf:vmdp:", long_options, &optindex)) != -1)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
@@ -180,10 +163,10 @@ main(int argc, char **argv)
|
||||
pid_file = optarg;
|
||||
break;
|
||||
case '?':
|
||||
help(progname);
|
||||
help();
|
||||
exit(SUCCESS);
|
||||
case 'V':
|
||||
printf("%s %s (PostgreSQL %s)\n", progname, REPMGR_VERSION, PG_VERSION);
|
||||
printf("%s %s (PostgreSQL %s)\n", progname(), REPMGR_VERSION, PG_VERSION);
|
||||
exit(SUCCESS);
|
||||
default:
|
||||
usage();
|
||||
@@ -200,7 +183,7 @@ main(int argc, char **argv)
|
||||
* which case we'll need to refactor parse_config() not to abort,
|
||||
* and return the error message.
|
||||
*/
|
||||
load_config(config_file, &local_options, argv[0]);
|
||||
load_config(config_file, verbose, &local_options, argv[0]);
|
||||
|
||||
if (daemonize)
|
||||
{
|
||||
@@ -230,10 +213,9 @@ main(int argc, char **argv)
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
logger_init(&local_options, progname, local_options.loglevel,
|
||||
local_options.logfacility);
|
||||
logger_init(&local_options, progname());
|
||||
if (verbose)
|
||||
logger_min_verbose(LOG_INFO);
|
||||
logger_set_verbose();
|
||||
|
||||
if (log_type == REPMGR_SYSLOG)
|
||||
{
|
||||
@@ -247,6 +229,7 @@ main(int argc, char **argv)
|
||||
}
|
||||
|
||||
/* Initialise the repmgr schema name */
|
||||
/* XXX check this handles quoting properly */
|
||||
maxlen_snprintf(repmgr_schema, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX,
|
||||
local_options.cluster_name);
|
||||
|
||||
@@ -264,7 +247,7 @@ main(int argc, char **argv)
|
||||
if (server_version_num > 0)
|
||||
{
|
||||
log_err(_("%s requires PostgreSQL %s or later\n"),
|
||||
progname,
|
||||
progname(),
|
||||
MIN_SUPPORTED_VERSION) ;
|
||||
}
|
||||
else
|
||||
@@ -282,7 +265,7 @@ main(int argc, char **argv)
|
||||
if (node_info.node_id == NODE_NOT_FOUND)
|
||||
{
|
||||
log_err(_("No metadata record found for this node - terminating\n"));
|
||||
log_notice(_("HINT: was this node registered with 'repmgr (master|standby) register'?\n"));
|
||||
log_hint(_("Check that 'repmgr (master|standby) register' was executed for this node\n"));
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
@@ -407,7 +390,7 @@ main(int argc, char **argv)
|
||||
|
||||
appendPQExpBuffer(&errmsg,
|
||||
_("unable to connect to master node '%s'"),
|
||||
local_options.cluster_name);
|
||||
master_options.node_name);
|
||||
|
||||
log_err("%s\n", errmsg.data);
|
||||
|
||||
@@ -457,7 +440,7 @@ main(int argc, char **argv)
|
||||
|
||||
do
|
||||
{
|
||||
log_debug("standby check loop...\n");
|
||||
log_verbose(LOG_DEBUG, "standby check loop...\n");
|
||||
|
||||
if (node_info.type == WITNESS)
|
||||
{
|
||||
@@ -467,6 +450,7 @@ main(int argc, char **argv)
|
||||
{
|
||||
standby_monitor();
|
||||
}
|
||||
|
||||
sleep(local_options.monitor_interval_secs);
|
||||
|
||||
if (got_SIGHUP)
|
||||
@@ -558,10 +542,10 @@ witness_monitor(void)
|
||||
{
|
||||
log_warning(
|
||||
_("unable to determine a valid master server; waiting %i seconds to retry...\n"),
|
||||
local_options.reconnect_intvl
|
||||
local_options.reconnect_interval
|
||||
);
|
||||
PQfinish(master_conn);
|
||||
sleep(local_options.reconnect_intvl);
|
||||
sleep(local_options.reconnect_interval);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -674,6 +658,7 @@ standby_monitor(void)
|
||||
char last_wal_standby_received[MAXLEN];
|
||||
char last_wal_standby_applied[MAXLEN];
|
||||
char last_wal_standby_applied_timestamp[MAXLEN];
|
||||
bool last_wal_standby_received_gte_replayed;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
XLogRecPtr lsn_master;
|
||||
@@ -701,23 +686,16 @@ standby_monitor(void)
|
||||
{
|
||||
PQExpBufferData errmsg;
|
||||
|
||||
set_local_node_failed();
|
||||
set_local_node_status();
|
||||
|
||||
initPQExpBuffer(&errmsg);
|
||||
|
||||
appendPQExpBuffer(&errmsg,
|
||||
_("failed to connect to local node, node marked as failed and terminating!"));
|
||||
_("failed to connect to local node, node marked as failed!"));
|
||||
|
||||
log_err("%s\n", errmsg.data);
|
||||
|
||||
create_event_record(master_conn,
|
||||
&local_options,
|
||||
local_options.node,
|
||||
"repmgrd_shutdown",
|
||||
false,
|
||||
errmsg.data);
|
||||
|
||||
terminate(ERR_DB_CON);
|
||||
goto continue_monitoring_standby;
|
||||
}
|
||||
|
||||
upstream_conn = get_upstream_connection(my_local_conn,
|
||||
@@ -738,7 +716,7 @@ standby_monitor(void)
|
||||
check_connection(&upstream_conn, type, upstream_conninfo);
|
||||
/*
|
||||
* This takes up to local_options.reconnect_attempts *
|
||||
* local_options.reconnect_intvl seconds
|
||||
* local_options.reconnect_interval seconds
|
||||
*/
|
||||
|
||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||
@@ -846,6 +824,7 @@ standby_monitor(void)
|
||||
|
||||
PQfinish(upstream_conn);
|
||||
|
||||
continue_monitoring_standby:
|
||||
/* Check if we still are a standby, we could have been promoted */
|
||||
do
|
||||
{
|
||||
@@ -861,10 +840,13 @@ standby_monitor(void)
|
||||
* will require manual resolution as there's no way of determing
|
||||
* which master is the correct one.
|
||||
*
|
||||
* We should log a message so the user knows of the situation at hand.
|
||||
*
|
||||
* XXX check if the original master is still active and display a
|
||||
* warning
|
||||
*/
|
||||
log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
|
||||
log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
|
||||
log_err(_("Check your cluster and manually fix any anomaly.\n"));
|
||||
terminate(1);
|
||||
break;
|
||||
|
||||
@@ -874,8 +856,11 @@ standby_monitor(void)
|
||||
|
||||
if (!check_connection(&my_local_conn, "standby", NULL))
|
||||
{
|
||||
set_local_node_failed();
|
||||
terminate(0);
|
||||
set_local_node_status();
|
||||
/*
|
||||
* Let's continue checking, and if the postgres server on the
|
||||
* standby comes back up, we will activate it again
|
||||
*/
|
||||
}
|
||||
|
||||
break;
|
||||
@@ -884,14 +869,20 @@ standby_monitor(void)
|
||||
|
||||
if (did_retry)
|
||||
{
|
||||
log_info(_("standby connection recovered!\n"));
|
||||
/*
|
||||
* There's a possible situation where the standby went down for some reason
|
||||
* (maintenance for example) and is now up and maybe connected once again to
|
||||
* the stream. If we set the local standby node as failed and it's now running
|
||||
* and receiving replication data, we should activate it again.
|
||||
*/
|
||||
set_local_node_status();
|
||||
log_info(_("standby connection recovered!\n"));
|
||||
}
|
||||
|
||||
/* Fast path for the case where no history is requested */
|
||||
if (!monitoring_history)
|
||||
return;
|
||||
|
||||
|
||||
/*
|
||||
* If original master has gone away we'll need to get the new one
|
||||
* from the upstream node to write monitoring information
|
||||
@@ -953,7 +944,8 @@ standby_monitor(void)
|
||||
/* Get local xlog info */
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
|
||||
"pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp() ");
|
||||
"pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp(), "
|
||||
"pg_last_xlog_receive_location() >= pg_last_xlog_replay_location()");
|
||||
|
||||
res = PQexec(my_local_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -968,10 +960,30 @@ standby_monitor(void)
|
||||
strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN);
|
||||
strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN);
|
||||
strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
|
||||
last_wal_standby_received_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
|
||||
? true
|
||||
: false;
|
||||
|
||||
PQclear(res);
|
||||
|
||||
/*
|
||||
* In the unusual event of a standby becoming disconnected from the primary,
|
||||
* while this repmgrd remains connected to the primary, subtracting
|
||||
* "lsn_standby_applied" from "lsn_standby_received" and coercing to
|
||||
* (long long unsigned int) will result in a meaningless, very large
|
||||
* value which will overflow a BIGINT column and spew error messages into the
|
||||
* PostgreSQL log. In the absence of a better strategy, skip attempting
|
||||
* to insert a monitoring record.
|
||||
*/
|
||||
if (last_wal_standby_received_gte_replayed == false)
|
||||
{
|
||||
log_verbose(LOG_WARNING,
|
||||
"Invalid replication_lag value calculated - is this standby connected to its upstream?\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Get master xlog info */
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location()");
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_current_xlog_location()");
|
||||
|
||||
res = PQexec(master_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -998,10 +1010,10 @@ standby_monitor(void)
|
||||
" last_monitor_time, last_apply_time, "
|
||||
" last_wal_primary_location, last_wal_standby_location, "
|
||||
" replication_lag, apply_lag ) "
|
||||
" VALUES(%d, %d, "
|
||||
" '%s'::TIMESTAMP WITH TIME ZONE, '%s'::TIMESTAMP WITH TIME ZONE, "
|
||||
" '%s', '%s', "
|
||||
" %llu, %llu) ",
|
||||
" VALUES(%d, %d, "
|
||||
" '%s'::TIMESTAMP WITH TIME ZONE, '%s'::TIMESTAMP WITH TIME ZONE, "
|
||||
" '%s', '%s', "
|
||||
" %llu, %llu) ",
|
||||
get_repmgr_schema_quoted(master_conn),
|
||||
master_options.node, local_options.node,
|
||||
monitor_standby_timestamp, last_wal_standby_applied_timestamp,
|
||||
@@ -1013,7 +1025,8 @@ standby_monitor(void)
|
||||
* Execute the query asynchronously, but don't check for a result. We will
|
||||
* check the result next time we pause for a monitor step.
|
||||
*/
|
||||
log_debug("standby_monitor: %s\n", sqlquery);
|
||||
log_verbose(LOG_DEBUG, "standby_monitor:() %s\n", sqlquery);
|
||||
|
||||
if (PQsendQuery(master_conn, sqlquery) == 0)
|
||||
log_warning(_("query could not be sent to master. %s\n"),
|
||||
PQerrorMessage(master_conn));
|
||||
@@ -1055,10 +1068,10 @@ do_master_failover(void)
|
||||
t_node_info nodes[FAILOVER_NODES_MAX_CHECK];
|
||||
|
||||
/* Store details of the failed node here */
|
||||
t_node_info failed_master = {-1, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
|
||||
t_node_info failed_master = T_NODE_INFO_INITIALIZER;
|
||||
|
||||
/* Store details of the best candidate for promotion to master here */
|
||||
t_node_info best_candidate = {-1, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
|
||||
t_node_info best_candidate = T_NODE_INFO_INITIALIZER;
|
||||
|
||||
/* get a list of standby nodes, including myself */
|
||||
sprintf(sqlquery,
|
||||
@@ -1187,12 +1200,13 @@ do_master_failover(void)
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
||||
res = PQexec(node_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_info(_("unable to retrieve node's last standby location: %s\n"),
|
||||
PQerrorMessage(node_conn));
|
||||
|
||||
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
|
||||
PQclear(res);
|
||||
PQfinish(node_conn);
|
||||
@@ -1218,7 +1232,7 @@ do_master_failover(void)
|
||||
}
|
||||
|
||||
/* last we get info about this node, and update shared memory */
|
||||
sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
|
||||
sprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
||||
res = PQexec(my_local_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -1284,7 +1298,7 @@ do_master_failover(void)
|
||||
res = PQexec(node_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("PQexec failed: %s.\nReport an invalid value to not"
|
||||
log_err(_("PQexec failed: %s.\nReport an invalid value to not "
|
||||
"be considered as new master and exit.\n"),
|
||||
PQerrorMessage(node_conn));
|
||||
PQclear(res);
|
||||
@@ -1336,6 +1350,9 @@ do_master_failover(void)
|
||||
PQclear(res);
|
||||
|
||||
/* If position is 0/0, keep checking */
|
||||
/* XXX we should add a timeout here to prevent infinite looping
|
||||
* if the other node's repmgrd is not up
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1413,8 +1430,7 @@ do_master_failover(void)
|
||||
/* wait */
|
||||
sleep(5);
|
||||
|
||||
if (verbose)
|
||||
log_info(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||
|
||||
log_debug(_("promote command is: \"%s\"\n"),
|
||||
local_options.promote_command);
|
||||
@@ -1463,10 +1479,8 @@ do_master_failover(void)
|
||||
/* wait */
|
||||
sleep(10);
|
||||
|
||||
if (verbose)
|
||||
log_info(_("node %d is the best candidate to be the new master, we should follow it...\n"),
|
||||
best_candidate.node_id);
|
||||
log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command);
|
||||
log_info(_("node %d is the best candidate for new master, attempting to follow...\n"),
|
||||
best_candidate.node_id);
|
||||
|
||||
/*
|
||||
* The new master may some time to be promoted. The follow command
|
||||
@@ -1477,57 +1491,23 @@ do_master_failover(void)
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
/*
|
||||
* If 9.4 or later, and replication slots in use, we'll need to create a
|
||||
* slot on the new master
|
||||
*/
|
||||
new_master_conn = establish_db_connection(best_candidate.conninfo_str, true);
|
||||
|
||||
if (local_options.use_replication_slots)
|
||||
{
|
||||
if (create_replication_slot(new_master_conn, node_info.slot_name) == false)
|
||||
{
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to create slot '%s' on the master node: %s"),
|
||||
node_info.slot_name,
|
||||
PQerrorMessage(new_master_conn));
|
||||
|
||||
log_err("%s\n", event_details.data);
|
||||
|
||||
create_event_record(new_master_conn,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
PQfinish(new_master_conn);
|
||||
terminate(ERR_DB_QUERY);
|
||||
}
|
||||
}
|
||||
log_debug(_("executing follow command: \"%s\"\n"), local_options.follow_command);
|
||||
|
||||
r = system(local_options.follow_command);
|
||||
if (r != 0)
|
||||
{
|
||||
log_err(_("follow command failed. You could check and try it manually.\n"));
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* and reconnect to the local database */
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
|
||||
/* update node information to reflect new status */
|
||||
if (update_node_record_set_upstream(new_master_conn, local_options.cluster_name, node_info.node_id, best_candidate.node_id) == false)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to update node record for node %i (following new upstream node %i)"),
|
||||
node_info.node_id,
|
||||
best_candidate.node_id);
|
||||
_("Unable to execute follow command:\n %s"),
|
||||
local_options.follow_command);
|
||||
|
||||
log_err("%s\n", event_details.data);
|
||||
|
||||
create_event_record(new_master_conn,
|
||||
/* It won't be possible to write to the event notification
|
||||
* table but we should be able to generate an external notification
|
||||
* if required.
|
||||
*/
|
||||
create_event_record(NULL,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
@@ -1537,13 +1517,20 @@ do_master_failover(void)
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* and reconnect to the local database */
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
|
||||
/* update internal record for this node*/
|
||||
new_master_conn = establish_db_connection(best_candidate.conninfo_str, true);
|
||||
|
||||
node_info = get_node_info(new_master_conn, local_options.cluster_name, local_options.node);
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Node %i now following new upstream node %i"),
|
||||
node_info.node_id,
|
||||
best_candidate.node_id);
|
||||
|
||||
log_info("%s\n", event_details.data);
|
||||
|
||||
create_event_record(new_master_conn,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
@@ -1570,6 +1557,8 @@ do_master_failover(void)
|
||||
* It might be worth providing a selection of reconnection strategies
|
||||
* as different behaviour might be desirable in different situations;
|
||||
* or maybe the option not to reconnect might be required?
|
||||
*
|
||||
* XXX check this handles replication slots gracefully
|
||||
*/
|
||||
static bool
|
||||
do_upstream_standby_failover(t_node_info upstream_node)
|
||||
@@ -1578,6 +1567,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
int upstream_node_id = node_info.upstream_node_id;
|
||||
int r;
|
||||
PQExpBufferData event_details;
|
||||
|
||||
log_debug(_("do_upstream_standby_failover(): performing failover for node %i\n"),
|
||||
node_info.node_id);
|
||||
@@ -1647,26 +1637,65 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
sleep(local_options.reconnect_intvl);
|
||||
sleep(local_options.reconnect_interval);
|
||||
}
|
||||
|
||||
/* Close the connection to this server */
|
||||
PQfinish(my_local_conn);
|
||||
my_local_conn = NULL;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
/* Follow new upstream */
|
||||
r = system(local_options.follow_command);
|
||||
if (r != 0)
|
||||
{
|
||||
log_err(_("follow command failed. You could check and try it manually.\n"));
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to execute follow command:\n %s"),
|
||||
local_options.follow_command);
|
||||
|
||||
log_err("%s\n", event_details.data);
|
||||
|
||||
/* It won't be possible to write to the event notification
|
||||
* table but we should be able to generate an external notification
|
||||
* if required.
|
||||
*/
|
||||
create_event_record(NULL,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (update_node_record_set_upstream(master_conn, local_options.cluster_name, node_info.node_id, upstream_node_id) == false)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to set node %i's new upstream ID to %i"),
|
||||
node_info.node_id,
|
||||
upstream_node_id);
|
||||
create_event_record(NULL,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Node %i is now following upstream node %i"),
|
||||
node_info.node_id,
|
||||
upstream_node_id);
|
||||
|
||||
create_event_record(NULL,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
|
||||
return true;
|
||||
@@ -1681,7 +1710,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
|
||||
/*
|
||||
* Check if the node is still available if after
|
||||
* local_options.reconnect_attempts * local_options.reconnect_intvl
|
||||
* local_options.reconnect_attempts * local_options.reconnect_interval
|
||||
* seconds of retries we cannot reconnect return false
|
||||
*/
|
||||
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
||||
@@ -1699,9 +1728,9 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
{
|
||||
log_warning(_("connection to %s has been lost, trying to recover... %i seconds before failover decision\n"),
|
||||
type,
|
||||
(local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
|
||||
/* wait local_options.reconnect_intvl seconds between retries */
|
||||
sleep(local_options.reconnect_intvl);
|
||||
(local_options.reconnect_interval * (local_options.reconnect_attempts - connection_retries)));
|
||||
/* wait local_options.reconnect_interval seconds between retries */
|
||||
sleep(local_options.reconnect_interval);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1728,7 +1757,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
|
||||
|
||||
/*
|
||||
* set_local_node_failed()
|
||||
* set_local_node_status()
|
||||
*
|
||||
* If failure of the local node is detected, attempt to connect
|
||||
* to the current master server (as stored in the global variable
|
||||
@@ -1736,16 +1765,16 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
*/
|
||||
|
||||
static bool
|
||||
set_local_node_failed(void)
|
||||
set_local_node_status(void)
|
||||
{
|
||||
PGresult *res;
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
int active_master_node_id = NODE_NOT_FOUND;
|
||||
int active_master_node_id = NODE_NOT_FOUND;
|
||||
char master_conninfo[MAXLEN];
|
||||
|
||||
if (!check_connection(&master_conn, "master", NULL))
|
||||
{
|
||||
log_err(_("set_local_node_failed(): Unable to connect to last known master node\n"));
|
||||
log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1799,17 +1828,16 @@ set_local_node_failed(void)
|
||||
|
||||
|
||||
/*
|
||||
* Attempt to set own record as inactive
|
||||
* Attempt to set the active record to the correct value.
|
||||
* First
|
||||
*/
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"UPDATE %s.repl_nodes "
|
||||
" SET active = FALSE "
|
||||
" WHERE id = %i ",
|
||||
get_repmgr_schema_quoted(master_conn),
|
||||
node_info.node_id);
|
||||
|
||||
res = PQexec(master_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
if (!update_node_record_status(master_conn,
|
||||
local_options.cluster_name,
|
||||
node_info.node_id,
|
||||
"standby",
|
||||
node_info.upstream_node_id,
|
||||
is_standby(my_local_conn)==1))
|
||||
{
|
||||
log_err(_("unable to set local node %i as inactive on master: %s\n"),
|
||||
node_info.node_id,
|
||||
@@ -1834,7 +1862,7 @@ check_cluster_configuration(PGconn *conn)
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT oid FROM pg_class "
|
||||
" WHERE oid = '%s.repl_nodes'::regclass ",
|
||||
get_repmgr_schema());
|
||||
get_repmgr_schema_quoted(master_conn));
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -1961,18 +1989,18 @@ lsn_to_xlogrecptr(char *lsn, bool *format_ok)
|
||||
void
|
||||
usage(void)
|
||||
{
|
||||
log_err(_("%s: Replicator manager daemon \n"), progname);
|
||||
log_err(_("Try \"%s --help\" for more information.\n"), progname);
|
||||
log_err(_("%s: Replicator manager daemon \n"), progname());
|
||||
log_err(_("Try \"%s --help\" for more information.\n"), progname());
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
help(const char *progname)
|
||||
help(void)
|
||||
{
|
||||
printf(_("%s: replication management daemon for PostgreSQL\n"), progname);
|
||||
printf(_("%s: replication management daemon for PostgreSQL\n"), progname());
|
||||
printf(_("\n"));
|
||||
printf(_("Usage:\n"));
|
||||
printf(_(" %s [OPTIONS]\n"), progname);
|
||||
printf(_(" %s [OPTIONS]\n"), progname());
|
||||
printf(_("\n"));
|
||||
printf(_("Options:\n"));
|
||||
printf(_(" -?, --help show this help, then exit\n"));
|
||||
@@ -1983,7 +2011,7 @@ help(const char *progname)
|
||||
printf(_(" -d, --daemonize detach process from foreground\n"));
|
||||
printf(_(" -p, --pid-file=PATH write a PID file\n"));
|
||||
printf(_("\n"));
|
||||
printf(_("%s monitors a cluster of servers and optionally performs failover.\n"), progname);
|
||||
printf(_("%s monitors a cluster of servers and optionally performs failover.\n"), progname());
|
||||
}
|
||||
|
||||
|
||||
@@ -2021,7 +2049,7 @@ terminate(int retval)
|
||||
unlink(pid_file);
|
||||
}
|
||||
|
||||
log_info(_("%s terminating...\n"), progname);
|
||||
log_info(_("%s terminating...\n"), progname());
|
||||
|
||||
exit(retval);
|
||||
}
|
||||
@@ -2228,7 +2256,7 @@ get_node_info(PGconn *conn, char *cluster, int node_id)
|
||||
{
|
||||
PGresult *res;
|
||||
|
||||
t_node_info node_info = { NODE_NOT_FOUND, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
|
||||
t_node_info node_info = T_NODE_INFO_INITIALIZER;
|
||||
|
||||
res = get_node_record(conn, cluster, node_id);
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#
|
||||
# Makefile
|
||||
#
|
||||
# Copyright (c) 2ndQuadrant, 2010-2015
|
||||
# Copyright (c) 2ndQuadrant, 2010-2016
|
||||
#
|
||||
|
||||
MODULE_big = repmgr_funcs
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgr_function.sql
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* uninstall_repmgr_funcs.sql
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* strutil.c
|
||||
*
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* strutil.h
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* uninstall_repmgr.sql
|
||||
*
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
Reference in New Issue
Block a user