mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 16:46:28 +00:00
Compare commits
2 Commits
REL3_0_STA
...
v3.0.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bf4de71523 | ||
|
|
770e6f758c |
@@ -1,29 +0,0 @@
|
||||
License and Contributions
|
||||
=========================
|
||||
|
||||
`repmgr` is licensed under the GPL v3. All of its code and documentation is
|
||||
Copyright 2010-2016, 2ndQuadrant Limited. See the files COPYRIGHT and LICENSE for
|
||||
details.
|
||||
|
||||
The development of repmgr has primarily been sponsored by 2ndQuadrant customers.
|
||||
|
||||
Additional work has been sponsored by the 4CaaST project for cloud computing,
|
||||
which has received funding from the European Union's Seventh Framework Programme
|
||||
(FP7/2007-2013) under grant agreement 258862.
|
||||
|
||||
Contributions to `repmgr` are welcome, and will be listed in the file `CREDITS`.
|
||||
2ndQuadrant Limited requires that any contributions provide a copyright
|
||||
assignment and a disclaimer of any work-for-hire ownership claims from the
|
||||
employer of the developer. This lets us make sure that all of the repmgr
|
||||
distribution remains free code. Please contact info@2ndQuadrant.com for a
|
||||
copy of the relevant Copyright Assignment Form.
|
||||
|
||||
Code style
|
||||
----------
|
||||
|
||||
Code in repmgr is formatted to a consistent style using the following command:
|
||||
|
||||
astyle --style=ansi --indent=tab --suffix=none *.c *.h
|
||||
|
||||
Contributors should reformat their code similarly before submitting code to
|
||||
the project, in order to minimize merge conflicts with other work.
|
||||
@@ -1,4 +1,4 @@
|
||||
Copyright (c) 2010-2016, 2ndQuadrant Limited
|
||||
Copyright (c) 2010-2015, 2ndQuadrant Limited
|
||||
All rights reserved.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
|
||||
11
FAILOVER.rst
11
FAILOVER.rst
@@ -93,6 +93,7 @@ Create the user and database to manage replication::
|
||||
su - postgres
|
||||
createuser -s repmgr
|
||||
createdb -O repmgr repmgr
|
||||
psql -f /usr/share/postgresql/9.0/contrib/repmgr_funcs.sql repmgr
|
||||
|
||||
Restart the PostgreSQL server::
|
||||
|
||||
@@ -171,13 +172,11 @@ Register Master and Standby
|
||||
|
||||
Log in to node1.
|
||||
|
||||
Register the node as master::
|
||||
Register the node as Master::
|
||||
|
||||
su - postgres
|
||||
repmgr -f /etc/repmgr/repmgr.conf master register
|
||||
|
||||
This will also create the repmgr schema and functions.
|
||||
|
||||
Log in to node2. Register it as a standby::
|
||||
|
||||
su - postgres
|
||||
@@ -204,12 +203,6 @@ repmgr will also ask for the superuser password on the witness database so
|
||||
it can reconnect when needed (the command line option --initdb-no-pwprompt
|
||||
will set up a password-less superuser).
|
||||
|
||||
By default the witness server will listen on port 5499; this value can be
|
||||
overridden by explicitly providing the port number in the conninfo string
|
||||
in repmgr.conf. (Note that it is also possible to specify the port number
|
||||
with the -l/--local-port option, however this option is now deprecated and
|
||||
will be overridden by a port setting in the conninfo string).
|
||||
|
||||
Start the repmgrd daemons
|
||||
-------------------------
|
||||
|
||||
|
||||
23
FAQ.md
23
FAQ.md
@@ -34,11 +34,6 @@ General
|
||||
replication slots, setting a higher figure will make adding new nodes
|
||||
easier.
|
||||
|
||||
- Does `repmgr` support hash indexes?
|
||||
|
||||
No. Hash indexes and replication do not mix well and their use is
|
||||
explicitly discouraged; see:
|
||||
http://www.postgresql.org/docs/current/interactive/sql-createindex.html#AEN74175
|
||||
|
||||
`repmgr`
|
||||
--------
|
||||
@@ -101,9 +96,8 @@ General
|
||||
is intended to support running the witness server as a separate
|
||||
instance on a normal node server, rather than on its own dedicated server.
|
||||
|
||||
To specify different port for the witness server, supply the port number
|
||||
in the `conninfo` string in `repmgr.conf`
|
||||
(repmgr 3.0.1 and earlier: use the `-l/--local-port` option)
|
||||
To specify a port for the witness server, supply the port number to
|
||||
repmgr with the `-l/--local-port` command line option.
|
||||
|
||||
- Do I need to include `shared_preload_libraries = 'repmgr_funcs'`
|
||||
in `postgresql.conf` if I'm not using `repmgrd`?
|
||||
@@ -112,14 +106,6 @@ General
|
||||
If you later decide to run `repmgrd`, you just need to add
|
||||
`shared_preload_libraries = 'repmgr_funcs'` and restart PostgreSQL.
|
||||
|
||||
- I've provided replication permission for the `repmgr` user in `pg_hba.conf`
|
||||
but `repmgr`/`repmgrd` complains it can't connect to the server... Why?
|
||||
|
||||
`repmgr`/`repmgrd` need to be able to connect to the repmgr database
|
||||
with a normal connection to query metadata. The `replication` connection
|
||||
permission is for PostgreSQL's streaming replication and doesn't
|
||||
necessarily need to be the `repmgr` user.
|
||||
|
||||
|
||||
`repmgrd`
|
||||
---------
|
||||
@@ -148,8 +134,3 @@ General
|
||||
|
||||
Note that after registering a delayed standby, `repmgrd` will only start
|
||||
once the metadata added in the master node has been replicated.
|
||||
|
||||
- How can I get `repmgrd` to rotate its logfile?
|
||||
|
||||
Configure your system's `logrotate` service to do this; see example
|
||||
in README.md
|
||||
|
||||
38
HISTORY
38
HISTORY
@@ -1,41 +1,3 @@
|
||||
3.0.4 2016-01-
|
||||
Remove requirement for 'archive_mode' to be enabled (Ian)
|
||||
|
||||
3.0.3 2016-01-04
|
||||
Create replication slot if required before base backup is run (Abhijit)
|
||||
standy clone: when using rsync, clean up "pg_replslot" directory (Ian)
|
||||
Improve --help output (Ian)
|
||||
Improve config file parsing (Ian)
|
||||
Various logging output improvements, including explicit HINTS (Ian)
|
||||
Add --log-level to explicitly set log level on command line (Ian)
|
||||
Repurpose --verbose to display extra log output (Ian)
|
||||
Add --terse to hide hints and other non-critical output (Ian)
|
||||
Reference internal functions with explicit catalog path (Ian)
|
||||
When following a new primary, have repmgr (not repmgrd) create the new slot (Ian)
|
||||
Add /etc/repmgr.conf as a default configuration file location (Ian)
|
||||
Prevent repmgrd's -v/--verbose option expecting a parameter (Ian)
|
||||
Prevent invalid replication_lag values being written to the monitoring table (Ian)
|
||||
Improve repmgrd behaviour when monitored standby node is temporarily
|
||||
unavailable (Martín)
|
||||
|
||||
3.0.2 2015-10-02
|
||||
Improve handling of --help/--version options; and improve help output (Ian)
|
||||
Improve handling of situation where logfile can't be opened (Ian)
|
||||
Always pass -D/--pgdata option to pg_basebackup (Ian)
|
||||
Bugfix: standby clone --force does not empty pg_xlog (Gianni)
|
||||
Bugfix: autofailover with reconnect_attempts > 1 (Gianni)
|
||||
Bugfix: ignore comments after values (soxwellfb)
|
||||
Bugfix: handle string values in 'node' parameter correctly (Gregory Duchatelet)
|
||||
Allow repmgr to be compiled with a newer libpq (Marco)
|
||||
Bugfix: call update_node_record_set_upstream() for STANDBY FOLLOW (Tomas)
|
||||
Update `repmgr --help` output (per Github report from renard)
|
||||
Update tablespace remapping in --rsync-only mode for 9.5 and later (Ian)
|
||||
Deprecate `-l/--local-port` option - the port can be extracted
|
||||
from the conninfo string in repmgr.conf (Ian)
|
||||
Add STANDBY UNREGISTER (Vik Fearing)
|
||||
Don't fail with error when registering master if schema already defined (Ian)
|
||||
Fixes to whitespace handling when parsing config file (Ian)
|
||||
|
||||
3.0.1 2015-04-16
|
||||
Prevent repmgrd from looping infinitely if node was not registered (Ian)
|
||||
When promoting a standby, have repmgr (not repmgrd) handle metadata updates (Ian)
|
||||
|
||||
2
Makefile
2
Makefile
@@ -1,6 +1,6 @@
|
||||
#
|
||||
# Makefile
|
||||
# Copyright (c) 2ndQuadrant, 2010-2016
|
||||
# Copyright (c) 2ndQuadrant, 2010-2015
|
||||
|
||||
repmgrd_OBJS = dbutils.o config.o repmgrd.o log.o strutil.o
|
||||
repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o
|
||||
|
||||
38
PACKAGES.md
38
PACKAGES.md
@@ -4,10 +4,10 @@ Packaging
|
||||
Notes on RedHat Linux, Fedora, and CentOS Builds
|
||||
------------------------------------------------
|
||||
|
||||
The RPM packages of PostgreSQL put `pg_config` into the `postgresql-devel`
|
||||
The RPM packages of PostgreSQL put ``pg_config`` into the ``postgresql-devel``
|
||||
package, not the main server one. And if you have a RPM install of PostgreSQL
|
||||
9.0, the entire PostgreSQL binary directory will not be in your PATH by default
|
||||
either. Individual utilities are made available via the `alternatives`
|
||||
either. Individual utilities are made available via the ``alternatives``
|
||||
mechanism, but not all commands will be wrapped that way. The files installed
|
||||
by repmgr will certainly not be in the default PATH for the postgres user
|
||||
on such a system. They will instead be in /usr/pgsql-9.0/bin/ on this
|
||||
@@ -15,33 +15,31 @@ type of system.
|
||||
|
||||
When building repmgr against a RPM packaged build, you may discover that some
|
||||
development packages are needed as well. The following build errors can
|
||||
occur:
|
||||
occur::
|
||||
|
||||
/usr/bin/ld: cannot find -lxslt
|
||||
/usr/bin/ld: cannot find -lpam
|
||||
|
||||
Install the following packages to correct those:
|
||||
|
||||
Install the following packages to correct those::
|
||||
|
||||
yum install libxslt-devel
|
||||
yum install pam-devel
|
||||
|
||||
If building repmgr as a regular user, then doing the install into the system
|
||||
directories using sudo, the syntax is hard. `pg_config` won't be in root's
|
||||
path either. The following recipe should work:
|
||||
directories using sudo, the syntax is hard. ``pg_config`` won't be in root's
|
||||
path either. The following recipe should work::
|
||||
|
||||
sudo PATH="/usr/pgsql-9.0/bin:$PATH" make USE_PGXS=1 install
|
||||
|
||||
|
||||
Issues with 32 and 64 bit RPMs
|
||||
------------------------------
|
||||
|
||||
If when building, you receive a series of errors of this form:
|
||||
If when building, you receive a series of errors of this form::
|
||||
|
||||
/usr/bin/ld: skipping incompatible /usr/pgsql-9.0/lib/libpq.so when searching for -lpq
|
||||
|
||||
This is likely because you have both the 32 and 64 bit versions of the
|
||||
`postgresql90-devel` package installed. You can check that like this:
|
||||
``postgresql90-devel`` package installed. You can check that like this::
|
||||
|
||||
rpm -qa --queryformat '%{NAME}\t%{ARCH}\n' | grep postgresql90-devel
|
||||
|
||||
@@ -49,8 +47,7 @@ And if two packages appear, one for i386 and one for x86_64, that's not supposed
|
||||
to be allowed.
|
||||
|
||||
This can happen when using the PGDG repo to install that package;
|
||||
here is an example sessions demonstrating the problem case appearing:
|
||||
|
||||
here is an example sessions demonstrating the problem case appearing::
|
||||
|
||||
# yum install postgresql-devel
|
||||
..
|
||||
@@ -70,21 +67,20 @@ here is an example sessions demonstrating the problem case appearing:
|
||||
postgresql90-devel i386 9.0.2-2PGDG.rhel5 pgdg90 1.5 M
|
||||
postgresql90-devel x86_64 9.0.2-2PGDG.rhel5 pgdg90 1.6 M
|
||||
|
||||
|
||||
Note how both the i386 and x86_64 platform architectures are selected for
|
||||
installation. Your main PostgreSQL package will only be compatible with one of
|
||||
those, and if the repmgr build finds the wrong postgresql90-devel these
|
||||
"skipping incompatible" messages appear.
|
||||
|
||||
In this case, you can temporarily remove both packages, then just install the
|
||||
correct one for your architecture. Example:
|
||||
correct one for your architecture. Example::
|
||||
|
||||
rpm -e postgresql90-devel --allmatches
|
||||
yum install postgresql90-devel-9.0.2-2PGDG.rhel5.x86_64
|
||||
|
||||
Instead just deleting the package from the wrong platform might not leave behind
|
||||
the correct files, due to the way in which these accidentally happen to interact.
|
||||
If you already tried to build repmgr before doing this, you'll need to do:
|
||||
If you already tried to build repmgr before doing this, you'll need to do::
|
||||
|
||||
make USE_PGXS=1 clean
|
||||
|
||||
@@ -93,17 +89,17 @@ to get rid of leftover files from the wrong architecture.
|
||||
Notes on Ubuntu, Debian or other Debian-based Builds
|
||||
----------------------------------------------------
|
||||
|
||||
The Debian packages of PostgreSQL put `pg_config` into the development package
|
||||
called `postgresql-server-dev-$version`.
|
||||
The Debian packages of PostgreSQL put ``pg_config`` into the development package
|
||||
called ``postgresql-server-dev-$version``.
|
||||
|
||||
When building repmgr against a Debian packages build, you may discover that some
|
||||
development packages are needed as well. You will need the following development
|
||||
packages installed:
|
||||
packages installed::
|
||||
|
||||
sudo apt-get install libxslt-dev libxml2-dev libpam-dev libedit-dev
|
||||
|
||||
If you're using Debian packages for PostgreSQL and are building repmgr with the
|
||||
USE_PGXS option you also need to install the corresponding development package:
|
||||
USE_PGXS option you also need to install the corresponding development package::
|
||||
|
||||
sudo apt-get install postgresql-server-dev-9.0
|
||||
|
||||
@@ -114,12 +110,12 @@ multiple installed versions of PostgreSQL on the same system through a wrapper
|
||||
called pg_wrapper and repmgr is not (yet) known to this wrapper.
|
||||
|
||||
You can solve this in many different ways, the most Debian like is to make an
|
||||
alternate for repmgr and repmgrd:
|
||||
alternate for repmgr and repmgrd::
|
||||
|
||||
sudo update-alternatives --install /usr/bin/repmgr repmgr /usr/lib/postgresql/9.0/bin/repmgr 10
|
||||
sudo update-alternatives --install /usr/bin/repmgrd repmgrd /usr/lib/postgresql/9.0/bin/repmgrd 10
|
||||
|
||||
You can also make a deb package of repmgr using:
|
||||
You can also make a deb package of repmgr using::
|
||||
|
||||
make USE_PGXS=1 deb
|
||||
|
||||
|
||||
@@ -21,8 +21,7 @@ Master setup
|
||||
CREATE DATABASE repmgr_db OWNER repmgr_usr;
|
||||
```
|
||||
|
||||
- configure `postgresql.conf` for replication (see README.md for sample
|
||||
settings)
|
||||
- configure `postgresql.conf` for replication (see above)
|
||||
|
||||
- update `pg_hba.conf`, e.g.:
|
||||
|
||||
@@ -112,7 +111,7 @@ created in the `repl_nodes` table should look something like this:
|
||||
|
||||
repmgr_db=# SELECT * from repmgr_test.repl_nodes;
|
||||
id | type | upstream_node_id | cluster | name | conninfo | slot_name | priority | active
|
||||
----+---------+------------------+---------+-------+----------------------------------------------------+-----------+----------+--------
|
||||
1 | primary | | test | node1 | host=repmgr_node1 user=repmgr_usr dbname=repmgr_db | | 0 | t
|
||||
2 | standby | 1 | test | node2 | host=repmgr_node2 user=repmgr_usr dbname=repmgr_db | | 0 | t
|
||||
----+---------+------------------+---------+-------+-------------------------------------------------+-----------+----------+--------
|
||||
1 | primary | | test | node1 | host=localhost user=repmgr_usr dbname=repmgr_db | | 0 | t
|
||||
2 | standby | 1 | test | node2 | host=localhost user=repmgr_usr dbname=repmgr_db | | 0 | t
|
||||
(2 rows)
|
||||
|
||||
81
README.md
81
README.md
@@ -7,18 +7,15 @@ hot-standby capabilities with tools to set up standby servers, monitor
|
||||
replication, and perform administrative tasks such as failover or manual
|
||||
switchover operations.
|
||||
|
||||
This document covers `repmgr 3`, which supports PostgreSQL 9.3 and later.
|
||||
This document covers `repmgr 3`, which supports PostgreSQL 9.4 and 9.3.
|
||||
This version can use `pg_basebackup` to clone standby servers, supports
|
||||
replication slots and cascading replication, doesn't require a restart
|
||||
after promotion, and has many usability improvements.
|
||||
|
||||
Please continue to use `repmgr 2` with PostgreSQL 9.2 and earlier.
|
||||
Please continue to use `repmgr 2` with earlier PostgreSQL 9.x versions.
|
||||
For a list of changes since `repmgr 2` and instructions on upgrading to
|
||||
`repmgr 3`, see the "Upgrading from repmgr 2" section below.
|
||||
|
||||
For a list of frequently asked questions about `repmgr`, please refer
|
||||
to the file `FAQ.md`.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
@@ -56,7 +53,7 @@ on any UNIX-like system which PostgreSQL itself supports.
|
||||
|
||||
All nodes must be running the same major version of PostgreSQL, and we
|
||||
recommend that they also run the same minor version. This version of
|
||||
`repmgr` (v3) supports PostgreSQL 9.3 and later.
|
||||
`repmgr` (v3) supports PostgreSQL 9.3 and 9.4.
|
||||
|
||||
Earlier versions of `repmgr` needed password-less SSH access between
|
||||
nodes in order to clone standby servers using `rsync`. `repmgr 3` can
|
||||
@@ -101,8 +98,8 @@ for details.
|
||||
|
||||
### PostgreSQL configuration
|
||||
|
||||
The primary server needs to be configured for replication with settings
|
||||
like the following in `postgresql.conf`:
|
||||
The primary server needs to be configured for replication with the
|
||||
following settings in `postgresql.conf`:
|
||||
|
||||
# Allow read-only queries on standby servers. The number of WAL
|
||||
# senders should be larger than the number of standby servers.
|
||||
@@ -114,7 +111,7 @@ like the following in `postgresql.conf`:
|
||||
# How much WAL to retain on the primary to allow a temporarily
|
||||
# disconnected standby to catch up again. The larger this is, the
|
||||
# longer the standby can be disconnected. This is needed only in
|
||||
# 9.3; from 9.4, replication slots can be used instead (see below).
|
||||
# 9.3; in 9.4, replication slots can be used instead (see below).
|
||||
|
||||
wal_keep_segments = 5000
|
||||
|
||||
@@ -124,18 +121,13 @@ like the following in `postgresql.conf`:
|
||||
archive_mode = on
|
||||
archive_command = 'cd .'
|
||||
|
||||
# If you plan to use repmgrd, ensure that shared_preload_libraries
|
||||
# is configured to load 'repmgr_funcs'
|
||||
|
||||
shared_preload_libraries = 'repmgr_funcs'
|
||||
# You can also set additional replication parameters here, such as
|
||||
# hot_standby_feedback or synchronous_standby_names.
|
||||
|
||||
PostgreSQL 9.4 makes it possible to use replication slots, which means
|
||||
the value of `wal_keep_segments` need no longer be set. See section
|
||||
"Replication slots" below for more details.
|
||||
|
||||
With PostgreSQL 9.3, `repmgr` expects `wal_keep_segments` to be set to
|
||||
at least 5000 (= 80GB of WAL) by default, though this can be overriden
|
||||
with the `-w N` argument.
|
||||
the value of wal_keep_segments need no longer be set. With 9.3, `repmgr`
|
||||
expects it to be set to at least 5000 (= 80GB of WAL) by default, though
|
||||
this can be overriden with the `-w N` argument.
|
||||
|
||||
A dedicated PostgreSQL superuser account and a database in which to
|
||||
store monitoring and replication data are required. Create them by
|
||||
@@ -144,14 +136,10 @@ running the following commands:
|
||||
createuser -s repmgr
|
||||
createdb repmgr -O repmgr
|
||||
|
||||
We recommend using the name `repmgr` for both user and database, but you
|
||||
can use whatever name you like (and you need to set the names you chose
|
||||
in the `conninfo` string in `repmgr.conf`; see below). We also recommend
|
||||
that you set the `repmgr` user's search path to include the `repmgr` schema
|
||||
for convenience when querying the metadata tables and views.
|
||||
|
||||
The `repmgr` application will create its metadata schema in the `repmgr`
|
||||
database when the master server is registered.
|
||||
We recommend using the name `repmgr` for both, but you can use whatever
|
||||
name you like (and you need to set the names you chose in the `conninfo`
|
||||
string in `repmgr.conf`; see below). `repmgr` will create the schema and
|
||||
objects it needs when it connects to the server.
|
||||
|
||||
### repmgr configuration
|
||||
|
||||
@@ -235,7 +223,7 @@ The node can then be restarted.
|
||||
The node will then need to be re-registered with `repmgr`; again
|
||||
the `--force` option is required to update the existing record:
|
||||
|
||||
repmgr -f /etc/repmgr/repmgr.conf \
|
||||
repmgr -f /etc/repmgr/repmgr.conf
|
||||
--force \
|
||||
standby register
|
||||
|
||||
@@ -267,20 +255,6 @@ Example log output (at default log level):
|
||||
[2015-03-11 13:15:40] [INFO] reloading configuration file and updating repmgr tables
|
||||
[2015-03-11 13:15:40] [INFO] starting continuous standby node monitoring
|
||||
|
||||
Note that currently `repmgrd` does not provide logfile rotation. To ensure
|
||||
the current logfile does not grow indefinitely, configure your system's `logrotate`
|
||||
to do this. Sample configuration to rotate logfiles weekly with retention
|
||||
for up to 52 weeks and rotation forced if a file grows beyond 100Mb:
|
||||
|
||||
/var/log/postgresql/repmgr-9.4.log {
|
||||
missingok
|
||||
compress
|
||||
rotate 52
|
||||
maxsize 100M
|
||||
weekly
|
||||
create 0600 postgres postgres
|
||||
}
|
||||
|
||||
|
||||
Witness server
|
||||
--------------
|
||||
@@ -371,12 +345,10 @@ Following event types currently exist:
|
||||
|
||||
master_register
|
||||
standby_register
|
||||
standby_unregister
|
||||
standby_clone
|
||||
standby_promote
|
||||
witness_create
|
||||
repmgrd_start
|
||||
repmgrd_monitor
|
||||
repmgrd_failover_promote
|
||||
repmgrd_failover_follow
|
||||
|
||||
@@ -426,18 +398,6 @@ stored in the `repl_nodes` table.
|
||||
Note that `repmgr` will fail with an error if this option is specified when
|
||||
working with PostgreSQL 9.3.
|
||||
|
||||
Be aware that when initially cloning a standby, you will need to ensure
|
||||
that all required WAL files remain available while the cloning is taking
|
||||
place. If using the default `pg_basebackup` method, we recommend setting
|
||||
`pg_basebackup`'s `--xlog-method` parameter to `stream` like this:
|
||||
|
||||
pg_basebackup_options='--xlog-method=stream'
|
||||
|
||||
See the `pg_basebackup` documentation [*] for details. Otherwise you'll need
|
||||
to set `wal_keep_segments` to an appropriately high value.
|
||||
|
||||
[*] http://www.postgresql.org/docs/current/static/app-pgbasebackup.html
|
||||
|
||||
Further reading:
|
||||
* http://www.postgresql.org/docs/current/interactive/warm-standby.html#STREAMING-REPLICATION-SLOTS
|
||||
* http://blog.2ndquadrant.com/postgresql-9-4-slots/
|
||||
@@ -475,19 +435,12 @@ its port if is different from the default one.
|
||||
Registers a master in a cluster. This command needs to be executed before any
|
||||
standby nodes are registered.
|
||||
|
||||
`primary register` can be used as an alias for `master register`.
|
||||
|
||||
* `standby register`
|
||||
|
||||
Registers a standby with `repmgr`. This command needs to be executed to enable
|
||||
promote/follow operations and to allow `repmgrd` to work with the node.
|
||||
An existing standby can be registered using this command.
|
||||
|
||||
* `standby unregister`
|
||||
|
||||
Unregisters a standby with `repmgr`. This command does not affect the actual
|
||||
replication.
|
||||
|
||||
* `standby clone [node to be cloned]`
|
||||
|
||||
Clones a new standby node from the data directory of the master (or
|
||||
@@ -620,7 +573,7 @@ exit:
|
||||
* ERR_BAD_SSH (12) Error when connecting to remote host via SSH
|
||||
* ERR_SYS_FAILURE (13) Error when forking (repmgrd only)
|
||||
* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup
|
||||
* ERR_MONITORING_FAIL (16) Unrecoverable error encountered during monitoring (repmgrd only)
|
||||
|
||||
|
||||
Support and Assistance
|
||||
----------------------
|
||||
|
||||
@@ -12,7 +12,7 @@ REPMGRD_ENABLED=no
|
||||
#REPMGRD_USER=postgres
|
||||
|
||||
# repmgrd binary
|
||||
#REPMGRD_BIN=/usr/bin/repmgrd
|
||||
#REPMGRD_BIN=/usr/bin/repmgr
|
||||
|
||||
# pid file
|
||||
#REPMGRD_PIDFILE=/var/lib/pgsql/repmgr/repmgrd.pid
|
||||
|
||||
13
SSH-RSYNC.md
13
SSH-RSYNC.md
@@ -1,13 +1,12 @@
|
||||
Set up trusted copy between postgres accounts
|
||||
---------------------------------------------
|
||||
|
||||
If you need to use `rsync` to clone standby servers, the `postgres` account
|
||||
on your primary and standby servers must be each able to access the other
|
||||
If you need to use rsync to clone standby servers, the postgres account
|
||||
on your master and standby servers must be each able to access the other
|
||||
using SSH without a password.
|
||||
|
||||
First generate an ssh key, using an empty passphrase, and copy the resulting
|
||||
keys and a matching authorization file to a privileged user account on the other
|
||||
system:
|
||||
First generate a ssh key, using an empty passphrase, and copy the resulting
|
||||
keys and a maching authorization file to a privledged user on the other system::
|
||||
|
||||
[postgres@node1]$ ssh-keygen -t rsa
|
||||
Generating public/private rsa key pair.
|
||||
@@ -23,8 +22,8 @@ system:
|
||||
[postgres@node1]$ cd ~/.ssh
|
||||
[postgres@node1]$ scp id_rsa.pub id_rsa authorized_keys user@node2:
|
||||
|
||||
Login as a user on the other system, and install the files into the `postgres`
|
||||
user's account:
|
||||
Login as a user on the other system, and install the files into the postgres
|
||||
user's account::
|
||||
|
||||
[user@node2 ~]$ sudo chown postgres.postgres authorized_keys id_rsa.pub id_rsa
|
||||
[user@node2 ~]$ sudo mkdir -p ~postgres/.ssh
|
||||
|
||||
56
TODO
56
TODO
@@ -7,14 +7,9 @@ Known issues in repmgr
|
||||
|
||||
* PGPASSFILE may not be passed to pg_basebackup
|
||||
|
||||
|
||||
Planned feature improvements
|
||||
============================
|
||||
|
||||
* Use 'primary' instead of 'master' in documentation and log output
|
||||
for consistency with PostgreSQL documentation. See also commit
|
||||
870b0a53b627eeb9aca1fc14cbafe25b5beafe12.
|
||||
|
||||
* A better check which standby did receive most of the data
|
||||
|
||||
* Make the fact that a standby may be delayed a factor in the voting
|
||||
@@ -27,51 +22,6 @@ Planned feature improvements
|
||||
* Use pg_basebackup for the data directory, and ALSO rsync for the
|
||||
configuration files.
|
||||
|
||||
* If no configuration file supplied, search in sensible default locations
|
||||
(currently: current directory and `pg_config --sysconfdir`); if
|
||||
possible this should include the location provided by the package,
|
||||
if installed.
|
||||
|
||||
* repmgrd: if connection to the upstream node fails on startup, optionally
|
||||
retry for a certain period before giving up; this will cover cases when
|
||||
e.g. primary and standby are both starting up, and the standby comes up
|
||||
before the primary. See github issue #80.
|
||||
|
||||
* make old master node ID available for event notification commands
|
||||
(See github issue #80).
|
||||
|
||||
* Have pg_basebackup use replication slots, if and when support for
|
||||
this is added; see:
|
||||
http://www.postgresql.org/message-id/555DD2B2.7020000@gmx.net
|
||||
|
||||
* use "primary/standby" terminology in place of "master/slave" for consistency
|
||||
with main PostrgreSQL usage
|
||||
|
||||
* repmgr standby clone: possibility to use barman instead of performing a new base backup
|
||||
|
||||
* possibility to transform a failed master into a new standby with pg_rewind
|
||||
|
||||
* "repmgr standby switchover" to promote a standby in a controlled manner
|
||||
and convert the existing primary into a standby
|
||||
|
||||
* make repmgrd more robust
|
||||
|
||||
* repmgr: when cloning a standby using pg_basebackup and replication slots are
|
||||
requested, activate the replication slot using pg_receivexlog to negate the
|
||||
need to set `wal_keep_segments` just for the initial clone (9.4 and 9.5).
|
||||
|
||||
Usability improvements
|
||||
======================
|
||||
|
||||
* repmgr: add interrupt handler, so that if the program is interrupted
|
||||
while running a backup, an attempt can be made to execute pg_stop_backup()
|
||||
on the primary, to prevent an orphaned backup state existing.
|
||||
|
||||
* repmgr: when unregistering a node, delete any entries in the repl_monitoring
|
||||
table.
|
||||
|
||||
* repmgr: for "standby unregister", accept connection parameters for the
|
||||
primary and perform metadata updates (and slot removal) directly on
|
||||
the primary, to allow a shutdown standby to be unregistered
|
||||
(currently the standby must still be running, which means the replication
|
||||
slot can't be dropped).
|
||||
* Use pg_basebackup -X s
|
||||
NOTE: this can be used by including `-X s` in the configuration parameter
|
||||
`pg_basebackup_options`
|
||||
59
check_dir.c
59
check_dir.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* check_dir.c - Directories management functions
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -23,19 +23,14 @@
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ftw.h>
|
||||
|
||||
/* NB: postgres_fe must be included BEFORE check_dir */
|
||||
#include <libpq-fe.h>
|
||||
#include <postgres_fe.h>
|
||||
|
||||
#include "postgres_fe.h"
|
||||
#include "check_dir.h"
|
||||
|
||||
#include "strutil.h"
|
||||
#include "log.h"
|
||||
|
||||
static bool _create_pg_dir(char *dir, bool force, bool for_witness);
|
||||
static int unlink_dir_callback(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf);
|
||||
|
||||
/*
|
||||
* make sure the directory either doesn't exist or is empty
|
||||
* we use this function to check the new data directory and
|
||||
@@ -248,19 +243,6 @@ is_pg_dir(char *dir)
|
||||
|
||||
bool
|
||||
create_pg_dir(char *dir, bool force)
|
||||
{
|
||||
return _create_pg_dir(dir, force, false);
|
||||
}
|
||||
|
||||
bool
|
||||
create_witness_pg_dir(char *dir, bool force)
|
||||
{
|
||||
return _create_pg_dir(dir, force, true);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
_create_pg_dir(char *dir, bool force, bool for_witness)
|
||||
{
|
||||
bool pg_dir = false;
|
||||
|
||||
@@ -297,33 +279,21 @@ _create_pg_dir(char *dir, bool force, bool for_witness)
|
||||
|
||||
pg_dir = is_pg_dir(dir);
|
||||
|
||||
|
||||
/*
|
||||
* we use force to reduce the time needed to restore a node which
|
||||
* turn async after a failover or anything else
|
||||
*/
|
||||
if (pg_dir && force)
|
||||
{
|
||||
|
||||
/*
|
||||
* The witness server does not store any data other than a copy of the
|
||||
* repmgr metadata, so in --force mode we can simply overwrite the
|
||||
* directory.
|
||||
*
|
||||
* For non-witness servers, we'll leave the data in place, both to reduce
|
||||
* the risk of unintentional data loss and to make it possible for the
|
||||
* data directory to be brought up-to-date with rsync.
|
||||
*/
|
||||
if (for_witness)
|
||||
{
|
||||
log_notice(_("deleting existing data directory \"%s\"\n"), dir);
|
||||
nftw(dir, unlink_dir_callback, 64, FTW_DEPTH | FTW_PHYS);
|
||||
}
|
||||
/* Let it continue */
|
||||
break;
|
||||
}
|
||||
else if (pg_dir && !force)
|
||||
{
|
||||
log_hint(_("This looks like a PostgreSQL directory.\n"
|
||||
log_warning(_("\nThis looks like a PostgreSQL directory.\n"
|
||||
"If you are sure you want to clone here, "
|
||||
"please check there is no PostgreSQL server "
|
||||
"running and use the -F/--force option\n"));
|
||||
"running and use the --force option\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -336,14 +306,3 @@ _create_pg_dir(char *dir, bool force, bool for_witness)
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static int
|
||||
unlink_dir_callback(const char *fpath, const struct stat *sb, int typeflag, struct FTW *ftwbuf)
|
||||
{
|
||||
int rv = remove(fpath);
|
||||
|
||||
if (rv)
|
||||
perror(fpath);
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* check_dir.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -26,6 +26,5 @@ bool create_dir(char *dir);
|
||||
bool set_dir_permissions(char *dir);
|
||||
bool is_pg_dir(char *dir);
|
||||
bool create_pg_dir(char *dir, bool force);
|
||||
bool create_witness_pg_dir(char *dir, bool force);
|
||||
|
||||
#endif
|
||||
|
||||
478
config.c
478
config.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* config.c - Functions to parse the config file
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -26,28 +26,10 @@
|
||||
|
||||
static void parse_event_notifications_list(t_configuration_options *options, const char *arg);
|
||||
static void tablespace_list_append(t_configuration_options *options, const char *arg);
|
||||
static void exit_with_errors(ErrorList *config_errors);
|
||||
|
||||
const static char *_progname = '\0';
|
||||
static char config_file_path[MAXPGPATH];
|
||||
static bool config_file_provided = false;
|
||||
static bool config_file_found = false;
|
||||
|
||||
|
||||
void
|
||||
set_progname(const char *argv0)
|
||||
{
|
||||
_progname = get_progname(argv0);
|
||||
}
|
||||
|
||||
const char *
|
||||
progname(void)
|
||||
{
|
||||
return _progname;
|
||||
}
|
||||
|
||||
/*
|
||||
* load_config()
|
||||
* parse_config()
|
||||
*
|
||||
* Set default options and overwrite with values from provided configuration
|
||||
* file.
|
||||
@@ -56,161 +38,83 @@ progname(void)
|
||||
*
|
||||
* Any configuration options changed in this function must also be changed in
|
||||
* reload_config()
|
||||
*
|
||||
* NOTE: this function is called before the logger is set up, so we need
|
||||
* to handle the verbose option ourselves; also the default log level is NOTICE,
|
||||
* so we can't use DEBUG.
|
||||
*/
|
||||
bool
|
||||
load_config(const char *config_file, bool verbose, t_configuration_options *options, char *argv0)
|
||||
parse_config(const char *config_file, t_configuration_options *options)
|
||||
{
|
||||
struct stat stat_config;
|
||||
char *s,
|
||||
buff[MAXLINELENGTH];
|
||||
char config_file_buf[MAXLEN];
|
||||
char name[MAXLEN];
|
||||
char value[MAXLEN];
|
||||
bool config_file_provided = false;
|
||||
FILE *fp;
|
||||
|
||||
/* Sanity checks */
|
||||
|
||||
/*
|
||||
* If a configuration file was provided, check it exists, otherwise
|
||||
* emit an error and terminate. We assume that if a user explicitly
|
||||
* provides a configuration file, they'll want to make sure it's
|
||||
* used and not fall back to any of the defaults.
|
||||
* emit an error
|
||||
*/
|
||||
if (config_file[0])
|
||||
{
|
||||
strncpy(config_file_path, config_file, MAXPGPATH);
|
||||
canonicalize_path(config_file_path);
|
||||
struct stat config;
|
||||
|
||||
if (stat(config_file_path, &stat_config) != 0)
|
||||
strncpy(config_file_buf, config_file, MAXLEN);
|
||||
canonicalize_path(config_file_buf);
|
||||
|
||||
if(stat(config_file_buf, &config) != 0)
|
||||
{
|
||||
log_err(_("provided configuration file \"%s\" not found: %s\n"),
|
||||
log_err(_("provided configuration file '%s' not found: %s\n"),
|
||||
config_file,
|
||||
strerror(errno)
|
||||
);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("using configuration file \"%s\"\n"), config_file);
|
||||
}
|
||||
|
||||
config_file_provided = true;
|
||||
config_file_found = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If no configuration file was provided, attempt to find a default file
|
||||
* in this order:
|
||||
* - current directory
|
||||
* - /etc/repmgr.conf
|
||||
* - default sysconfdir
|
||||
*
|
||||
* here we just check for the existence of the file; parse_config()
|
||||
* will handle read errors etc.
|
||||
* If no configuration file was provided, set to a default file
|
||||
* which `parse_config()` will attempt to read if it exists
|
||||
*/
|
||||
if (config_file_provided == false)
|
||||
{
|
||||
char my_exec_path[MAXPGPATH];
|
||||
char sysconf_etc_path[MAXPGPATH];
|
||||
|
||||
/* 1. "./repmgr.conf" */
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("looking for configuration file in current directory\n"));
|
||||
}
|
||||
|
||||
snprintf(config_file_path, MAXPGPATH, "./%s", CONFIG_FILE_NAME);
|
||||
canonicalize_path(config_file_path);
|
||||
|
||||
if (stat(config_file_path, &stat_config) == 0)
|
||||
{
|
||||
config_file_found = true;
|
||||
goto end_search;
|
||||
}
|
||||
|
||||
/* 2. "/etc/repmgr.conf" */
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("looking for configuration file in /etc\n"));
|
||||
}
|
||||
|
||||
snprintf(config_file_path, MAXPGPATH, "/etc/%s", CONFIG_FILE_NAME);
|
||||
if (stat(config_file_path, &stat_config) == 0)
|
||||
{
|
||||
config_file_found = true;
|
||||
goto end_search;
|
||||
}
|
||||
|
||||
/* 3. default sysconfdir */
|
||||
if (find_my_exec(argv0, my_exec_path) < 0)
|
||||
{
|
||||
fprintf(stderr, _("%s: could not find own program executable\n"), argv0);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
get_etc_path(my_exec_path, sysconf_etc_path);
|
||||
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("looking for configuration file in %s"), sysconf_etc_path);
|
||||
}
|
||||
|
||||
snprintf(config_file_path, MAXPGPATH, "%s/%s", sysconf_etc_path, CONFIG_FILE_NAME);
|
||||
if (stat(config_file_path, &stat_config) == 0)
|
||||
{
|
||||
config_file_found = true;
|
||||
goto end_search;
|
||||
}
|
||||
|
||||
end_search:
|
||||
if (config_file_found == true)
|
||||
{
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("configuration file found at: %s\n"), config_file_path);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (verbose == true)
|
||||
{
|
||||
log_notice(_("no configuration file provided or found\n"));
|
||||
}
|
||||
}
|
||||
strncpy(config_file_buf, DEFAULT_CONFIG_FILE, MAXLEN);
|
||||
}
|
||||
|
||||
return parse_config(options);
|
||||
}
|
||||
|
||||
fp = fopen(config_file_buf, "r");
|
||||
|
||||
/*
|
||||
* Parse configuration file; if any errors are encountered,
|
||||
* list them and exit.
|
||||
* Since some commands don't require a config file at all, not having one
|
||||
* isn't necessarily a problem.
|
||||
*
|
||||
* Ensure any default values set here are synced with repmgr.conf.sample
|
||||
* and any other documentation.
|
||||
* If the user explictly provided a configuration file and we can't
|
||||
* read it we'll raise an error.
|
||||
*
|
||||
* If no configuration file was provided, we'll try and read the default\
|
||||
* file if it exists and is readable, but won't worry if it's not.
|
||||
*/
|
||||
bool
|
||||
parse_config(t_configuration_options *options)
|
||||
if (fp == NULL)
|
||||
{
|
||||
FILE *fp;
|
||||
char *s,
|
||||
buf[MAXLINELENGTH];
|
||||
char name[MAXLEN];
|
||||
char value[MAXLEN];
|
||||
if(config_file_provided)
|
||||
{
|
||||
log_err(_("unable to open provided configuration file '%s'; terminating\n"), config_file_buf);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* For sanity-checking provided conninfo string */
|
||||
PQconninfoOption *conninfo_options;
|
||||
char *conninfo_errmsg = NULL;
|
||||
log_notice(_("no configuration file provided and default file '%s' not found - "
|
||||
"continuing with default values\n"),
|
||||
DEFAULT_CONFIG_FILE);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Collate configuration file errors here for friendlier reporting */
|
||||
static ErrorList config_errors = { NULL, NULL };
|
||||
|
||||
/* Initialize configuration options with sensible defaults
|
||||
* note: the default log level is set in log.c and does not need
|
||||
* to be initialised here
|
||||
*/
|
||||
/* Initialize configuration options with sensible defaults */
|
||||
memset(options->cluster_name, 0, sizeof(options->cluster_name));
|
||||
options->node = -1;
|
||||
options->upstream_node = NO_UPSTREAM_NODE;
|
||||
options->use_replication_slots = 0;
|
||||
memset(options->conninfo, 0, sizeof(options->conninfo));
|
||||
options->failover = MANUAL_FAILOVER;
|
||||
options->priority = DEFAULT_PRIORITY;
|
||||
@@ -228,7 +132,7 @@ parse_config(t_configuration_options *options)
|
||||
|
||||
/* default to 6 reconnection attempts at intervals of 10 seconds */
|
||||
options->reconnect_attempts = 6;
|
||||
options->reconnect_interval = 10;
|
||||
options->reconnect_intvl = 10;
|
||||
|
||||
options->monitor_interval_secs = 2;
|
||||
options->retry_promote_interval_secs = 300;
|
||||
@@ -238,61 +142,27 @@ parse_config(t_configuration_options *options)
|
||||
options->tablespace_mapping.head = NULL;
|
||||
options->tablespace_mapping.tail = NULL;
|
||||
|
||||
/*
|
||||
* If no configuration file available (user didn't specify and none found
|
||||
* in the default locations), return with default values
|
||||
*/
|
||||
if (config_file_found == false)
|
||||
{
|
||||
log_notice(_("no configuration file provided and no default file found - "
|
||||
"continuing with default values\n"));
|
||||
return true;
|
||||
}
|
||||
|
||||
fp = fopen(config_file_path, "r");
|
||||
|
||||
/*
|
||||
* A configuration file has been found, either provided by the user
|
||||
* or found in one of the default locations. If we can't open it,
|
||||
* fail with an error.
|
||||
*/
|
||||
if (fp == NULL)
|
||||
{
|
||||
if (config_file_provided)
|
||||
{
|
||||
log_err(_("unable to open provided configuration file \"%s\"; terminating\n"), config_file_path);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_err(_("unable to open default configuration file \"%s\"; terminating\n"), config_file_path);
|
||||
}
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* Read file */
|
||||
while ((s = fgets(buf, sizeof buf, fp)) != NULL)
|
||||
/* Read next line */
|
||||
while ((s = fgets(buff, sizeof buff, fp)) != NULL)
|
||||
{
|
||||
bool known_parameter = true;
|
||||
|
||||
/* Skip blank lines and comments */
|
||||
if (buff[0] == '\n' || buff[0] == '#')
|
||||
continue;
|
||||
|
||||
/* Parse name/value pair from line */
|
||||
parse_line(buf, name, value);
|
||||
|
||||
/* Skip blank lines */
|
||||
if (!strlen(name))
|
||||
continue;
|
||||
|
||||
/* Skip comments */
|
||||
if (name[0] == '#')
|
||||
continue;
|
||||
parse_line(buff, name, value);
|
||||
|
||||
/* Copy into correct entry in parameters struct */
|
||||
if (strcmp(name, "cluster") == 0)
|
||||
strncpy(options->cluster_name, value, MAXLEN);
|
||||
else if (strcmp(name, "node") == 0)
|
||||
options->node = repmgr_atoi(value, "node", &config_errors);
|
||||
options->node = atoi(value);
|
||||
else if (strcmp(name, "upstream_node") == 0)
|
||||
options->upstream_node = repmgr_atoi(value, "upstream_node", &config_errors);
|
||||
options->upstream_node = atoi(value);
|
||||
else if (strcmp(name, "conninfo") == 0)
|
||||
strncpy(options->conninfo, value, MAXLEN);
|
||||
else if (strcmp(name, "rsync_options") == 0)
|
||||
@@ -319,11 +189,12 @@ parse_config(t_configuration_options *options)
|
||||
}
|
||||
else
|
||||
{
|
||||
error_list_append(&config_errors,_("value for 'failover' must be 'automatic' or 'manual'\n"));
|
||||
log_err(_("value for 'failover' must be 'automatic' or 'manual'\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
else if (strcmp(name, "priority") == 0)
|
||||
options->priority = repmgr_atoi(value, "priority", &config_errors);
|
||||
options->priority = atoi(value);
|
||||
else if (strcmp(name, "node_name") == 0)
|
||||
strncpy(options->node_name, value, MAXLEN);
|
||||
else if (strcmp(name, "promote_command") == 0)
|
||||
@@ -331,16 +202,11 @@ parse_config(t_configuration_options *options)
|
||||
else if (strcmp(name, "follow_command") == 0)
|
||||
strncpy(options->follow_command, value, MAXLEN);
|
||||
else if (strcmp(name, "master_response_timeout") == 0)
|
||||
options->master_response_timeout = repmgr_atoi(value, "master_response_timeout", &config_errors);
|
||||
/* 'primary_response_timeout' as synonym for 'master_response_timeout' -
|
||||
* we'll switch terminology in a future release (3.1?)
|
||||
*/
|
||||
else if (strcmp(name, "primary_response_timeout") == 0)
|
||||
options->master_response_timeout = repmgr_atoi(value, "primary_response_timeout", &config_errors);
|
||||
options->master_response_timeout = atoi(value);
|
||||
else if (strcmp(name, "reconnect_attempts") == 0)
|
||||
options->reconnect_attempts = repmgr_atoi(value, "reconnect_attempts", &config_errors);
|
||||
options->reconnect_attempts = atoi(value);
|
||||
else if (strcmp(name, "reconnect_interval") == 0)
|
||||
options->reconnect_interval = repmgr_atoi(value, "reconnect_interval", &config_errors);
|
||||
options->reconnect_intvl = atoi(value);
|
||||
else if (strcmp(name, "pg_bindir") == 0)
|
||||
strncpy(options->pg_bindir, value, MAXLEN);
|
||||
else if (strcmp(name, "pg_ctl_options") == 0)
|
||||
@@ -350,12 +216,11 @@ parse_config(t_configuration_options *options)
|
||||
else if (strcmp(name, "logfile") == 0)
|
||||
strncpy(options->logfile, value, MAXLEN);
|
||||
else if (strcmp(name, "monitor_interval_secs") == 0)
|
||||
options->monitor_interval_secs = repmgr_atoi(value, "monitor_interval_secs", &config_errors);
|
||||
options->monitor_interval_secs = atoi(value);
|
||||
else if (strcmp(name, "retry_promote_interval_secs") == 0)
|
||||
options->retry_promote_interval_secs = repmgr_atoi(value, "retry_promote_interval_secs", &config_errors);
|
||||
options->retry_promote_interval_secs = atoi(value);
|
||||
else if (strcmp(name, "use_replication_slots") == 0)
|
||||
/* XXX we should have a dedicated boolean argument format */
|
||||
options->use_replication_slots = repmgr_atoi(value, "use_replication_slots", &config_errors);
|
||||
options->use_replication_slots = atoi(value);
|
||||
else if (strcmp(name, "event_notification_command") == 0)
|
||||
strncpy(options->event_notification_command, value, MAXLEN);
|
||||
else if (strcmp(name, "event_notifications") == 0)
|
||||
@@ -375,13 +240,8 @@ parse_config(t_configuration_options *options)
|
||||
* as currently e.g. an empty `node` value will be converted to '0'.
|
||||
*/
|
||||
if(known_parameter == true && !strlen(value)) {
|
||||
char error_message_buf[MAXLEN] = "";
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("no value provided for parameter \"%s\""),
|
||||
name);
|
||||
|
||||
error_list_append(&config_errors, error_message_buf);
|
||||
log_err(_("no value provided for parameter '%s'\n"), name);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -392,49 +252,45 @@ parse_config(t_configuration_options *options)
|
||||
/* The following checks are for the presence of the parameter */
|
||||
if (*options->cluster_name == '\0')
|
||||
{
|
||||
error_list_append(&config_errors, _("\"cluster\": parameter was not found\n"));
|
||||
log_err(_("required parameter 'cluster' was not found\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (options->node == -1)
|
||||
{
|
||||
error_list_append(&config_errors, _("\"node\": parameter was not found\n"));
|
||||
log_err(_("required parameter 'node' was not found\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (*options->node_name == '\0')
|
||||
{
|
||||
error_list_append(&config_errors, _("\"node_name\": parameter was not found\n"));
|
||||
log_err(_("required parameter 'node_name' was not found\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (*options->conninfo == '\0')
|
||||
{
|
||||
error_list_append(&config_errors, _("\"conninfo\": parameter was not found\n"));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
/* Sanity check the provided conninfo string
|
||||
*
|
||||
* NOTE: PQconninfoParse() verifies the string format and checks for valid options
|
||||
* but does not sanity check values
|
||||
*/
|
||||
conninfo_options = PQconninfoParse(options->conninfo, &conninfo_errmsg);
|
||||
if (conninfo_options == NULL)
|
||||
{
|
||||
char error_message_buf[MAXLEN] = "";
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("\"conninfo\": %s"),
|
||||
conninfo_errmsg);
|
||||
|
||||
error_list_append(&config_errors, error_message_buf);
|
||||
log_err(_("required parameter 'conninfo' was not found\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
PQconninfoFree(conninfo_options);
|
||||
/* The following checks are for valid parameter values */
|
||||
if (options->master_response_timeout <= 0)
|
||||
{
|
||||
log_err(_("'master_response_timeout' must be greater than zero\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (config_errors.head != NULL)
|
||||
if (options->reconnect_attempts < 0)
|
||||
{
|
||||
exit_with_errors(&config_errors);
|
||||
log_err(_("'reconnect_attempts' must be zero or greater\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (options->reconnect_intvl < 0)
|
||||
{
|
||||
log_err(_("'reconnect_interval' must be zero or greater\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -469,58 +325,32 @@ trim(char *s)
|
||||
}
|
||||
|
||||
void
|
||||
parse_line(char *buf, char *name, char *value)
|
||||
parse_line(char *buff, char *name, char *value)
|
||||
{
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
|
||||
/*
|
||||
* Extract parameter name, if present
|
||||
* first we find the name of the parameter
|
||||
*/
|
||||
for (; i < MAXLEN; ++i)
|
||||
{
|
||||
|
||||
if (buf[i] == '=')
|
||||
if (buff[i] != '=')
|
||||
name[j++] = buff[i];
|
||||
else
|
||||
break;
|
||||
|
||||
switch(buf[i])
|
||||
{
|
||||
/* Ignore whitespace */
|
||||
case ' ':
|
||||
case '\n':
|
||||
case '\r':
|
||||
case '\t':
|
||||
continue;
|
||||
default:
|
||||
name[j++] = buf[i];
|
||||
}
|
||||
}
|
||||
name[j] = '\0';
|
||||
|
||||
/*
|
||||
* Ignore any whitespace following the '=' sign
|
||||
*/
|
||||
for (; i < MAXLEN; ++i)
|
||||
{
|
||||
if (buf[i+1] == ' ')
|
||||
continue;
|
||||
if (buf[i+1] == '\t')
|
||||
continue;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract parameter value
|
||||
* Now the value
|
||||
*/
|
||||
j = 0;
|
||||
for (++i; i < MAXLEN; ++i)
|
||||
if (buf[i] == '\'')
|
||||
if (buff[i] == '\'')
|
||||
continue;
|
||||
else if (buf[i] == '#')
|
||||
break;
|
||||
else if (buf[i] != '\n')
|
||||
value[j++] = buf[i];
|
||||
else if (buff[i] != '\n')
|
||||
value[j++] = buff[i];
|
||||
else
|
||||
break;
|
||||
value[j] = '\0';
|
||||
@@ -528,7 +358,7 @@ parse_line(char *buf, char *name, char *value)
|
||||
}
|
||||
|
||||
bool
|
||||
reload_config(t_configuration_options *orig_options)
|
||||
reload_config(char *config_file, t_configuration_options * orig_options)
|
||||
{
|
||||
PGconn *conn;
|
||||
t_configuration_options new_options;
|
||||
@@ -539,7 +369,7 @@ reload_config(t_configuration_options *orig_options)
|
||||
*/
|
||||
log_info(_("reloading configuration file and updating repmgr tables\n"));
|
||||
|
||||
parse_config(&new_options);
|
||||
parse_config(config_file, &new_options);
|
||||
if (new_options.node == -1)
|
||||
{
|
||||
log_warning(_("unable to parse new configuration, retaining current configuration\n"));
|
||||
@@ -582,7 +412,7 @@ reload_config(t_configuration_options *orig_options)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (new_options.reconnect_interval < 0)
|
||||
if (new_options.reconnect_intvl < 0)
|
||||
{
|
||||
log_warning(_("new value for 'reconnect_interval' must be zero or greater\n"));
|
||||
return false;
|
||||
@@ -701,10 +531,10 @@ reload_config(t_configuration_options *orig_options)
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/* reconnect_interval */
|
||||
if (orig_options->reconnect_interval != new_options.reconnect_interval)
|
||||
/* reconnect_intvl */
|
||||
if(orig_options->reconnect_intvl != new_options.reconnect_intvl)
|
||||
{
|
||||
orig_options->reconnect_interval = new_options.reconnect_interval;
|
||||
orig_options->reconnect_intvl = new_options.reconnect_intvl;
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
@@ -756,96 +586,6 @@ reload_config(t_configuration_options *orig_options)
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
error_list_append(ErrorList *error_list, char *error_message)
|
||||
{
|
||||
ErrorListCell *cell;
|
||||
|
||||
cell = (ErrorListCell *) pg_malloc0(sizeof(ErrorListCell));
|
||||
|
||||
if (cell == NULL)
|
||||
{
|
||||
log_err(_("unable to allocate memory; terminating.\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
cell->error_message = pg_malloc0(MAXLEN);
|
||||
strncpy(cell->error_message, error_message, MAXLEN);
|
||||
|
||||
if (error_list->tail)
|
||||
{
|
||||
error_list->tail->next = cell;
|
||||
}
|
||||
else
|
||||
{
|
||||
error_list->head = cell;
|
||||
}
|
||||
|
||||
error_list->tail = cell;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Convert provided string to an integer using strtol;
|
||||
* on error, if a callback is provided, pass the error message to that,
|
||||
* otherwise exit
|
||||
*/
|
||||
int
|
||||
repmgr_atoi(const char *value, const char *config_item, ErrorList *error_list)
|
||||
{
|
||||
char *endptr;
|
||||
long longval = 0;
|
||||
char error_message_buf[MAXLEN] = "";
|
||||
|
||||
/* It's possible that some versions of strtol() don't treat an empty
|
||||
* string as an error.
|
||||
*/
|
||||
|
||||
if (*value == '\0')
|
||||
{
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("no value provided for \"%s\""),
|
||||
config_item);
|
||||
}
|
||||
else
|
||||
{
|
||||
errno = 0;
|
||||
longval = strtol(value, &endptr, 10);
|
||||
|
||||
if (value == endptr || errno)
|
||||
{
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("\"%s\": invalid value (provided: \"%s\")"),
|
||||
config_item, value);
|
||||
}
|
||||
}
|
||||
|
||||
/* Currently there are no values which could be negative */
|
||||
if (longval < 0)
|
||||
{
|
||||
snprintf(error_message_buf,
|
||||
MAXLEN,
|
||||
_("\"%s\" must be zero or greater (provided: %s)"),
|
||||
config_item, value);
|
||||
}
|
||||
|
||||
/* Error message buffer is set */
|
||||
if (error_message_buf[0] != '\0')
|
||||
{
|
||||
if (error_list == NULL)
|
||||
{
|
||||
log_err("%s\n", error_message_buf);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
error_list_append(error_list, error_message_buf);
|
||||
}
|
||||
|
||||
return (int32) longval;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Split argument into old_dir and new_dir and append to tablespace mapping
|
||||
@@ -978,21 +718,3 @@ parse_event_notifications_list(t_configuration_options *options, const char *arg
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void
|
||||
exit_with_errors(ErrorList *config_errors)
|
||||
{
|
||||
ErrorListCell *cell;
|
||||
|
||||
log_err(_("%s: following errors were found in the configuration file.\n"), progname());
|
||||
|
||||
for (cell = config_errors->head; cell; cell = cell->next)
|
||||
{
|
||||
log_err("%s\n", cell->error_message);
|
||||
}
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
|
||||
28
config.h
28
config.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* config.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -24,7 +24,6 @@
|
||||
|
||||
#include "strutil.h"
|
||||
|
||||
#define CONFIG_FILE_NAME "repmgr.conf"
|
||||
|
||||
typedef struct EventNotificationListCell
|
||||
{
|
||||
@@ -68,7 +67,7 @@ typedef struct
|
||||
char ssh_options[QUERY_STR_LEN];
|
||||
int master_response_timeout;
|
||||
int reconnect_attempts;
|
||||
int reconnect_interval;
|
||||
int reconnect_intvl;
|
||||
char pg_bindir[MAXLEN];
|
||||
char pg_ctl_options[MAXLEN];
|
||||
char pg_basebackup_options[MAXLEN];
|
||||
@@ -83,29 +82,10 @@ typedef struct
|
||||
|
||||
#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", 0, 0, 0, "", { NULL, NULL }, {NULL, NULL} }
|
||||
|
||||
typedef struct ErrorListCell
|
||||
{
|
||||
struct ErrorListCell *next;
|
||||
char *error_message;
|
||||
} ErrorListCell;
|
||||
|
||||
typedef struct ErrorList
|
||||
{
|
||||
ErrorListCell *head;
|
||||
ErrorListCell *tail;
|
||||
} ErrorList;
|
||||
|
||||
void set_progname(const char *argv0);
|
||||
const char * progname(void);
|
||||
|
||||
bool load_config(const char *config_file, bool verbose, t_configuration_options *options, char *argv0);
|
||||
bool reload_config(t_configuration_options *orig_options);
|
||||
bool parse_config(t_configuration_options *options);
|
||||
bool parse_config(const char *config_file, t_configuration_options *options);
|
||||
void parse_line(char *buff, char *name, char *value);
|
||||
char *trim(char *s);
|
||||
void error_list_append(ErrorList *error_list, char *error_message);
|
||||
int repmgr_atoi(const char *s,
|
||||
const char *config_item,
|
||||
ErrorList *error_list);
|
||||
bool reload_config(char *config_file, t_configuration_options *orig_options);
|
||||
|
||||
#endif
|
||||
|
||||
417
dbutils.c
417
dbutils.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* dbutils.c - Database connection/management functions
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -82,78 +82,6 @@ establish_db_connection_by_params(const char *keywords[], const char *values[],
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
begin_transaction(PGconn *conn)
|
||||
{
|
||||
PGresult *res;
|
||||
|
||||
log_verbose(LOG_DEBUG, "begin_transaction()\n");
|
||||
|
||||
res = PQexec(conn, "BEGIN");
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to begin transaction: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
|
||||
PQclear(res);
|
||||
return false;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
commit_transaction(PGconn *conn)
|
||||
{
|
||||
PGresult *res;
|
||||
|
||||
log_verbose(LOG_DEBUG, "commit_transaction()\n");
|
||||
|
||||
res = PQexec(conn, "COMMIT");
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to commit transaction: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
rollback_transaction(PGconn *conn)
|
||||
{
|
||||
PGresult *res;
|
||||
|
||||
log_verbose(LOG_DEBUG, "rollback_transaction()\n");
|
||||
|
||||
res = PQexec(conn, "ROLLBACK");
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to rollback transaction: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
check_cluster_schema(PGconn *conn)
|
||||
{
|
||||
@@ -164,8 +92,7 @@ check_cluster_schema(PGconn *conn)
|
||||
"SELECT 1 FROM pg_namespace WHERE nspname = '%s'",
|
||||
get_repmgr_schema());
|
||||
|
||||
log_verbose(LOG_DEBUG, "check_cluster_schema(): %s\n", sqlquery);
|
||||
|
||||
log_debug(_("check_cluster_schema(): %s\n"), sqlquery);
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -195,22 +122,17 @@ is_standby(PGconn *conn)
|
||||
{
|
||||
PGresult *res;
|
||||
int result = 0;
|
||||
char *sqlquery = "SELECT pg_catalog.pg_is_in_recovery()";
|
||||
|
||||
log_verbose(LOG_DEBUG, "is_standby(): %s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
res = PQexec(conn, "SELECT pg_is_in_recovery()");
|
||||
|
||||
if (res == NULL || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("Unable to query server mode: %s\n"),
|
||||
log_err(_("Can't query server mode: %s"),
|
||||
PQerrorMessage(conn));
|
||||
result = -1;
|
||||
}
|
||||
else if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0)
|
||||
{
|
||||
result = 1;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
return result;
|
||||
@@ -297,8 +219,6 @@ get_master_node_id(PGconn *conn, char *cluster)
|
||||
get_repmgr_schema_quoted(conn),
|
||||
cluster);
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_master_node_id():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -355,17 +275,14 @@ guc_set(PGconn *conn, const char *parameter, const char *op,
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
int retval = 1;
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT true FROM pg_settings "
|
||||
sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings "
|
||||
" WHERE name = '%s' AND setting %s '%s'",
|
||||
parameter, op, value);
|
||||
|
||||
log_verbose(LOG_DEBUG, "guc_set():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("guc_set(): unable to execute query\n%s\n"),
|
||||
log_err(_("GUC setting check PQexec failed: %s"),
|
||||
PQerrorMessage(conn));
|
||||
retval = -1;
|
||||
}
|
||||
@@ -396,12 +313,10 @@ guc_set_typed(PGconn *conn, const char *parameter, const char *op,
|
||||
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
|
||||
parameter, datatype, op, value, datatype);
|
||||
|
||||
log_verbose(LOG_DEBUG, "guc_set_typed():n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("guc_set_typed(): unable to execute query\n%s\n"),
|
||||
log_err(_("GUC setting check PQexec failed: %s"),
|
||||
PQerrorMessage(conn));
|
||||
retval = -1;
|
||||
}
|
||||
@@ -422,16 +337,15 @@ get_cluster_size(PGconn *conn, char *size)
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT pg_catalog.pg_size_pretty(SUM(pg_catalog.pg_database_size(oid))::bigint) "
|
||||
sqlquery_snprintf(
|
||||
sqlquery,
|
||||
"SELECT pg_size_pretty(SUM(pg_database_size(oid))::bigint) "
|
||||
" FROM pg_database ");
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_cluster_size():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (res == NULL || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("get_cluster_size(): unable to execute query\n%s\n"),
|
||||
log_err(_("get_cluster_size(): PQexec failed: %s"),
|
||||
PQerrorMessage(conn));
|
||||
|
||||
PQclear(res);
|
||||
@@ -459,7 +373,7 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
|
||||
" FROM pg_settings WHERE name = '%s'",
|
||||
setting);
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_pg_setting(): %s\n", sqlquery);
|
||||
log_debug(_("get_pg_setting(): %s\n"), sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
@@ -481,14 +395,13 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
|
||||
}
|
||||
else
|
||||
{
|
||||
/* XXX highly unlikely this would ever happen */
|
||||
log_err(_("get_pg_setting(): unknown parameter \"%s\""), PQgetvalue(res, i, 0));
|
||||
log_err(_("unknown parameter: %s"), PQgetvalue(res, i, 0));
|
||||
}
|
||||
}
|
||||
|
||||
if(success == true)
|
||||
{
|
||||
log_debug(_("get_pg_setting(): returned value is \"%s\"\n"), output);
|
||||
log_debug(_("get_pg_setting(): returned value is '%s'\n"), output);
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
@@ -533,13 +446,13 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
||||
cluster,
|
||||
node_id);
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_upstream_connection():\n%s\n", sqlquery);
|
||||
log_debug("get_upstream_connection(): %s\n", sqlquery);
|
||||
|
||||
res = PQexec(standby_conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("unable to get conninfo for upstream server\n%s\n"),
|
||||
log_err(_("unable to get conninfo for upstream server: %s\n"),
|
||||
PQerrorMessage(standby_conn));
|
||||
PQclear(res);
|
||||
return NULL;
|
||||
@@ -559,7 +472,7 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
||||
|
||||
PQclear(res);
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_upstream_connection(): conninfo is \"%s\"\n", upstream_conninfo);
|
||||
log_debug("conninfo is: '%s'\n", upstream_conninfo);
|
||||
upstream_conn = establish_db_connection(upstream_conninfo, false);
|
||||
|
||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||
@@ -574,26 +487,24 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
||||
|
||||
|
||||
/*
|
||||
* Read the node list from the local node and attempt to connect to each node
|
||||
* in turn to definitely establish if it's the cluster primary.
|
||||
* get a connection to master by reading repl_nodes, creating a connection
|
||||
* to each node (one at a time) and finding if it is a master or a standby
|
||||
*
|
||||
* The node list is returned in the order which makes it likely that the
|
||||
* current primary will be returned first, reducing the number of speculative
|
||||
* connections which need to be made to other nodes.
|
||||
*
|
||||
* If master_conninfo_out points to allocated memory of MAXCONNINFO in length,
|
||||
* the primary server's conninfo string will be copied there.
|
||||
* NB: If master_conninfo_out may be NULL. If it is non-null, it is assumed to
|
||||
* point to allocated memory of MAXCONNINFO in length, and the master server
|
||||
* connection string is placed there.
|
||||
*/
|
||||
|
||||
PGconn *
|
||||
get_master_connection(PGconn *standby_conn, char *cluster,
|
||||
int *master_id, char *master_conninfo_out)
|
||||
{
|
||||
PGconn *remote_conn = NULL;
|
||||
PGresult *res;
|
||||
PGconn *master_conn = NULL;
|
||||
PGresult *res1;
|
||||
PGresult *res2;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
char remote_conninfo_stack[MAXCONNINFO];
|
||||
char *remote_conninfo = &*remote_conninfo_stack;
|
||||
char master_conninfo_stack[MAXCONNINFO];
|
||||
char *master_conninfo = &*master_conninfo_stack;
|
||||
|
||||
int i,
|
||||
node_id;
|
||||
@@ -604,60 +515,59 @@ get_master_connection(PGconn *standby_conn, char *cluster,
|
||||
}
|
||||
|
||||
/* find all nodes belonging to this cluster */
|
||||
log_info(_("retrieving node list for cluster '%s'\n"),
|
||||
log_info(_("finding node list for cluster '%s'\n"),
|
||||
cluster);
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
" SELECT id, conninfo, "
|
||||
" CASE WHEN type = 'master' THEN 1 ELSE 2 END AS type_priority"
|
||||
"SELECT id, conninfo "
|
||||
" FROM %s.repl_nodes "
|
||||
" WHERE cluster = '%s' "
|
||||
" AND type != 'witness' "
|
||||
"ORDER BY active DESC, type_priority, priority, id",
|
||||
" AND type != 'witness' ",
|
||||
get_repmgr_schema_quoted(standby_conn),
|
||||
cluster);
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_master_connection():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(standby_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
res1 = PQexec(standby_conn, sqlquery);
|
||||
if (PQresultStatus(res1) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("unable to retrieve node records: %s\n"),
|
||||
PQerrorMessage(standby_conn));
|
||||
PQclear(res);
|
||||
PQclear(res1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (i = 0; i < PQntuples(res); i++)
|
||||
for (i = 0; i < PQntuples(res1); i++)
|
||||
{
|
||||
int is_node_standby;
|
||||
|
||||
/* initialize with the values of the current node being processed */
|
||||
node_id = atoi(PQgetvalue(res, i, 0));
|
||||
strncpy(remote_conninfo, PQgetvalue(res, i, 1), MAXCONNINFO);
|
||||
log_verbose(LOG_INFO,
|
||||
_("checking role of cluster node '%i'\n"),
|
||||
node_id = atoi(PQgetvalue(res1, i, 0));
|
||||
strncpy(master_conninfo, PQgetvalue(res1, i, 1), MAXCONNINFO);
|
||||
log_info(_("checking role of cluster node '%i'\n"),
|
||||
node_id);
|
||||
remote_conn = establish_db_connection(remote_conninfo, false);
|
||||
master_conn = establish_db_connection(master_conninfo, false);
|
||||
|
||||
if (PQstatus(remote_conn) != CONNECTION_OK)
|
||||
if (PQstatus(master_conn) != CONNECTION_OK)
|
||||
continue;
|
||||
|
||||
is_node_standby = is_standby(remote_conn);
|
||||
/*
|
||||
* Can't use the is_standby() function here because on error that
|
||||
* function closes the connection passed and exits. This still needs
|
||||
* to close master_conn first.
|
||||
*/
|
||||
res2 = PQexec(master_conn, "SELECT pg_is_in_recovery()");
|
||||
|
||||
if (is_node_standby == -1)
|
||||
if (PQresultStatus(res2) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("unable to retrieve recovery state from node %i:\n%s\n"),
|
||||
node_id,
|
||||
PQerrorMessage(remote_conn));
|
||||
PQfinish(remote_conn);
|
||||
log_err(_("unable to retrieve recovery state from this node: %s\n"),
|
||||
PQerrorMessage(master_conn));
|
||||
PQclear(res2);
|
||||
PQfinish(master_conn);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* if is_standby() returns 0, queried node is the master */
|
||||
if (is_node_standby == 0)
|
||||
/* if false, this is the master */
|
||||
if (strcmp(PQgetvalue(res2, 0, 0), "f") == 0)
|
||||
{
|
||||
PQclear(res);
|
||||
PQclear(res2);
|
||||
PQclear(res1);
|
||||
log_debug(_("get_master_connection(): current master node is %i\n"), node_id);
|
||||
|
||||
if(master_id != NULL)
|
||||
@@ -665,12 +575,14 @@ get_master_connection(PGconn *standby_conn, char *cluster,
|
||||
*master_id = node_id;
|
||||
}
|
||||
|
||||
return remote_conn;
|
||||
return master_conn;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* if it is a standby, clear info */
|
||||
PQclear(res2);
|
||||
PQfinish(master_conn);
|
||||
}
|
||||
|
||||
|
||||
/* if it is a standby, clear connection info and continue*/
|
||||
PQfinish(remote_conn);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -681,7 +593,7 @@ get_master_connection(PGconn *standby_conn, char *cluster,
|
||||
* Probably we will need to check the error to know if we need to start
|
||||
* failover procedure or just fix some situation on the standby.
|
||||
*/
|
||||
PQclear(res);
|
||||
PQclear(res1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -709,7 +621,7 @@ wait_connection_availability(PGconn *conn, long long timeout)
|
||||
{
|
||||
if (PQconsumeInput(conn) == 0)
|
||||
{
|
||||
log_warning(_("wait_connection_availability(): could not receive data from connection. %s\n"),
|
||||
log_warning(_("wait_connection_availability: could not receive data from connection. %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
return 0;
|
||||
}
|
||||
@@ -736,7 +648,7 @@ wait_connection_availability(PGconn *conn, long long timeout)
|
||||
if (select(sock, &read_set, NULL, NULL, &tmout) == -1)
|
||||
{
|
||||
log_warning(
|
||||
_("wait_connection_availability(): select() returned with error\n%s\n"),
|
||||
_("wait_connection_availability: select() returned with error: %s"),
|
||||
strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
@@ -752,7 +664,7 @@ wait_connection_availability(PGconn *conn, long long timeout)
|
||||
return 1;
|
||||
}
|
||||
|
||||
log_warning(_("wait_connection_availability(): timeout reached"));
|
||||
log_warning(_("wait_connection_availability: timeout reached"));
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -787,12 +699,6 @@ cancel_query(PGconn *conn, int timeout)
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/* Return the repmgr schema as an unmodified string
|
||||
* This is useful for displaying the schema name in log messages,
|
||||
* however inclusion in SQL statements, get_repmgr_schema_quoted() should
|
||||
* always be used.
|
||||
*/
|
||||
char *
|
||||
get_repmgr_schema(void)
|
||||
{
|
||||
@@ -834,8 +740,6 @@ create_replication_slot(PGconn *conn, char *slot_name)
|
||||
" WHERE slot_name = '%s' ",
|
||||
slot_name);
|
||||
|
||||
log_verbose(LOG_DEBUG, "create_replication_slot():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -856,7 +760,7 @@ create_replication_slot(PGconn *conn, char *slot_name)
|
||||
if(strcmp(PQgetvalue(res, 0, 0), "f") == 0)
|
||||
{
|
||||
PQclear(res);
|
||||
log_debug("Replication slot '%s' exists but is inactive; reusing\n",
|
||||
log_debug(_("Replication slot '%s' exists but is inactive; reusing\n"),
|
||||
slot_name);
|
||||
|
||||
return true;
|
||||
@@ -872,7 +776,6 @@ create_replication_slot(PGconn *conn, char *slot_name)
|
||||
slot_name);
|
||||
|
||||
log_debug(_("create_replication_slot(): Creating slot '%s' on primary\n"), slot_name);
|
||||
log_verbose(LOG_DEBUG, "create_replication_slot():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -888,33 +791,6 @@ create_replication_slot(PGconn *conn, char *slot_name)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
drop_replication_slot(PGconn *conn, char *slot_name)
|
||||
{
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
PGresult *res;
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT pg_drop_replication_slot('%s')",
|
||||
slot_name);
|
||||
|
||||
log_verbose(LOG_DEBUG, "drop_replication_slot():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("unable to drop replication slot \"%s\":\n %s\n"),
|
||||
slot_name,
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
return false;
|
||||
}
|
||||
|
||||
log_verbose(LOG_DEBUG, "replication slot \"%s\" successfully dropped\n",
|
||||
slot_name);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint)
|
||||
@@ -923,11 +799,11 @@ start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint)
|
||||
PGresult *res;
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT pg_catalog.pg_xlogfile_name(pg_catalog.pg_start_backup('repmgr_standby_clone_%ld', %s))",
|
||||
"SELECT pg_xlogfile_name(pg_start_backup('repmgr_standby_clone_%ld', %s))",
|
||||
time(NULL),
|
||||
fast_checkpoint ? "TRUE" : "FALSE");
|
||||
|
||||
log_verbose(LOG_DEBUG, "start_backup():\n%s\n", sqlquery);
|
||||
log_debug(_("standby clone: %s\n"), sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -942,7 +818,7 @@ start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint)
|
||||
char *first_wal_seg_pq = PQgetvalue(res, 0, 0);
|
||||
size_t buf_sz = strlen(first_wal_seg_pq);
|
||||
|
||||
first_wal_segment = pg_malloc0(buf_sz + 1);
|
||||
first_wal_segment = malloc(buf_sz + 1);
|
||||
xsnprintf(first_wal_segment, buf_sz + 1, "%s", first_wal_seg_pq);
|
||||
}
|
||||
|
||||
@@ -958,7 +834,7 @@ stop_backup(PGconn *conn, char *last_wal_segment)
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
PGresult *res;
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_xlogfile_name(pg_catalog.pg_stop_backup())");
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_xlogfile_name(pg_stop_backup())");
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -973,7 +849,7 @@ stop_backup(PGconn *conn, char *last_wal_segment)
|
||||
char *last_wal_seg_pq = PQgetvalue(res, 0, 0);
|
||||
size_t buf_sz = strlen(last_wal_seg_pq);
|
||||
|
||||
last_wal_segment = pg_malloc0(buf_sz + 1);
|
||||
last_wal_segment = malloc(buf_sz + 1);
|
||||
xsnprintf(last_wal_segment, buf_sz + 1, "%s", last_wal_seg_pq);
|
||||
}
|
||||
|
||||
@@ -994,8 +870,6 @@ set_config_bool(PGconn *conn, const char *config_param, bool state)
|
||||
config_param,
|
||||
state ? "TRUE" : "FALSE");
|
||||
|
||||
log_verbose(LOG_DEBUG, "set_config_bool():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
@@ -1027,13 +901,11 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
|
||||
int i;
|
||||
|
||||
sqlquery_snprintf(sqlquery, "TRUNCATE TABLE %s.repl_nodes", get_repmgr_schema_quoted(witnessconn));
|
||||
|
||||
log_verbose(LOG_DEBUG, "copy_configuration():\n%s\n", sqlquery);
|
||||
|
||||
log_debug("copy_configuration: %s\n", sqlquery);
|
||||
res = PQexec(witnessconn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to truncate witness servers's repl_nodes table:\n%s\n"),
|
||||
fprintf(stderr, "Cannot clean node details in the witness, %s\n",
|
||||
PQerrorMessage(witnessconn));
|
||||
return false;
|
||||
}
|
||||
@@ -1041,13 +913,10 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name FROM %s.repl_nodes",
|
||||
get_repmgr_schema_quoted(masterconn));
|
||||
|
||||
log_verbose(LOG_DEBUG, "copy_configuration():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(masterconn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err("Unable to retrieve node records from master:\n%s\n",
|
||||
fprintf(stderr, "Can't get configuration from master: %s\n",
|
||||
PQerrorMessage(masterconn));
|
||||
PQclear(res);
|
||||
return false;
|
||||
@@ -1056,11 +925,9 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
|
||||
for (i = 0; i < PQntuples(res); i++)
|
||||
{
|
||||
bool node_record_created;
|
||||
char *witness = PQgetvalue(res, i, 4);
|
||||
|
||||
log_verbose(LOG_DEBUG,
|
||||
"copy_configuration(): writing node record for node %s (id: %s)\n",
|
||||
PQgetvalue(res, i, 4),
|
||||
PQgetvalue(res, i, 0));
|
||||
log_debug(_("copy_configuration(): %s\n"), witness);
|
||||
|
||||
node_record_created = create_node_record(witnessconn,
|
||||
"copy_configuration",
|
||||
@@ -1080,9 +947,7 @@ copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name)
|
||||
|
||||
if (node_record_created == false)
|
||||
{
|
||||
PQclear(res);
|
||||
|
||||
log_err("Unable to copy node record to witness database\n%s\n",
|
||||
fprintf(stderr, "Unable to copy node record to witness database: %s\n",
|
||||
PQerrorMessage(witnessconn));
|
||||
return false;
|
||||
}
|
||||
@@ -1138,7 +1003,6 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
|
||||
maxlen_snprintf(slot_name_buf, "%s", "NULL");
|
||||
}
|
||||
|
||||
/* XXX convert to placeholder query */
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"INSERT INTO %s.repl_nodes "
|
||||
" (id, type, upstream_node_id, cluster, "
|
||||
@@ -1154,17 +1018,15 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
|
||||
slot_name_buf,
|
||||
priority);
|
||||
|
||||
log_verbose(LOG_DEBUG, "create_node_record(): %s\n", sqlquery);
|
||||
|
||||
if(action != NULL)
|
||||
{
|
||||
log_verbose(LOG_DEBUG, "create_node_record(): action is \"%s\"\n", action);
|
||||
log_debug(_("%s: %s\n"), action, sqlquery);
|
||||
}
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to create node record\n%s\n"),
|
||||
log_warning(_("Unable to create node record: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
return false;
|
||||
@@ -1187,18 +1049,15 @@ delete_node_record(PGconn *conn, int node, char *action)
|
||||
" WHERE id = %d",
|
||||
get_repmgr_schema_quoted(conn),
|
||||
node);
|
||||
|
||||
log_verbose(LOG_DEBUG, "delete_node_record(): %s\n", sqlquery);
|
||||
|
||||
if(action != NULL)
|
||||
{
|
||||
log_verbose(LOG_DEBUG, "create_node_record(): action is \"%s\"\n", action);
|
||||
log_debug(_("%s: %s\n"), action, sqlquery);
|
||||
}
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to delete node record: %s\n"),
|
||||
log_warning(_("Unable to delete node record: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
return false;
|
||||
@@ -1221,8 +1080,8 @@ delete_node_record(PGconn *conn, int node, char *action)
|
||||
*
|
||||
* Note this function may be called with `conn` set to NULL in cases where
|
||||
* the master node is not available and it's therefore not possible to write
|
||||
* an event record. In this case, if `event_notification_command` is set, a
|
||||
* user-defined notification to be generated; if not, this function will have
|
||||
* an event record. In this case, if `event_notification_command` is set a user-
|
||||
* defined notification to be generated; if not, this function will have
|
||||
* no effect.
|
||||
*/
|
||||
|
||||
@@ -1235,7 +1094,7 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
bool success = true;
|
||||
struct tm ts;
|
||||
|
||||
/* Only attempt to write a record if a connection handle was provided.
|
||||
/* Only attempt to write a record if a connection handle was provided/
|
||||
Also check that the repmgr schema has been properly intialised - if
|
||||
not it means no configuration file was provided, which can happen with
|
||||
e.g. `repmgr standby clone`, and we won't know which schema to write to.
|
||||
@@ -1270,8 +1129,6 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
" RETURNING event_timestamp ",
|
||||
get_repmgr_schema_quoted(conn));
|
||||
|
||||
log_verbose(LOG_DEBUG, "create_event_record():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexecParams(conn,
|
||||
sqlquery,
|
||||
4,
|
||||
@@ -1283,6 +1140,7 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
|
||||
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
|
||||
log_warning(_("Unable to create event record: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
|
||||
@@ -1293,7 +1151,7 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
{
|
||||
/* Store timestamp to send to the notification command */
|
||||
strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
|
||||
log_verbose(LOG_DEBUG, "create_event_record(): Event timestamp is \"%s\"\n", event_timestamp);
|
||||
log_debug(_("Event timestamp is: %s\n"), event_timestamp);
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
@@ -1413,116 +1271,15 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
|
||||
|
||||
*dst_ptr = '\0';
|
||||
|
||||
log_debug("create_event_record(): executing\n%s\n", parsed_command);
|
||||
log_debug(_("Executing: %s\n"), parsed_command);
|
||||
|
||||
r = system(parsed_command);
|
||||
if (r != 0)
|
||||
{
|
||||
log_warning(_("Unable to execute event notification command\n"));
|
||||
log_info(_("Parsed event notification command was:\n%s\n"), parsed_command);
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update node record following change of status
|
||||
* (e.g. inactive primary converted to standby)
|
||||
*/
|
||||
bool
|
||||
update_node_record_status(PGconn *conn, char *cluster_name, int this_node_id, char *type, int upstream_node_id, bool active)
|
||||
{
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
" UPDATE %s.repl_nodes "
|
||||
" SET type = '%s', "
|
||||
" upstream_node_id = %i, "
|
||||
" active = %s "
|
||||
" WHERE cluster = '%s' "
|
||||
" AND id = %i ",
|
||||
get_repmgr_schema_quoted(conn),
|
||||
type,
|
||||
upstream_node_id,
|
||||
active ? "TRUE" : "FALSE",
|
||||
cluster_name,
|
||||
this_node_id);
|
||||
|
||||
log_verbose(LOG_DEBUG, "update_node_record_status():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to update node record: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_id, int new_upstream_node_id)
|
||||
{
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
log_debug(_("update_node_record_set_upstream(): Updating node %i's upstream node to %i\n"), this_node_id, new_upstream_node_id);
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
" UPDATE %s.repl_nodes "
|
||||
" SET upstream_node_id = %i "
|
||||
" WHERE cluster = '%s' "
|
||||
" AND id = %i ",
|
||||
get_repmgr_schema_quoted(conn),
|
||||
new_upstream_node_id,
|
||||
cluster_name,
|
||||
this_node_id);
|
||||
|
||||
log_verbose(LOG_DEBUG, "update_node_record_set_upstream():\n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to set new upstream node id: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
PGresult *
|
||||
get_node_record(PGconn *conn, char *cluster, int node_id)
|
||||
{
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
sprintf(sqlquery,
|
||||
"SELECT id, upstream_node_id, conninfo, type, slot_name, active "
|
||||
" FROM %s.repl_nodes "
|
||||
" WHERE cluster = '%s' "
|
||||
" AND id = %i",
|
||||
get_repmgr_schema_quoted(conn),
|
||||
cluster,
|
||||
node_id);
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_node_record():\n%s\n", sqlquery);
|
||||
|
||||
return PQexec(conn, sqlquery);
|
||||
}
|
||||
|
||||
51
dbutils.h
51
dbutils.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* dbutils.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -20,60 +20,16 @@
|
||||
#ifndef _REPMGR_DBUTILS_H_
|
||||
#define _REPMGR_DBUTILS_H_
|
||||
|
||||
#include "access/xlogdefs.h"
|
||||
|
||||
#include "config.h"
|
||||
#include "strutil.h"
|
||||
|
||||
|
||||
typedef enum {
|
||||
UNKNOWN = 0,
|
||||
MASTER,
|
||||
STANDBY,
|
||||
WITNESS
|
||||
} t_server_type;
|
||||
|
||||
/*
|
||||
* Struct to store node information
|
||||
*/
|
||||
typedef struct s_node_info
|
||||
{
|
||||
int node_id;
|
||||
int upstream_node_id;
|
||||
t_server_type type;
|
||||
char name[MAXLEN];
|
||||
char conninfo_str[MAXLEN];
|
||||
char slot_name[MAXLEN];
|
||||
int priority;
|
||||
bool active;
|
||||
bool is_ready;
|
||||
bool is_visible;
|
||||
XLogRecPtr xlog_location;
|
||||
} t_node_info;
|
||||
|
||||
|
||||
#define T_NODE_INFO_INITIALIZER { \
|
||||
NODE_NOT_FOUND, \
|
||||
NO_UPSTREAM_NODE, \
|
||||
UNKNOWN, \
|
||||
"", \
|
||||
"", \
|
||||
"", \
|
||||
DEFAULT_PRIORITY, \
|
||||
true, \
|
||||
false, \
|
||||
false, \
|
||||
InvalidXLogRecPtr \
|
||||
}
|
||||
|
||||
PGconn *establish_db_connection(const char *conninfo,
|
||||
const bool exit_on_error);
|
||||
PGconn *establish_db_connection_by_params(const char *keywords[],
|
||||
const char *values[],
|
||||
const bool exit_on_error);
|
||||
bool begin_transaction(PGconn *conn);
|
||||
bool commit_transaction(PGconn *conn);
|
||||
bool rollback_transaction(PGconn *conn);
|
||||
bool check_cluster_schema(PGconn *conn);
|
||||
int is_standby(PGconn *conn);
|
||||
bool is_pgup(PGconn *conn, int timeout);
|
||||
@@ -99,7 +55,6 @@ bool cancel_query(PGconn *conn, int timeout);
|
||||
char *get_repmgr_schema(void);
|
||||
char *get_repmgr_schema_quoted(PGconn *conn);
|
||||
bool create_replication_slot(PGconn *conn, char *slot_name);
|
||||
bool drop_replication_slot(PGconn *conn, char *slot_name);
|
||||
|
||||
bool start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint);
|
||||
bool stop_backup(PGconn *conn, char *last_wal_segment);
|
||||
@@ -108,8 +63,6 @@ bool copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_
|
||||
bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name);
|
||||
bool delete_node_record(PGconn *conn, int node, char *action);
|
||||
bool create_event_record(PGconn *conn, t_configuration_options *options, int node_id, char *event, bool successful, char *details);
|
||||
bool update_node_record_status(PGconn *conn, char *cluster_name, int this_node_id, char *type, int upstream_node_id, bool active);
|
||||
bool update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_id, int new_upstream_node_id);
|
||||
PGresult * get_node_record(PGconn *conn, char *cluster, int node_id);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
4
debian/repmgr.repmgrd.default
vendored
4
debian/repmgr.repmgrd.default
vendored
@@ -12,7 +12,7 @@ REPMGRD_ENABLED=no
|
||||
#REPMGRD_USER=postgres
|
||||
|
||||
# repmgrd binary
|
||||
#REPMGRD_BIN=/usr/bin/repmgrd
|
||||
#REPMGR_BIN=/usr/bin/repmgr
|
||||
|
||||
# pid file
|
||||
#REPMGRD_PIDFILE=/var/run/repmgrd.pid
|
||||
#REPMGR_PIDFILE=/var/run/repmgrd.pid
|
||||
|
||||
2
debian/repmgr.repmgrd.init
vendored
2
debian/repmgr.repmgrd.init
vendored
@@ -59,7 +59,7 @@ do_stop()
|
||||
# 0 if daemon has been stopped
|
||||
# 1 if daemon was already stopped
|
||||
# other if daemon could not be stopped or a failure occurred
|
||||
start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 --pidfile $REPMGRD_PIDFILE --name "$(basename $REPMGRD_BIN)"
|
||||
start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 --pidfile $REPMGRD_PIDFILE --exec $REPMGRD_BIN
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* errcode.h
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -35,7 +35,5 @@
|
||||
#define ERR_BAD_SSH 12
|
||||
#define ERR_SYS_FAILURE 13
|
||||
#define ERR_BAD_BASEBACKUP 14
|
||||
#define ERR_INTERNAL 15
|
||||
#define ERR_MONITORING_FAIL 16
|
||||
|
||||
#endif /* _ERRCODE_H_ */
|
||||
|
||||
154
log.c
154
log.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* log.c - Logging methods
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This module is a set of methods for logging (currently only syslog)
|
||||
*
|
||||
@@ -39,37 +39,13 @@
|
||||
|
||||
/* #define REPMGR_DEBUG */
|
||||
|
||||
static int detect_log_facility(const char *facility);
|
||||
static void _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap);
|
||||
|
||||
int log_type = REPMGR_STDERR;
|
||||
int log_level = LOG_NOTICE;
|
||||
int last_log_level = LOG_NOTICE;
|
||||
int verbose_logging = false;
|
||||
int terse_logging = false;
|
||||
|
||||
void
|
||||
stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
|
||||
va_start(arglist, fmt);
|
||||
_stderr_log_with_level(level_name, level, fmt, arglist);
|
||||
va_end(arglist);
|
||||
}
|
||||
|
||||
static void
|
||||
_stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap)
|
||||
{
|
||||
time_t t;
|
||||
struct tm *tm;
|
||||
char buff[100];
|
||||
|
||||
/*
|
||||
* Store the requested level so that if there's a subsequent
|
||||
* log_hint(), we can suppress that if appropriate.
|
||||
*/
|
||||
last_log_level = level;
|
||||
va_list ap;
|
||||
|
||||
if (log_level >= level)
|
||||
{
|
||||
@@ -78,74 +54,24 @@ _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_li
|
||||
strftime(buff, 100, "[%Y-%m-%d %H:%M:%S]", tm);
|
||||
fprintf(stderr, "%s [%s] ", buff, level_name);
|
||||
|
||||
va_start(ap, fmt);
|
||||
vfprintf(stderr, fmt, ap);
|
||||
va_end(ap);
|
||||
|
||||
fflush(stderr);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
log_hint(const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
if (terse_logging == false)
|
||||
{
|
||||
va_start(ap, fmt);
|
||||
_stderr_log_with_level("HINT", last_log_level, fmt, ap);
|
||||
va_end(ap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
log_verbose(int level, const char *fmt, ...)
|
||||
{
|
||||
va_list ap;
|
||||
|
||||
va_start(ap, fmt);
|
||||
|
||||
if (verbose_logging == true)
|
||||
{
|
||||
switch(level)
|
||||
{
|
||||
case LOG_EMERG:
|
||||
_stderr_log_with_level("EMERG", level, fmt, ap);
|
||||
break;
|
||||
case LOG_ALERT:
|
||||
_stderr_log_with_level("ALERT", level, fmt, ap);
|
||||
break;
|
||||
case LOG_CRIT:
|
||||
_stderr_log_with_level("CRIT", level, fmt, ap);
|
||||
break;
|
||||
case LOG_ERR:
|
||||
_stderr_log_with_level("ERR", level, fmt, ap);
|
||||
break;
|
||||
case LOG_WARNING:
|
||||
_stderr_log_with_level("WARNING", level, fmt, ap);
|
||||
break;
|
||||
case LOG_NOTICE:
|
||||
_stderr_log_with_level("NOTICE", level, fmt, ap);
|
||||
break;
|
||||
case LOG_INFO:
|
||||
_stderr_log_with_level("INFO", level, fmt, ap);
|
||||
break;
|
||||
case LOG_DEBUG:
|
||||
_stderr_log_with_level("DEBUG", level, fmt, ap);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
va_end(ap);
|
||||
}
|
||||
static int detect_log_level(const char *level);
|
||||
static int detect_log_facility(const char *facility);
|
||||
|
||||
int log_type = REPMGR_STDERR;
|
||||
int log_level = LOG_NOTICE;
|
||||
|
||||
bool
|
||||
logger_init(t_configuration_options * opts, const char *ident)
|
||||
logger_init(t_configuration_options * opts, const char *ident, const char *level, const char *facility)
|
||||
{
|
||||
char *level = opts->loglevel;
|
||||
char *facility = opts->logfacility;
|
||||
|
||||
int l;
|
||||
int f;
|
||||
|
||||
@@ -169,10 +95,10 @@ logger_init(t_configuration_options * opts, const char *ident)
|
||||
printf("Assigned level for logger: %d\n", l);
|
||||
#endif
|
||||
|
||||
if (l >= 0)
|
||||
if (l > 0)
|
||||
log_level = l;
|
||||
else
|
||||
stderr_log_warning(_("Invalid log level \"%s\" (available values: DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG)\n"), level);
|
||||
stderr_log_warning(_("Cannot detect log level %s (use any of DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG)\n"), level);
|
||||
}
|
||||
|
||||
if (facility && *facility)
|
||||
@@ -218,38 +144,18 @@ logger_init(t_configuration_options * opts, const char *ident)
|
||||
{
|
||||
FILE *fd;
|
||||
|
||||
/* Check if we can write to the specified file before redirecting
|
||||
* stderr - if freopen() fails, stderr output will vanish into
|
||||
* the ether and the user won't know what's going on.
|
||||
*/
|
||||
|
||||
fd = fopen(opts->logfile, "a");
|
||||
if (fd == NULL)
|
||||
{
|
||||
stderr_log_err(_("Unable to open specified logfile '%s' for writing: %s\n"), opts->logfile, strerror(errno));
|
||||
stderr_log_err(_("Terminating\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
fclose(fd);
|
||||
|
||||
stderr_log_notice(_("Redirecting logging output to '%s'\n"), opts->logfile);
|
||||
fd = freopen(opts->logfile, "a", stderr);
|
||||
|
||||
/* It's possible freopen() may still fail due to e.g. a race condition;
|
||||
as it's not feasible to restore stderr after a failed freopen(),
|
||||
we'll write to stdout as a last resort.
|
||||
*/
|
||||
if (fd == NULL)
|
||||
{
|
||||
printf(_("Unable to open specified logfile %s for writing: %s\n"), opts->logfile, strerror(errno));
|
||||
printf(_("Terminating\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
fprintf(stderr, "error reopening stderr to '%s': %s",
|
||||
opts->logfile, strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool
|
||||
logger_shutdown(void)
|
||||
@@ -263,32 +169,17 @@ logger_shutdown(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Indicate whether extra-verbose logging is required. This will
|
||||
* generate a lot of output, particularly debug logging, and should
|
||||
* not be permanently enabled in production.
|
||||
*
|
||||
* NOTE: in previous repmgr versions, this option forced the log
|
||||
* level to INFO.
|
||||
* Set a minimum logging level. Intended for command line verbosity
|
||||
* options, which might increase requested logging over what's specified
|
||||
* in the regular configuration file.
|
||||
*/
|
||||
void
|
||||
logger_set_verbose(void)
|
||||
logger_min_verbose(int minimum)
|
||||
{
|
||||
verbose_logging = true;
|
||||
if (log_level < minimum)
|
||||
log_level = minimum;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Indicate whether some non-critical log messages can be omitted.
|
||||
* Currently this includes warnings about irrelevant command line
|
||||
* options and hints.
|
||||
*/
|
||||
|
||||
void logger_set_terse(void)
|
||||
{
|
||||
terse_logging = true;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
detect_log_level(const char *level)
|
||||
{
|
||||
@@ -309,16 +200,17 @@ detect_log_level(const char *level)
|
||||
if (!strcmp(level, "EMERG"))
|
||||
return LOG_EMERG;
|
||||
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
int
|
||||
detect_log_facility(const char *facility)
|
||||
{
|
||||
int local = 0;
|
||||
|
||||
if (!strncmp(facility, "LOCAL", 5) && strlen(facility) == 6)
|
||||
{
|
||||
|
||||
local = atoi(&facility[5]);
|
||||
|
||||
switch (local)
|
||||
|
||||
14
log.h
14
log.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* log.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -112,19 +112,13 @@ __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
|
||||
#endif
|
||||
|
||||
|
||||
int detect_log_level(const char *level);
|
||||
|
||||
/* Logger initialisation and shutdown */
|
||||
|
||||
bool logger_init(t_configuration_options * opts, const char *ident);
|
||||
|
||||
bool logger_shutdown(void);
|
||||
|
||||
void logger_set_verbose(void);
|
||||
void logger_set_terse(void);
|
||||
bool logger_init(t_configuration_options * opts, const char *ident,
|
||||
const char *level, const char *facility);
|
||||
|
||||
void log_hint(const char *fmt, ...);
|
||||
void log_verbose(int level, const char *fmt, ...);
|
||||
void logger_min_verbose(int minimum);
|
||||
|
||||
extern int log_type;
|
||||
extern int log_level;
|
||||
|
||||
@@ -16,15 +16,11 @@ cluster=example_cluster
|
||||
# Node ID and name
|
||||
# (Note: we recommend to avoid naming nodes after their initial
|
||||
# replication funcion, as this will cause confusion when e.g.
|
||||
# "standby2" is promoted to primary)
|
||||
node=2 # a unique integer
|
||||
node_name=node2 # an arbitrary (but unique) string; we recommend using
|
||||
# the server's hostname or another identifier unambiguously
|
||||
# associated with the server to avoid confusion
|
||||
# "standby2" is promoted to master)
|
||||
node=2
|
||||
node_name=node2
|
||||
|
||||
# Database connection information as a conninfo string
|
||||
# This must be accessible to all servers in the cluster; for details see:
|
||||
# http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING
|
||||
# Database connection information
|
||||
conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
|
||||
|
||||
# Optional configuration items
|
||||
@@ -36,7 +32,7 @@ conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
|
||||
# when using cascading replication and a standby is to be connected to an
|
||||
# upstream standby, specify that node's ID with 'upstream_node'. The node
|
||||
# must exist before the new standby can be registered. If a standby is
|
||||
# to connect directly to a primary node, this parameter is not required.
|
||||
# to connect directly to a master node, this parameter is not required.
|
||||
#
|
||||
# upstream_node=1
|
||||
|
||||
@@ -44,9 +40,7 @@ conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr'
|
||||
# (default: 0)
|
||||
#
|
||||
# use_replication_slots=0
|
||||
#
|
||||
# NOTE: 'max_replication_slots' should be configured for at least the
|
||||
# number of standbys which will connect to the primary.
|
||||
|
||||
|
||||
# Logging and monitoring settings
|
||||
# -------------------------------
|
||||
@@ -116,29 +110,28 @@ logfacility=STDERR
|
||||
#
|
||||
# These settings are only applied when repmgrd is running.
|
||||
|
||||
# Number of seconds to wait for a response from the primary server before
|
||||
# deciding it has failed
|
||||
|
||||
# How many seconds we wait for master response before declaring master failure
|
||||
master_response_timeout=60
|
||||
|
||||
# Number of times to try and reconnect to the primary before starting
|
||||
# the failover procedure
|
||||
# How many time we try to reconnect to master before starting failover procedure
|
||||
reconnect_attempts=6
|
||||
reconnect_interval=10
|
||||
|
||||
# Autofailover options
|
||||
failover=automatic # one of 'automatic', 'manual'
|
||||
priority=100 # a value of zero or less prevents the node being promoted to primary
|
||||
priority=100 # a value of zero or less prevents the node being promoted to master
|
||||
promote_command='repmgr standby promote -f /path/to/repmgr.conf'
|
||||
follow_command='repmgr standby follow -f /path/to/repmgr.conf -W'
|
||||
|
||||
# monitoring interval in seconds; default is 2
|
||||
# monitoring interval; default is 2s
|
||||
#
|
||||
# monitor_interval_secs=2
|
||||
|
||||
# change wait time for primary; before we bail out and exit when the primary
|
||||
# change wait time for master; before we bail out and exit when the master
|
||||
# disappears, we wait 'reconnect_attempts' * 'retry_promote_interval_secs'
|
||||
# seconds; by default this would be half an hour, as 'retry_promote_interval_secs'
|
||||
# default value is 300)
|
||||
#
|
||||
# retry_promote_interval_secs=300
|
||||
|
||||
|
||||
|
||||
35
repmgr.h
35
repmgr.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgr.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -20,9 +20,11 @@
|
||||
#ifndef _REPMGR_H_
|
||||
#define _REPMGR_H_
|
||||
|
||||
#include <libpq-fe.h>
|
||||
#include <postgres_fe.h>
|
||||
#include <getopt_long.h>
|
||||
#include "postgres_fe.h"
|
||||
#include "libpq-fe.h"
|
||||
|
||||
|
||||
#include "getopt_long.h"
|
||||
|
||||
#include "strutil.h"
|
||||
#include "dbutils.h"
|
||||
@@ -36,6 +38,7 @@
|
||||
#define MAXFILENAME 1024
|
||||
#define ERRBUFF_SIZE 512
|
||||
|
||||
#define DEFAULT_CONFIG_FILE "./repmgr.conf"
|
||||
#define DEFAULT_WAL_KEEP_SEGMENTS "5000"
|
||||
#define DEFAULT_DEST_DIR "."
|
||||
#define DEFAULT_MASTER_PORT "5432"
|
||||
@@ -48,7 +51,14 @@
|
||||
#define AUTOMATIC_FAILOVER 1
|
||||
#define NODE_NOT_FOUND -1
|
||||
#define NO_UPSTREAM_NODE -1
|
||||
#define UNKNOWN_NODE_ID -1
|
||||
|
||||
|
||||
typedef enum {
|
||||
UNKNOWN = 0,
|
||||
MASTER,
|
||||
STANDBY,
|
||||
WITNESS
|
||||
} t_server_type;
|
||||
|
||||
|
||||
|
||||
@@ -65,7 +75,6 @@ typedef struct
|
||||
char superuser[MAXLEN];
|
||||
char wal_keep_segments[MAXLEN];
|
||||
bool verbose;
|
||||
bool terse;
|
||||
bool force;
|
||||
bool wait_for_master;
|
||||
bool ignore_rsync_warn;
|
||||
@@ -75,7 +84,6 @@ typedef struct
|
||||
bool ignore_external_config_files;
|
||||
char masterport[MAXLEN];
|
||||
char localport[MAXLEN];
|
||||
char loglevel[MAXLEN];
|
||||
|
||||
/* parameter used by CLUSTER CLEANUP */
|
||||
int keep_history;
|
||||
@@ -85,9 +93,20 @@ typedef struct
|
||||
char recovery_min_apply_delay[MAXLEN];
|
||||
} t_runtime_options;
|
||||
|
||||
#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, false, "", "", "", 0, "", "" }
|
||||
#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, "", "", 0, "", "" }
|
||||
|
||||
extern char repmgr_schema[MAXLEN];
|
||||
|
||||
typedef struct ErrorListCell
|
||||
{
|
||||
struct ErrorListCell *next;
|
||||
char *error_message;
|
||||
} ErrorListCell;
|
||||
|
||||
typedef struct ErrorList
|
||||
{
|
||||
ErrorListCell *head;
|
||||
ErrorListCell *tail;
|
||||
} ErrorList;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* repmgr.sql
|
||||
*
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
426
repmgrd.c
426
repmgrd.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgrd.c - Replication manager daemon
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This module connects to the nodes of a replication cluster and monitors
|
||||
* how far are they from master
|
||||
@@ -41,6 +41,22 @@
|
||||
#include "access/xlogdefs.h"
|
||||
#include "pqexpbuffer.h"
|
||||
|
||||
/*
|
||||
* Struct to store node information
|
||||
*/
|
||||
typedef struct s_node_info
|
||||
{
|
||||
int node_id;
|
||||
int upstream_node_id;
|
||||
char conninfo_str[MAXLEN];
|
||||
XLogRecPtr xlog_location;
|
||||
t_server_type type;
|
||||
bool is_ready;
|
||||
bool is_visible;
|
||||
char slot_name[MAXLEN];
|
||||
bool active;
|
||||
} t_node_info;
|
||||
|
||||
|
||||
|
||||
/* Local info */
|
||||
@@ -52,7 +68,9 @@ t_configuration_options master_options;
|
||||
|
||||
PGconn *master_conn = NULL;
|
||||
|
||||
char *config_file = "";
|
||||
const char *progname;
|
||||
|
||||
char *config_file = DEFAULT_CONFIG_FILE;
|
||||
bool verbose = false;
|
||||
bool monitoring_history = false;
|
||||
t_node_info node_info;
|
||||
@@ -63,15 +81,17 @@ char *pid_file = NULL;
|
||||
|
||||
t_configuration_options config = T_CONFIGURATION_OPTIONS_INITIALIZER;
|
||||
|
||||
static void help(void);
|
||||
static void help(const char *progname);
|
||||
static void usage(void);
|
||||
static void check_cluster_configuration(PGconn *conn);
|
||||
static void check_node_configuration(void);
|
||||
|
||||
static void standby_monitor(void);
|
||||
static void witness_monitor(void);
|
||||
static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
|
||||
static bool set_local_node_status(void);
|
||||
static bool check_connection(PGconn *conn, const char *type);
|
||||
static bool set_local_node_failed(void);
|
||||
|
||||
static bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id);
|
||||
|
||||
static void update_shared_memory(char *last_wal_standby_applied);
|
||||
static void update_registration(void);
|
||||
@@ -127,8 +147,6 @@ main(int argc, char **argv)
|
||||
{"monitoring-history", no_argument, NULL, 'm'},
|
||||
{"daemonize", no_argument, NULL, 'd'},
|
||||
{"pid-file", required_argument, NULL, 'p'},
|
||||
{"help", no_argument, NULL, '?'},
|
||||
{"version", no_argument, NULL, 'V'},
|
||||
{NULL, 0, NULL, 0}
|
||||
};
|
||||
|
||||
@@ -140,10 +158,23 @@ main(int argc, char **argv)
|
||||
FILE *fd;
|
||||
|
||||
int server_version_num = 0;
|
||||
progname = get_progname(argv[0]);
|
||||
|
||||
set_progname(argv[0]);
|
||||
if (argc > 1)
|
||||
{
|
||||
if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
|
||||
{
|
||||
help(progname);
|
||||
exit(SUCCESS);
|
||||
}
|
||||
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
|
||||
{
|
||||
printf("%s %s (PostgreSQL %s)\n", progname, REPMGR_VERSION, PG_VERSION);
|
||||
exit(SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
while ((c = getopt_long(argc, argv, "?Vf:vmdp:", long_options, &optindex)) != -1)
|
||||
while ((c = getopt_long(argc, argv, "f:v:mdp:", long_options, &optindex)) != -1)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
@@ -162,12 +193,6 @@ main(int argc, char **argv)
|
||||
case 'p':
|
||||
pid_file = optarg;
|
||||
break;
|
||||
case '?':
|
||||
help();
|
||||
exit(SUCCESS);
|
||||
case 'V':
|
||||
printf("%s %s (PostgreSQL %s)\n", progname(), REPMGR_VERSION, PG_VERSION);
|
||||
exit(SUCCESS);
|
||||
default:
|
||||
usage();
|
||||
exit(ERR_BAD_CONFIG);
|
||||
@@ -183,7 +208,7 @@ main(int argc, char **argv)
|
||||
* which case we'll need to refactor parse_config() not to abort,
|
||||
* and return the error message.
|
||||
*/
|
||||
load_config(config_file, verbose, &local_options, argv[0]);
|
||||
parse_config(config_file, &local_options);
|
||||
|
||||
if (daemonize)
|
||||
{
|
||||
@@ -213,9 +238,10 @@ main(int argc, char **argv)
|
||||
strerror(errno));
|
||||
}
|
||||
|
||||
logger_init(&local_options, progname());
|
||||
logger_init(&local_options, progname, local_options.loglevel,
|
||||
local_options.logfacility);
|
||||
if (verbose)
|
||||
logger_set_verbose();
|
||||
logger_min_verbose(LOG_INFO);
|
||||
|
||||
if (log_type == REPMGR_SYSLOG)
|
||||
{
|
||||
@@ -229,7 +255,6 @@ main(int argc, char **argv)
|
||||
}
|
||||
|
||||
/* Initialise the repmgr schema name */
|
||||
/* XXX check this handles quoting properly */
|
||||
maxlen_snprintf(repmgr_schema, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX,
|
||||
local_options.cluster_name);
|
||||
|
||||
@@ -247,7 +272,7 @@ main(int argc, char **argv)
|
||||
if (server_version_num > 0)
|
||||
{
|
||||
log_err(_("%s requires PostgreSQL %s or later\n"),
|
||||
progname(),
|
||||
progname,
|
||||
MIN_SUPPORTED_VERSION) ;
|
||||
}
|
||||
else
|
||||
@@ -265,7 +290,7 @@ main(int argc, char **argv)
|
||||
if(node_info.node_id == NODE_NOT_FOUND)
|
||||
{
|
||||
log_err(_("No metadata record found for this node - terminating\n"));
|
||||
log_hint(_("Check that 'repmgr (master|standby) register' was executed for this node\n"));
|
||||
log_notice(_("HINT: was this node registered with 'repmgr (master|standby) register'?\n"));
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
@@ -294,7 +319,7 @@ main(int argc, char **argv)
|
||||
check_cluster_configuration(my_local_conn);
|
||||
check_node_configuration();
|
||||
|
||||
if (reload_config(&local_options))
|
||||
if (reload_config(config_file, &local_options))
|
||||
{
|
||||
PQfinish(my_local_conn);
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
@@ -328,7 +353,7 @@ main(int argc, char **argv)
|
||||
*/
|
||||
do
|
||||
{
|
||||
if (check_connection(&master_conn, "master", NULL))
|
||||
if (check_connection(master_conn, "master"))
|
||||
{
|
||||
sleep(local_options.monitor_interval_secs);
|
||||
}
|
||||
@@ -343,10 +368,10 @@ main(int argc, char **argv)
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
/*
|
||||
* if we can reload the configuration file, then could need to change
|
||||
* if we can reload, then could need to change
|
||||
* my_local_conn
|
||||
*/
|
||||
if (reload_config(&local_options))
|
||||
if (reload_config(config_file, &local_options))
|
||||
{
|
||||
PQfinish(my_local_conn);
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
@@ -390,7 +415,7 @@ main(int argc, char **argv)
|
||||
|
||||
appendPQExpBuffer(&errmsg,
|
||||
_("unable to connect to master node '%s'"),
|
||||
master_options.node_name);
|
||||
local_options.cluster_name);
|
||||
|
||||
log_err("%s\n", errmsg.data);
|
||||
|
||||
@@ -407,7 +432,7 @@ main(int argc, char **argv)
|
||||
check_cluster_configuration(my_local_conn);
|
||||
check_node_configuration();
|
||||
|
||||
if (reload_config(&local_options))
|
||||
if (reload_config(config_file, &local_options))
|
||||
{
|
||||
PQfinish(my_local_conn);
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
@@ -440,7 +465,7 @@ main(int argc, char **argv)
|
||||
|
||||
do
|
||||
{
|
||||
log_verbose(LOG_DEBUG, "standby check loop...\n");
|
||||
log_debug("standby check loop...\n");
|
||||
|
||||
if (node_info.type == WITNESS)
|
||||
{
|
||||
@@ -450,7 +475,6 @@ main(int argc, char **argv)
|
||||
{
|
||||
standby_monitor();
|
||||
}
|
||||
|
||||
sleep(local_options.monitor_interval_secs);
|
||||
|
||||
if (got_SIGHUP)
|
||||
@@ -459,7 +483,7 @@ main(int argc, char **argv)
|
||||
* if we can reload, then could need to change
|
||||
* my_local_conn
|
||||
*/
|
||||
if (reload_config(&local_options))
|
||||
if (reload_config(config_file, &local_options))
|
||||
{
|
||||
PQfinish(my_local_conn);
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
@@ -512,7 +536,7 @@ witness_monitor(void)
|
||||
* of a missing master and promotion of a standby by that standby's
|
||||
* repmgrd, so we'll loop for a while before giving up.
|
||||
*/
|
||||
connection_ok = check_connection(&master_conn, "master", NULL);
|
||||
connection_ok = check_connection(master_conn, "master");
|
||||
|
||||
if(connection_ok == false)
|
||||
{
|
||||
@@ -542,10 +566,10 @@ witness_monitor(void)
|
||||
{
|
||||
log_warning(
|
||||
_("unable to determine a valid master server; waiting %i seconds to retry...\n"),
|
||||
local_options.reconnect_interval
|
||||
local_options.reconnect_intvl
|
||||
);
|
||||
PQfinish(master_conn);
|
||||
sleep(local_options.reconnect_interval);
|
||||
sleep(local_options.reconnect_intvl);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -658,7 +682,6 @@ standby_monitor(void)
|
||||
char last_wal_standby_received[MAXLEN];
|
||||
char last_wal_standby_applied[MAXLEN];
|
||||
char last_wal_standby_applied_timestamp[MAXLEN];
|
||||
bool last_wal_standby_received_gte_replayed;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
XLogRecPtr lsn_master;
|
||||
@@ -670,7 +693,6 @@ standby_monitor(void)
|
||||
bool did_retry = false;
|
||||
|
||||
PGconn *upstream_conn;
|
||||
char upstream_conninfo[MAXCONNINFO];
|
||||
int upstream_node_id;
|
||||
t_node_info upstream_node;
|
||||
|
||||
@@ -682,26 +704,33 @@ standby_monitor(void)
|
||||
* no point in doing much else anyway
|
||||
*/
|
||||
|
||||
if (!check_connection(&my_local_conn, "standby", NULL))
|
||||
if (!check_connection(my_local_conn, "standby"))
|
||||
{
|
||||
PQExpBufferData errmsg;
|
||||
|
||||
set_local_node_status();
|
||||
set_local_node_failed();
|
||||
|
||||
initPQExpBuffer(&errmsg);
|
||||
|
||||
appendPQExpBuffer(&errmsg,
|
||||
_("failed to connect to local node, node marked as failed!"));
|
||||
_("failed to connect to local node, node marked as failed and terminating!"));
|
||||
|
||||
log_err("%s\n", errmsg.data);
|
||||
|
||||
goto continue_monitoring_standby;
|
||||
create_event_record(master_conn,
|
||||
&local_options,
|
||||
local_options.node,
|
||||
"repmgrd_shutdown",
|
||||
false,
|
||||
errmsg.data);
|
||||
|
||||
terminate(ERR_DB_CON);
|
||||
}
|
||||
|
||||
upstream_conn = get_upstream_connection(my_local_conn,
|
||||
local_options.cluster_name,
|
||||
local_options.node,
|
||||
&upstream_node_id, upstream_conninfo);
|
||||
&upstream_node_id, NULL);
|
||||
|
||||
type = upstream_node_id == master_options.node
|
||||
? "master"
|
||||
@@ -713,12 +742,12 @@ standby_monitor(void)
|
||||
* we cannot reconnect, try to get a new upstream node.
|
||||
*/
|
||||
|
||||
check_connection(&upstream_conn, type, upstream_conninfo);
|
||||
/*
|
||||
* This takes up to local_options.reconnect_attempts *
|
||||
* local_options.reconnect_interval seconds
|
||||
check_connection(upstream_conn, type); /* this takes up to
|
||||
* local_options.reconnect_attempts
|
||||
* local_options.reconnect_intvl seconds
|
||||
*/
|
||||
|
||||
|
||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||
{
|
||||
PQfinish(upstream_conn);
|
||||
@@ -824,7 +853,6 @@ standby_monitor(void)
|
||||
|
||||
PQfinish(upstream_conn);
|
||||
|
||||
continue_monitoring_standby:
|
||||
/* Check if we still are a standby, we could have been promoted */
|
||||
do
|
||||
{
|
||||
@@ -840,13 +868,10 @@ standby_monitor(void)
|
||||
* will require manual resolution as there's no way of determing
|
||||
* which master is the correct one.
|
||||
*
|
||||
* We should log a message so the user knows of the situation at hand.
|
||||
*
|
||||
* XXX check if the original master is still active and display a
|
||||
* warning
|
||||
*/
|
||||
log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
|
||||
log_err(_("Check your cluster and manually fix any anomaly.\n"));
|
||||
log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
|
||||
terminate(1);
|
||||
break;
|
||||
|
||||
@@ -854,13 +879,10 @@ standby_monitor(void)
|
||||
log_err(_("standby node has disappeared, trying to reconnect...\n"));
|
||||
did_retry = true;
|
||||
|
||||
if (!check_connection(&my_local_conn, "standby", NULL))
|
||||
if (!check_connection(my_local_conn, "standby"))
|
||||
{
|
||||
set_local_node_status();
|
||||
/*
|
||||
* Let's continue checking, and if the postgres server on the
|
||||
* standby comes back up, we will activate it again
|
||||
*/
|
||||
set_local_node_failed();
|
||||
terminate(0);
|
||||
}
|
||||
|
||||
break;
|
||||
@@ -869,13 +891,6 @@ standby_monitor(void)
|
||||
|
||||
if (did_retry)
|
||||
{
|
||||
/*
|
||||
* There's a possible situation where the standby went down for some reason
|
||||
* (maintenance for example) and is now up and maybe connected once again to
|
||||
* the stream. If we set the local standby node as failed and it's now running
|
||||
* and receiving replication data, we should activate it again.
|
||||
*/
|
||||
set_local_node_status();
|
||||
log_info(_("standby connection recovered!\n"));
|
||||
}
|
||||
|
||||
@@ -883,6 +898,7 @@ standby_monitor(void)
|
||||
if (!monitoring_history)
|
||||
return;
|
||||
|
||||
|
||||
/*
|
||||
* If original master has gone away we'll need to get the new one
|
||||
* from the upstream node to write monitoring information
|
||||
@@ -928,9 +944,8 @@ standby_monitor(void)
|
||||
master_conn = get_master_connection(my_local_conn,
|
||||
local_options.cluster_name,
|
||||
&master_options.node, NULL);
|
||||
|
||||
}
|
||||
if (PQstatus(master_conn) != CONNECTION_OK)
|
||||
PQreset(master_conn);
|
||||
|
||||
/*
|
||||
* Cancel any query that is still being executed, so i can insert the
|
||||
@@ -944,8 +959,7 @@ standby_monitor(void)
|
||||
/* Get local xlog info */
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
|
||||
"pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp(), "
|
||||
"pg_last_xlog_receive_location() >= pg_last_xlog_replay_location()");
|
||||
"pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp() ");
|
||||
|
||||
res = PQexec(my_local_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -960,30 +974,10 @@ standby_monitor(void)
|
||||
strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN);
|
||||
strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN);
|
||||
strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
|
||||
last_wal_standby_received_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
|
||||
? true
|
||||
: false;
|
||||
|
||||
PQclear(res);
|
||||
|
||||
/*
|
||||
* In the unusual event of a standby becoming disconnected from the primary,
|
||||
* while this repmgrd remains connected to the primary, subtracting
|
||||
* "lsn_standby_applied" from "lsn_standby_received" and coercing to
|
||||
* (long long unsigned int) will result in a meaningless, very large
|
||||
* value which will overflow a BIGINT column and spew error messages into the
|
||||
* PostgreSQL log. In the absence of a better strategy, skip attempting
|
||||
* to insert a monitoring record.
|
||||
*/
|
||||
if (last_wal_standby_received_gte_replayed == false)
|
||||
{
|
||||
log_verbose(LOG_WARNING,
|
||||
"Invalid replication_lag value calculated - is this standby connected to its upstream?\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Get master xlog info */
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_current_xlog_location()");
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location()");
|
||||
|
||||
res = PQexec(master_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -1025,8 +1019,7 @@ standby_monitor(void)
|
||||
* Execute the query asynchronously, but don't check for a result. We will
|
||||
* check the result next time we pause for a monitor step.
|
||||
*/
|
||||
log_verbose(LOG_DEBUG, "standby_monitor:() %s\n", sqlquery);
|
||||
|
||||
log_debug("standby_monitor: %s\n", sqlquery);
|
||||
if (PQsendQuery(master_conn, sqlquery) == 0)
|
||||
log_warning(_("query could not be sent to master. %s\n"),
|
||||
PQerrorMessage(master_conn));
|
||||
@@ -1068,10 +1061,10 @@ do_master_failover(void)
|
||||
t_node_info nodes[FAILOVER_NODES_MAX_CHECK];
|
||||
|
||||
/* Store details of the failed node here */
|
||||
t_node_info failed_master = T_NODE_INFO_INITIALIZER;
|
||||
t_node_info failed_master = {-1, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
|
||||
|
||||
/* Store details of the best candidate for promotion to master here */
|
||||
t_node_info best_candidate = T_NODE_INFO_INITIALIZER;
|
||||
t_node_info best_candidate = {-1, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
|
||||
|
||||
/* get a list of standby nodes, including myself */
|
||||
sprintf(sqlquery,
|
||||
@@ -1159,8 +1152,8 @@ do_master_failover(void)
|
||||
total_nodes, visible_nodes);
|
||||
|
||||
/*
|
||||
* Am I on the group that should keep alive? If I see less than half of
|
||||
* total_nodes then I should do nothing
|
||||
* am i on the group that should keep alive? if i see less than half of
|
||||
* total_nodes then i should do nothing
|
||||
*/
|
||||
if (visible_nodes < (total_nodes / 2.0))
|
||||
{
|
||||
@@ -1200,13 +1193,12 @@ do_master_failover(void)
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
|
||||
res = PQexec(node_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_info(_("unable to retrieve node's last standby location: %s\n"),
|
||||
PQerrorMessage(node_conn));
|
||||
|
||||
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
|
||||
PQclear(res);
|
||||
PQfinish(node_conn);
|
||||
@@ -1232,7 +1224,7 @@ do_master_failover(void)
|
||||
}
|
||||
|
||||
/* last we get info about this node, and update shared memory */
|
||||
sprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
||||
sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
|
||||
res = PQexec(my_local_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -1350,9 +1342,6 @@ do_master_failover(void)
|
||||
PQclear(res);
|
||||
|
||||
/* If position is 0/0, keep checking */
|
||||
/* XXX we should add a timeout here to prevent infinite looping
|
||||
* if the other node's repmgrd is not up
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1430,7 +1419,8 @@ do_master_failover(void)
|
||||
/* wait */
|
||||
sleep(5);
|
||||
|
||||
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||
if (verbose)
|
||||
log_info(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||
|
||||
log_debug(_("promote command is: \"%s\"\n"),
|
||||
local_options.promote_command);
|
||||
@@ -1479,8 +1469,10 @@ do_master_failover(void)
|
||||
/* wait */
|
||||
sleep(10);
|
||||
|
||||
log_info(_("node %d is the best candidate for new master, attempting to follow...\n"),
|
||||
if (verbose)
|
||||
log_info(_("node %d is the best candidate to be the new master, we should follow it...\n"),
|
||||
best_candidate.node_id);
|
||||
log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command);
|
||||
|
||||
/*
|
||||
* The new master may some time to be promoted. The follow command
|
||||
@@ -1491,23 +1483,57 @@ do_master_failover(void)
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
/*
|
||||
* If 9.4 or later, and replication slots in use, we'll need to create a
|
||||
* slot on the new master
|
||||
*/
|
||||
new_master_conn = establish_db_connection(best_candidate.conninfo_str, true);
|
||||
|
||||
log_debug(_("executing follow command: \"%s\"\n"), local_options.follow_command);
|
||||
if(local_options.use_replication_slots)
|
||||
{
|
||||
if(create_replication_slot(new_master_conn, node_info.slot_name) == false)
|
||||
{
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to create slot '%s' on the master node: %s"),
|
||||
node_info.slot_name,
|
||||
PQerrorMessage(new_master_conn));
|
||||
|
||||
log_err("%s\n", event_details.data);
|
||||
|
||||
create_event_record(new_master_conn,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
PQfinish(new_master_conn);
|
||||
terminate(ERR_DB_QUERY);
|
||||
}
|
||||
}
|
||||
|
||||
r = system(local_options.follow_command);
|
||||
if (r != 0)
|
||||
{
|
||||
log_err(_("follow command failed. You could check and try it manually.\n"));
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* and reconnect to the local database */
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
|
||||
/* update node information to reflect new status */
|
||||
if(update_node_record_set_upstream(new_master_conn, node_info.node_id, best_candidate.node_id) == false)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to execute follow command:\n %s"),
|
||||
local_options.follow_command);
|
||||
_("Unable to update node record for node %i (following new upstream node %i)"),
|
||||
node_info.node_id,
|
||||
best_candidate.node_id);
|
||||
|
||||
log_err("%s\n", event_details.data);
|
||||
|
||||
/* It won't be possible to write to the event notification
|
||||
* table but we should be able to generate an external notification
|
||||
* if required.
|
||||
*/
|
||||
create_event_record(NULL,
|
||||
create_event_record(new_master_conn,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
@@ -1517,20 +1543,13 @@ do_master_failover(void)
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* and reconnect to the local database */
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
|
||||
/* update internal record for this node*/
|
||||
new_master_conn = establish_db_connection(best_candidate.conninfo_str, true);
|
||||
|
||||
node_info = get_node_info(new_master_conn, local_options.cluster_name, local_options.node);
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Node %i now following new upstream node %i"),
|
||||
node_info.node_id,
|
||||
best_candidate.node_id);
|
||||
|
||||
log_info("%s\n", event_details.data);
|
||||
|
||||
create_event_record(new_master_conn,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
@@ -1557,8 +1576,6 @@ do_master_failover(void)
|
||||
* It might be worth providing a selection of reconnection strategies
|
||||
* as different behaviour might be desirable in different situations;
|
||||
* or maybe the option not to reconnect might be required?
|
||||
*
|
||||
* XXX check this handles replication slots gracefully
|
||||
*/
|
||||
static bool
|
||||
do_upstream_standby_failover(t_node_info upstream_node)
|
||||
@@ -1567,7 +1584,6 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
int upstream_node_id = node_info.upstream_node_id;
|
||||
int r;
|
||||
PQExpBufferData event_details;
|
||||
|
||||
log_debug(_("do_upstream_standby_failover(): performing failover for node %i\n"),
|
||||
node_info.node_id);
|
||||
@@ -1576,7 +1592,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
* Verify that we can still talk to the cluster master even though
|
||||
* node upstream is not available
|
||||
*/
|
||||
if (!check_connection(&master_conn, "master", NULL))
|
||||
if (!check_connection(master_conn, "master"))
|
||||
{
|
||||
log_err(_("do_upstream_standby_failover(): Unable to connect to last known master node\n"));
|
||||
return false;
|
||||
@@ -1637,65 +1653,26 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
sleep(local_options.reconnect_interval);
|
||||
sleep(local_options.reconnect_intvl);
|
||||
}
|
||||
|
||||
/* Close the connection to this server */
|
||||
PQfinish(my_local_conn);
|
||||
my_local_conn = NULL;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
/* Follow new upstream */
|
||||
r = system(local_options.follow_command);
|
||||
if (r != 0)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to execute follow command:\n %s"),
|
||||
local_options.follow_command);
|
||||
|
||||
log_err("%s\n", event_details.data);
|
||||
|
||||
/* It won't be possible to write to the event notification
|
||||
* table but we should be able to generate an external notification
|
||||
* if required.
|
||||
*/
|
||||
create_event_record(NULL,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
log_err(_("follow command failed. You could check and try it manually.\n"));
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (update_node_record_set_upstream(master_conn, local_options.cluster_name, node_info.node_id, upstream_node_id) == false)
|
||||
if(update_node_record_set_upstream(master_conn, node_info.node_id, upstream_node_id) == false)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to set node %i's new upstream ID to %i"),
|
||||
node_info.node_id,
|
||||
upstream_node_id);
|
||||
create_event_record(NULL,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Node %i is now following upstream node %i"),
|
||||
node_info.node_id,
|
||||
upstream_node_id);
|
||||
|
||||
create_event_record(NULL,
|
||||
&local_options,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
|
||||
return true;
|
||||
@@ -1704,33 +1681,24 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
|
||||
|
||||
static bool
|
||||
check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
check_connection(PGconn *conn, const char *type)
|
||||
{
|
||||
int connection_retries;
|
||||
|
||||
/*
|
||||
* Check if the node is still available if after
|
||||
* local_options.reconnect_attempts * local_options.reconnect_interval
|
||||
* local_options.reconnect_attempts * local_options.reconnect_intvl
|
||||
* seconds of retries we cannot reconnect return false
|
||||
*/
|
||||
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
||||
{
|
||||
if (*conn == NULL)
|
||||
{
|
||||
if (conninfo == NULL)
|
||||
{
|
||||
log_err("INTERNAL ERROR: *conn == NULL && conninfo == NULL");
|
||||
terminate(ERR_INTERNAL);
|
||||
}
|
||||
*conn = establish_db_connection(conninfo, false);
|
||||
}
|
||||
if (!is_pgup(*conn, local_options.master_response_timeout))
|
||||
if (!is_pgup(conn, local_options.master_response_timeout))
|
||||
{
|
||||
log_warning(_("connection to %s has been lost, trying to recover... %i seconds before failover decision\n"),
|
||||
type,
|
||||
(local_options.reconnect_interval * (local_options.reconnect_attempts - connection_retries)));
|
||||
/* wait local_options.reconnect_interval seconds between retries */
|
||||
sleep(local_options.reconnect_interval);
|
||||
(local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
|
||||
/* wait local_options.reconnect_intvl seconds between retries */
|
||||
sleep(local_options.reconnect_intvl);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1742,9 +1710,9 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_pgup(*conn, local_options.master_response_timeout))
|
||||
if (!is_pgup(conn, local_options.master_response_timeout))
|
||||
{
|
||||
log_err(_("unable to reconnect to %s (timeout %i seconds)...\n"),
|
||||
log_err(_("unable to reconnect to %s after %i seconds...\n"),
|
||||
type,
|
||||
local_options.master_response_timeout
|
||||
);
|
||||
@@ -1757,7 +1725,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
|
||||
|
||||
/*
|
||||
* set_local_node_status()
|
||||
* set_local_node_failed()
|
||||
*
|
||||
* If failure of the local node is detected, attempt to connect
|
||||
* to the current master server (as stored in the global variable
|
||||
@@ -1765,16 +1733,16 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
*/
|
||||
|
||||
static bool
|
||||
set_local_node_status(void)
|
||||
set_local_node_failed(void)
|
||||
{
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
int active_master_node_id = NODE_NOT_FOUND;
|
||||
char master_conninfo[MAXLEN];
|
||||
|
||||
if (!check_connection(&master_conn, "master", NULL))
|
||||
if (!check_connection(master_conn, "master"))
|
||||
{
|
||||
log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
|
||||
log_err(_("set_local_node_failed(): Unable to connect to last known master node\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1828,16 +1796,17 @@ set_local_node_status(void)
|
||||
|
||||
|
||||
/*
|
||||
* Attempt to set the active record to the correct value.
|
||||
* First
|
||||
* Attempt to set own record as inactive
|
||||
*/
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"UPDATE %s.repl_nodes "
|
||||
" SET active = FALSE "
|
||||
" WHERE id = %i ",
|
||||
get_repmgr_schema_quoted(master_conn),
|
||||
node_info.node_id);
|
||||
|
||||
if (!update_node_record_status(master_conn,
|
||||
local_options.cluster_name,
|
||||
node_info.node_id,
|
||||
"standby",
|
||||
node_info.upstream_node_id,
|
||||
is_standby(my_local_conn)==1))
|
||||
res = PQexec(master_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("unable to set local node %i as inactive on master: %s\n"),
|
||||
node_info.node_id,
|
||||
@@ -1862,7 +1831,7 @@ check_cluster_configuration(PGconn *conn)
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"SELECT oid FROM pg_class "
|
||||
" WHERE oid = '%s.repl_nodes'::regclass ",
|
||||
get_repmgr_schema_quoted(master_conn));
|
||||
get_repmgr_schema());
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
@@ -1989,29 +1958,25 @@ lsn_to_xlogrecptr(char *lsn, bool *format_ok)
|
||||
void
|
||||
usage(void)
|
||||
{
|
||||
log_err(_("%s: Replicator manager daemon \n"), progname());
|
||||
log_err(_("Try \"%s --help\" for more information.\n"), progname());
|
||||
log_err(_("%s: Replicator manager daemon \n"), progname);
|
||||
log_err(_("Try \"%s --help\" for more information.\n"), progname);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
help(void)
|
||||
help(const char *progname)
|
||||
{
|
||||
printf(_("%s: replication management daemon for PostgreSQL\n"), progname());
|
||||
printf(_("\n"));
|
||||
printf(_("Usage:\n"));
|
||||
printf(_(" %s [OPTIONS]\n"), progname());
|
||||
printf(_("\n"));
|
||||
printf(_("Options:\n"));
|
||||
printf(_(" -?, --help show this help, then exit\n"));
|
||||
printf(_(" -V, --version output version information, then exit\n"));
|
||||
printf(_("Usage: %s [OPTIONS]\n"), progname);
|
||||
printf(_("Replicator manager daemon for PostgreSQL.\n"));
|
||||
printf(_("\nOptions:\n"));
|
||||
printf(_(" --help show this help, then exit\n"));
|
||||
printf(_(" --version output version information, then exit\n"));
|
||||
printf(_(" -v, --verbose output verbose activity information\n"));
|
||||
printf(_(" -m, --monitoring-history track advance or lag of the replication in every standby in repl_monitor\n"));
|
||||
printf(_(" -f, --config-file=PATH path to the configuration file\n"));
|
||||
printf(_(" -d, --daemonize detach process from foreground\n"));
|
||||
printf(_(" -p, --pid-file=PATH write a PID file\n"));
|
||||
printf(_("\n"));
|
||||
printf(_("%s monitors a cluster of servers and optionally performs failover.\n"), progname());
|
||||
printf(_("\n%s monitors a cluster of servers.\n"), progname);
|
||||
}
|
||||
|
||||
|
||||
@@ -2049,7 +2014,7 @@ terminate(int retval)
|
||||
unlink(pid_file);
|
||||
}
|
||||
|
||||
log_info(_("%s terminating...\n"), progname());
|
||||
log_info(_("%s terminating...\n"), progname);
|
||||
|
||||
exit(retval);
|
||||
}
|
||||
@@ -2254,12 +2219,23 @@ check_and_create_pid_file(const char *pid_file)
|
||||
t_node_info
|
||||
get_node_info(PGconn *conn, char *cluster, int node_id)
|
||||
{
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
PGresult *res;
|
||||
|
||||
t_node_info node_info = T_NODE_INFO_INITIALIZER;
|
||||
t_node_info node_info = { NODE_NOT_FOUND, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
|
||||
|
||||
res = get_node_record(conn, cluster, node_id);
|
||||
sprintf(sqlquery,
|
||||
"SELECT id, upstream_node_id, conninfo, type, slot_name, active "
|
||||
" FROM %s.repl_nodes "
|
||||
" WHERE cluster = '%s' "
|
||||
" AND id = %i",
|
||||
get_repmgr_schema_quoted(conn),
|
||||
local_options.cluster_name,
|
||||
node_id);
|
||||
|
||||
log_debug("get_node_info(): %s\n", sqlquery);
|
||||
|
||||
res = PQexec(my_local_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
PQExpBufferData errmsg;
|
||||
@@ -2323,3 +2299,37 @@ parse_node_type(const char *type)
|
||||
|
||||
return UNKNOWN;
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id)
|
||||
{
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
log_debug(_("update_node_record_set_upstream(): Updating node %i's upstream node to %i\n"), this_node_id, new_upstream_node_id);
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
" UPDATE %s.repl_nodes "
|
||||
" SET upstream_node_id = %i "
|
||||
" WHERE cluster = '%s' "
|
||||
" AND id = %i ",
|
||||
get_repmgr_schema_quoted(conn),
|
||||
new_upstream_node_id,
|
||||
local_options.cluster_name,
|
||||
this_node_id);
|
||||
res = PQexec(conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_err(_("Unable to set new upstream node id: %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#
|
||||
# Makefile
|
||||
#
|
||||
# Copyright (c) 2ndQuadrant, 2010-2016
|
||||
# Copyright (c) 2ndQuadrant, 2010-2015
|
||||
#
|
||||
|
||||
MODULE_big = repmgr_funcs
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgr_function.sql
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* uninstall_repmgr_funcs.sql
|
||||
* Copyright (c) 2ndQuadrant, 2010-2016
|
||||
* Copyright (c) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* strutil.c
|
||||
*
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* strutil.h
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* uninstall_repmgr.sql
|
||||
*
|
||||
* Copyright (C) 2ndQuadrant, 2010-2016
|
||||
* Copyright (C) 2ndQuadrant, 2010-2015
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
Reference in New Issue
Block a user