Compare commits

...

130 Commits

Author SHA1 Message Date
Ian Barwick
afdaf9be66 _create_event(): log event and node ID for debugging 2018-06-11 15:20:01 +09:00
Ian Barwick
8067924c3e repmgr: consolidate code in "standby switchover"
Commit 41274f5525 left us with two if statements
in sequence with exactly the same condition, so consolidate both into a single
statement. Clarify code comments while we're at it.
2018-06-11 15:14:40 +09:00
Ian Barwick
e94a6eefde repmgr: cluster check commands - non-zero exit code if node(s) unavailable
Return ERR_CLUSTER_CHECK if one or nodes was not reachable.

Implements GitHub #447.
2018-06-11 12:41:19 +09:00
Ian Barwick
69d7b6f7eb doc: 4.0.6 release notes 2018-06-07 17:14:50 +09:00
Ian Barwick
8ec3b2a536 Bump version
4.0.6
2018-06-07 15:08:48 +09:00
Ian Barwick
68a9745e7e standby follow: check node has connect to new primary
After restarting the standby, poll pg_stat_replication on the upstream
until the standby connects, and exit with an error if it doesn't by the
timeout defined in "standby_follow_timeout".

Implments GitHub #444.
2018-06-07 14:41:05 +09:00
Ian Barwick
20ce53e2d2 doc: update release notes 2018-06-07 12:48:54 +09:00
Ian Barwick
638a119c85 standby follow: add hint about using "node rejoin"
If "repmgr standby follow" is executed on a node which isn't running,
point out "repmgr node rejoin" should probably be used instead.
2018-06-07 11:02:32 +09:00
Ian Barwick
053863cdd0 doc: fix typos 2018-06-07 10:40:30 +09:00
Ian Barwick
009cc0480c witness_register: check for existing node with same name 2018-06-07 10:04:26 +09:00
Ian Barwick
63bdc19132 repmgrd: ensure local node is counted as quorum member
Rename "standby_nodes" to "sibling_nodes" to make it clearer in the
code what total is actually provided by the struct.

Addresses GitHub #439.
2018-06-01 17:19:40 +09:00
Ian Barwick
fbd389d0b3 doc: fix typo 2018-06-01 13:07:19 +09:00
Ian Barwick
4aef4ea11e standby clone: improve external configuration file copying
If --copy-external-config-files was provided, check that we can copy
the files *before* cloning the standby, and abort if an error is
encountered. This will give the user the opportunity to fix any issues
before running the entire (and potentially lengthy) clone.

Previously errors were logged but no action taken, and the final
message indicated the clone operation was successful.

Addresses GitHub #443.
2018-06-01 13:00:07 +09:00
Ian Barwick
0ffaff75df repmgrd: ensue degraded monitoring timeout works on standby
Parameter "degraded_monitoring_timeout" was not being acted on when
monitoring a streaming replication standby.

Addresses GitHub #439.
2018-05-31 17:53:31 +09:00
Ian Barwick
c54bb73fb2 If --dry-run specified, ensure minimum log level is INFO
When executed with --dry-run, repmgr outputs detail about what would
happen using log level INFO. If the log_level is configured to
NOTICE or higher, it's possible some or all of the --dry-run output
might not be displayed.

Addresses GitHub #441.
2018-05-31 15:30:26 +09:00
Ian Barwick
28ea2e48de node rejoin: avoid outputting empty DETAIL message 2018-05-31 15:10:51 +09:00
Ian Barwick
41274f5525 node rejoin: improve handling of --config-file parameter
Fixes bug when parsing --config-file values (GitHub #442).

Also improves handling in --dry-run mode, as some checks for the
provided files were being skipped if --dry-run supplied, even though
they are intended to work with --dry-run.
2018-05-31 11:44:31 +09:00
Ian Barwick
edceb32ccb standby clone: --recovery-conf-only expects the standby to be registered
Note this in the documentation, and add a HINT about registering it
if the standby record is not available.

Related to GitHub #438.
2018-05-29 11:54:38 +09:00
Ian Barwick
3dba8336e9 standby clone: don't assume existence of "user" in upstream conninfo
Usually a seperate user (typically "repmgr") is set up specifically to manage
the repmgr metadata, however there's no compelling requirement to do this, and
it's possible the database owner (usually: "postgres") will be used, in which
case it's possible the username will be left out of the conninfo string.

Addresses GitHub #437.
2018-05-24 15:51:41 +09:00
Ian Barwick
97d0cee259 "config_file" is MAXPGPATH, not MAXLEN
The two values are the same anyway, so change is more for consistency.
2018-05-22 17:19:55 +09:00
Martín Marqués
2dfe1d18e9 Fix typo in a code comment 2018-05-19 12:29:04 -03:00
Ian Barwick
55bb93bd3f "standby clone": log actual connection string used to connect to upstream
Useful for diagnostic purposes.
2018-05-10 11:58:48 +09:00
Ian Barwick
4c49954cd4 Fix check for -d/--dbname parameter
Not a bug per-se, just meant some unnecessary processing was done on
an empty string.

Per note from petere.
2018-05-10 11:57:02 +09:00
Ian Barwick
a880b6ce16 Include "arpa/inet.h" in dbutils.c
Needed for htonl() on FreeBSD.
2018-05-10 11:25:52 +09:00
Ian Barwick
c51a2283dd Minor documentation fixes 2018-05-10 10:27:25 +09:00
Ian Barwick
717828e73e doc: update 2ndQuadrant repository information
Canonical link for each repository should not include any directories.
2018-05-03 17:21:29 +09:00
Ian Barwick
c7477d7a9c doc: update repository information 2018-05-03 15:22:33 +09:00
Ian Barwick
1db8d3904f doc: update package installation information
Document the new public 2ndQuadrant apt repository
2018-05-03 15:07:26 +09:00
Ian Barwick
362f478d55 doc: update package installation information
Document the new, public 2ndQuadrant RPM repository.
2018-05-03 14:12:29 +09:00
Ian Barwick
cb1bf892e6 Finalize 4.0.5 release 2018-05-01 11:26:30 +09:00
Ian Barwick
b1b5fe1193 doc: add notes about package compatibility
We need to emphasise that the repmgr packages are only compatible
with packages based on the PGDG filesystem layout; 3rd party vendor
packages often put application and data directories elsewhere.
See e.g. GitHub #427.
2018-05-01 11:08:59 +09:00
Ian Barwick
af0e141859 doc: update FAQ location 2018-05-01 10:27:59 +09:00
Ian Barwick
580c1a9170 doc: update HISTORY and add 4.0.5 release notes 2018-05-01 10:13:44 +09:00
Ian Barwick
b624fc7efa Bump version
4.0.5
2018-05-01 09:21:32 +09:00
Ian Barwick
67ccd4dcb3 repmgrd: don't explicitly close connections on shutdown 2018-04-30 15:13:30 +09:00
Ian Barwick
6de3a5a997 Fix parsing of "archive_ready_critical" configuration file parameter.
Per report in GitHub #426.
2018-04-28 06:59:20 +09:00
Ian Barwick
f86e89ba45 repmgrd: notify sibling nodes to follow new primary after pg_ctl timeout
If "pg_ctl promote" fails due to a timeout, but the promotion itself succeeds,
have repmgrd on the new primary explicitly notify any sibling nodes to
follow it.

Previously the sibling nodes would wait "primary_notification_timeout" seconds
before attempting to discover the new primary.

This (and preceding commit eac80ae) address GitHub #425.
2018-04-27 11:59:00 +09:00
Ian Barwick
a6d0ba07ed repmgrd: handle pg_ctl timeout
It's possible "pg_ctl promote" will timeout, causing "repmgr standby
follow" to return with an error; however the promotion itself will usually
succeed, so detect this case and handle accordingly.
2018-04-26 19:23:26 +09:00
Ian Barwick
b553a70ad5 repmgrd: always close the connection if the pointer is not NULL 2018-04-25 14:08:17 +09:00
Ian Barwick
3364f8bdf0 Add configuration file parameter "config_directory"
This enables explicit provision of an external configuration file
directory, which if set will be passed to "pg_ctl" as the -D
parameter. Otherwise "pg_ctl" will default to using the data directory,
which will cause some operations to fail if the configuration files
are not present there.

Note this is implemented primarily for feature completeness and for
development/testing purposes. Users who have installed "repmgr" from
a package should not rely on "pg_ctl" to stop/start/restart PostgreSQL,
instead they should set the appropriate "service_..._command" for their
operating system. For more details see:

    https://repmgr.org/docs/4.0/configuration-service-commands.html

Note: in a future release, the presence of "config_directory" in repmgr.conf
will be used to implictly set "--copy-external-config-files=samepath" when
cloning a standby; this is a behaviour change so will be implemented in the
next major realease (repmgr 4.1).

Implements GitHub #424.
2018-04-25 11:57:27 +09:00
Ian Barwick
242fa287b4 repmgrd: catch corner case in standby connection handle check
If repmgrd marks the local node as unavailable, and it was actually
restarting but a failover event occured before the next local node
check, failover will continue with the stale connection handle.

Add a final local node check just before starting the failover
process, so repmgrd can reconnect if it wasn't able to before.
2018-04-24 21:55:36 +09:00
Ian Barwick
fa908432c8 Minor doc and log output tweaks 2018-04-24 21:08:31 +09:00
Ian Barwick
afa942fef6 repmgrd: prevent standby connection handle from going stale
If monitoring history not in use, there's no activity on the standby's
connection handle, so if e.g. the standby is restarted, PQstatus()
never returns CONNECTION_BAD and repmgrd never notices the connection
is stale. Therefore execute a throw-away statement at "monitor_interval_secs".
2018-04-23 23:51:03 +09:00
Ian Barwick
94cfc66b04 doc: minor clarification 2018-04-20 12:23:04 +09:00
Ian Barwick
87eae9a50f doc: additional details about repmgrd usage in Debian/Ubuntu 2018-04-20 12:04:15 +09:00
Ian Barwick
82a37f4865 doc: add Debian package details 2018-04-20 10:57:19 +09:00
Ian Barwick
a38f727b7d doc: Improve CentOS package-related documentation 2018-04-20 10:31:42 +09:00
Ian Barwick
e6df936c1b doc: link to service command configuration from switchover section 2018-04-19 17:09:10 +09:00
Ian Barwick
91ca997d40 doc: improve configuration documentation
With special attention to setting service commands, and extra special
mention of "pg_ctlcluster" for Debian/Ubuntu users.
2018-04-19 16:49:26 +09:00
Ian Barwick
65c90a2a64 doc: update CentOS package documentation 2018-04-19 14:27:17 +09:00
Ian Barwick
90cba78f52 repmgrd: tweak event notifications on standby failure
The event notification was only being created if there was a valid
primary connection; it should be created in any case, so an event
notification script can be executed.
2018-04-17 10:27:25 +09:00
Ian Barwick
f8908d7e31 Bump version
4.0.5dev
2018-04-13 10:18:04 +09:00
Ian Barwick
478bbcccbf Add "dbname=replication" to all replication connection strings
Previously repmgr was attempting to make replication connections
with "dbname" set to the repmgr database name. While this works
if e.g. the repmgr user also has replication permissions, it will
fail if a dedicated replication user is specified, who only has
permission to access the virtual "replication" database.

Change this to use "dbname=replication" if the replication connection
user is different to the normal repmgr database user.

(We could just always set it to "replication", but that might break
existing installations e.g. where a .pgpass file is in use and there's
no "replication" entry for the normal repmgr database user).

Addresses GitHub #421.
2018-04-12 16:10:02 +09:00
Ian Barwick
a03d41de28 doc: mention --recovery-conf-only introduced in repmgr 4.0.4
Per GitHub #419.
2018-04-12 13:13:11 +09:00
Ian Barwick
f1e527adcb doc: various updates related to "standby clone" operations. 2018-04-12 13:08:05 +09:00
Ian Barwick
09e597dcdd Fix superuser password handling
When establishing a superuser connection, the connection parameters
were being copied from the existing (non-superuser) connection, which
in some circumstances can lead to that user's password being
included in the copied parameter list. The password parameter, if set, will
now always be removed, which will cause libpq to retrieve the correct
one from the .pgpass file.

Addresses GitHub #400.
2018-04-12 12:50:17 +09:00
Ian Barwick
94a7f0c719 Don't issue a CHECKPOINT after promoting a standby.
Issuing a CHECKPOINT immediately after promoting a standby may impact
performance. Commit 239a548e9d ensures
one is only issued when required, i.e. during a switchover when
pg_rewind will be executed.

This reverts commit a2068768ab.
2018-04-09 14:39:47 +09:00
Ian Barwick
6ac42f1593 "standby register": add sanity check when --upstream-node-id not supplied
If --upstream-node-id was not supplied to "repmgr standby register",
repmgr defaults to the primary node as upstream node. If the local node is
available, we now double-check that it's attached to the primary,
in case the lack of --upstream-node-id was an accidental ommission.

This check is only made when the local node is available.

This behaviour can be overriden with -F/--force (though it's hard to
imagine a scenario where that would be useful).

Addresses GitHub #395.
2018-04-05 17:40:05 +09:00
Ian Barwick
94b72382e5 doc: minor FAQ tweaks 2018-04-05 17:10:52 +09:00
Ian Barwick
18c12f58a4 doc: add a section about repmgrd and service commands etc. 2018-04-05 11:47:35 +09:00
Ian Barwick
cf3fa18085 doc: miscelleneous FAQ updates
- clarify pg_rewind item
 - add note about what's included in recovery.conf
2018-04-04 10:08:04 +09:00
Ian Barwick
a5281d93dc Add TODO for pg_rewind changes coming in PostgreSQL 11 2018-04-03 21:57:50 +09:00
Ian Barwick
0d73d3c2b5 Enable provision of "archive_cleanup_command" in recovery.conf
If "archive_cleanup_command" is defined in "repmgr.conf", a corresponding
entry will be made in the node's "recovery.conf" file after cloning a
standby.

Note that we recommend using PgBarman to manage WAL archives, but are
providing this facility to help repmgr to be integrated in existing environments.

Implements GitHub #416.
2018-04-03 14:11:24 +09:00
Ian Barwick
23c99304a6 "node rejoin": actively check for node to rejoin cluster
Previously repmgr was relying on whatever command was configured to
start PostgreSQL to determine whether the node being rejoined had
started correctly. However it's preferable to actively poll the upstream
to confirm it has restarted and actually attached as a standby before
confirming success of the "node rejoin" action.

This can be overridden with the -W/--no-wait option.

(Note that for consistency with other PostgreSQL utilities, the
short form of the --wait option is now "-w"; this is currently
only used in "repmgr standby follow".)

Also update "repmgr node rejoin" documentation with a list of supported
options, and add some useful index entries for "pg_rewind".

Implements GitHub #415.
2018-04-03 10:36:13 +09:00
Ian Barwick
1ab16bc6c2 doc: fix option description for "repmgr primary register" 2018-04-03 10:10:05 +09:00
Ian Barwick
7f1f04636d Refactor pg_control parsing
The "data_checksum_version" field towards the end of the ControlFileData struct,
meaning its position varies between versions. Previously this wasn't a problem
as it was only required for operations involving 9.5 and later, and its position
within the control file has not changed between the current release and current
HEAD.

However, in order to support pg_rewind in 9.3 and 9.4, which both have changes in
the control file format, we'll need version-specific parsing. This will also make
it easier to deal with any future changes to the control file format.
2018-04-02 20:55:10 +09:00
Ian Barwick
6a1797cadd Enable pg_rewind to be used with PostgreSQL 9.3/9.4
pg_rewind is not part of the core distribution for those, but we
provided support in repmgr 3.3 so should extend it to repmgr 4.

Note that there is no check in place whether the pg_rewind binary
exists, so it's up to the user to ensure it's present.

Addresses GitHub #413.
2018-04-02 20:55:04 +09:00
Ian Barwick
94d26dbe9f Always set "connect_timeout" when pinging a PostgreSQL instance
Insert "connect_timeout=2" into the connection parameters, if not
explicitly set by the user. This will prevent excessive wait time
for the host operating system to report a connection timeout.
2018-04-02 09:31:42 +09:00
Ian Barwick
ae655eb4fd Add TODO list
This file will collate various requests and ideas for future developement.
In particular it will reference requests which come in via the GitHub issue
tracker, so we can acknowledge and close off the request and not have an
open unresolved issue hanging around.
2018-03-30 14:18:51 +09:00
Ian Barwick
65371489c6 repmgrd: handle failover with two nodes in the primary location
If two nodes were in the primary location, and at least one node in
another location, the non-failed node in the primary location was not
recognising itself as a promotion candidate.

Addresses GitHub #407.
2018-03-30 12:17:34 +09:00
Ian Barwick
28c7737dc0 Log pg_control access errors as WARNINGs rather than DEBUG
This will make it easier to diagnose issues, possibly with an incorrect
"data_directory" setting in "repmgr.conf".
2018-03-30 11:24:44 +09:00
Ian Barwick
505d72d19c "standby switchover": force checkpoint if pg_rewind requested.
Addresses issue described in GitHub #378.

PostgreSQL itself doesn't issue a checkpoint after promotion to ensure
the newly promoted server is available as quickly as possible, so we'll
only execute an explicit CHECKPOINT when it's actually required, i.e.
when pg_rewind will be executed. This is required as pg_rewind uses
the timeline reported in the pg_control file to compare with the
server to be rewound, and the pg_control timeline is only updated after
the first checkpoint, so there is an interval where pg_rewind will
erroneously assume both servers are on the timeline and take no action.
2018-03-30 09:12:25 +09:00
Ian Barwick
b292ac61f8 "standby switchover": update hint 2018-03-30 09:12:21 +09:00
Ian Barwick
293d66bf71 Fix minimum accepted value for "degraded_monitoring_timeout"
Should be -1, the default.

Addresses GitHub #411.
2018-03-30 09:12:17 +09:00
Ian Barwick
3e1f0ec168 repmgr: move demoted primary check to the final step during switchover
This will give the demoted primary more time to start up as a standby,
during which "standby follow" can be executed on sibling nodes, if
specified.
2018-03-27 16:41:13 +09:00
Ian Barwick
6f9a1f975e repmgr: poll demoted primary after restart during switchover
During a switchover operation, once the demoted primary has been restarted
as a standby, repmgr attempts to reconnect to verify its status and drop
any redundant replication slots. However it's possible the standby may still
be in the startup phase, so poll for "standby_reconnect_timeout" seconds
before giving up.

Addresses GitHub #408.
2018-03-27 15:58:18 +09:00
Ian Barwick
deea4f69f7 Fix "repmgr cluster crosscheck" output
Addresses GitHub #398.
2018-03-27 10:28:27 +09:00
Ian Barwick
37e53108a2 Consolidate connection closure calls 2018-03-27 08:52:23 +09:00
Ian Barwick
96cf06204c doc: add note about remote command execution
When executing a command on a remote server, repmgr expects the remote binary
to be in the same location as the local binary. It's reasonable to assume
repmgr will be deployed in a unified environment; if not, the onus is on the
user to ensure repmgr can find the remote binary, e.g. by creating appropriate
symlinks.

Addresses query in GitHub #406.
2018-03-27 08:47:56 +09:00
Ian Barwick
381e22c2c7 Misc tweaks to witness code 2018-03-26 20:59:38 +09:00
Ian Barwick
7e2af17783 repmgrd: tweak log notices when marking a standby as failed
Announce what we're going to do (set the node record inactive) *before*
performing the action. Makes reading the log slightly easier.
2018-03-23 13:27:37 +08:00
Ian Barwick
b4272853e7 Add event "repmgrd_failover_aborted" 2018-03-23 10:44:00 +08:00
Ian Barwick
562b6ddfc2 Add error code ERR_FOLLOW_FAIL 2018-03-23 10:34:19 +08:00
Ian Barwick
a15e5c9d52 Tidy up queries in dbutils.c
- standardize formatting
- prefix various internal function calls with "pg_catalog.", to
  mitigate possible risks from CVE-2018-1058
2018-03-23 10:33:28 +08:00
Ian Barwick
d9cc09cee4 repmgrd: fix typo 2018-03-21 12:36:51 +09:00
Ian Barwick
c4f6abe951 Update HISTORY 2018-03-21 06:51:56 +09:00
Martín Marqués
e454fb77d3 While reviewing 7cb6e5af8d before merging
I noticed that besides the result cleanup added, there was still a missing
spot inside the if condition.

Adding the PQclear that was missing.
2018-03-21 06:51:50 +09:00
Andrzej Nowicki
b76e5852d3 One more memory leak fixed 2018-03-21 06:51:43 +09:00
Andrzej Nowicki
0674364ffd Clear node list to avoid memory leak, fixes #402 2018-03-21 06:51:37 +09:00
Ian Barwick
b2eb9b8525 Correctly handle error message pointer when parsing strings.
When parsing conninfo strings, ensure the error message pointer is
actually returned to the caller.

Not a criticial issue, just meant the contents of the error message
were not being displayed.
2018-03-10 14:28:10 +09:00
Ian Barwick
71c5d10a8c doc: update 4.0.4 release date 2018-03-09 20:07:16 +09:00
Ian Barwick
1476b21cd4 doc: update release notes
Add note about requiring 4.0.3 or later on all nodes when performing
a switchover from a noder running 4.0.3 or later.

Per report in GitHub #388.
2018-03-09 09:46:58 +09:00
Ian Barwick
b17993abdb doc: update "repmgr primary unregister" description
As noted by GitHub user yonj1e in GitHub #396.
2018-03-08 15:01:25 +09:00
Ian Barwick
8f68344f9a doc: update FAQ
Additional clarification for "repmgr standby clone --recovery-conf-only"
2018-03-08 10:04:30 +09:00
Ian Barwick
125ac6c297 doc: update FAQ
Add entry about upgrading PostgreSQL
2018-03-08 10:04:30 +09:00
Ian Barwick
955860923f Fix parsing of -k/--keep-history option
GitHub #394.
2018-03-07 19:14:18 +09:00
Ian Barwick
50626f90cc Add 4.0.4 release notes 2018-03-07 14:17:04 +09:00
Ian Barwick
9aea5b8aa7 repmgrd: fix failover handling in "manual" mode
Regression was introduced in commit c7a585c555
2018-03-06 22:35:51 +09:00
Ian Barwick
ed1bcb159e repmgrd: remove duplicate local record check in BDR mode 2018-03-06 12:31:07 +09:00
Ian Barwick
9c72c0d66e Add event "repmgrd_shutdown"
Implements GitHub #393
2018-03-06 10:59:54 +09:00
Emre Hasegeli
0ddc226c2a Add witness options to the main help
GitHub #392
2018-03-06 10:57:33 +09:00
Ian Barwick
93830cad61 Fix directory creation when cloning from Barman 2018-03-05 19:31:53 +09:00
Ian Barwick
bca1660d5e Improve repmgrd logging in BDR mode
Also ensure interval status log line is shown as intended
2018-03-05 15:05:40 +09:00
Ian Barwick
5a52917421 repmgrd: add debug log output for "monitor_interval_secs" sleep in all modes 2018-03-05 14:23:58 +09:00
Emre Hasegeli
70752d7d4a Add missing options to the main help 2018-03-05 09:52:04 +09:00
Ian Barwick
c29d1efc37 "standby clone": improve replication user selection
Use the upstream node's replication user when checking the replication
connection.
2018-03-02 16:21:32 +09:00
Ian Barwick
6fbbe2a97a "standby clone": fix --superuser handling
get_superuser_connection() was erroneously using the local node record
to connect to as a superuser, which works when registering the primary
but obviously not when cloning a standby.

Addresses GitHub #380.
2018-03-02 14:49:17 +09:00
Ian Barwick
ce42d6827e Update HISTORY 2018-03-01 15:51:09 +09:00
Ian Barwick
98384559a6 "standby clone": remove restriction on replication slots in Barman mode
While it's preferable to avoid standby replication slots if Barman is in
use, there's no technical reason to prevent this.

Implements GitHub #379.
2018-03-01 15:47:28 +09:00
Ian Barwick
4a1477343b repmgr: escape "restore_command" in generated recovery.conf 2018-03-01 10:39:04 +09:00
Ian Barwick
d2b9d20393 "standy clone": fix primary_conninfo when --upstream-conninfo provided 2018-03-01 09:18:40 +09:00
Ian Barwick
fe594c95ad repmgrd: retry standby connection after cascading standby failover 2018-02-28 21:15:11 +09:00
Ian Barwick
60e63feaca repmgrd: add configuration file parameter "standby_reconnect_timeout"
This is used for determining a timeout when reconnecting to the standby
after executing the "follow_command". This will normally not need to be
set explicitly, but maybe useful in cases where the standby's startup
phase can last longer than usual.
2018-02-28 18:56:33 +09:00
Ian Barwick
ae4d0f2622 repmgrd: fix main monitoring loop for witness server
Missing "break" was breaking it when following a new primary.
2018-02-28 16:30:14 +09:00
Ian Barwick
5e8b41e221 repmgrd: retry standby connection after "follow_command" executed
It's possible that the standby is still starting up after the "follow_command"
completes, so poll for a while until we get a connection.
2018-02-28 15:35:47 +09:00
Ian Barwick
c7a585c555 repmgrd: improve log output
- emit explicit startup NOTICE
- emit NOTICE when falling back to degraded monitoring on a primary node
- improve log message and event notification details when monitoring
  a former primary which has been reconnected as a standby
2018-02-28 12:35:13 +09:00
Ian Barwick
a27dd8c49c doc: document "primary_follow_timeout" configuration file parameter. 2018-02-27 10:09:40 +09:00
Ian Barwick
9365bf3474 "standby promote": make timeout values configurable
This introduces following new configuration file parameters, which
were previously hard-coded values:

 - promote_check_timeout
 - promote_check_interval

Implements GitHub #387.
2018-02-27 10:04:58 +09:00
Ian Barwick
e8ae0831fe doc: add <options> section for various commands 2018-02-26 16:54:54 +09:00
Ian Barwick
518866eba5 "node status": improve replication slot warnings
Addresses GitHub #385
2018-02-23 11:06:47 +09:00
Ian Barwick
ed0330c334 "standby clone": document --recovery-conf-only option 2018-02-23 10:54:42 +09:00
Ian Barwick
1f021dc9fa "standby clone --recovery-conf-only": display generated file with --dry-run
Refactor the original code which generates "recovery.conf" to place the
output into a buffer, which can either be output as "recovery.conf"
or copied to a buffer specified by the caller.
2018-02-23 10:16:47 +09:00
Ian Barwick
425839d764 Fix typo in function name 2018-02-22 15:48:41 +09:00
Ian Barwick
3a764f678a "standby clone": add --recovery-conf-only option
This will generate "recovery.conf" for an existing standby.

Typical use-case is a standby cloned manually from an external data
source (e.g. Barman), where "recovery.conf" needs to be created
(and if required a replication slot).

The --dry-run option will check the pre-requisites but not actually
create "recovery.conf" or a replication slot.

This requires that the upstream node is running, a replication connection
can be made and if required a replication slot can be created.

Implements GitHub #382.
2018-02-22 15:47:19 +09:00
Ian Barwick
829cf5cca4 repmgrd: improve detection of status change from primary to standby
If repmgrd is running in degraded mode on a primary which has been stopped,
then manually been brought back online as a standby (e.g. by creating
recovery.conf and starting the server), ensure it not only detects the
change but automatically updates the node record so it can resume
monitoring the node as a standby.

Previously, repmgrd was looping waiting for the record to be updated
(as is done transparently when executing "repmgr node rejoin") but
if the record was not updated within the timeout period (e.g. by
"repmgr standby register) it would fail to resume monitoring as a
standby.

It seems reasonable to have repmgrd automatically update the node record,
as this will restore failover capability as quickly as possible. If this
is not desired, then the onus is on the user to shut down repmgrd while
making the desired changes.
2018-02-22 11:35:47 +09:00
Ian Barwick
14420d83fa "node rejoin": ensure --dry-run is honoured
Addresses GitHub #383.
2018-02-20 15:28:39 +09:00
Ian Barwick
a80e22f0ed Bump version
4.0.4
2018-02-16 12:19:31 +09:00
Ian Barwick
832993bfbc doc: update 4.0.3 release notes 2018-02-16 12:15:10 +09:00
Ian Barwick
f1ea5e62df doc: update release notes 2018-02-15 14:42:29 +09:00
Ian Barwick
b47448d0e5 Replace remaining instances of strcpy() with strncpy()
Also use strncmp() to match.
2018-02-15 13:17:06 +09:00
57 changed files with 4853 additions and 1188 deletions

4
FAQ.md
View File

@@ -1,9 +1,7 @@
FAQ - Frequently Asked Questions about repmgr FAQ - Frequently Asked Questions about repmgr
============================================= =============================================
The repmgr 4 FAQ is located here: The repmgr 4 FAQ is located here: [repmgr FAQ (Frequently Asked Questions)](https://repmgr.org/docs/4.0/appendix-faq.html "repmgr FAQ")
https://repmgr.org/docs/appendix-faq.html
The repmgr 3.x FAQ can be found here: The repmgr 3.x FAQ can be found here:

64
HISTORY
View File

@@ -1,4 +1,65 @@
4.0.3 2018-02- 4.0.6 2018-06-14
repmgr: (witness register) prevent registration of a witness server with the
same name as an existing node (Ian)
repmgr: (standby follow) check node has actually connected to new primary
before reporting success; GitHub #444 (Ian)
repmgr: (standby clone) improve handling of external configuration file copying,
including consideration in --dry-run check; GitHub #443 (Ian)
repmgr: (standby clone) don't require presence of "user" parameter in
conninfo string; GitHub #437 (Ian)
repmgr: (standby clone) improve documentation of --recovery-conf-only
mode; GitHub #438 (Ian)
repmgr: (node rejoin) fix bug when parsing --config-files parameter;
GitHub #442 (Ian)
repmgr: when using --dry-run, force log level to INFO to ensure output
will always be displayed; GitHub #441 (Ian)
repmgr: (cluster matrix/crosscheck) return non-zero exit code if node
connection issues detected; GitHub #447 (Ian)
repmgrd: ensure local node is counted as quorum member; GitHub #439 (Ian)
4.0.5 2018-05-02
repmgr: poll demoted primary after restart as a standby during a
switchover operation; GitHub #408 (Ian)
repmgr: add configuration parameter "config_directory"; GitHub #424 (Ian)
repmgr: add "dbname=replication" to all replication connection strings;
GitHub #421 (Ian)
repmgr: add sanity check if --upstream-node-id not supplied when executing
"standby register"; GitHub #395 (Ian)
repmgr: enable provision of "archive_cleanup_command" in recovery.conf;
GitHub #416 (Ian)
repmgr: actively check for node to rejoin cluster; GitHub #415 (Ian)
repmgr: enable pg_rewind to be used with PostgreSQL 9.3/9.4; GitHub #413 (Ian)
repmgr: fix minimum accepted value for "degraded_monitoring_timeout";
GitHub #411 (Ian)
repmgr: fix superuser password handling; GitHub #400 (Ian)
repmgr: fix parsing of "archive_ready_critical" configuration file
parameter; GitHub #426 (Ian)
repmgr: fix display of conninfo parsing error messages (Ian)
repmgr: fix "repmgr cluster crosscheck" output; GitHub #389 (Ian)
repmgrd: prevent standby connection handle from going stale (Ian)
repmgrd: fix memory leaks in witness code; GitHub #402 (AndrzejNowicki, Martín)
repmgrd: handle "pg_ctl promote" timeout; GitHub #425 (Ian)
repmgrd: handle failover situation with only two nodes in the primary
location, and at least one node in another location; GitHub #407 (Ian)
repmgrd: set "connect_timeout=2" when pinging a server (Ian)
4.0.4 2018-03-09
repmgr: add "standby clone --recovery-conf-only" option; GitHub #382 (Ian)
repmgr: make "standby promote" timeout values configurable; GitHub #387 (Ian)
repmgr: improve replication slot warnings generated by "node status";
GitHub #385 (Ian)
repmgr: remove restriction on replication slots when cloning from
a Barman server; GitHub #379 (Ian)
repmgr: ensure "node rejoin" honours "--dry-run" option; GitHub #383 (Ian)
repmgr: fix --superuser handling when cloning a standby; GitHub #380 (Ian)
repmgr: update various help options; GitHub #391, #392 (hasegeli)
repmgrd: add event "repmgrd_shutdown"; GitHub #393 (Ian)
repmgrd: improve detection of status change from primary to standby (Ian)
repmgrd: improve log output in various situations (Ian)
repmgrd: improve reconnection to the local node after a failover (Ian)
repmgrd: ensure witness server connects to new primary after a failover (Ian)
4.0.3 2018-02-15
repmgr: improve switchover handling when "pg_ctl" used to control the repmgr: improve switchover handling when "pg_ctl" used to control the
server and logging output is not explicitly redirected (Ian) server and logging output is not explicitly redirected (Ian)
repmgr: improve switchover log messages and exit code when old primary could repmgr: improve switchover log messages and exit code when old primary could
@@ -17,6 +78,7 @@
repmgr: fix upstream node display in "repmgr node status"; GitHub #363 (fanf2) repmgr: fix upstream node display in "repmgr node status"; GitHub #363 (fanf2)
repmgr: improve/clarify documentation and update --help output for repmgr: improve/clarify documentation and update --help output for
"primary unregister"; GitHub #373 (Ian) "primary unregister"; GitHub #373 (Ian)
repmgr: allow replication slots when Barman is configured; GitHub #379 (Ian)
repmgr: fix parsing of "pg_basebackup_options"; GitHub #376 (Ian) repmgr: fix parsing of "pg_basebackup_options"; GitHub #376 (Ian)
repmgr: ensure "pg_subtrans" directory is created when cloning a standby in repmgr: ensure "pg_subtrans" directory is created when cloning a standby in
Barman mode (Ian) Barman mode (Ian)

20
TODO.md Normal file
View File

@@ -0,0 +1,20 @@
TODO
====
This file contains a list of improvements which are desireable and/or have
been requested, and which we aim to address/implement when time and resources
permit.
It is *not* a roadmap and there's no guarantee of any item being implemented
within any given timeframe.
Enable suspension of repmgrd failover
-------------------------------------
When performing maintenance, e.g. a switchover, it's necessary to stop all
repmgrd nodes to prevent unintended failover; this is obviously inconvenient.
We'll need to implement some way of notifying each repmgrd to suspend automatic
failover until further notice.
Requested in GitHub #410 ( https://github.com/2ndQuadrant/repmgr/issues/410 )

View File

@@ -288,6 +288,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
memset(options->node_name, 0, sizeof(options->node_name)); memset(options->node_name, 0, sizeof(options->node_name));
memset(options->conninfo, 0, sizeof(options->conninfo)); memset(options->conninfo, 0, sizeof(options->conninfo));
memset(options->data_directory, 0, sizeof(options->data_directory)); memset(options->data_directory, 0, sizeof(options->data_directory));
memset(options->config_directory, 0, sizeof(options->data_directory));
memset(options->pg_bindir, 0, sizeof(options->pg_bindir)); memset(options->pg_bindir, 0, sizeof(options->pg_bindir));
options->replication_type = REPLICATION_TYPE_PHYSICAL; options->replication_type = REPLICATION_TYPE_PHYSICAL;
@@ -303,7 +304,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->log_status_interval = DEFAULT_LOG_STATUS_INTERVAL; options->log_status_interval = DEFAULT_LOG_STATUS_INTERVAL;
/*----------------------- /*-----------------------
* standby action settings * standby clone settings
*------------------------ *------------------------
*/ */
options->use_replication_slots = false; options->use_replication_slots = false;
@@ -314,9 +315,24 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->tablespace_mapping.tail = NULL; options->tablespace_mapping.tail = NULL;
memset(options->recovery_min_apply_delay, 0, sizeof(options->recovery_min_apply_delay)); memset(options->recovery_min_apply_delay, 0, sizeof(options->recovery_min_apply_delay));
options->recovery_min_apply_delay_provided = false; options->recovery_min_apply_delay_provided = false;
memset(options->archive_cleanup_command, 0, sizeof(options->archive_cleanup_command));
options->use_primary_conninfo_password = false; options->use_primary_conninfo_password = false;
memset(options->passfile, 0, sizeof(options->passfile)); memset(options->passfile, 0, sizeof(options->passfile));
/*-------------------------
* standby promote settings
*-------------------------
*/
options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT;
options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL;
/*------------------------
* standby follow settings
*------------------------
*/
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
/*----------------- /*-----------------
* repmgrd settings * repmgrd settings
*----------------- *-----------------
@@ -336,7 +352,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->degraded_monitoring_timeout = -1; options->degraded_monitoring_timeout = -1;
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT; options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT; options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT; options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
/*------------- /*-------------
* witness settings * witness settings
@@ -455,6 +471,9 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
strncpy(options->conninfo, value, MAXLEN); strncpy(options->conninfo, value, MAXLEN);
else if (strcmp(name, "data_directory") == 0) else if (strcmp(name, "data_directory") == 0)
strncpy(options->data_directory, value, MAXPGPATH); strncpy(options->data_directory, value, MAXPGPATH);
else if (strcmp(name, "config_directory") == 0)
strncpy(options->config_directory, value, MAXPGPATH);
else if (strcmp(name, "replication_user") == 0) else if (strcmp(name, "replication_user") == 0)
{ {
if (strlen(value) < NAMEDATALEN) if (strlen(value) < NAMEDATALEN)
@@ -500,15 +519,30 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
parse_time_unit_parameter(name, value, options->recovery_min_apply_delay, error_list); parse_time_unit_parameter(name, value, options->recovery_min_apply_delay, error_list);
options->recovery_min_apply_delay_provided = true; options->recovery_min_apply_delay_provided = true;
} }
else if (strcmp(name, "archive_cleanup_command") == 0)
strncpy(options->archive_cleanup_command, value, MAXLEN);
else if (strcmp(name, "use_primary_conninfo_password") == 0) else if (strcmp(name, "use_primary_conninfo_password") == 0)
options->use_primary_conninfo_password = parse_bool(value, name, error_list); options->use_primary_conninfo_password = parse_bool(value, name, error_list);
else if (strcmp(name, "passfile") == 0) else if (strcmp(name, "passfile") == 0)
strncpy(options->passfile, value, sizeof(options->passfile)); strncpy(options->passfile, value, sizeof(options->passfile));
/* standby promote settings */
else if (strcmp(name, "promote_check_timeout") == 0)
options->promote_check_timeout = repmgr_atoi(value, name, error_list, 1);
else if (strcmp(name, "promote_check_interval") == 0)
options->promote_check_interval = repmgr_atoi(value, name, error_list, 1);
/* standby follow settings */
else if (strcmp(name, "primary_follow_timeout") == 0)
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "standby_follow_timeout") == 0)
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
/* node check settings */ /* node check settings */
else if (strcmp(name, "archive_ready_warning") == 0) else if (strcmp(name, "archive_ready_warning") == 0)
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1); options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
else if (strcmp(name, "archive_ready_critcial") == 0) else if (strcmp(name, "archive_ready_critical") == 0)
options->archive_ready_critical = repmgr_atoi(value, name, error_list, 1); options->archive_ready_critical = repmgr_atoi(value, name, error_list, 1);
else if (strcmp(name, "replication_lag_warning") == 0) else if (strcmp(name, "replication_lag_warning") == 0)
options->replication_lag_warning = repmgr_atoi(value, name, error_list, 1); options->replication_lag_warning = repmgr_atoi(value, name, error_list, 1);
@@ -549,13 +583,13 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
else if (strcmp(name, "monitoring_history") == 0) else if (strcmp(name, "monitoring_history") == 0)
options->monitoring_history = parse_bool(value, name, error_list); options->monitoring_history = parse_bool(value, name, error_list);
else if (strcmp(name, "degraded_monitoring_timeout") == 0) else if (strcmp(name, "degraded_monitoring_timeout") == 0)
options->degraded_monitoring_timeout = repmgr_atoi(value, name, error_list, 1); options->degraded_monitoring_timeout = repmgr_atoi(value, name, error_list, -1);
else if (strcmp(name, "async_query_timeout") == 0) else if (strcmp(name, "async_query_timeout") == 0)
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0); options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "primary_notification_timeout") == 0) else if (strcmp(name, "primary_notification_timeout") == 0)
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0); options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "primary_follow_timeout") == 0) else if (strcmp(name, "standby_reconnect_timeout") == 0)
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0); options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
/* witness settings */ /* witness settings */
else if (strcmp(name, "witness_sync_interval") == 0) else if (strcmp(name, "witness_sync_interval") == 0)
@@ -1028,7 +1062,7 @@ reload_config(t_configuration_options *orig_options)
return false; return false;
} }
if (strcmp(new_options.node_name, orig_options->node_name) != 0) if (strncmp(new_options.node_name, orig_options->node_name, MAXLEN) != 0)
{ {
log_warning(_("\"node_name\" cannot be changed, keeping current configuration")); log_warning(_("\"node_name\" cannot be changed, keeping current configuration"));
return false; return false;
@@ -1072,7 +1106,7 @@ reload_config(t_configuration_options *orig_options)
} }
/* conninfo */ /* conninfo */
if (strcmp(orig_options->conninfo, new_options.conninfo) != 0) if (strncmp(orig_options->conninfo, new_options.conninfo, MAXLEN) != 0)
{ {
/* Test conninfo string works */ /* Test conninfo string works */
conn = establish_db_connection(new_options.conninfo, false); conn = establish_db_connection(new_options.conninfo, false);
@@ -1099,7 +1133,7 @@ reload_config(t_configuration_options *orig_options)
} }
/* event_notification_command */ /* event_notification_command */
if (strcmp(orig_options->event_notification_command, new_options.event_notification_command) != 0) if (strncmp(orig_options->event_notification_command, new_options.event_notification_command, MAXLEN) != 0)
{ {
strncpy(orig_options->event_notification_command, new_options.event_notification_command, MAXLEN); strncpy(orig_options->event_notification_command, new_options.event_notification_command, MAXLEN);
log_info(_("\"event_notification_command\" is now \"%s\""), new_options.event_notification_command); log_info(_("\"event_notification_command\" is now \"%s\""), new_options.event_notification_command);
@@ -1108,7 +1142,7 @@ reload_config(t_configuration_options *orig_options)
} }
/* event_notifications */ /* event_notifications */
if (strcmp(orig_options->event_notifications_orig, new_options.event_notifications_orig) != 0) if (strncmp(orig_options->event_notifications_orig, new_options.event_notifications_orig, MAXLEN) != 0)
{ {
strncpy(orig_options->event_notifications_orig, new_options.event_notifications_orig, MAXLEN); strncpy(orig_options->event_notifications_orig, new_options.event_notifications_orig, MAXLEN);
log_info(_("\"event_notifications\" is now \"%s\""), new_options.event_notifications_orig); log_info(_("\"event_notifications\" is now \"%s\""), new_options.event_notifications_orig);
@@ -1128,7 +1162,7 @@ reload_config(t_configuration_options *orig_options)
} }
/* follow_command */ /* follow_command */
if (strcmp(orig_options->follow_command, new_options.follow_command) != 0) if (strncmp(orig_options->follow_command, new_options.follow_command, MAXLEN) != 0)
{ {
strncpy(orig_options->follow_command, new_options.follow_command, MAXLEN); strncpy(orig_options->follow_command, new_options.follow_command, MAXLEN);
log_info(_("\"follow_command\" is now \"%s\""), new_options.follow_command); log_info(_("\"follow_command\" is now \"%s\""), new_options.follow_command);
@@ -1165,7 +1199,7 @@ reload_config(t_configuration_options *orig_options)
/* promote_command */ /* promote_command */
if (strcmp(orig_options->promote_command, new_options.promote_command) != 0) if (strncmp(orig_options->promote_command, new_options.promote_command, MAXLEN) != 0)
{ {
strncpy(orig_options->promote_command, new_options.promote_command, MAXLEN); strncpy(orig_options->promote_command, new_options.promote_command, MAXLEN);
log_info(_("\"promote_command\" is now \"%s\""), new_options.promote_command); log_info(_("\"promote_command\" is now \"%s\""), new_options.promote_command);
@@ -1205,18 +1239,18 @@ reload_config(t_configuration_options *orig_options)
*/ */
/* log_facility */ /* log_facility */
if (strcmp(orig_options->log_facility, new_options.log_facility) != 0) if (strncmp(orig_options->log_facility, new_options.log_facility, MAXLEN) != 0)
{ {
strcpy(orig_options->log_facility, new_options.log_facility); strncpy(orig_options->log_facility, new_options.log_facility, MAXLEN);
log_info(_("\"log_facility\" is now \"%s\""), new_options.log_facility); log_info(_("\"log_facility\" is now \"%s\""), new_options.log_facility);
log_config_changed = true; log_config_changed = true;
} }
/* log_file */ /* log_file */
if (strcmp(orig_options->log_file, new_options.log_file) != 0) if (strncmp(orig_options->log_file, new_options.log_file, MAXLEN) != 0)
{ {
strcpy(orig_options->log_file, new_options.log_file); strncpy(orig_options->log_file, new_options.log_file, MAXLEN);
log_info(_("\"log_file\" is now \"%s\""), new_options.log_file); log_info(_("\"log_file\" is now \"%s\""), new_options.log_file);
log_config_changed = true; log_config_changed = true;
@@ -1224,9 +1258,9 @@ reload_config(t_configuration_options *orig_options)
/* log_level */ /* log_level */
if (strcmp(orig_options->log_level, new_options.log_level) != 0) if (strncmp(orig_options->log_level, new_options.log_level, MAXLEN) != 0)
{ {
strcpy(orig_options->log_level, new_options.log_level); strncpy(orig_options->log_level, new_options.log_level, MAXLEN);
log_info(_("\"log_level\" is now \"%s\""), new_options.log_level); log_info(_("\"log_level\" is now \"%s\""), new_options.log_level);
log_config_changed = true; log_config_changed = true;

View File

@@ -73,6 +73,7 @@ typedef struct
char conninfo[MAXLEN]; char conninfo[MAXLEN];
char replication_user[NAMEDATALEN]; char replication_user[NAMEDATALEN];
char data_directory[MAXPGPATH]; char data_directory[MAXPGPATH];
char config_directory[MAXPGPATH];
char pg_bindir[MAXPGPATH]; char pg_bindir[MAXPGPATH];
int replication_type; int replication_type;
@@ -82,16 +83,25 @@ typedef struct
char log_file[MAXLEN]; char log_file[MAXLEN];
int log_status_interval; int log_status_interval;
/* standby action settings */ /* standby clone settings */
bool use_replication_slots; bool use_replication_slots;
char pg_basebackup_options[MAXLEN]; char pg_basebackup_options[MAXLEN];
char restore_command[MAXLEN]; char restore_command[MAXLEN];
TablespaceList tablespace_mapping; TablespaceList tablespace_mapping;
char recovery_min_apply_delay[MAXLEN]; char recovery_min_apply_delay[MAXLEN];
bool recovery_min_apply_delay_provided; bool recovery_min_apply_delay_provided;
char archive_cleanup_command[MAXLEN];
bool use_primary_conninfo_password; bool use_primary_conninfo_password;
char passfile[MAXPGPATH]; char passfile[MAXPGPATH];
/* standby promote settings */
int promote_check_timeout;
int promote_check_interval;
/* standby follow settings */
int primary_follow_timeout;
int standby_follow_timeout;
/* node check settings */ /* node check settings */
int archive_ready_warning; int archive_ready_warning;
int archive_ready_critical; int archive_ready_critical;
@@ -114,7 +124,7 @@ typedef struct
int degraded_monitoring_timeout; int degraded_monitoring_timeout;
int async_query_timeout; int async_query_timeout;
int primary_notification_timeout; int primary_notification_timeout;
int primary_follow_timeout; int standby_reconnect_timeout;
/* BDR settings */ /* BDR settings */
bool bdr_local_monitoring_only; bool bdr_local_monitoring_only;
@@ -153,11 +163,16 @@ typedef struct
#define T_CONFIGURATION_OPTIONS_INITIALIZER { \ #define T_CONFIGURATION_OPTIONS_INITIALIZER { \
/* node information */ \ /* node information */ \
UNKNOWN_NODE_ID, "", "", "", "", "", REPLICATION_TYPE_PHYSICAL, \ UNKNOWN_NODE_ID, "", "", "", "", "", "", REPLICATION_TYPE_PHYSICAL, \
/* log settings */ \ /* log settings */ \
"", "", "", DEFAULT_LOG_STATUS_INTERVAL, \ "", "", "", DEFAULT_LOG_STATUS_INTERVAL, \
/* standby action settings */ \ /* standby clone settings */ \
false, "", "", { NULL, NULL }, "", false, false, "", \ false, "", "", { NULL, NULL }, "", false, "", false, "", \
/* standby promote settings */ \
DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \
/* standby follow settings */ \
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
/* node check settings */ \ /* node check settings */ \
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \ DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \ DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
@@ -171,7 +186,7 @@ typedef struct
false, -1, \ false, -1, \
DEFAULT_ASYNC_QUERY_TIMEOUT, \ DEFAULT_ASYNC_QUERY_TIMEOUT, \
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \ DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \ DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
/* BDR settings */ \ /* BDR settings */ \
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \ false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
/* service settings */ \ /* service settings */ \

18
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for repmgr 4.0.3. # Generated by GNU Autoconf 2.69 for repmgr 4.0.5.
# #
# Report bugs to <pgsql-bugs@postgresql.org>. # Report bugs to <pgsql-bugs@postgresql.org>.
# #
@@ -582,8 +582,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='repmgr' PACKAGE_NAME='repmgr'
PACKAGE_TARNAME='repmgr' PACKAGE_TARNAME='repmgr'
PACKAGE_VERSION='4.0.3' PACKAGE_VERSION='4.0.5'
PACKAGE_STRING='repmgr 4.0.3' PACKAGE_STRING='repmgr 4.0.5'
PACKAGE_BUGREPORT='pgsql-bugs@postgresql.org' PACKAGE_BUGREPORT='pgsql-bugs@postgresql.org'
PACKAGE_URL='https://2ndquadrant.com/en/resources/repmgr/' PACKAGE_URL='https://2ndquadrant.com/en/resources/repmgr/'
@@ -1178,7 +1178,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures repmgr 4.0.3 to adapt to many kinds of systems. \`configure' configures repmgr 4.0.5 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1239,7 +1239,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of repmgr 4.0.3:";; short | recursive ) echo "Configuration of repmgr 4.0.5:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@@ -1313,7 +1313,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
repmgr configure 4.0.3 repmgr configure 4.0.5
generated by GNU Autoconf 2.69 generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc. Copyright (C) 2012 Free Software Foundation, Inc.
@@ -1332,7 +1332,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by repmgr $as_me 4.0.3, which was It was created by repmgr $as_me 4.0.5, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@ $ $0 $@
@@ -2359,7 +2359,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by repmgr $as_me 4.0.3, which was This file was extended by repmgr $as_me 4.0.5, which was
generated by GNU Autoconf 2.69. Invocation command line was generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@@ -2422,7 +2422,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
repmgr config.status 4.0.3 repmgr config.status 4.0.5
configured by $0, generated by GNU Autoconf 2.69, configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([repmgr], [4.0.3], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/]) AC_INIT([repmgr], [4.0.6], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])
AC_COPYRIGHT([Copyright (c) 2010-2018, 2ndQuadrant Ltd.]) AC_COPYRIGHT([Copyright (c) 2010-2018, 2ndQuadrant Ltd.])

View File

@@ -37,13 +37,8 @@ get_system_identifier(const char *data_directory)
uint64 system_identifier = UNKNOWN_SYSTEM_IDENTIFIER; uint64 system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
control_file_info = get_controlfile(data_directory); control_file_info = get_controlfile(data_directory);
system_identifier = control_file_info->system_identifier;
if (control_file_info->control_file_processed == true)
system_identifier = control_file_info->control_file->system_identifier;
else
system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
pfree(control_file_info->control_file);
pfree(control_file_info); pfree(control_file_info);
return system_identifier; return system_identifier;
@@ -57,13 +52,8 @@ get_db_state(const char *data_directory)
control_file_info = get_controlfile(data_directory); control_file_info = get_controlfile(data_directory);
if (control_file_info->control_file_processed == true) state = control_file_info->state;
state = control_file_info->control_file->state;
else
/* if we were unable to parse the control file, assume DB is shut down */
state = DB_SHUTDOWNED;
pfree(control_file_info->control_file);
pfree(control_file_info); pfree(control_file_info);
return state; return state;
@@ -78,12 +68,8 @@ get_latest_checkpoint_location(const char *data_directory)
control_file_info = get_controlfile(data_directory); control_file_info = get_controlfile(data_directory);
if (control_file_info->control_file_processed == false) checkPoint = control_file_info->checkPoint;
return InvalidXLogRecPtr;
checkPoint = control_file_info->control_file->checkPoint;
pfree(control_file_info->control_file);
pfree(control_file_info); pfree(control_file_info);
return checkPoint; return checkPoint;
@@ -98,16 +84,8 @@ get_data_checksum_version(const char *data_directory)
control_file_info = get_controlfile(data_directory); control_file_info = get_controlfile(data_directory);
if (control_file_info->control_file_processed == false) data_checksum_version = (int) control_file_info->data_checksum_version;
{
data_checksum_version = -1;
}
else
{
data_checksum_version = (int) control_file_info->control_file->data_checksum_version;
}
pfree(control_file_info->control_file);
pfree(control_file_info); pfree(control_file_info);
return data_checksum_version; return data_checksum_version;
@@ -139,33 +117,109 @@ describe_db_state(DBState state)
/* /*
* we maintain our own version of get_controlfile() as we need cross-version * We maintain our own version of get_controlfile() as we need cross-version
* compatibility, and also don't care if the file isn't readable. * compatibility, and also don't care if the file isn't readable.
*/ */
static ControlFileInfo * static ControlFileInfo *
get_controlfile(const char *DataDir) get_controlfile(const char *DataDir)
{ {
ControlFileInfo *control_file_info; ControlFileInfo *control_file_info;
int fd; FILE *fp = NULL;
int fd, ret, version_num;
char PgVersionPath[MAXPGPATH] = "";
char ControlFilePath[MAXPGPATH] = ""; char ControlFilePath[MAXPGPATH] = "";
char file_version_string[64] = "";
long file_major, file_minor;
char *endptr = NULL;
void *ControlFileDataPtr = NULL;
int expected_size = 0;
control_file_info = palloc0(sizeof(ControlFileInfo)); control_file_info = palloc0(sizeof(ControlFileInfo));
/* set default values */
control_file_info->control_file_processed = false; control_file_info->control_file_processed = false;
control_file_info->control_file = palloc0(sizeof(ControlFileData)); control_file_info->system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
control_file_info->state = DB_SHUTDOWNED;
control_file_info->checkPoint = InvalidXLogRecPtr;
control_file_info->data_checksum_version = -1;
/*
* Read PG_VERSION, as we'll need to determine which struct to read
* the control file contents into
*/
snprintf(PgVersionPath, MAXPGPATH, "%s/PG_VERSION", DataDir);
fp = fopen(PgVersionPath, "r");
if (fp == NULL)
{
log_warning(_("could not open file \"%s\" for reading"),
PgVersionPath);
log_detail("%s", strerror(errno));
return control_file_info;
}
file_version_string[0] = '\0';
ret = fscanf(fp, "%63s", file_version_string);
fclose(fp);
if (ret != 1 || endptr == file_version_string)
{
log_warning(_("unable to determine major version number from PG_VERSION"));
return control_file_info;
}
file_major = strtol(file_version_string, &endptr, 10);
file_minor = 0;
if (*endptr == '.')
file_minor = strtol(endptr + 1, NULL, 10);
version_num = ((int) file_major * 10000) + ((int) file_minor * 100);
if (version_num < 90300)
{
log_warning(_("Data directory appears to be initialised for %s"), file_version_string);
return control_file_info;
}
snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir); snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);
if ((fd = open(ControlFilePath, O_RDONLY | PG_BINARY, 0)) == -1) if ((fd = open(ControlFilePath, O_RDONLY | PG_BINARY, 0)) == -1)
{ {
log_debug("could not open file \"%s\" for reading: %s", log_warning(_("could not open file \"%s\" for reading"),
ControlFilePath, strerror(errno)); ControlFilePath);
log_detail("%s", strerror(errno));
return control_file_info; return control_file_info;
} }
if (read(fd, control_file_info->control_file, sizeof(ControlFileData)) != sizeof(ControlFileData))
if (version_num >= 90500)
{ {
log_debug("could not read file \"%s\": %s", expected_size = sizeof(ControlFileData95);
ControlFilePath, strerror(errno)); ControlFileDataPtr = palloc0(expected_size);
}
else if (version_num >= 90400)
{
expected_size = sizeof(ControlFileData94);
ControlFileDataPtr = palloc0(expected_size);
}
else if (version_num >= 90300)
{
expected_size = sizeof(ControlFileData93);
ControlFileDataPtr = palloc0(expected_size);
}
if (read(fd, ControlFileDataPtr, expected_size) != expected_size)
{
log_warning(_("could not read file \"%s\""),
ControlFilePath);
log_detail("%s", strerror(errno));
return control_file_info; return control_file_info;
} }
@@ -173,6 +227,33 @@ get_controlfile(const char *DataDir)
control_file_info->control_file_processed = true; control_file_info->control_file_processed = true;
if (version_num >= 90500)
{
ControlFileData95 *ptr = (struct ControlFileData95 *)ControlFileDataPtr;
control_file_info->system_identifier = ptr->system_identifier;
control_file_info->state = ptr->state;
control_file_info->checkPoint = ptr->checkPoint;
control_file_info->data_checksum_version = ptr->data_checksum_version;
}
else if (version_num >= 90400)
{
ControlFileData94 *ptr = (struct ControlFileData94 *)ControlFileDataPtr;
control_file_info->system_identifier = ptr->system_identifier;
control_file_info->state = ptr->state;
control_file_info->checkPoint = ptr->checkPoint;
control_file_info->data_checksum_version = ptr->data_checksum_version;
}
else if (version_num >= 90300)
{
ControlFileData93 *ptr = (struct ControlFileData93 *)ControlFileDataPtr;
control_file_info->system_identifier = ptr->system_identifier;
control_file_info->state = ptr->state;
control_file_info->checkPoint = ptr->checkPoint;
control_file_info->data_checksum_version = ptr->data_checksum_version;
}
pfree(ControlFileDataPtr);
/* /*
* We don't check the CRC here as we're potentially checking a pg_control * We don't check the CRC here as we're potentially checking a pg_control
* file from a different PostgreSQL version to the one repmgr was compiled * file from a different PostgreSQL version to the one repmgr was compiled

View File

@@ -12,12 +12,261 @@
#include "postgres_fe.h" #include "postgres_fe.h"
#include "catalog/pg_control.h" #include "catalog/pg_control.h"
/*
* A simplified representation of pg_control containing only those fields
* required by repmgr.
*/
typedef struct typedef struct
{ {
bool control_file_processed; bool control_file_processed;
ControlFileData *control_file; uint64 system_identifier;
DBState state;
XLogRecPtr checkPoint;
uint32 data_checksum_version;
} ControlFileInfo; } ControlFileInfo;
/* Same for 9.3, 9.4 */
typedef struct CheckPoint93
{
XLogRecPtr redo; /* next RecPtr available when we began to
* create CheckPoint (i.e. REDO start point) */
TimeLineID ThisTimeLineID; /* current TLI */
TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new
* timeline (equals ThisTimeLineID otherwise) */
bool fullPageWrites; /* current full_page_writes */
uint32 nextXidEpoch; /* higher-order bits of nextXid */
TransactionId nextXid; /* next free XID */
Oid nextOid; /* next free OID */
MultiXactId nextMulti; /* next free MultiXactId */
MultiXactOffset nextMultiOffset; /* next free MultiXact offset */
TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */
Oid oldestXidDB; /* database with minimum datfrozenxid */
MultiXactId oldestMulti; /* cluster-wide minimum datminmxid */
Oid oldestMultiDB; /* database with minimum datminmxid */
pg_time_t time; /* time stamp of checkpoint */
TransactionId oldestActiveXid;
} CheckPoint93;
/* Same for 9.5, 9.6, 10, HEAD */
typedef struct CheckPoint95
{
XLogRecPtr redo; /* next RecPtr available when we began to
* create CheckPoint (i.e. REDO start point) */
TimeLineID ThisTimeLineID; /* current TLI */
TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new
* timeline (equals ThisTimeLineID otherwise) */
bool fullPageWrites; /* current full_page_writes */
uint32 nextXidEpoch; /* higher-order bits of nextXid */
TransactionId nextXid; /* next free XID */
Oid nextOid; /* next free OID */
MultiXactId nextMulti; /* next free MultiXactId */
MultiXactOffset nextMultiOffset; /* next free MultiXact offset */
TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */
Oid oldestXidDB; /* database with minimum datfrozenxid */
MultiXactId oldestMulti; /* cluster-wide minimum datminmxid */
Oid oldestMultiDB; /* database with minimum datminmxid */
pg_time_t time; /* time stamp of checkpoint */
TransactionId oldestCommitTsXid; /* oldest Xid with valid commit
* timestamp */
TransactionId newestCommitTsXid; /* newest Xid with valid commit
* timestamp */
TransactionId oldestActiveXid;
} CheckPoint95;
typedef struct ControlFileData93
{
uint64 system_identifier;
uint32 pg_control_version; /* PG_CONTROL_VERSION */
uint32 catalog_version_no; /* see catversion.h */
DBState state; /* see enum above */
pg_time_t time; /* time stamp of last pg_control update */
XLogRecPtr checkPoint; /* last check point record ptr */
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
CheckPoint93 checkPointCopy; /* copy of last check point record */
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
XLogRecPtr minRecoveryPoint;
TimeLineID minRecoveryPointTLI;
XLogRecPtr backupStartPoint;
XLogRecPtr backupEndPoint;
bool backupEndRequired;
int wal_level;
int MaxConnections;
int max_prepared_xacts;
int max_locks_per_xact;
uint32 maxAlign; /* alignment requirement for tuples */
double floatFormat; /* constant 1234567.0 */
uint32 blcksz; /* data block size for this DB */
uint32 relseg_size; /* blocks per segment of large relation */
uint32 xlog_blcksz; /* block size within WAL files */
uint32 xlog_seg_size; /* size of each WAL segment */
uint32 nameDataLen; /* catalog name field width */
uint32 indexMaxKeys; /* max number of columns in an index */
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
/* flag indicating internal format of timestamp, interval, time */
bool enableIntTimes; /* int64 storage enabled? */
/* flags indicating pass-by-value status of various types */
bool float4ByVal; /* float4 pass-by-value? */
bool float8ByVal; /* float8, int8, etc pass-by-value? */
/* Are data pages protected by checksums? Zero if no checksum version */
uint32 data_checksum_version;
} ControlFileData93;
/*
* Following fields added since 9.3:
*
* int max_worker_processes;
* int max_prepared_xacts;
* int max_locks_per_xact;
*
*/
typedef struct ControlFileData94
{
uint64 system_identifier;
uint32 pg_control_version; /* PG_CONTROL_VERSION */
uint32 catalog_version_no; /* see catversion.h */
DBState state; /* see enum above */
pg_time_t time; /* time stamp of last pg_control update */
XLogRecPtr checkPoint; /* last check point record ptr */
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
CheckPoint93 checkPointCopy; /* copy of last check point record */
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
XLogRecPtr minRecoveryPoint;
TimeLineID minRecoveryPointTLI;
XLogRecPtr backupStartPoint;
XLogRecPtr backupEndPoint;
bool backupEndRequired;
int wal_level;
bool wal_log_hints;
int MaxConnections;
int max_worker_processes;
int max_prepared_xacts;
int max_locks_per_xact;
uint32 maxAlign; /* alignment requirement for tuples */
double floatFormat; /* constant 1234567.0 */
uint32 blcksz; /* data block size for this DB */
uint32 relseg_size; /* blocks per segment of large relation */
uint32 xlog_blcksz; /* block size within WAL files */
uint32 xlog_seg_size; /* size of each WAL segment */
uint32 nameDataLen; /* catalog name field width */
uint32 indexMaxKeys; /* max number of columns in an index */
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
uint32 loblksize; /* chunk size in pg_largeobject */
bool enableIntTimes; /* int64 storage enabled? */
bool float4ByVal; /* float4 pass-by-value? */
bool float8ByVal; /* float8, int8, etc pass-by-value? */
/* Are data pages protected by checksums? Zero if no checksum version */
uint32 data_checksum_version;
} ControlFileData94;
/*
* Following field added since 9.4:
*
* bool track_commit_timestamp;
*
* Unchanged in 9.6
*
* In 10, following field appended *after* "data_checksum_version":
*
* char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN];
*
* (but we don't care about that)
*/
typedef struct ControlFileData95
{
uint64 system_identifier;
uint32 pg_control_version; /* PG_CONTROL_VERSION */
uint32 catalog_version_no; /* see catversion.h */
DBState state; /* see enum above */
pg_time_t time; /* time stamp of last pg_control update */
XLogRecPtr checkPoint; /* last check point record ptr */
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
CheckPoint95 checkPointCopy; /* copy of last check point record */
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
XLogRecPtr minRecoveryPoint;
TimeLineID minRecoveryPointTLI;
XLogRecPtr backupStartPoint;
XLogRecPtr backupEndPoint;
bool backupEndRequired;
int wal_level;
bool wal_log_hints;
int MaxConnections;
int max_worker_processes;
int max_prepared_xacts;
int max_locks_per_xact;
bool track_commit_timestamp;
uint32 maxAlign; /* alignment requirement for tuples */
double floatFormat; /* constant 1234567.0 */
uint32 blcksz; /* data block size for this DB */
uint32 relseg_size; /* blocks per segment of large relation */
uint32 xlog_blcksz; /* block size within WAL files */
uint32 xlog_seg_size; /* size of each WAL segment */
uint32 nameDataLen; /* catalog name field width */
uint32 indexMaxKeys; /* max number of columns in an index */
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
uint32 loblksize; /* chunk size in pg_largeobject */
bool enableIntTimes; /* int64 storage enabled? */
bool float4ByVal; /* float4 pass-by-value? */
bool float8ByVal; /* float8, int8, etc pass-by-value? */
uint32 data_checksum_version;
} ControlFileData95;
extern DBState get_db_state(const char *data_directory); extern DBState get_db_state(const char *data_directory);
extern const char *describe_db_state(DBState state); extern const char *describe_db_state(DBState state);
extern int get_data_checksum_version(const char *data_directory); extern int get_data_checksum_version(const char *data_directory);

369
dbutils.c
View File

@@ -23,6 +23,7 @@
#include <sys/time.h> #include <sys/time.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <dirent.h> #include <dirent.h>
#include <arpa/inet.h>
#include "repmgr.h" #include "repmgr.h"
#include "dbutils.h" #include "dbutils.h"
@@ -124,7 +125,7 @@ _establish_db_connection(const char *conninfo, const bool exit_on_error, const b
initialize_conninfo_params(&conninfo_params, false); initialize_conninfo_params(&conninfo_params, false);
parse_success = parse_conninfo_string(conninfo, &conninfo_params, errmsg, false); parse_success = parse_conninfo_string(conninfo, &conninfo_params, &errmsg, false);
if (parse_success == false) if (parse_success == false)
{ {
@@ -219,8 +220,7 @@ establish_db_connection_quiet(const char *conninfo)
} }
PGconn PGconn *
*
establish_primary_db_connection(PGconn *conn, establish_primary_db_connection(PGconn *conn,
const bool exit_on_error) const bool exit_on_error)
{ {
@@ -237,36 +237,6 @@ establish_primary_db_connection(PGconn *conn,
} }
PGconn *
establish_db_connection_as_user(const char *conninfo,
const char *user,
const bool exit_on_error)
{
PGconn *conn = NULL;
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
bool parse_success = false;
char *errmsg = NULL;
initialize_conninfo_params(&conninfo_params, false);
parse_success = parse_conninfo_string(conninfo, &conninfo_params, errmsg, true);
if (parse_success == false)
{
log_error(_("unable to pass provided conninfo string:\n %s"), errmsg);
return NULL;
}
param_set(&conninfo_params, "user", user);
conn = establish_db_connection_by_params(&conninfo_params, false);
return conn;
}
PGconn * PGconn *
establish_db_connection_by_params(t_conninfo_param_list *param_list, establish_db_connection_by_params(t_conninfo_param_list *param_list,
const bool exit_on_error) const bool exit_on_error)
@@ -342,6 +312,18 @@ is_superuser_connection(PGconn *conn, t_connection_user *userinfo)
} }
void
close_connection(PGconn **conn)
{
if (*conn == NULL)
return;
PQfinish(*conn);
*conn = NULL;
}
/* =============================== */ /* =============================== */
/* conninfo manipulation functions */ /* conninfo manipulation functions */
/* =============================== */ /* =============================== */
@@ -389,6 +371,37 @@ get_conninfo_value(const char *conninfo, const char *keyword, char *output)
} }
/*
* Get a default conninfo value for the provided parameter, and copy
* it to the 'output' buffer.
*
* Returns true on success, or false on failure (provided keyword not found).
*
*/
bool
get_conninfo_default_value(const char *param, char *output, int maxlen)
{
PQconninfoOption *defs = NULL;
PQconninfoOption *def = NULL;
bool found = false;
defs = PQconndefaults();
for (def = defs; def->keyword; def++)
{
if (strncmp(def->keyword, param, maxlen) == 0)
{
strncpy(output, def->val, maxlen);
found = true;
}
}
PQconninfoFree(defs);
return found;
}
void void
initialize_conninfo_params(t_conninfo_param_list *param_list, bool set_defaults) initialize_conninfo_params(t_conninfo_param_list *param_list, bool set_defaults)
{ {
@@ -572,7 +585,7 @@ param_get(t_conninfo_param_list *param_list, const char *param)
/* /*
* Parse a conninfo string into a t_conninfo_param_list * Parse a conninfo string into a t_conninfo_param_list
* *
* See conn_to_param_list() to do the same for a PQconn * See conn_to_param_list() to do the same for a PGconn
* *
* "ignore_local_params": ignores those parameters specific * "ignore_local_params": ignores those parameters specific
* to a local installation, i.e. when parsing an upstream * to a local installation, i.e. when parsing an upstream
@@ -580,12 +593,12 @@ param_get(t_conninfo_param_list *param_list, const char *param)
* don't copy that node's values * don't copy that node's values
*/ */
bool bool
parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char *errmsg, bool ignore_local_params) parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char **errmsg, bool ignore_local_params)
{ {
PQconninfoOption *connOptions = NULL; PQconninfoOption *connOptions = NULL;
PQconninfoOption *option = NULL; PQconninfoOption *option = NULL;
connOptions = PQconninfoParse(conninfo_str, &errmsg); connOptions = PQconninfoParse(conninfo_str, errmsg);
if (connOptions == NULL) if (connOptions == NULL)
return false; return false;
@@ -616,10 +629,19 @@ parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_lis
return true; return true;
} }
/* /*
* Parse a PQconn into a t_conninfo_param_list * Parse a PGconn into a t_conninfo_param_list
* *
* See parse_conninfo_string() to do the same for a conninfo string * See parse_conninfo_string() to do the same for a conninfo string
*
* NOTE: the current use case for this is to take an active connection,
* replace the existing username (typically replacing it with the superuser
* or replication user name), and make a new connection as that user.
* If the "password" field is set, it will cause any connection made with
* these parameters to fail (unless of course the password happens to be the
* same). Therefore we remove the password altogether, and rely on it being
* available via .pgpass.
*/ */
void void
conn_to_param_list(PGconn *conn, t_conninfo_param_list *param_list) conn_to_param_list(PGconn *conn, t_conninfo_param_list *param_list)
@@ -635,6 +657,10 @@ conn_to_param_list(PGconn *conn, t_conninfo_param_list *param_list)
(option->val != NULL && option->val[0] == '\0')) (option->val != NULL && option->val[0] == '\0'))
continue; continue;
/* Ignore "password" */
if (strcmp(option->keyword, "password") == 0)
continue;
param_set(param_list, option->keyword, option->val); param_set(param_list, option->keyword, option->val);
} }
@@ -1015,7 +1041,7 @@ get_cluster_size(PGconn *conn, char *size)
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
"SELECT pg_catalog.pg_size_pretty(SUM(pg_catalog.pg_database_size(oid))::bigint) " "SELECT pg_catalog.pg_size_pretty(pg_catalog.sum(pg_catalog.pg_database_size(oid))::bigint) "
" FROM pg_catalog.pg_database "); " FROM pg_catalog.pg_database ");
log_verbose(LOG_DEBUG, "get_cluster_size():\n%s", query.data); log_verbose(LOG_DEBUG, "get_cluster_size():\n%s", query.data);
@@ -1060,7 +1086,7 @@ get_server_version(PGconn *conn, char *server_version)
} }
if (server_version != NULL) if (server_version != NULL)
strcpy(server_version, PQgetvalue(res, 0, 1)); strncpy(server_version, PQgetvalue(res, 0, 1), MAXVERSIONSTR);
server_version_num = atoi(PQgetvalue(res, 0, 0)); server_version_num = atoi(PQgetvalue(res, 0, 0));
@@ -1358,67 +1384,6 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
} }
bool
can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason)
{
bool can_use = true;
if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
server_version_num = get_server_version(conn, NULL);
if (server_version_num < 90500)
{
appendPQExpBuffer(reason,
_("pg_rewind available from PostgreSQL 9.5"));
return false;
}
if (guc_set(conn, "full_page_writes", "=", "off"))
{
if (can_use == false)
appendPQExpBuffer(reason, "; ");
appendPQExpBuffer(reason,
_("\"full_page_writes\" must be set to \"on\""));
can_use = false;
}
/*
* "wal_log_hints" off - are data checksums available? Note: we're
* checking the local pg_control file here as the value will be the same
* throughout the cluster and saves a round-trip to the demotion
* candidate.
*/
if (guc_set(conn, "wal_log_hints", "=", "on") == false)
{
int data_checksum_version = get_data_checksum_version(data_directory);
if (data_checksum_version < 0)
{
if (can_use == false)
appendPQExpBuffer(reason, "; ");
appendPQExpBuffer(reason,
_("\"wal_log_hints\" is set to \"off\" but unable to determine checksum version"));
can_use = false;
}
else if (data_checksum_version == 0)
{
if (can_use == false)
appendPQExpBuffer(reason, "; ");
appendPQExpBuffer(reason,
_("\"wal_log_hints\" is set to \"off\" and checksums are disabled"));
can_use = false;
}
}
return can_use;
}
int int
get_ready_archive_files(PGconn *conn, const char *data_directory) get_ready_archive_files(PGconn *conn, const char *data_directory)
{ {
@@ -1800,10 +1765,10 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row)
strncpy(node_info->location, PQgetvalue(res, row, 7), MAXLEN); strncpy(node_info->location, PQgetvalue(res, row, 7), MAXLEN);
node_info->priority = atoi(PQgetvalue(res, row, 8)); node_info->priority = atoi(PQgetvalue(res, row, 8));
node_info->active = atobool(PQgetvalue(res, row, 9)); node_info->active = atobool(PQgetvalue(res, row, 9));
strncpy(node_info->config_file, PQgetvalue(res, row, 10), MAXLEN); strncpy(node_info->config_file, PQgetvalue(res, row, 10), MAXPGPATH);
/* This won't normally be set */ /* This won't normally be set */
strncpy(node_info->upstream_node_name, PQgetvalue(res, row, 10), MAXLEN); strncpy(node_info->upstream_node_name, PQgetvalue(res, row, 11), MAXLEN);
/* Set remaining struct fields with default values */ /* Set remaining struct fields with default values */
node_info->node_status = NODE_STATUS_UNKNOWN; node_info->node_status = NODE_STATUS_UNKNOWN;
@@ -1887,6 +1852,36 @@ get_node_record(PGconn *conn, int node_id, t_node_info *node_info)
} }
RecordStatus
get_node_record_with_upstream(PGconn *conn, int node_id, t_node_info *node_info)
{
PQExpBufferData query;
RecordStatus result;
initPQExpBuffer(&query);
appendPQExpBuffer(&query,
" SELECT n.node_id, n.type, n.upstream_node_id, n.node_name, n.conninfo, n.repluser, "
" n.slot_name, n.location, n.priority, n.active, n.config_file, un.node_name AS upstream_node_name "
" FROM repmgr.nodes n "
" LEFT JOIN repmgr.nodes un "
" ON un.node_id = n.upstream_node_id"
" WHERE n.node_id = %i",
node_id);
log_verbose(LOG_DEBUG, "get_node_record():\n %s", query.data);
result = _get_node_record(conn, query.data, node_info);
termPQExpBuffer(&query);
if (result == RECORD_NOT_FOUND)
{
log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %i", node_id);
}
return result;
}
RecordStatus RecordStatus
get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_info) get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_info)
{ {
@@ -2140,7 +2135,7 @@ get_all_node_records_with_upstream(PGconn *conn, NodeInfoList *node_list)
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" SELECT n.node_id, n.type, n.upstream_node_id, n.node_name, n.conninfo, n.repluser, " " SELECT n.node_id, n.type, n.upstream_node_id, n.node_name, n.conninfo, n.repluser, "
" n.slot_name, n.location, n.priority, n.active, un.node_name AS upstream_node_name " " n.slot_name, n.location, n.priority, n.active, n.config_file, un.node_name AS upstream_node_name "
" FROM repmgr.nodes n " " FROM repmgr.nodes n "
" LEFT JOIN repmgr.nodes un " " LEFT JOIN repmgr.nodes un "
" ON un.node_id = n.upstream_node_id" " ON un.node_id = n.upstream_node_id"
@@ -2170,7 +2165,7 @@ get_all_node_records_with_upstream(PGconn *conn, NodeInfoList *node_list)
bool bool
get_downsteam_nodes_with_missing_slot(PGconn *conn, int this_node_id, NodeInfoList *node_list) get_downstream_nodes_with_missing_slot(PGconn *conn, int this_node_id, NodeInfoList *node_list)
{ {
PQExpBufferData query; PQExpBufferData query;
PGresult *res = NULL; PGresult *res = NULL;
@@ -2181,9 +2176,10 @@ get_downsteam_nodes_with_missing_slot(PGconn *conn, int this_node_id, NodeInfoLi
" SELECT " REPMGR_NODES_COLUMNS " SELECT " REPMGR_NODES_COLUMNS
" FROM repmgr.nodes n " " FROM repmgr.nodes n "
"LEFT JOIN pg_catalog.pg_replication_slots rs " "LEFT JOIN pg_catalog.pg_replication_slots rs "
" ON rs.slot_name = n.node_name " " ON rs.slot_name = n.slot_name "
" WHERE rs.slot_name IS NULL " " WHERE n.slot_name IS NOT NULL"
" AND n.node_id != %i ", " AND rs.slot_name IS NULL "
" AND n.upstream_node_id = %i ",
this_node_id); this_node_id);
log_verbose(LOG_DEBUG, "get_all_node_records_with_missing_slot():\n%s", query.data); log_verbose(LOG_DEBUG, "get_all_node_records_with_missing_slot():\n%s", query.data);
@@ -2347,8 +2343,7 @@ update_node_record_set_active(PGconn *conn, int this_node_id, bool active)
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBuffer( appendPQExpBuffer(&query,
&query,
"UPDATE repmgr.nodes SET active = %s " "UPDATE repmgr.nodes SET active = %s "
" WHERE node_id = %i", " WHERE node_id = %i",
active == true ? "TRUE" : "FALSE", active == true ? "TRUE" : "FALSE",
@@ -2373,6 +2368,40 @@ update_node_record_set_active(PGconn *conn, int this_node_id, bool active)
} }
bool
update_node_record_set_active_standby(PGconn *conn, int this_node_id)
{
PQExpBufferData query;
PGresult *res = NULL;
initPQExpBuffer(&query);
appendPQExpBuffer(&query,
"UPDATE repmgr.nodes "
" SET type = 'standby', "
" active = TRUE "
" WHERE node_id = %i",
this_node_id);
log_verbose(LOG_DEBUG, "update_node_record_set_active_standby():\n %s", query.data);
res = PQexec(conn, query.data);
termPQExpBuffer(&query);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_error(_("unable to update node record:\n %s"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
PQclear(res);
return true;
}
bool bool
update_node_record_set_primary(PGconn *conn, int this_node_id) update_node_record_set_primary(PGconn *conn, int this_node_id)
{ {
@@ -2390,7 +2419,9 @@ update_node_record_set_primary(PGconn *conn, int this_node_id)
" UPDATE repmgr.nodes " " UPDATE repmgr.nodes "
" SET active = FALSE " " SET active = FALSE "
" WHERE type = 'primary' " " WHERE type = 'primary' "
" AND active IS TRUE "); " AND active IS TRUE "
" AND node_id != %i ",
this_node_id);
res = PQexec(conn, query.data); res = PQexec(conn, query.data);
termPQExpBuffer(&query); termPQExpBuffer(&query);
@@ -2412,7 +2443,8 @@ update_node_record_set_primary(PGconn *conn, int this_node_id)
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" UPDATE repmgr.nodes" " UPDATE repmgr.nodes"
" SET type = 'primary', " " SET type = 'primary', "
" upstream_node_id = NULL " " upstream_node_id = NULL, "
" active = TRUE "
" WHERE node_id = %i ", " WHERE node_id = %i ",
this_node_id); this_node_id);
@@ -2575,9 +2607,11 @@ witness_copy_node_records(PGconn *primary_conn, PGconn *witness_conn)
log_error(_("unable to defer constraints:\n %s"), log_error(_("unable to defer constraints:\n %s"),
PQerrorMessage(witness_conn)); PQerrorMessage(witness_conn));
rollback_transaction(witness_conn); rollback_transaction(witness_conn);
PQclear(res);
return false; return false;
} }
PQclear(res);
/* truncate existing records */ /* truncate existing records */
@@ -2598,6 +2632,8 @@ witness_copy_node_records(PGconn *primary_conn, PGconn *witness_conn)
/* and done */ /* and done */
commit_transaction(witness_conn); commit_transaction(witness_conn);
clear_node_info_list(&nodes);
return true; return true;
} }
@@ -2612,7 +2648,7 @@ delete_node_record(PGconn *conn, int node)
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
"DELETE FROM repmgr.nodes " "DELETE FROM repmgr.nodes "
" WHERE node_id = %d", " WHERE node_id = %i",
node); node);
log_verbose(LOG_DEBUG, "delete_node_record():\n %s", query.data); log_verbose(LOG_DEBUG, "delete_node_record():\n %s", query.data);
@@ -2682,6 +2718,7 @@ update_node_record_slot_name(PGconn *primary_conn, int node_id, char *slot_name)
return true; return true;
} }
void void
get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *node_info) get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *node_info)
{ {
@@ -2696,14 +2733,14 @@ get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *no
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" SELECT current_setting('max_wal_senders')::INT AS max_wal_senders, " " SELECT pg_catalog.current_setting('max_wal_senders')::INT AS max_wal_senders, "
" (SELECT COUNT(*) FROM pg_catalog.pg_stat_replication) AS attached_wal_receivers, "); " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_stat_replication) AS attached_wal_receivers, ");
/* no replication slots in PostgreSQL 9.3 */ /* no replication slots in PostgreSQL 9.3 */
if (server_version_num < 90400) if (server_version_num < 90400)
{ {
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" 0 AS max_replication_slots, " " 0 AS max_replication_slots, "
" 0 AS total_replication_slots, " " 0 AS total_replication_slots, "
" 0 AS active_replication_slots, " " 0 AS active_replication_slots, "
" 0 AS inactive_replication_slots, "); " 0 AS inactive_replication_slots, ");
@@ -2712,16 +2749,16 @@ get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *no
{ {
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" current_setting('max_replication_slots')::INT AS max_replication_slots, " " current_setting('max_replication_slots')::INT AS max_replication_slots, "
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots) AS total_replication_slots, " " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots) AS total_replication_slots, "
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active IS TRUE) AS active_replication_slots, " " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots WHERE active IS TRUE) AS active_replication_slots, "
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active IS FALSE) AS inactive_replication_slots, "); " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots WHERE active IS FALSE) AS inactive_replication_slots, ");
} }
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" pg_catalog.pg_is_in_recovery() AS in_recovery"); " pg_catalog.pg_is_in_recovery() AS in_recovery");
log_verbose(LOG_DEBUG, "get_node_replication_stats():\n%s", query.data);
res = PQexec(conn, query.data); res = PQexec(conn, query.data);
termPQExpBuffer(&query); termPQExpBuffer(&query);
@@ -2758,7 +2795,7 @@ is_downstream_node_attached(PGconn *conn, char *node_name)
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" SELECT COUNT(*) FROM pg_catalog.pg_stat_replication " " SELECT pg_catalog.count(*) FROM pg_catalog.pg_stat_replication "
" WHERE application_name = '%s'", " WHERE application_name = '%s'",
node_name); node_name);
res = PQexec(conn, query.data); res = PQexec(conn, query.data);
@@ -2848,21 +2885,21 @@ get_datadir_configuration_files(PGconn *conn, KeyValueList *list)
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
"WITH files AS ( " "WITH files AS ( "
" WITH dd AS ( " " WITH dd AS ( "
" SELECT setting " " SELECT setting "
" FROM pg_catalog.pg_settings " " FROM pg_catalog.pg_settings "
" WHERE name = 'data_directory') " " WHERE name = 'data_directory') "
" SELECT distinct(sourcefile) AS config_file" " SELECT distinct(sourcefile) AS config_file"
" FROM dd, pg_catalog.pg_settings ps " " FROM dd, pg_catalog.pg_settings ps "
" WHERE ps.sourcefile IS NOT NULL " " WHERE ps.sourcefile IS NOT NULL "
" AND ps.sourcefile ~ ('^' || dd.setting) " " AND ps.sourcefile ~ ('^' || dd.setting) "
" UNION " " UNION "
" SELECT ps.setting AS config_file" " SELECT ps.setting AS config_file"
" FROM dd, pg_catalog.pg_settings ps " " FROM dd, pg_catalog.pg_settings ps "
" WHERE ps.name IN ( 'config_file', 'hba_file', 'ident_file') " " WHERE ps.name IN ('config_file', 'hba_file', 'ident_file') "
" AND ps.setting ~ ('^' || dd.setting) " " AND ps.setting ~ ('^' || dd.setting) "
") " ") "
" SELECT config_file, " " SELECT config_file, "
" regexp_replace(config_file, '^.*\\/','') AS filename " " pg_catalog.regexp_replace(config_file, '^.*\\/','') AS filename "
" FROM files " " FROM files "
"ORDER BY config_file"); "ORDER BY config_file");
@@ -2955,7 +2992,7 @@ get_configuration_file_locations(PGconn *conn, t_configfile_list *list)
" WHERE name = 'data_directory' " " WHERE name = 'data_directory' "
" ) " " ) "
" SELECT ps.setting, " " SELECT ps.setting, "
" regexp_replace(setting, '^.*\\/', '') AS filename, " " pg_catalog.regexp_replace(setting, '^.*\\/', '') AS filename, "
" ps.setting ~ ('^' || dd.data_directory) AS in_data_dir " " ps.setting ~ ('^' || dd.data_directory) AS in_data_dir "
" FROM dd, pg_catalog.pg_settings ps " " FROM dd, pg_catalog.pg_settings ps "
" WHERE ps.name IN ('hba_file', 'ident_file') " " WHERE ps.name IN ('hba_file', 'ident_file') "
@@ -3105,6 +3142,8 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
char event_timestamp[MAXLEN] = ""; char event_timestamp[MAXLEN] = "";
bool success = true; bool success = true;
log_verbose(LOG_DEBUG, "_create_event(): event is \"%s\" for node %i", event, node_id);
/* /*
* Only attempt to write a record if a connection handle was provided. * Only attempt to write a record if a connection handle was provided.
* Also check that the repmgr schema has been properly initialised - if * Also check that the repmgr schema has been properly initialised - if
@@ -3372,7 +3411,7 @@ get_event_records(PGconn *conn, int node_id, const char *node_name, const char *
/* LEFT JOIN used here as a node record may have been removed */ /* LEFT JOIN used here as a node record may have been removed */
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" SELECT e.node_id, n.node_name, e.event, e.successful, " " SELECT e.node_id, n.node_name, e.event, e.successful, "
" TO_CHAR(e.event_timestamp, 'YYYY-MM-DD HH24:MI:SS') AS timestamp, " " pg_catalog.to_char(e.event_timestamp, 'YYYY-MM-DD HH24:MI:SS') AS timestamp, "
" e.details " " e.details "
" FROM repmgr.events e " " FROM repmgr.events e "
"LEFT JOIN repmgr.nodes n ON e.node_id = n.node_id "); "LEFT JOIN repmgr.nodes n ON e.node_id = n.node_id ");
@@ -3461,6 +3500,9 @@ create_replication_slot(PGconn *conn, char *slot_name, int server_version_num, P
PGresult *res = NULL; PGresult *res = NULL;
t_replication_slot slot_info = T_REPLICATION_SLOT_INITIALIZER; t_replication_slot slot_info = T_REPLICATION_SLOT_INITIALIZER;
if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
server_version_num = get_server_version(conn, NULL);
/* /*
* Check whether slot exists already; if it exists and is active, that * Check whether slot exists already; if it exists and is active, that
* means another active standby is using it, which creates an error * means another active standby is using it, which creates an error
@@ -3622,7 +3664,7 @@ get_free_replication_slots(PGconn *conn)
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" SELECT pg_catalog.current_setting('max_replication_slots')::INT - " " SELECT pg_catalog.current_setting('max_replication_slots')::INT - "
" COUNT(*) AS free_slots" " pg_catalog.count(*) AS free_slots"
" FROM pg_catalog.pg_replication_slots"); " FROM pg_catalog.pg_replication_slots");
res = PQexec(conn, query.data); res = PQexec(conn, query.data);
@@ -3817,6 +3859,45 @@ is_server_available(const char *conninfo)
} }
bool
is_server_available_params(t_conninfo_param_list *param_list)
{
PGPing status = PQpingParams((const char **) param_list->keywords,
(const char **) param_list->values,
false);
/* deparsing the param_list adds overhead, so only do it if needed */
if (log_level == LOG_DEBUG)
{
char *conninfo_str = param_list_to_string(param_list);
log_verbose(LOG_DEBUG, "ping status for %s is %i", conninfo_str, (int)status);
pfree(conninfo_str);
}
if (status == PQPING_OK)
return true;
return false;
}
/*
* Simple throw-away query to stop a connection handle going stale
*/
void
connection_ping(PGconn *conn)
{
PGresult *res = PQexec(conn, "SELECT TRUE");
log_verbose(LOG_DEBUG, "connection_ping(): result is %s", PQresStatus(PQresultStatus(res)));
PQclear(res);
return;
}
/* ==================== */ /* ==================== */
/* monitoring functions */ /* monitoring functions */
/* ==================== */ /* ==================== */
@@ -3901,9 +3982,9 @@ get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_histor
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
"SELECT COUNT(*) " "SELECT pg_catalog.count(*) "
" FROM repmgr.monitoring_history " " FROM repmgr.monitoring_history "
" WHERE age(now(), last_monitor_time) >= '%d days'::interval", " WHERE pg_catalog.age(pg_catalog.now(), last_monitor_time) >= '%d days'::interval",
keep_history); keep_history);
res = PQexec(primary_conn, query.data); res = PQexec(primary_conn, query.data);
@@ -3942,7 +4023,7 @@ delete_monitoring_records(PGconn *primary_conn, int keep_history)
{ {
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
"DELETE FROM repmgr.monitoring_history " "DELETE FROM repmgr.monitoring_history "
" WHERE age(now(), last_monitor_time) >= '%d days'::interval ", " WHERE pg_catalog.age(pg_catalog.now(), last_monitor_time) >= '%d days'::interval ",
keep_history); keep_history);
} }
else else
@@ -4240,7 +4321,7 @@ _is_bdr_db(PGconn *conn, PQExpBufferData *output, bool quiet)
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
"SELECT COUNT(*) FROM pg_catalog.pg_extension WHERE extname='bdr'"); "SELECT pg_catalog.count(*) FROM pg_catalog.pg_extension WHERE extname='bdr'");
res = PQexec(conn, query.data); res = PQexec(conn, query.data);
termPQExpBuffer(&query); termPQExpBuffer(&query);
@@ -4353,7 +4434,7 @@ is_bdr_repmgr(PGconn *conn)
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
"SELECT COUNT(*)" "SELECT pg_catalog.count(*)"
" FROM repmgr.nodes n" " FROM repmgr.nodes n"
" WHERE n.type != 'bdr' "); " WHERE n.type != 'bdr' ");
@@ -4384,8 +4465,8 @@ is_table_in_bdr_replication_set(PGconn *conn, const char *tablename, const char
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
"SELECT COUNT(*) " "SELECT pg_catalog.count(*) "
" FROM UNNEST(bdr.table_get_replication_sets('repmgr.%s')) AS repset " " FROM pg_catalog.unnest(bdr.table_get_replication_sets('repmgr.%s')) AS repset "
" WHERE repset='%s' ", " WHERE repset='%s' ",
tablename, tablename,
set); set);
@@ -4763,8 +4844,8 @@ bdr_node_has_repmgr_set(PGconn *conn, const char *node_name)
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBuffer(&query, appendPQExpBuffer(&query,
" SELECT COUNT(*) " " SELECT pg_catalog.count(*) "
" FROM UNNEST(bdr.connection_get_replication_sets('%s') AS repset " " FROM pg_catalog.unnest(bdr.connection_get_replication_sets('%s') AS repset "
" WHERE repset = 'repmgr'", " WHERE repset = 'repmgr'",
node_name); node_name);
@@ -4799,7 +4880,7 @@ bdr_node_set_repmgr_set(PGconn *conn, const char *node_name)
" SELECT bdr.connection_set_replication_sets( " " SELECT bdr.connection_set_replication_sets( "
" ARRAY( " " ARRAY( "
" SELECT repset::TEXT " " SELECT repset::TEXT "
" FROM UNNEST(bdr.connection_get_replication_sets('%s')) AS repset " " FROM pg_catalog.unnest(bdr.connection_get_replication_sets('%s')) AS repset "
" UNION " " UNION "
" SELECT 'repmgr'::TEXT " " SELECT 'repmgr'::TEXT "
" ), " " ), "

View File

@@ -343,9 +343,6 @@ bool atobool(const char *value);
PGconn *establish_db_connection(const char *conninfo, PGconn *establish_db_connection(const char *conninfo,
const bool exit_on_error); const bool exit_on_error);
PGconn *establish_db_connection_quiet(const char *conninfo); PGconn *establish_db_connection_quiet(const char *conninfo);
PGconn *establish_db_connection_as_user(const char *conninfo,
const char *user,
const bool exit_on_error);
PGconn *establish_db_connection_by_params(t_conninfo_param_list *param_list, PGconn *establish_db_connection_by_params(t_conninfo_param_list *param_list,
const bool exit_on_error); const bool exit_on_error);
@@ -356,10 +353,11 @@ PGconn *get_primary_connection(PGconn *standby_conn, int *primary_id, char *p
PGconn *get_primary_connection_quiet(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out); PGconn *get_primary_connection_quiet(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
bool is_superuser_connection(PGconn *conn, t_connection_user *userinfo); bool is_superuser_connection(PGconn *conn, t_connection_user *userinfo);
void close_connection(PGconn **conn);
/* conninfo manipulation functions */ /* conninfo manipulation functions */
bool get_conninfo_value(const char *conninfo, const char *keyword, char *output); bool get_conninfo_value(const char *conninfo, const char *keyword, char *output);
bool get_conninfo_default_value(const char *param, char *output, int maxlen);
void initialize_conninfo_params(t_conninfo_param_list *param_list, bool set_defaults); void initialize_conninfo_params(t_conninfo_param_list *param_list, bool set_defaults);
void free_conninfo_params(t_conninfo_param_list *param_list); void free_conninfo_params(t_conninfo_param_list *param_list);
void copy_conninfo_params(t_conninfo_param_list *dest_list, t_conninfo_param_list *source_list); void copy_conninfo_params(t_conninfo_param_list *dest_list, t_conninfo_param_list *source_list);
@@ -367,10 +365,11 @@ void conn_to_param_list(PGconn *conn, t_conninfo_param_list *param_list);
void param_set(t_conninfo_param_list *param_list, const char *param, const char *value); void param_set(t_conninfo_param_list *param_list, const char *param, const char *value);
void param_set_ine(t_conninfo_param_list *param_list, const char *param, const char *value); void param_set_ine(t_conninfo_param_list *param_list, const char *param, const char *value);
char *param_get(t_conninfo_param_list *param_list, const char *param); char *param_get(t_conninfo_param_list *param_list, const char *param);
bool parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char *errmsg, bool ignore_local_params); bool parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char **errmsg, bool ignore_local_params);
char *param_list_to_string(t_conninfo_param_list *param_list); char *param_list_to_string(t_conninfo_param_list *param_list);
bool has_passfile(void); bool has_passfile(void);
/* transaction functions */ /* transaction functions */
bool begin_transaction(PGconn *conn); bool begin_transaction(PGconn *conn);
bool commit_transaction(PGconn *conn); bool commit_transaction(PGconn *conn);
@@ -389,7 +388,6 @@ bool get_cluster_size(PGconn *conn, char *size);
int get_server_version(PGconn *conn, char *server_version); int get_server_version(PGconn *conn, char *server_version);
RecoveryType get_recovery_type(PGconn *conn); RecoveryType get_recovery_type(PGconn *conn);
int get_primary_node_id(PGconn *conn); int get_primary_node_id(PGconn *conn);
bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
int get_ready_archive_files(PGconn *conn, const char *data_directory); int get_ready_archive_files(PGconn *conn, const char *data_directory);
bool identify_system(PGconn *repl_conn, t_system_identification *identification); bool identify_system(PGconn *repl_conn, t_system_identification *identification);
bool repmgrd_set_local_node_id(PGconn *conn, int local_node_id); bool repmgrd_set_local_node_id(PGconn *conn, int local_node_id);
@@ -408,6 +406,8 @@ t_server_type parse_node_type(const char *type);
const char *get_node_type_string(t_server_type type); const char *get_node_type_string(t_server_type type);
RecordStatus get_node_record(PGconn *conn, int node_id, t_node_info *node_info); RecordStatus get_node_record(PGconn *conn, int node_id, t_node_info *node_info);
RecordStatus get_node_record_with_upstream(PGconn *conn, int node_id, t_node_info *node_info);
RecordStatus get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_info); RecordStatus get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_info);
t_node_info *get_node_record_pointer(PGconn *conn, int node_id); t_node_info *get_node_record_pointer(PGconn *conn, int node_id);
@@ -419,7 +419,7 @@ void get_downstream_node_records(PGconn *conn, int node_id, NodeInfoList *nodes
void get_active_sibling_node_records(PGconn *conn, int node_id, int upstream_node_id, NodeInfoList *node_list); void get_active_sibling_node_records(PGconn *conn, int node_id, int upstream_node_id, NodeInfoList *node_list);
void get_node_records_by_priority(PGconn *conn, NodeInfoList *node_list); void get_node_records_by_priority(PGconn *conn, NodeInfoList *node_list);
bool get_all_node_records_with_upstream(PGconn *conn, NodeInfoList *node_list); bool get_all_node_records_with_upstream(PGconn *conn, NodeInfoList *node_list);
bool get_downsteam_nodes_with_missing_slot(PGconn *conn, int this_node_id, NodeInfoList *noede_list); bool get_downstream_nodes_with_missing_slot(PGconn *conn, int this_node_id, NodeInfoList *noede_list);
bool create_node_record(PGconn *conn, char *repmgr_action, t_node_info *node_info); bool create_node_record(PGconn *conn, char *repmgr_action, t_node_info *node_info);
bool update_node_record(PGconn *conn, char *repmgr_action, t_node_info *node_info); bool update_node_record(PGconn *conn, char *repmgr_action, t_node_info *node_info);
@@ -428,6 +428,7 @@ bool truncate_node_records(PGconn *conn);
bool update_node_record_set_active(PGconn *conn, int this_node_id, bool active); bool update_node_record_set_active(PGconn *conn, int this_node_id, bool active);
bool update_node_record_set_primary(PGconn *conn, int this_node_id); bool update_node_record_set_primary(PGconn *conn, int this_node_id);
bool update_node_record_set_active_standby(PGconn *conn, int this_node_id);
bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id); bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id);
bool update_node_record_status(PGconn *conn, int this_node_id, char *type, int upstream_node_id, bool active); bool update_node_record_status(PGconn *conn, int this_node_id, char *type, int upstream_node_id, bool active);
bool update_node_record_conn_priority(PGconn *conn, t_configuration_options *options); bool update_node_record_conn_priority(PGconn *conn, t_configuration_options *options);
@@ -465,6 +466,8 @@ int wait_connection_availability(PGconn *conn, long long timeout);
/* node availability functions */ /* node availability functions */
bool is_server_available(const char *conninfo); bool is_server_available(const char *conninfo);
bool is_server_available_params(t_conninfo_param_list *param_list);
void connection_ping(PGconn *conn);
/* monitoring functions */ /* monitoring functions */
void void

View File

@@ -24,8 +24,9 @@
series will no longer be actively maintained. series will no longer be actively maintained.
</para> </para>
<para> <para>
repmgr 2.x supports PostgreSQL 9.0 ~ 9.3. While it is compatible &repmgr; 2.x supports PostgreSQL 9.0 ~ 9.3. While it is compatible
with PostgreSQL 9.3, we recommend using repmgr 4.x. with PostgreSQL 9.3, we recommend using repmgr 4.x. &repmgr; 2.x is
no longer maintained.
</para> </para>
</sect2> </sect2>
@@ -35,7 +36,7 @@
Replication slots, introduced in PostgreSQL 9.4, ensure that the Replication slots, introduced in PostgreSQL 9.4, ensure that the
primary server will retain WAL files until they have been consumed primary server will retain WAL files until they have been consumed
by all standby servers. This makes WAL file management much easier, by all standby servers. This makes WAL file management much easier,
and if used `repmgr` will no longer insist on a fixed minimum number and if used &repmgr; will no longer insist on a fixed minimum number
(default: 5000) of WAL files being retained. (default: 5000) of WAL files being retained.
</para> </para>
<para> <para>
@@ -69,12 +70,50 @@
in a streaming replication cluster. in a streaming replication cluster.
</para> </para>
</sect2> </sect2>
<sect2 id="faq-upgrades" xreflabel="Upgrading PostgreSQL with repmgr">
<title>Can &repmgr; assist with upgrading a PostgreSQL cluster?</title>
<para>
For <emphasis>minor</emphasis> version upgrades, e.g. from 9.6.7 to 9.6.8, a common
approach is to upgrade a standby to the latest version, perform a
<link linkend="performing-switchover">switchover</link> promoting it to a primary,
then upgrade the former primary.
</para>
<para>
For <emphasis>major</emphasis> version upgrades (e.g. from PostgreSQL 9.6 to PostgreSQL 10),
the traditional approach is to "reseed" a cluster by upgrading a single
node with <ulink url="https://www.postgresql.org/docs/current/static/pgupgrade.html">pg_upgrade</ulink>
and recloning standbys from this.
</para>
<para>
To minimize downtime during major upgrades, for more recent PostgreSQL
versions (PostgreSQL 9.4 and later),
<ulink url="https://www.2ndquadrant.com/en/resources/pglogical/">pglogical</ulink>
can be used to set up a parallel cluster using the newer PostgreSQL version,
which can be kept in sync with the existing production cluster until the
new cluster is ready to be put into production.
</para>
</sect2>
<sect2 id="faq-libdir-repmgr-error">
<title>What does this error mean: <literal>ERROR: could not access file "$libdir/repmgr"</literal>?</title>
<para>
It means the &repmgr; extension code is not installed in the
PostgreSQL application directory. This typically happens when using PostgreSQL
packages provided by a third-party vendor, which often have different
filesystem layouts.
</para>
<para>
Either use PostgreSQL packages provided by the community or 2ndQuadrant; if this
is not possible, contact your vendor for assistance.
</para>
</sect2>
</sect1> </sect1>
<sect1 id="faq-repmgr" xreflabel="repmgr"> <sect1 id="faq-repmgr" xreflabel="repmgr">
<title><command>repmgr</command></title> <title><command>repmgr</command></title>
<sect2 id="faq-register-existing-node" xreflabel=""> <sect2 id="faq-register-existing-node" xreflabel="registering an existing node">
<title>Can I register an existing PostgreSQL server with repmgr?</title> <title>Can I register an existing PostgreSQL server with repmgr?</title>
<para> <para>
Yes, any existing PostgreSQL server which is part of the same replication Yes, any existing PostgreSQL server which is part of the same replication
@@ -83,6 +122,26 @@
</para> </para>
</sect2> </sect2>
<sect2 id="faq-repmgr-clone-other-source" >
<title>Can I use a standby not cloned by &repmgr; as a &repmgr; node?</title>
<para>
For a standby which has been manually cloned or recovered from an external
backup manager such as Barman, the command
<command><link linkend="repmgr-standby-clone">repmgr standby clone --recovery-conf-only</link></command>
can be used to create the correct <filename>recovery.conf</filename> file for
use with &repmgr; (and will create a replication slot if required). Once this has been done,
<link linkend="repmgr-standby-register">register the node</link> as usual.
</para>
</sect2>
<sect2 id="faq-repmgr-recovery-conf" >
<title>What does &repmgr; write in <filename>recovery.conf</filename>, and what options can be set there?</title>
<para>
See section <link linkend="repmgr-standby-clone-recovery-conf">Customising recovery.conf</link>.
</para>
</sect2>
<sect2 id="faq-repmgr-failed-primary-standby" xreflabel="Reintegrate a failed primary as a standby"> <sect2 id="faq-repmgr-failed-primary-standby" xreflabel="Reintegrate a failed primary as a standby">
<title>How can a failed primary be re-added as a standby?</title> <title>How can a failed primary be re-added as a standby?</title>
<para> <para>
@@ -91,19 +150,23 @@
needs to be re-registered as a standby. needs to be re-registered as a standby.
</para> </para>
<para> <para>
In PostgreSQL 9.5 and later, it's possible to use <command>pg_rewind</command> It's possible to use <command>pg_rewind</command> to re-synchronise the existing data
to re-synchronise the existing data directory, which will usually be much directory, which will usually be much
faster than re-cloning the server. However <command>pg_rewind</command> can only faster than re-cloning the server. However <command>pg_rewind</command> can only
be used if PostgreSQL either has <varname>wal_log_hints</varname> enabled, or be used if PostgreSQL either has <varname>wal_log_hints</varname> enabled, or
data checksums were enabled when the cluster was initialized. data checksums were enabled when the cluster was initialized.
</para> </para>
<para> <para>
&repmgr; provides the command <command>repmgr node rejoin</command> which can Note that <command>pg_rewind</command> is available as part of the core PostgreSQL
optionally execute <command>pg_rewind</command>; see the <xref linkend="repmgr-node-rejoin"> distribution from PostgreSQL 9.5, and as a third-party utility for PostgreSQL 9.3 and 9.4.
documentation for details.
</para> </para>
<para> <para>
If <command>pg_rewind</command> cannot be used, then the data directory will have &repmgr; provides the command <command>repmgr node rejoin</command> which can
optionally execute <command>pg_rewind</command>; see the <xref linkend="repmgr-node-rejoin">
documentation for details, in particular the section <xref linkend="repmgr-node-rejoin-pg-rewind">.
</para>
<para>
If <command>pg_rewind</command> cannot be used, then the data directory will need
to be re-cloned from scratch. to be re-cloned from scratch.
</para> </para>
@@ -180,6 +243,9 @@
</para> </para>
</sect2> </sect2>
</sect1> </sect1>
<sect1 id="faq-repmgrd" xreflabel="repmgrd"> <sect1 id="faq-repmgrd" xreflabel="repmgrd">

View File

@@ -1,48 +1,119 @@
<appendix id="appendix-packages" xreflabel="Package details"> <appendix id="appendix-packages" xreflabel="Package details">
<indexterm> <indexterm>
<primary>packages</primary> <primary>packages</primary>
</indexterm> </indexterm>
<title>&repmgr; package details</title> <title>&repmgr; package details</title>
<para>
This section provides technical details about various &repmgr; binary
packages, such as location of the installed binaries and
configuration files.
</para>
<sect1 id="packages-centos" xreflabel="CentOS packages">
<title>CentOS, RHEL, Scientific Linux etc.</title>
<para> <para>
Currently packages are provided for versions 6.x and 7.x of CentOS et al. This section provides technical details about various &repmgr; binary
packages, such as location of the installed binaries and
configuration files.
</para> </para>
<note> <sect1 id="packages-centos" xreflabel="CentOS packages">
<title>CentOS Packages</title>
<indexterm>
<primary>packages</primary>
<secondary>CentOS packages</secondary>
</indexterm>
<para> <para>
For PostgreSQL 9.6 and lower, the CentOS packages use a mixture of <literal>9.6</literal> Currently, &repmgr; RPM packages are provided for versions 6.x and 7.x of CentOS. These should also
and <literal>96</literal> in various places to designate the major version; work on matching versions of Red Hat Enterprise Linux, Scientific Linux and Oracle Enterprise Linux;
from PostgreSQL 10, the first part of the version number (e.g. <literal>10</literal>) is together with CentOS, these are the same RedHat-based distributions for which the main community project
the major version, so there is more consistency in file/path/package naming. (PGDG) provides packages (see the <ulink url="https://yum.postgresql.org/">PostgreSQL RPM Building Project</ulink>
page for details).
</para> </para>
</note>
<para>
Note these &repmgr; RPM packages are not designed to work with SuSE/OpenSuSE.
</para>
<note>
<para>
&repmgr; packages are designed to be compatible with community-provided PostgreSQL packages.
They may not work with vendor-specific packages such as those provided by RedHat for RHEL
customers, as the filesystem layout may be different to the community RPMs.
Please contact your support vendor for assistance.
</para>
</note>
<sect2 id="packages-centos-repositories">
<title>CentOS repositories</title>
<para>
&repmgr; packages are available from the public 2ndQuadrant repository, and also the
PostgreSQL community repository. The 2ndQuadrant repository is updated immediately
after each
&repmgr; release.
</para>
<table id="centos-2ndquadrant-repository">
<title>2ndQuadrant public repository</title>
<tgroup cols="2">
<tbody>
<row>
<entry>Repository URL:</entry>
<entry><ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink></entry>
</row>
<row>
<entry>Repository documentation:</entry>
<entry><ulink url="https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ">https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ</ulink></entry>
</row>
</tbody>
</tgroup>
</table>
<table id="centos-pgdg-repository">
<title>PostgreSQL community repository (PGDG)</title>
<tgroup cols="2">
<tbody>
<row>
<entry>Repository URL:</entry>
<entry><ulink url="https://yum.postgresql.org/repopackages.php">https://yum.postgresql.org/repopackages.php</ulink></entry>
</row>
<row>
<entry>Repository documentation:</entry>
<entry><ulink url="https://yum.postgresql.org/">https://yum.postgresql.org/</ulink></entry>
</row>
</tbody>
</tgroup>
</table>
</sect2>
<sect2 id="packages-centos-details">
<title>CentOS package details</title>
<para>
The two tables below list relevant information, paths, commands etc. for the &repmgr; packages on
CentOS 7 (with systemd) and CentOS 6 (no systemd). Substitute the appropriate PostgreSQL major
version number for your installation.
</para>
<note>
<para>
For PostgreSQL 9.6 and lower, the CentOS packages use a mixture of <literal>9.6</literal>
and <literal>96</literal> in various places to designate the major version; e.g. the
package name is <literal>repmgr96</literal>, but the binary directory is
<filename>/var/lib/pgsql/9.6/data</filename>.
</para>
<para>
From PostgreSQL 10, the first part of the version number (e.g. <literal>10</literal>) is
the major version, so there is more consistency in file/path/package naming
(package <literal>repmgr10</literal>, binary directory <filename>/var/lib/pgsql/10/data</filename>).
</para>
</note>
<table id="centos-7-packages"> <table id="centos-7-packages">
<title>CentOS 7 packages</title> <title>CentOS 7 packages</title>
<tgroup cols="2"> <tgroup cols="2">
<tbody> <tbody>
<row>
<entry>Repository URL:</entry>
<entry><ulink url="https://yum.postgresql.org/repopackages.php">https://yum.postgresql.org/repopackages.php</ulink></entry>
</row>
<row>
<entry>Repository documentation:</entry>
<entry><ulink url="https://yum.postgresql.org/">https://yum.postgresql.org/</ulink></entry>
</row>
<row> <row>
<entry>Package name example:</entry> <entry>Package name example:</entry>
<entry><filename>repmgr10-4.0.0-1.rhel7.x86_64</filename></entry> <entry><filename>repmgr10-4.0.4-1.rhel7.x86_64</filename></entry>
</row> </row>
<row> <row>
@@ -52,7 +123,7 @@
<row> <row>
<entry>Installation command:</entry> <entry>Installation command:</entry>
<entry><literal>yum install -y repmgr10</literal></entry> <entry><literal>yum install repmgr10</literal></entry>
</row> </row>
<row> <row>
@@ -61,7 +132,7 @@
</row> </row>
<row> <row>
<entry>In default path:</entry> <entry>repmgr in default path:</entry>
<entry>NO</entry> <entry>NO</entry>
</row> </row>
@@ -70,9 +141,14 @@
<entry><filename>/etc/repmgr/10/repmgr.conf</filename></entry> <entry><filename>/etc/repmgr/10/repmgr.conf</filename></entry>
</row> </row>
<row>
<entry>Data directory:</entry>
<entry><filename>/var/lib/pgsql/10/data</filename></entry>
</row>
<row> <row>
<entry>repmgrd service command:</entry> <entry>repmgrd service command:</entry>
<entry><literal>service repmgr10</literal></entry> <entry><command>systemctl [start|stop|restart|reload] repmgr10</command></entry>
</row> </row>
<row> <row>
@@ -82,7 +158,7 @@
<row> <row>
<entry>repmgrd log file location:</entry> <entry>repmgrd log file location:</entry>
<entry>(not specified)</entry> <entry>(not specified by package; set in <filename>repmgr.conf</filename>)</entry>
</row> </row>
</tbody> </tbody>
@@ -94,29 +170,20 @@
<tgroup cols="2"> <tgroup cols="2">
<tbody> <tbody>
<row>
<entry>Repository URL:</entry>
<entry><ulink url="https://yum.postgresql.org/repopackages.php">https://yum.postgresql.org/repopackages.php</ulink></entry>
</row>
<row>
<entry>Repository documentation:</entry>
<entry><ulink url="https://yum.postgresql.org/">https://yum.postgresql.org/</ulink></entry>
</row>
<row> <row>
<entry>Package name example:</entry> <entry>Package name example:</entry>
<entry><filename>repmgr96-4.0.0-1.rhel6.x86_64</filename></entry> <entry><filename>repmgr96-4.0.4-1.rhel6.x86_64</filename></entry>
</row> </row>
<row> <row>
<entry>Metapackage:</entry> <entry>Metapackage:</entry>
<entry>NO</entry> <entry>(none)</entry>
</row> </row>
<row> <row>
<entry>Installation command:</entry> <entry>Installation command:</entry>
<entry><literal>yum install -y repmgr96</literal></entry> <entry><literal>yum install repmgr96</literal></entry>
</row> </row>
<row> <row>
@@ -125,7 +192,7 @@
</row> </row>
<row> <row>
<entry>In default path:</entry> <entry>repmgr in default path:</entry>
<entry>NO</entry> <entry>NO</entry>
</row> </row>
@@ -134,9 +201,14 @@
<entry><filename>/etc/repmgr/9.6/repmgr.conf</filename></entry> <entry><filename>/etc/repmgr/9.6/repmgr.conf</filename></entry>
</row> </row>
<row>
<entry>Data directory:</entry>
<entry><filename>/var/lib/pgsql/9.6/data</filename></entry>
</row>
<row> <row>
<entry>repmgrd service command:</entry> <entry>repmgrd service command:</entry>
<entry>service repmgr-9.6</entry> <entry><literal>service [start|stop|restart|reload] repmgr-9.6</literal></entry>
</row> </row>
<row> <row>
@@ -153,6 +225,143 @@
</tgroup> </tgroup>
</table> </table>
</sect2>
</sect1> </sect1>
<sect1 id="packages-debian-ubuntu" xreflabel="Debian/Ubuntu packages">
<title>Debian/Ubuntu Packages</title>
<indexterm>
<primary>packages</primary>
<secondary>Debian/Ubuntu packages</secondary>
</indexterm>
<para>
&repmgr; <literal>.deb</literal> packages are provided via the
PostgreSQL Community APT repository, and are available for each community-supported
PostgreSQL version, currently supported Debian releases, and currently supported
Ubuntu LTS releases.
</para>
<sect2 id="packages-apt-repository">
<title>APT repository</title>
<para>
&repmgr; packages are available from the PostgreSQL Community APT repository,
which is updated immediately after each &repmgr; release.
</para>
<table id="apt-repository">
<title>PostgreSQL Community APT repository (PGDG)</title>
<tgroup cols="2">
<tbody>
<row>
<entry>Repository URL:</entry>
<entry><ulink url="http://apt.postgresql.org/">http://apt.postgresql.org/</ulink></entry>
</row>
<row>
<entry>Repository documentation:</entry>
<entry><ulink url="https://wiki.postgresql.org/wiki/Apt)">https://wiki.postgresql.org/wiki/Apt)</ulink></entry>
</row>
</tbody>
</tgroup>
</table>
</sect2>
<sect2 id="packages-debian-details">
<title>Debian/Ubuntu package details</title>
<para>
The table below lists relevant information, paths, commands etc. for the &repmgr; packages on
Debian 9.x ("Stretch"). Substitute the appropriate PostgreSQL major
version number for your installation.
</para>
<para>
See also <xref linkend="repmgrd-configuration-debian-ubuntu"> for some specifics related
to configuring the <application>repmgrd</application> daemon.
</para>
<table id="debian-9-packages">
<title>Debian 9.x packages</title>
<tgroup cols="2">
<tbody>
<row>
<entry>Package name example:</entry>
<entry><filename>postgresql-10-repmgr</filename></entry>
</row>
<row>
<entry>Metapackage:</entry>
<entry><filename>repmgr-common</filename></entry>
</row>
<row>
<entry>Installation command:</entry>
<entry><literal>apt-get install postgresql-10-repmgr</literal></entry>
</row>
<row>
<entry>Binary location:</entry>
<entry><filename>/usr/lib/postgresql/10/bin</filename></entry>
</row>
<row>
<entry>repmgr in default path:</entry>
<entry>Yes (via wrapper script <filename>/usr/bin/repmgr</filename>)</entry>
</row>
<row>
<entry>Configuration file location:</entry>
<entry>(not set by package)</entry>
</row>
<row>
<entry>Data directory:</entry>
<entry><filename>/var/lib/postgresql/10/main</filename></entry>
</row>
<row>
<entry>PostgreSQL service command:</entry>
<entry><command>systemctl [start|stop|restart|reload] postgresql@10-main</command></entry>
</row>
<row>
<entry>repmgrd service command:</entry>
<entry><command>systemctl [start|stop|restart|reload] repmgrd</command></entry>
</row>
<row>
<entry>repmgrd service file location:</entry>
<entry><filename>/etc/init.d/repmgrd</filename> (defaults in: <filename>/etc/defaults/repmgrd</filename>)</entry>
</row>
<row>
<entry>repmgrd log file location:</entry>
<entry>(not specified by package; set in <filename>repmgr.conf</filename>)</entry>
</row>
</tbody>
</tgroup>
</table>
<note>
<para>
Instead of using the <application>systemd</application> service command directly,
it's recommended to execute <command>pg_ctlcluster</command> (as <literal>root</literal>,
either directly or via <command>sudo</command>), e.g.:
<programlisting>
<command>pg_ctlcluster 10 main [start|stop|restart|reload]</command></programlisting>
</para>
<para>
For pre-<application>systemd</application> systems, <command>pg_ctlcluster</command>
can be executed directly by the <literal>postgres</literal> user.
</para>
</note>
</sect2>
</sect1>
</appendix> </appendix>

View File

@@ -15,9 +15,393 @@
See also: <xref linkend="upgrading-repmgr"> See also: <xref linkend="upgrading-repmgr">
</para> </para>
<sect1 id="release-4.0.6">
<title>Release 4.0.6</title>
<para><emphasis>June ??, 2018</emphasis></para>
<para>
&repmgr; 4.0.6 contains a number of bug fixes and usability enhancements.
</para>
<para>
We recommend upgrading to this version as soon as possible.
This release can be installed as a simple package upgrade from repmgr 4.0 ~ 4.0.5;
<application>repmgrd</application> (if running) should be restarted. See <xref linkend="upgrading-repmgr">
for more details.
</para>
<sect2>
<title>Usability enhancements</title>
<para>
<itemizedlist>
<listitem>
<para>
<command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command> and
<command><link linkend="repmgr-cluster-matrix">repmgr cluster matrix</link></command>:
return non-zero exit code if node connection issues detected (GitHub #447)
</para>
</listitem>
<listitem>
<para>
<command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>:
Improve handling of external configuration file copying, including consideration in
<option>--dry-run</option> check
(GitHub #443)
</para>
</listitem>
<listitem>
<para>
When using <option>--dry-run</option>, force log level to <literal>INFO</literal>
to ensure output will always be displayed
(GitHub #441)
</para>
</listitem>
</itemizedlist>
</para>
</sect2>
<sect2>
<title>Bug fixes</title>
<para>
<itemizedlist>
<listitem>
<para>
<command><link linkend="repmgr-witness-register">repmgr witness register</link></command>:
prevent registration of a witness server with the same name as an existing node.
</para>
</listitem>
<listitem>
<para>
<command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>:
check node has actually connected to new primary before reporting success
(GitHub #444)
</para>
</listitem>
<listitem>
<para>
<command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>:
Don't require presence of <varname>user</varname> parameter in conninfo string
(GitHub #437)
</para>
</listitem>
<listitem>
<para>
<command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>:
Improve documentation of <option>--recovery-conf-only</option> mode
(GitHub #438)
</para>
</listitem>
<listitem>
<para>
<command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>:
Fix bug when parsing <option>--config-files</option> parameter
(GitHub #442)
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: ensure local node is counted as quorum member
(GitHub #439)
</para>
</listitem>
</itemizedlist>
</para>
</sect2>
</sect1>
<sect1 id="release-4.0.5">
<title>Release 4.0.5</title>
<para><emphasis>Wed May 2, 2018</emphasis></para>
<para>
&repmgr; 4.0.5 contains a number of usability enhancements related to
<application>pg_rewind</application> usage, <filename>recovery.conf</filename>
generation and (in <application>repmgrd</application>) handling of various
corner-case situations, as well as a number of bug fixes.
</para>
<sect2>
<title>Usability enhancements</title>
<para>
<itemizedlist>
<listitem>
<para>
Various documentation improvements, with particular emphasis on
the importance of setting appropriate <link linkend="configuration-service-commands">service commands</link>
instead of relying on <application>pg_ctl</application>.
</para>
</listitem>
<listitem>
<para>
Poll demoted primary after restart as a standby during a switchover operation (GitHub #408).
</para>
</listitem>
<listitem>
<para>
Add configuration parameter <option>config_directory</option> (GitHub #424).
</para>
</listitem>
<listitem>
<para>
Add sanity check if <option>--upstream-node-id</option> not supplied when executing
<xref linkend="repmgr-standby-register"> (GitHub #395).
</para>
</listitem>
<listitem>
<para>
Enable <link linkend="repmgr-node-rejoin-pg-rewind">pg_rewind</link> to be used with
PostgreSQL 9.3/9.4 (GitHub #413).
</para>
</listitem>
<listitem>
<para>
When generating replication connection strings, set <literal>dbname=replication</literal>
if appropriate (GitHub #421).
</para>
</listitem>
<listitem>
<para>
Enable provision of <option>archive_cleanup_command</option> in <filename>recovery.conf</filename>
(GitHub #416).
</para>
</listitem>
<listitem>
<para>
Actively check for node to <link linkend="repmgr-node-rejoin">rejoin</link> cluster (GitHub #415).
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: set <literal>connect_timeout=2</literal> (if not explicitly set)
when pinging a server.
</para>
</listitem>
</itemizedlist>
</para>
</sect2>
<sect2>
<title>Bug fixes</title>
<para>
<itemizedlist>
<listitem>
<para>
Fix display of conninfo parsing error messages.
</para>
</listitem>
<listitem>
<para>
Fix minimum accepted value for <varname>degraded_monitoring_timeout</varname> (GitHub #411).
</para>
</listitem>
<listitem>
<para>
Fix superuser password handling (GitHub #400)
</para>
</listitem>
<listitem>
<para>
Fix parsing of <varname>archive_ready_critical</varname> configuration file parameter (GitHub #426).
</para>
</listitem>
<listitem>
<para>
Fix <command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command>
output (GitHub #389)
</para>
</listitem>
<listitem>
<para>
Fix memory leaks in witness code (GitHub #402).
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: handle <command>pg_ctl promote</command> timeout (GitHub #425).
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: handle failover situation with only two nodes in the primary
location, and at least one node in another location (GitHub #407).
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: prevent standby connection handle from going stale.
</para>
</listitem>
</itemizedlist>
</para>
</sect2>
</sect1>
<sect1 id="release-4.0.4">
<title>Release 4.0.4</title>
<para><emphasis>Fri Mar 9, 2018</emphasis></para>
<para>
&repmgr; 4.0.4 contains some bug fixes and and a number of
usability enhancements related to logging/diagnostics,
event notifications and pre-action checks.
</para>
<para>
This release can be installed as a simple package upgrade from repmgr 4.0 ~ 4.0.3;
<application>repmgrd</application> (if running) should be restarted. See <xref linkend="upgrading-repmgr">
for more details.
</para>
<note>
<para>
It is not possible to perform a switchover where the demotion candidate is
running &repmgr; 4.0.2 or lower; all nodes should be upgraded to the latest version (4.0.4).
This is due to additional checks introduced in 4.0.3 which require the presence of
4.0.3 or later versions on all nodes.
</para>
</note>
<sect2>
<title>Usability enhancements</title>
<para>
<itemizedlist>
<listitem>
<para>
add <command><link linkend="repmgr-standby-clone">repmgr standby clone --recovery-conf-only</link></command>
option to enable integration of a standby cloned from another source into a &repmgr; cluster (GitHub #382)
</para>
</listitem>
<listitem>
<para>
remove restriction on using replication slots when cloning from a Barman server (GitHub #379)
</para>
</listitem>
<listitem>
<para>
make <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
timeout values configurable (GitHub #387)
</para>
</listitem>
<listitem>
<para>
add missing options to main <literal>--help</literal> output (GitHub #391, #392)
</para>
</listitem>
</itemizedlist>
</para>
</sect2>
<sect2>
<title>Bug fixes</title>
<para>
<itemizedlist>
<listitem>
<para>
ensure <command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>
honours the <option>--dry-run</option> option (GitHub #383)
</para>
</listitem>
<listitem>
<para>
improve replication slot warnings generated by
<command><link linkend="repmgr-node-status">repmgr node status</link></command>
(GitHub #385)
</para>
</listitem>
<listitem>
<para>
fix --superuser handling when cloning a standby (GitHub #380)
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: improve detection of status change from primary to
standby
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: improve reconnection to the local node after a
failover (previously a connection error due to the node starting up was being
interpreted as the node being unavailable)
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: when running on a witness server, correctly connect
to new primary after a failover
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: add <link linkend="event-notifications">event notification</link>
<literal>repmgrd_shutdown</literal> (GitHub #393)
</para>
</listitem>
</itemizedlist>
</para>
</sect2>
</sect1>
<sect1 id="release-4.0.3"> <sect1 id="release-4.0.3">
<title>Release 4.0.3</title> <title>Release 4.0.3</title>
<para><emphasis>??? Feb ??, 2018</emphasis></para> <para><emphasis>Thu Feb 15, 2018</emphasis></para>
<para> <para>
&repmgr; 4.0.3 contains some bug fixes and and a number of &repmgr; 4.0.3 contains some bug fixes and and a number of
@@ -25,6 +409,18 @@
event notifications and pre-action checks. event notifications and pre-action checks.
</para> </para>
<para>
This release can be installed as a simple package upgrade from repmgr 4.0 ~ 4.0.2;
repmgrd (if running) should be restarted.
</para>
<note>
<para>
It is not possible to perform a switchover where the demotion candidate is
running &repmgr; 4.0.2 or lower; all nodes should be upgraded to 4.0.3. This is due
to additional checks introduced in 4.0.3 which require the presence of
4.0.3 or later versions on all nodes.
</para>
</note>
<sect2> <sect2>
<title>Usability enhancements</title> <title>Usability enhancements</title>
@@ -65,16 +461,24 @@
<listitem> <listitem>
<para> <para>
add --dry-run mode to <command><link linkend="repmgr-standby-switchover">repmgr standby follow</link></command> add --dry-run mode to <command><link linkend="repmgr-standby-switchover">repmgr standby follow</link></command>
(GitHub #369) (GitHub #368)
</para> </para>
</listitem> </listitem>
<listitem> <listitem>
<para> <para>
add <literal>standby_register_sync</literal> event notification, which is fired when provide information about the primary node for
<command><link linkend="repmgr-standby-register">repmgr standby register</link></command> and
<command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command> event notifications (GitHub #375)
</para>
</listitem>
<listitem>
<para>
add <literal>standby_register_sync</literal> <link linkend="event-notifications">event notification</link>, which is fired when
<command><link linkend="repmgr-standby-register">repmgr standby register</link></command> <command><link linkend="repmgr-standby-register">repmgr standby register</link></command>
is run with the <option>--wait-sync</option> option and the new or updated standby node is run with the <option>--wait-sync</option> option and the new or updated standby node
record has synchronised to the standy (GitHub #374) record has synchronised to the standby (GitHub #374)
</para> </para>
</listitem> </listitem>

View File

@@ -51,7 +51,7 @@
</itemizedlist> </itemizedlist>
</para> </para>
<sect2 id="cloning-from-barman-prerequisites" xreflabel="Prerequisites for cloning from Barman"> <sect2 id="cloning-from-barman-prerequisites">
<title>Prerequisites for cloning from Barman</title> <title>Prerequisites for cloning from Barman</title>
<para> <para>
In order to enable Barman support for <command>repmgr standby clone</command>, following In order to enable Barman support for <command>repmgr standby clone</command>, following
@@ -356,7 +356,7 @@
By default, <command>pg_basebackup</command> performs a checkpoint before beginning the backup By default, <command>pg_basebackup</command> performs a checkpoint before beginning the backup
process. However, a normal checkpoint may take some time to complete; process. However, a normal checkpoint may take some time to complete;
a fast checkpoint can be forced with the <literal>-c/--fast-checkpoint</literal> option. a fast checkpoint can be forced with the <literal>-c/--fast-checkpoint</literal> option.
However this may impact performance of the server being cloned from (typically the primary) Note that this may impact performance of the server being cloned from (typically the primary)
so should be used with care. so should be used with care.
</para> </para>
<tip> <tip>
@@ -384,11 +384,16 @@
<sect2 id="cloning-advanced-managing-passwords" xreflabel="Managing passwords"> <sect2 id="cloning-advanced-managing-passwords" xreflabel="Managing passwords">
<title>Managing passwords</title> <title>Managing passwords</title>
<indexterm>
<primary>cloning</primary>
<secondary>using passwords</secondary>
</indexterm>
<para> <para>
If replication connections to a standby's upstream server are password-protected, If replication connections to a standby's upstream server are password-protected,
the standby must be able to provide the password so it can begin streaming the standby must be able to provide the password so it can begin streaming replication.
replication.
</para> </para>
<para> <para>
The recommended way to do this is to store the password in the <literal>postgres</literal> system The recommended way to do this is to store the password in the <literal>postgres</literal> system
user's <filename>~/.pgpass</filename> file. It's also possible to store the password in the user's <filename>~/.pgpass</filename> file. It's also possible to store the password in the
@@ -396,6 +401,17 @@
security reasons. For more details see the security reasons. For more details see the
<ulink url="https://www.postgresql.org/docs/current/static/libpq-pgpass.html">PostgreSQL password file documentation</ulink>. <ulink url="https://www.postgresql.org/docs/current/static/libpq-pgpass.html">PostgreSQL password file documentation</ulink>.
</para> </para>
<note>
<para>
If using a <filename>pgpass</filename> file, an entry for the replication user (by default the
user who connects to the <literal>repmgr</literal> database) <emphasis>must</emphasis>
be provided, with database name set to <literal>replication</literal>, e.g.:
<programlisting>
node1:5432:replication:repmgr:12345</programlisting>
</para>
</note>
<para> <para>
If, for whatever reason, you wish to include the password in <filename>recovery.conf</filename>, If, for whatever reason, you wish to include the password in <filename>recovery.conf</filename>,
set <varname>use_primary_conninfo_password</varname> to <literal>true</literal> in set <varname>use_primary_conninfo_password</varname> to <literal>true</literal> in
@@ -407,8 +423,7 @@
</para> </para>
<para> <para>
It is of course also possible to include the password value in the <varname>conninfo</varname> It is of course also possible to include the password value in the <varname>conninfo</varname>
string for each node, but this is obviously a security risk and should be string for each node, but this is obviously a security risk and should be avoided.
avoided.
</para> </para>
<para> <para>
From PostgreSQL 9.6, <application>libpq</application> supports the <varname>passfile</varname> From PostgreSQL 9.6, <application>libpq</application> supports the <varname>passfile</varname>

View File

@@ -1,10 +1,10 @@
<sect1 id="configuration-file-settings" xreflabel="configuration file settings"> <sect1 id="configuration-file-settings" xreflabel="configuration file settings">
<indexterm> <indexterm>
<primary>repmgr.conf</primary> <primary>repmgr.conf</primary>
<secondary>settings</secondary> <secondary>basic settings</secondary>
</indexterm> </indexterm>
<title>Configuration file settings</title> <title>Basic configuration file settings</title>
<para> <para>
Each <filename>repmgr.conf</filename> file must contain the following parameters: Each <filename>repmgr.conf</filename> file must contain the following parameters:
</para> </para>
@@ -92,7 +92,10 @@
<para> <para>
For a full list of annotated configuration items, see the file For a full list of annotated configuration items, see the file
<ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</>. <ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink>.
</para>
<para>
For <application>repmgrd</application>-specific settings, see <xref linkend="repmgrd-configuration">.
</para> </para>
<note> <note>

View File

@@ -0,0 +1,115 @@
<sect1 id="configuration-service-commands" xreflabel="service command settings">
<indexterm>
<primary>repmgr.conf</primary>
<secondary>service command settings</secondary>
</indexterm>
<indexterm>
<primary>service command settings</primary>
<secondary>configuration in repmgr.conf</secondary>
</indexterm>
<title>Service command settings</title>
<para>
In some circumstances, &repmgr; (and <application>repmgrd</application>) need to
be able to stop, start or restart PostgreSQL. &repmgr; commands which need to do this
include <link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>,
<link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link> and
<link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>.
</para>
<para>
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> to control the PostgreSQL
server. However this can lead to various problems, particularly when PostgreSQL has been
installed from packages, and expecially so if <application>systemd</application> is in use.
</para>
<note>
<para>
If using <application>systemd</application>, ensure you have <varname>RemoteIPC</varname> set to <literal>off</literal>.
See the <ulink url="https://wiki.postgresql.org/wiki/Systemd">systemd</ulink>
entry in the <ulink url="https://wiki.postgresql.org/wiki/Main_Page">PostgreSQL wiki</ulink> for details.
</para>
</note>
<para>
With this in mind, we recommend to <emphasis>always</emphasis> configure &repmgr; to use the
available system service commands.
</para>
<para>
To do this, specify the appropriate command for each action
in <filename>repmgr.conf</filename> using the following configuration
parameters:
<programlisting>
service_start_command
service_stop_command
service_restart_command
service_reload_command</programlisting>
</para>
<note>
<para>
It's also possible to specify a <varname>service_promote_command</varname>;
this overrides any value contained in the setting <varname>promote_command</varname>.
This is intended for systems which provide a package-level promote command,
such as Debian's <application>pg_ctlcluster</application>.
</para>
</note>
<para>
To confirm which command &repmgr; will execute for each action, use
<command>repmgr node service --list --action=...</command>, e.g.:
<programlisting>
repmgr -f /etc/repmgr.conf node service --list --action=stop
repmgr -f /etc/repmgr.conf node service --list --action=start
repmgr -f /etc/repmgr.conf node service --list --action=restart
repmgr -f /etc/repmgr.conf node service --list --action=reload</programlisting>
</para>
<para>
These commands will be executed by the system user which &repmgr; runs as (usually <literal>postgres</literal>)
and will probably require passwordless sudo access to be able to execute the command.
</para>
<para>
For example, using <application>systemd</application> on CentOS 7, the service commands can be
set as follows:
<programlisting>
service_start_command = 'sudo systemctl start postgresql-9.6'
service_stop_command = 'sudo systemctl stop postgresql-9.6'
service_restart_command = 'sudo systemctl restart postgresql-9.6'
service_reload_command = 'sudo systemctl reload postgresql-9.6'</programlisting>
and <filename>/etc/sudoers</filename> should be set as follows:
<programlisting>
Defaults:postgres !requiretty
postgres ALL = NOPASSWD: /usr/bin/systemctl stop postgresql-9.6, \
/usr/bin/systemctl start postgresql-9.6, \
/usr/bin/systemctl restart postgresql-9.6 \
/usr/bin/systemctl reload postgresql-9.6</programlisting>
</para>
<important>
<indexterm>
<primary>pg_ctlcluster</primary>
<secondary>service command settings</secondary>
</indexterm>
<para>
Debian/Ubuntu users: instead of calling <command>sudo systemctl</command> directly, use
<command>sudo pg_ctlcluster</command>, e.g.:
<programlisting>
service_start_command = 'sudo pg_ctlcluster 9.6 main start'
service_stop_command = 'sudo pg_ctlcluster 9.6 main stop'
service_restart_command = 'sudo pg_ctlcluster 9.6 main restart'
service_reload_command = 'sudo pg_ctlcluster 9.6 main reload'</programlisting>
and set <filename>/etc/sudoers</filename> accordingly.
</para>
<para>
While <command>pg_ctlcluster</command> will work when executed as user <literal>postgres</literal>,
it's strongly recommended to use <command>sudo pg_ctlcluster</command> on <application>systemd</application>
systems, to ensure <application>systemd</application> has a correct picture of
the PostgreSQL application state.
</para>
</important>
</sect1>

View File

@@ -3,6 +3,7 @@
&configuration-file; &configuration-file;
&configuration-file-settings; &configuration-file-settings;
&configuration-service-commands;
<sect1 id="configuration-permissions" xreflabel="User permissions"> <sect1 id="configuration-permissions" xreflabel="User permissions">
<indexterm> <indexterm>

View File

@@ -205,6 +205,9 @@
<listitem> <listitem>
<simpara><literal>repmgrd_failover_follow</literal></simpara> <simpara><literal>repmgrd_failover_follow</literal></simpara>
</listitem> </listitem>
<listitem>
<simpara><literal>repmgrd_failover_aborted</literal></simpara>
</listitem>
<listitem> <listitem>
<simpara><literal>repmgrd_upstream_disconnect</literal></simpara> <simpara><literal>repmgrd_upstream_disconnect</literal></simpara>
</listitem> </listitem>

View File

@@ -39,6 +39,7 @@
<!ENTITY configuration SYSTEM "configuration.sgml"> <!ENTITY configuration SYSTEM "configuration.sgml">
<!ENTITY configuration-file SYSTEM "configuration-file.sgml"> <!ENTITY configuration-file SYSTEM "configuration-file.sgml">
<!ENTITY configuration-file-settings SYSTEM "configuration-file-settings.sgml"> <!ENTITY configuration-file-settings SYSTEM "configuration-file-settings.sgml">
<!ENTITY configuration-service-commands SYSTEM "configuration-service-commands.sgml">
<!ENTITY cloning-standbys SYSTEM "cloning-standbys.sgml"> <!ENTITY cloning-standbys SYSTEM "cloning-standbys.sgml">
<!ENTITY promoting-standby SYSTEM "promoting-standby.sgml"> <!ENTITY promoting-standby SYSTEM "promoting-standby.sgml">
<!ENTITY follow-new-primary SYSTEM "follow-new-primary.sgml"> <!ENTITY follow-new-primary SYSTEM "follow-new-primary.sgml">

View File

@@ -5,83 +5,107 @@
system. system.
</para> </para>
<sect2 id="installation-packages-redhat" xreflabel="Installing from packages on RHEL, Fedora and CentOS"> <sect2 id="installation-packages-redhat" xreflabel="Installing from packages on RHEL, CentOS and Fedora">
<indexterm> <indexterm>
<primary>installation</primary> <primary>installation</primary>
<secondary>on Redhat/CentOS/Fedora etc.</secondary> <secondary>on Red Hat/CentOS/Fedora etc.</secondary>
</indexterm> </indexterm>
<title>RedHat/Fedora/CentOS</title> <title>RedHat/CentOS/Fedora</title>
<para> <para>
RPM packages for &repmgr; are available via Yum through &repmgr; RPM packages for RedHat/CentOS variants and Fedora are available from the
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
<ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>; see following
section for details.
</para>
<para>
RPM packages for &repmgr; are also available via Yum through
the PostgreSQL Global Development Group RPM repository the PostgreSQL Global Development Group RPM repository
(<ulink url="https://yum.postgresql.org/">http://yum.postgresql.org/</ulink>). (<ulink url="https://yum.postgresql.org/">http://yum.postgresql.org/</ulink>).
Follow the instructions for your distribution (RedHat, CentOS, Follow the instructions for your distribution (RedHat, CentOS,
Fedora, etc.) and architecture as detailed there. Fedora, etc.) and architecture as detailed there. Note that it can take some days
for new &repmgr; packages to become available via the this repository.
</para> </para>
<note>
<para>
&repmgr; packages are designed to be compatible with the community-provided PostgreSQL packages.
They may not work with vendor-specific packages such as those provided by RedHat for RHEL
customers, as the filesystem layout may be different to the community RPMs.
Please contact your support vendor for assistance.
</para>
</note>
<para> <para>
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink> also provides its For more information on the package contents, including details of installation
own RPM packages which are made available paths and relevant <link linkend="configuration-service-commands">service commands</link>,
at the same time as each &repmgr; release, as it can take some days for see the appendix section <xref linkend="packages-centos">.
them to become available via the main PGDG repository. See following section for details:
</para> </para>
<sect3 id="installation-packages-redhat-2ndq"> <sect3 id="installation-packages-redhat-2ndq">
<title>2ndQuadrant repmgr yum repository</title> <title>2ndQuadrant public RPM yum repository</title>
<note>
<para>
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink> previously provided a dedicated
&repmgr; repository at
<ulink url="http://packages.2ndquadrant.com/repmgr/">http://packages.2ndquadrant.com/repmgr/</ulink>.
This repository will be deprecated in a future release as it is now replaced by
the <ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>
documented below.
</para>
</note>
<para> <para>
Beginning with <ulink url="http://repmgr.org/release-notes-3.1.3.html">repmgr 3.1.3</ulink>, Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
<ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a dedicated <literal>yum</literal> <ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a dedicated <literal>yum</literal>
repository for &repmgr; releases. This repository complements the main <ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink> for 2ndQuadrant software,
<ulink url="https://yum.postgresql.org/repopackages.php">PGDG community repository</ulink>, including &repmgr;. We recommend using this for all future &repmgr; releases.
but enables repmgr users to access the latest &repmgr; packages before they are </para>
available via the PGDG repository, which can take several days to be updated following <para>
a fresh &repmgr; release. General instructions for using this repository can be found on its
</para> <ulink url="https://rpm.2ndquadrant.com/">homepage</ulink>. Specific instructions
for installing &repmgr; follow below.
</para>
<para> <para>
<emphasis>Installation</emphasis> <emphasis>Installation</emphasis>
<itemizedlist> <itemizedlist>
<listitem> <listitem>
<para> <para>
Import the repository public key (optional but recommended): Locate the repository RPM for your PostgreSQL version from the list at:
<programlisting> <ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink>
rpm --import http://packages.2ndquadrant.com/repmgr/RPM-GPG-KEY-repmgr</programlisting> </para>
</para> </listitem>
</listitem>
<listitem> <listitem>
<para> <para>
Install the repository RPM for your distribution (this enables the 2ndQuadrant Install the repository RPM for your distribution and PostgreSQL version
repository as a source of repmgr packages): (this enables the 2ndQuadrant repository as a source of &repmgr; packages).
<itemizedlist> </para>
<listitem> <para>
<simpara> For example, for PostgreSQL 10 on CentOS, execute:
<emphasis>Fedora:</emphasis> <programlisting>
<ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm</ulink> sudo yum install https://rpm.2ndquadrant.com/site/content/2ndquadrant-repo-10-1-1.el7.noarch.rpm
</simpara> </programlisting>
</listitem> </para>
<listitem> <para>
<simpara> Verify that the repository is installed with:
<emphasis>RHEL, CentOS etc:</emphasis> <programlisting>
<ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</ulink> sudo yum repolist</programlisting>
</simpara> The output should contain two entries like this:
</listitem> <programlisting>
</itemizedlist> 2ndquadrant-repo-10/7/x86_64 2ndQuadrant packages for PG10 for rhel 7 - x86_64 1
</para> 2ndquadrant-repo-10-debug/7/x86_64 2ndQuadrant packages for PG10 for rhel 7 - x86_64 - Debug 1</programlisting>
<para> </para>
e.g.: </listitem>
<programlisting>
$ yum install http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</programlisting>
</para>
</listitem>
<listitem> <listitem>
<para> <para>
Install the repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr96</literal>), e.g.: Install the &repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr10</literal>):
<programlisting> <programlisting>
$ yum install repmgr96</programlisting> $ yum install repmgr10</programlisting>
</para> </para>
</listitem> </listitem>
</itemizedlist> </itemizedlist>
@@ -91,13 +115,13 @@
<emphasis>Compatibility with PGDG Repositories</emphasis> <emphasis>Compatibility with PGDG Repositories</emphasis>
</para> </para>
<para> <para>
The 2ndQuadrant &repmgr; yum repository uses exactly the same package definitions as the The 2ndQuadrant &repmgr; yum repository packages use the same definitions and file system layout as the
main PGDG repository and is effectively a selective mirror for &repmgr; packages only. main PGDG repository.
</para> </para>
<para> <para>
Normally yum should prioritize the repository with the most recent &repmgr; version. Normally <application>yum</application> will prioritize the repository with the most recent &repmgr; version.
Once the PGDG repository has been updated, it doesn't matter which repository Once the PGDG repository has been updated, it doesn't matter which repository
the packages are installed from. the packages are installed from.
</para> </para>
<para> <para>
To ensure the 2ndQuadrant repository is always prioritised, install <literal>yum-plugin-priorities</literal> To ensure the 2ndQuadrant repository is always prioritised, install <literal>yum-plugin-priorities</literal>
@@ -111,30 +135,23 @@
To install a specific package version, execute <command>yum --showduplicates list</command> To install a specific package version, execute <command>yum --showduplicates list</command>
for the package in question: for the package in question:
<programlisting> <programlisting>
[root@localhost ~]# yum --showduplicates list repmgr96 [root@localhost ~]# yum --showduplicates list repmgr10
Loaded plugins: fastestmirror Loaded plugins: fastestmirror
Loading mirror speeds from cached hostfile Loading mirror speeds from cached hostfile
* base: ftp.iij.ad.jp * base: ftp.iij.ad.jp
* extras: ftp.iij.ad.jp * extras: ftp.iij.ad.jp
* updates: ftp.iij.ad.jp * updates: ftp.iij.ad.jp
Available Packages Available Packages
repmgr96.x86_64 3.2-1.el6 2ndquadrant-repmgr repmgr10.x86_64 4.0.3-1.rhel7 pgdg10
repmgr96.x86_64 3.2.1-1.el6 2ndquadrant-repmgr repmgr10.x86_64 4.0.4-1.rhel7 pgdg10
repmgr96.x86_64 3.3-1.el6 2ndquadrant-repmgr repmgr10.x86_64 4.0.5-1.el7 2ndquadrant-repo-10</programlisting>
repmgr96.x86_64 3.3.1-1.el6 2ndquadrant-repmgr
repmgr96.x86_64 3.3.2-1.el6 2ndquadrant-repmgr
repmgr96.x86_64 3.3.2-1.rhel6 pgdg96
repmgr96.x86_64 4.0.0-1.el6 2ndquadrant-repmgr
repmgr96.x86_64 4.0.0-1.rhel6 pgdg96</programlisting>
then append the appropriate version number to the package name with a hyphen, e.g.: then append the appropriate version number to the package name with a hyphen, e.g.:
<programlisting> <programlisting>
[root@localhost ~]# yum install repmgr96-3.3.2-1.el6</programlisting> [root@localhost ~]# yum install repmgr10-4.0.3-1.rhel7</programlisting>
</para> </para>
</sect3> </sect3>
</sect2> </sect2>
<sect2 id="installation-packages-debian" xreflabel="Installing from packages on Debian or Ubuntu"> <sect2 id="installation-packages-debian" xreflabel="Installing from packages on Debian or Ubuntu">
<indexterm> <indexterm>
@@ -148,6 +165,85 @@
Instructions can be found in the APT section of the PostgreSQL Wiki Instructions can be found in the APT section of the PostgreSQL Wiki
(<ulink url="https://wiki.postgresql.org/wiki/Apt">https://wiki.postgresql.org/wiki/Apt</ulink>). (<ulink url="https://wiki.postgresql.org/wiki/Apt">https://wiki.postgresql.org/wiki/Apt</ulink>).
</para> </para>
<para>
For more information on the package contents, including details of installation
paths and relevant <link linkend="configuration-service-commands">service commands</link>,
see the appendix section <xref linkend="packages-debian-ubuntu">.
</para>
<sect3 id="installation-packages-debian-ubuntu-2ndq">
<title>2ndQuadrant public apt repository for Debian/Ubuntu</title>
<para>
Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
<ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a
<ulink url="https://apt.2ndquadrant.com/">public apt repository</ulink> for 2ndQuadrant software,
including &repmgr;.
</para>
<para>
General instructions for using this repository can be found on its
<ulink url="https://apt.2ndquadrant.com/">homepage</ulink>. Specific instructions
for installing &repmgr; follow below.
</para>
<para>
<emphasis>Installation</emphasis>
<itemizedlist>
<listitem>
<para>
If not already present, install the <application>apt-transport-https</application> package:
<programlisting>
sudo apt-get install apt-transport-https</programlisting>
</para>
</listitem>
<listitem>
<para>
Create <filename>/etc/apt/sources.list.d/2ndquadrant.list</filename> as follows:
<programlisting>
sudo sh -c 'echo "deb https://apt.2ndquadrant.com/ $(lsb_release -cs)-2ndquadrant main" > /etc/apt/sources.list.d/2ndquadrant.list'</programlisting>
</para>
</listitem>
<listitem>
<para>
Install the 2ndQuadrant <ulink url="https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc">repository key</ulink>:
<programlisting>
sudo apt-get install curl ca-certificates
curl https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc | sudo apt-key add -</programlisting>
</para>
</listitem>
<listitem>
<para>
Update the package list
<programlisting>
sudo apt-get update</programlisting>
</para>
</listitem>
<listitem>
<para>
Install the &repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr10</literal>):
<programlisting>
$ apt-get install postgresql-10-repmgr</programlisting>
</para>
<note>
<para>
For packages for PostgreSQL 9.6 and earlier, the package name includes
a period between major and minor version numbers, e.g.
<literal>postgresql-9.6-repmgr</literal>.
</para>
</note>
</listitem>
</itemizedlist>
</para>
</sect3>
</sect2> </sect2>
</sect1> </sect1>

View File

@@ -80,7 +80,7 @@
</para> </para>
<para> <para>
There are also tags for each &repmgr; release, e.g. <filename>REL4_0_STABLE</filename>. There are also tags for each &repmgr; release, e.g. <filename>4.0.5</filename>.
</para> </para>
<para> <para>

View File

@@ -2,7 +2,8 @@
<title>repmgr overview</title> <title>repmgr overview</title>
<para> <para>
This chapter provides a high-level overview of repmgr's components and functionality. This chapter provides a high-level overview of &repmgr;'s components and
functionality.
</para> </para>
<sect1 id="repmgr-concepts" xreflabel="Concepts"> <sect1 id="repmgr-concepts" xreflabel="Concepts">

View File

@@ -38,5 +38,34 @@
and therefore determine the state of outbound connections from that node. and therefore determine the state of outbound connections from that node.
</para> </para>
</refsect1> </refsect1>
<refsect1>
<title>Exit codes</title>
<para>
Following exit codes can be emitted by <command>repmgr cluster crosscheck</command>:
</para>
<variablelist>
<varlistentry>
<term><option>SUCCESS (0)</option></term>
<listitem>
<para>
The check completed successfully and all nodes are reachable.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>ERR_CLUSTER_CHECK (25)</option></term>
<listitem>
<para>
One or more nodes could not be reached.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
</refentry> </refentry>

View File

@@ -97,5 +97,35 @@
useful result. useful result.
</para> </para>
</refsect1> </refsect1>
<refsect1>
<title>Exit codes</title>
<para>
Following exit codes can be emitted by <command>repmgr cluster matrix</command>:
</para>
<variablelist>
<varlistentry>
<term><option>SUCCESS (0)</option></term>
<listitem>
<para>
The check completed successfully and all nodes are reachable.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>ERR_CLUSTER_CHECK (25)</option></term>
<listitem>
<para>
One or more nodes could not be reached.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
</refentry> </refentry>

View File

@@ -45,6 +45,77 @@
</para> </para>
</refsect1> </refsect1>
<refsect1>
<title>Options</title>
<variablelist>
<varlistentry>
<term><option>--dry-run</option></term>
<listitem>
<para>
Check prerequisites but don't actually execute the rejoin.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--force-rewind[=/path/to/pg_rewind]</option></term>
<listitem>
<para>
Execute <application>pg_rewind</application> if necessary.
</para>
<para>
It is only necessary to provide the <application>pg_rewind</application>
if using PostgreSQL 9.3 or 9.4, and <application>pg_rewind</application>
is not installed in the PostgreSQL <filename>bin</filename> directory.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--config-files</option></term>
<listitem>
<para>
comma-separated list of configuration files to retain after
executing <application>pg_rewind</application>.
</para>
<para>
Currently <application>pg_rewind</application> will overwrite
the local node's configuration files with the files from the source node,
so it's advisable to use this option to ensure they are kept.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--config-archive-dir</option></term>
<listitem>
<para>
Directory to temporarily store configuration files specified with
<option>--config-files</option>; default: <filename>/tmp</filename>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-W/--no-wait</option></term>
<listitem>
<para>
Don't wait for the node to rejoin cluster.
</para>
<para>
If this option is supplied, &repmgr; will restart the node but
not wait for it to connect to the primary.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1> <refsect1>
<title>Event notifications</title> <title>Event notifications</title>
<para> <para>
@@ -77,11 +148,18 @@
</refsect1> </refsect1>
<refsect1 id="repmgr-node-rejoin-pg-rewind" xreflabel="Using pg_rewind"> <refsect1 id="repmgr-node-rejoin-pg-rewind" xreflabel="Using pg_rewind">
<indexterm>
<primary>pg_rewind</primary>
<secondary>using with "repmgr node rejoin"</secondary>
</indexterm>
<title>Using <command>pg_rewind</command></title> <title>Using <command>pg_rewind</command></title>
<para> <para>
<command>repmgr node rejoin</command> can optionally use <command>pg_rewind</command> to re-integrate a <command>repmgr node rejoin</command> can optionally use <command>pg_rewind</command> to re-integrate a
node which has diverged from the rest of the cluster, typically a failed primary. node which has diverged from the rest of the cluster, typically a failed primary.
<command>pg_rewind</command> is available in PostgreSQL 9.5 and later. <command>pg_rewind</command> is available in PostgreSQL 9.5 and later as part of the core distribution,
and can be installed from external sources for PostgreSQL 9.3 and 9.4.
</para> </para>
<note> <note>
<para> <para>

View File

@@ -26,7 +26,7 @@
<refsect1> <refsect1>
<title>Execution</title> <title>Execution</title>
<para> <para>
Execute with the <literal>--dry-run</literal> option to check what would happen without Execute with the <option>--dry-run</option> option to check what would happen without
actually registering the primary. actually registering the primary.
</para> </para>
<para> <para>
@@ -36,7 +36,7 @@
<note> <note>
<para> <para>
If providing the configuration file location with <literal>-f/--config-file</literal>, If providing the configuration file location with <option>-f/--config-file</option>,
avoid using a relative path, as &repmgr; stores the configuration file location avoid using a relative path, as &repmgr; stores the configuration file location
in the repmgr metadata for use when &repmgr; is executed remotely (e.g. during in the repmgr metadata for use when &repmgr; is executed remotely (e.g. during
<xref linkend="repmgr-standby-switchover">). &repmgr; will attempt to convert the <xref linkend="repmgr-standby-switchover">). &repmgr; will attempt to convert the
@@ -48,6 +48,33 @@
</note> </note>
</refsect1> </refsect1>
<refsect1>
<title>Options</title>
<variablelist>
<varlistentry>
<term><option>--dry-run</option></term>
<listitem>
<para>
Check prerequisites but don't actually register the primary.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-F</option>, <option>--force</option></term>
<listitem>
<para>
Overwrite an existing node record
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1> <refsect1>
<title>Event notifications</title> <title>Event notifications</title>
<para> <para>

View File

@@ -22,7 +22,7 @@
<refsect1> <refsect1>
<title>Execution</title> <title>Execution</title>
<para> <para>
<command>repmgr primary unregister</command> should be run on the current primary, <command>repmgr primary unregister</command> can be run on any active &repmgr; node,
with the ID of the node to unregister passed as <option>--node-id</option>. with the ID of the node to unregister passed as <option>--node-id</option>.
</para> </para>
<para> <para>

View File

@@ -25,9 +25,11 @@
<note> <note>
<simpara> <simpara>
<command>repmgr standby clone</command> does not start the standby, and after cloning <command>repmgr standby clone</command> does not start the standby, and after cloning
<command>repmgr standby register</command> must be executed to notify &repmgr; of its presence. a standby, the command <command>repmgr standby register</command> must be executed to
notify &repmgr; of its existence.
</simpara> </simpara>
</note> </note>
</refsect1> </refsect1>
@@ -65,7 +67,71 @@
</tip> </tip>
</refsect1> </refsect1>
<refsect1 id="repmgr-standby-clone-wal-management" xreflabel="Managing WAL during the cloning process"> <refsect1 id="repmgr-standby-clone-recovery-conf">
<indexterm>
<primary>recovery.conf</primary>
<secondary>customising with "repmgr standby clone"</secondary>
</indexterm>
<title>Customising recovery.conf</title>
<para>
By default, &repmgr; will create a minimal <filename>recovery.conf</filename>
containing following parameters:
</para>
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<simpara><varname>standby_mode</varname> (always <literal>'on'</literal>)</simpara>
</listitem>
<listitem>
<simpara><varname>recovery_target_timeline</varname> (always <literal>'latest'</literal>)</simpara>
</listitem>
<listitem>
<simpara><varname>primary_conninfo</varname></simpara>
</listitem>
<listitem>
<simpara><varname>primary_slot_name</varname> (if replication slots in use)</simpara>
</listitem>
</itemizedlist>
<para>
The following additional parameters can be specified in <filename>repmgr.conf</filename>
for inclusion in <filename>recovery.conf</filename>:
</para>
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<simpara><varname>restore_command</varname></simpara>
</listitem>
<listitem>
<simpara><varname>archive_cleanup_command</varname></simpara>
</listitem>
<listitem>
<simpara><varname>recovery_min_apply_delay</varname></simpara>
</listitem>
</itemizedlist>
<note>
<para>
We recommend using <ulink url="https://www.pgbarman.org/">Barman</ulink> to manage
WAL file archiving. For more details on combining &repmgr; and <application>Barman</application>,
in particular using <varname>restore_command</varname> to configure Barman as a backup source of
WAL files, see <xref linkend="cloning-from-barman">.
</para>
</note>
</refsect1>
<refsect1 id="repmgr-standby-clone-wal-management">
<title>Managing WAL during the cloning process</title> <title>Managing WAL during the cloning process</title>
<para> <para>
When initially cloning a standby, you will need to ensure When initially cloning a standby, you will need to ensure
@@ -100,6 +166,164 @@
</note> </note>
</refsect1> </refsect1>
<refsect1 id="repmgr-standby-create-recovery-conf">
<indexterm>
<primary>recovery.conf</primary>
<secondary>generating for a standby cloned by another method</secondary>
</indexterm>
<title>Using a standby cloned by another method</title>
<para>
&repmgr; supports standbys cloned by another method (e.g. using <application>barman</application>'s
<command><ulink url="http://docs.pgbarman.org/release/2.4/#recover">barman recover</ulink></command> command).
</para>
<para>
To integrate the standby as a &repmgr; node, ensure the <filename>repmgr.conf</filename>
file is created for the node, and that it has been registered using
<command><link linkend="repmgr-standby-register">repmgr standby register</link></command>.
Then execute the command <command>repmgr standby clone --recovery-conf-only</command>.
This will create the <filename>recovery.conf</filename> file needed to attach
the node to its upstream, and will also create a replication slot on the
upstream node if required.
</para>
<para>
Note that the upstream node must be running. An existing
<filename>recovery.conf</filename> will not be overwritten unless the
<option>-F/--force</option> option is provided.
</para>
<para>
Execute <command>repmgr standby clone --recovery-conf-only --dry-run</command>
to check the prerequisites for creating the <filename>recovery.conf</filename> file,
and display the contents of the file without actually creating it.
</para>
<note>
<para>
<option>--recovery-conf-only</option> was introduced in &repmgr; <link linkend="release-4.0.4">4.0.4</link>.
</para>
</note>
</refsect1>
<refsect1>
<title>Options</title>
<variablelist>
<varlistentry>
<term><option>--dry-run</option></term>
<listitem>
<para>
Check prerequisites but don't actually clone the standby.
</para>
<para>
If <option>--recovery-conf-only</option> specified, the contents of
the generated <filename>recovery.conf</filename> file will be displayed
but the file itself not written.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-c, --fast-checkpoint</option></term>
<listitem>
<para>
Force fast checkpoint (not effective when cloning from Barman).
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--copy-external-config-files[={samepath|pgdata}]</option></term>
<listitem>
<para>
Copy configuration files located outside the data directory on the source
node to the same path on the standby (default) or to the
PostgreSQL data directory.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--no-upstream-connection</option></term>
<listitem>
<para>
When using Barman, do not connect to upstream node.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-R, --remote-user=USERNAME</option></term>
<listitem>
<para>
Remote system username for SSH operations (default: current local system username).
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option> --recovery-conf-only</option></term>
<listitem>
<para>
Create <filename>recovery.conf</filename> file for a previously cloned instance. &repmgr 4.0.4 and later.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--replication-user</option></term>
<listitem>
<para>
User to make replication connections with (optional, not usually required).
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--superuser</option></term>
<listitem>
<para>
If the &repmgr; user is not a superuser, the name of a valid superuser must
be provided with this option.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--upstream-conninfo</option></term>
<listitem>
<para>
<literal>primary_conninfo</literal> value to write in recovery.conf
when the intended upstream server does not yet exist.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--upstream-node-id</option></term>
<listitem>
<para>
ID of the upstream node to replicate from (optional, defaults to primary node)
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--without-barman </option></term>
<listitem>
<para>
Do not use Barman even if configured.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1> <refsect1>
<title>Event notifications</title> <title>Event notifications</title>
<para> <para>
@@ -107,5 +331,11 @@
</para> </para>
</refsect1> </refsect1>
<refsect1>
<title>See also</title>
<para>
See <xref linkend="cloning-standbys"> for details about various aspects of cloning.
</para>
</refsect1>
</refentry> </refentry>

View File

@@ -26,10 +26,19 @@
running. It can only be used to attach an active standby to the current primary node running. It can only be used to attach an active standby to the current primary node
(and not to another standby). (and not to another standby).
</para> </para>
<para> <tip>
To re-add an inactive node to the replication cluster, see <para>
<xref linkend="repmgr-node-rejoin"> To re-add an inactive node to the replication cluster, use
</para> <xref linkend="repmgr-node-rejoin">.
</para>
</tip>
<para>
<command>repmgr standby follow</command> will wait up to
<varname>standby_follow_timeout</varname> seconds (default: <literal>30</literal>)
to verify the standby has actually connected to the new primary.
</para>
</refsect1> </refsect1>
<refsect1> <refsect1>
@@ -70,11 +79,14 @@
</varlistentry> </varlistentry>
<varlistentry> <varlistentry>
<term><option>-W</option></term> <term><option>-w</option></term>
<term><option>--wait</option></term> <term><option>--wait</option></term>
<listitem> <listitem>
<para> <para>
Wait for a primary to appear. Wait for a primary to appear. &repmgr; will wait for up to
<varname>primary_follow_timeout</varname> seconds
(default: 60 seconds) to verify that the standby is following the new primary.
This value can be defined in <filename>repmgr.conf</filename>.
</para> </para>
</listitem> </listitem>
</varlistentry> </varlistentry>
@@ -88,7 +100,7 @@
A <literal>standby_follow</literal> <link linkend="event-notifications">event notification</link> will be generated. A <literal>standby_follow</literal> <link linkend="event-notifications">event notification</link> will be generated.
</para> </para>
<para> <para>
If provided, &repmgr; will subsitute the placeholders <literal>%p</literal> with the node ID of the primary If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the primary
being followed, <literal>%c</literal> with its <literal>conninfo</literal> string, and being followed, <literal>%c</literal> with its <literal>conninfo</literal> string, and
<literal>%a</literal> with its node name. <literal>%a</literal> with its node name.
</para> </para>

View File

@@ -26,6 +26,12 @@
by using <xref linkend="repmgr-standby-follow">; if <application>repmgrd</application> by using <xref linkend="repmgr-standby-follow">; if <application>repmgrd</application>
is active, it will handle this automatically. is active, it will handle this automatically.
</para> </para>
<para>
Note that &repmgr; will wait for up to <varname>promote_check_timeout</varname> seconds
(default: 60 seconds) to verify that the standby has been promoted, and will
check the promotion every <varname>promote_check_interval</varname> seconds (default: 1 second).
Both values can be defined in <filename>repmgr.conf</filename>.
</para>
</refsect1> </refsect1>
<refsect1> <refsect1>
@@ -42,6 +48,7 @@
</para> </para>
</refsect1> </refsect1>
<refsect1> <refsect1>
<title>Event notifications</title> <title>Event notifications</title>
<para> <para>

View File

@@ -92,6 +92,73 @@
</para> </para>
</refsect1> </refsect1>
<refsect1 id="repmgr-standby-register-node-cloned-other-source">
<title>Registering a node not cloned by repmgr</title>
<para>
If you've cloned a standby using another method (e.g. <application>barman</application>'s
<command>barman recover</command> command), first execute
<link linkend="repmgr-standby-create-recovery-conf">repmgr standby clone --recovery-conf-only</link>
to add the <filename>recovery.conf</filename> file, then register the standby as usual.
</para>
</refsect1>
<refsect1>
<title>Options</title>
<variablelist>
<varlistentry>
<term><option>--dry-run</option></term>
<listitem>
<para>
Check prerequisites but don't actually register the standby.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-F</option><option>--force</option></term>
<listitem>
<para>
Overwrite an existing node record
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--upstream-node-id</option></term>
<listitem>
<para>
ID of the upstream node to replicate from (optional)
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--wait-start</option></term>
<listitem>
<para>
wait for the standby to start (timeout in seconds, default 30 seconds)
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--wait-sync</option></term>
<listitem>
<para>
wait for the node record to synchronise to the standby (optional timeout in seconds)
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1> <refsect1>
<title>Event notifications</title> <title>Event notifications</title>
<para> <para>
@@ -106,7 +173,7 @@
</para> </para>
<para> <para>
If provided, &repmgr; will subsitute the placeholders <literal>%p</literal> with the node ID of the If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the
primary node, <literal>%c</literal> with its <literal>conninfo</literal> string, and primary node, <literal>%c</literal> with its <literal>conninfo</literal> string, and
<literal>%a</literal> with its node name. <literal>%a</literal> with its node name.
</para> </para>

View File

@@ -35,6 +35,10 @@
a successful switchover. a successful switchover.
</para> </para>
</note> </note>
<para>
For more details on performing a switchover, including preparation and configuration,
see section <xref linkend="performing-switchover">.
</para>
</refsect1> </refsect1>
<refsect1> <refsect1>
@@ -84,11 +88,14 @@
</varlistentry> </varlistentry>
<varlistentry> <varlistentry>
<term><option>--force-rewind</option></term> <term><option>--force-rewind[=/path/to/pg_rewind]</option></term>
<listitem> <listitem>
<para> <para>
Use <application>pg_rewind</application> to reintegrate the old primary if necessary Use <application>pg_rewind</application> to reintegrate the old primary if necessary
(PostgreSQL 9.5 and later). (and the prerequisites for using <application>pg_rewind</application> are met).
If using PostgreSQL 9.3 or 9.4, and the <application>pg_rewind</application>
binary is not installed in the PostgreSQL <filename>bin</filename> directory,
provide its full path. For more details see also <xref linkend="switchover-pg-rewind">.
</para> </para>
</listitem> </listitem>
</varlistentry> </varlistentry>
@@ -115,6 +122,48 @@
</refsect1> </refsect1>
<refsect1>
<title>Configuration file settings</title>
<para>
Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
switchover operation:
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<simpara>
<literal>reconnect_attempts</literal>: number of times to check the original primary
for a clean shutdown after executing the shutdown command, before aborting
</simpara>
</listitem>
<listitem>
<simpara>
<literal>reconnect_interval</literal>: interval (in seconds) to check the original
primary for a clean shutdown after executing the shutdown command (up to a maximum
of <literal>reconnect_attempts</literal> tries)
</simpara>
</listitem>
<listitem>
<simpara>
<literal>replication_lag_critical</literal>:
if replication lag (in seconds) on the standby exceeds this value, the
switchover will be aborted (unless the <literal>-F/--force</literal> option
is provided)
</simpara>
</listitem>
<listitem>
<simpara>
<literal>standby_reconnect_timeout</literal>:
Number of seconds to attempt to reconnect to the demoted primary
once it has been restarted.
</simpara>
</listitem>
</itemizedlist>
</para>
</refsect1>
<refsect1> <refsect1>
<title>Execution</title> <title>Execution</title>
@@ -150,7 +199,7 @@
<refsect1> <refsect1>
<title>Exit codes</title> <title>Exit codes</title>
<para> <para>
Following exit codes can be emitted by <literal>repmgr standby switchover</literal>: Following exit codes can be emitted by <command>repmgr standby switchover</command>:
</para> </para>
<variablelist> <variablelist>
@@ -178,7 +227,7 @@
<para> <para>
The switchover was executed but a problem was encountered. The switchover was executed but a problem was encountered.
Typically this means the former primary could not be reattached Typically this means the former primary could not be reattached
as a standby. as a standby. Check preceding log messages for more information.
</para> </para>
</listitem> </listitem>
</varlistentry> </varlistentry>

View File

@@ -43,6 +43,22 @@
</para> </para>
</refsect1> </refsect1>
<refsect1>
<title>Options</title>
<variablelist>
<varlistentry>
<term><option>--node-id</option></term>
<listitem>
<para>
<varname>node_id</varname> of the node to unregister (optional)
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1> <refsect1>
<title>Event notifications</title> <title>Event notifications</title>
<para> <para>

View File

@@ -1,60 +1,233 @@
<chapter id="repmgrd-configuration"> <chapter id="repmgrd-configuration">
<indexterm> <indexterm>
<primary>repmgrd</primary> <primary>repmgrd</primary>
<secondary>configuration</secondary> <secondary>configuration</secondary>
</indexterm> </indexterm>
<title>repmgrd configuration</title> <title>repmgrd configuration</title>
<para>
To use <application>repmgrd</application>, its associated function library must be
included in <filename>postgresql.conf</filename> with:
<programlisting> <para>
shared_preload_libraries = 'repmgr'</programlisting> <application>repmgrd</application> is a daemon which runs on each PostgreSQL node,
</para> monitoring the local node, and (unless it's the primary node) the upstream server
<para> (the primary server or with cascading replication, another standby) which it's
Changing this setting requires a restart of PostgreSQL; for more details see connected to.
the <ulink url="https://www.postgresql.org/docs/current/static/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>. </para>
</para> <para>
<para> <application>repmgrd</application> can be configured to provide failover
Additionally the following <application>repmgrd</application> options *must* be set in capability in case the primary upstream node becomes unreachable, and/or
<filename>repmgr.conf</filename> (adjust configuration file locations as appropriate): provide monitoring data to the &repmgr; metadatabase.
<programlisting> </para>
failover=automatic
promote_command='repmgr standby promote -f /etc/repmgr.conf --log-to-file' <sect1 id="repmgrd-basic-configuration">
follow_command='repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting> <title>repmgrd basic configuration</title>
</para>
<para> <para>
Note that the <literal>--log-to-file</literal> option will cause To use <application>repmgrd</application>, its associated function library <emphasis>must</emphasis> be
output generated by the &repmgr; command, when executed by <application>repmgrd</application>, included in <filename>postgresql.conf</filename> with:
to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
See <filename>repmgr.conf.sample</filename> for further <application>repmgrd</application>-specific settings. <programlisting>
</para> shared_preload_libraries = 'repmgr'</programlisting>
<para> </para>
When <varname>failover</varname> is set to <literal>automatic</literal>, upon detecting failure <para>
of the current primary, <application>repmgrd</application> will execute one of Changing this setting requires a restart of PostgreSQL; for more details see
<varname>promote_command</varname> or <varname>follow_command</varname>, the <ulink url="https://www.postgresql.org/docs/current/static/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
depending on whether the current server is to become the new primary, or </para>
needs to follow another server which has become the new primary. Note that
these commands can be any valid shell script which results in one of these <sect2 id="repmgrd-automatic-failover-configuration">
two actions happening, but if &repmgr;'s <command>standby follow</command> or <title>automatic failover configuration</title>
<command>standby promote</command> <para>
commands are not executed (either directly as shown here, or from a script which If using automatic failover, the following <application>repmgrd</application> options *must* be set in
performs other actions), the &repmgr; metadata will not be updated and <filename>repmgr.conf</filename> :
&repmgr; will no longer function reliably. <programlisting>
</para> failover=automatic
<para> promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'
The <varname>follow_command</varname> should provide the <literal>--upstream-node-id=%n</literal> follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by </para>
<application>repmgrd</application> with the ID of the new primary node. If this is not provided, &repmgr; <para>
will attempt to determine the new primary by itself, but if the Adjust file paths as appropriate; we recomment specifying the full path to the &repmgr; binary.
original primary comes back online after the new primary is promoted, there is a risk that </para>
<command>repmgr standby follow</command> will result in the node continuing to follow <para>
the original primary. Note that the <literal>--log-to-file</literal> option will cause
</para> output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
<sect1 id="repmgrd-connection-settings"> to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
<title>repmgrd connection settings</title> See <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename>
for further <application>repmgrd</application>-specific settings.
</para>
<para>
When <varname>failover</varname> is set to <literal>automatic</literal>, upon detecting failure
of the current primary, <application>repmgrd</application> will execute one of:
</para>
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<simpara>
<varname>promote_command</varname> (if the current server is to become the new primary)
</simpara>
</listitem>
<listitem>
<simpara>
<varname>follow_command</varname> (if the current server needs to follow another server which has
become the new primary)
</simpara>
</listitem>
</itemizedlist>
<note>
<para>
These commands can be any valid shell script which results in one of these
two actions happening, but if &repmgr;'s <command>standby follow</command> or
<command>standby promote</command>
commands are not executed (either directly as shown here, or from a script which
performs other actions), the &repmgr; metadata will not be updated and
&repmgr; will no longer function reliably.
</para>
</note>
<para>
The <varname>follow_command</varname> should provide the <literal>--upstream-node-id=%n</literal>
option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
<application>repmgrd</application> with the ID of the new primary node. If this is not provided, &repmgr;
will attempt to determine the new primary by itself, but if the
original primary comes back online after the new primary is promoted, there is a risk that
<command>repmgr standby follow</command> will result in the node continuing to follow
the original primary.
</para>
</sect2>
<sect2 id="repmgrd-service-configuration">
<indexterm>
<primary>repmgrd</primary>
<secondary>PostgreSQL service configuration</secondary>
</indexterm>
<title>PostgreSQL service configuration</title>
<para>
If using automatic failover, currently <application>repmgrd</application> will need to execute
<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>
to restart PostgreSQL on standbys to have them follow a new primary.
</para>
<para>
To ensure this happens smoothly, it's essential to provide the appropriate system/service restart
command appropriate to your operating system via <varname>service_restart_command</varname>
in <filename>repmgr.conf</filename>. If you don't do this, <application>repmgrd</application>
will default to using <command>pg_ctl</command>, which can result in unexpected problems,
particularly on <application>systemd</application>-based systems.
</para>
<para>
For more details, see <xref linkend="configuration-service-commands">.
</para>
</sect2>
<sect2 id="repmgrd-monitoring-configuration">
<indexterm>
<primary>repmgrd</primary>
<secondary>monitoring configuration</secondary>
</indexterm>
<title>Monitoring configuration</title>
<para>
To enable monitoring, set:
<programlisting>
monitoring_history=yes</programlisting>
in <filename>repmgr.conf</filename>.
</para>
<para>
The default monitoring interval is 2 seconds; this value can be explicitly set using:
<programlisting>
monitor_interval_secs=&lt;seconds&gt;</programlisting>
in <filename>repmgr.conf</filename>.
</para>
<para>
For more details on monitoring, see <xref linkend="repmgrd-monitoring">.
</para>
</sect2>
</sect1>
<sect1 id="repmgrd-daemon">
<indexterm>
<primary>repmgrd</primary>
<secondary>starting and stopping</secondary>
</indexterm>
<title>repmgrd daemon</title>
<para>
If installed from a package, the <application>repmgrd</application> can be started
via the operating system's service command, e.g. in <application>systemd</application>
using <command>systemctl</command>.
</para>
<para>
See appendix <xref linkend="appendix-packages"> for details of service commands
for different distributions.
</para>
<para>
<application>repmgrd</application> can be started manually like this:
<programlisting>
repmgrd -f /etc/repmgr.conf --pid-file /tmp/repmgrd.pid --daemonize</programlisting>
and stopped with <command>kill `cat /tmp/repmgrd.pid`</command>. Adjust paths as appropriate.
</para>
<para>
To apply configuration file changes to a running <application>repmgrd</application>
daemon, execute the operating system's service reload command (for manually started
instances, execute <command>kill -HUP `cat /tmp/repmgrd.pid`</command>).
Note that only a subset of configuration file parameters can be changed on a
running <application>repmgrd</application> daemon.
</para>
<sect2 id="repmgrd-configuration-debian-ubuntu">
<indexterm>
<primary>repmgrd</primary>
<secondary>Debian/Ubuntu and daemon configuration</secondary>
</indexterm>
<indexterm>
<primary>Debian/Ubuntu</primary>
<secondary>repmgrd daemon configuration</secondary>
</indexterm>
<title>repmgrd daemon configuration on Debian/Ubuntu</title>
<para>
If &repmgr; was installed from Debian/Ubuntu packages, additional configuration
is required before <application>repmgrd</application> is started as a daemon.
</para>
<para>
This is done via the file <filename>/etc/default/repmgrd</filename>, which by default
looks like this:
<programlisting>
# default settings for repmgrd. This file is source by /bin/sh from
# /etc/init.d/repmgrd
# disable repmgrd by default so it won't get started upon installation
# valid values: yes/no
REPMGRD_ENABLED=no
# configuration file (required)
#REPMGRD_CONF="/path/to/repmgr.conf"
# additional options
#REPMGRD_OPTS=""
# user to run repmgrd as
#REPMGRD_USER=postgres
# repmgrd binary
#REPMGRD_BIN=/usr/bin/repmgrd
# pid file
#REPMGRD_PIDFILE=/var/run/repmgrd.pid</programlisting>
</para>
<para>
Set <varname>REPMGRD_ENABLED</varname> to <literal>yes</literal>, and <varname>REPMGRD_CONF</varname>
to the <filename>repmgr.conf</filename> file you are using.
</para>
<para>
If using <application>systemd</application>, you may need to execute <command>systemctl daemon-reload</command>.
Also, if you attempted to start <application>repmgrd</application> using <command>systemctl start repmgrd</command>,
you'll need to execute <command>systemctl stop repmgrd</command>. Because that's how <application>systemd</application>
rolls.
</para>
</sect2>
</sect1>
<sect1 id="repmgrd-connection-settings">
<title>repmgrd connection settings</title>
<para> <para>
In addition to the &repmgr; configuration settings, parameters in the In addition to the &repmgr; configuration settings, parameters in the
<varname>conninfo</varname> string influence how &repmgr; makes a network connection to <varname>conninfo</varname> string influence how &repmgr; makes a network connection to
@@ -76,12 +249,21 @@
<ulink url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-PARAMKEYWORDS">PostgreSQL documentation</ulink>. <ulink url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-PARAMKEYWORDS">PostgreSQL documentation</ulink>.
</para> </para>
</sect1> </sect1>
<sect1 id="repmgrd-log-rotation"> <sect1 id="repmgrd-log-rotation">
<indexterm>
<primary>log rotation</primary>
<secondary>repmgrd</secondary>
</indexterm>
<title>repmgrd log rotation</title> <title>repmgrd log rotation</title>
<para> <para>
To ensure the current <application>repmgrd</application> logfile does not grow To ensure the current <application>repmgrd</application> logfile
indefinitely, configure your system's <command>logrotate</command> to (specified in <filename>repmgr.conf</filename> with the parameter
regularly rotate it. <option>log_file</option> does not grow indefinitely, configure your
system's <command>logrotate</command> to regularly rotate it.
</para> </para>
<para> <para>
Sample configuration to rotate logfiles weekly with retention for Sample configuration to rotate logfiles weekly with retention for

View File

@@ -40,7 +40,7 @@
</listitem> </listitem>
<listitem> <listitem>
<simpara>repmgrd is monitoring the primary node, but it is not available</simpara> <simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
</listitem> </listitem>
</itemizedlist> </itemizedlist>
</para> </para>
@@ -69,7 +69,15 @@
By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely. By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>, However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
after which <application>repmgrd</application> will terminate. after which <application>repmgrd</application> will terminate.
</para> </para>
<note>
<para>
If <application>repmgrd</application> is monitoring a primary mode which has been stopped
and manually restarted as a standby attached to a new primary, it will automatically detect
the status change and update the node record to reflect the node's new status
as an active standby. It will then resume monitoring the node as a standby.
</para>
</note>
</chapter> </chapter>

View File

@@ -3,6 +3,10 @@
<primary>repmgrd</primary> <primary>repmgrd</primary>
<secondary>monitoring</secondary> <secondary>monitoring</secondary>
</indexterm> </indexterm>
<indexterm>
<primary>monitoring</primary>
<secondary>with repmgrd</secondary>
</indexterm>
<title>Monitoring with repmgrd</title> <title>Monitoring with repmgrd</title>
<para> <para>

View File

@@ -67,13 +67,21 @@
promotion candidate to all standbys attached to the demotion candidate. promotion candidate to all standbys attached to the demotion candidate.
</para> </para>
<note>
<simpara>
&repmgr; expects to find the &repmgr; binary in the same path on the remote
server as on the local server.
</simpara>
</note>
<para> <para>
Double-check which commands will be used to stop/start/restart the current Double-check which commands will be used to stop/start/restart the current
primary; on the primary execute: primary; on the current primary execute:
<programlisting> <programlisting>
repmgr -f /etc/repmgr.conf node service --list --action=stop repmgr -f /etc/repmgr.conf node service --list --action=stop
repmgr -f /etc/repmgr.conf node service --list --action=start repmgr -f /etc/repmgr.conf node service --list --action=start
repmgr -f /etc/repmgr.conf node service --list --action=restart</programlisting> repmgr -f /etc/repmgr.conf node service --list --action=restart</programlisting>
</para> </para>
<para> <para>
@@ -92,7 +100,11 @@
<para> <para>
If the <option>service_*_command</option> options aren't defined, &repmgr; will If the <option>service_*_command</option> options aren't defined, &repmgr; will
fall back to using <application>pg_ctl</application> to stop/start/restart fall back to using <application>pg_ctl</application> to stop/start/restart
PostgreSQL, which may not work properly. PostgreSQL, which may not work properly, particularly when executed on a remote
server.
</para>
<para>
For more details, see <xref linkend="configuration-service-commands">.
</para> </para>
</important> </important>
@@ -109,6 +121,7 @@
</simpara> </simpara>
</note> </note>
<para> <para>
Check that access from applications is minimalized or preferably blocked Check that access from applications is minimalized or preferably blocked
completely, so applications are not unexpectedly interrupted. completely, so applications are not unexpectedly interrupted.
@@ -163,34 +176,60 @@
</para> </para>
</important> </important>
<para>
Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
switchover operation:
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<simpara>
<literal>reconnect_attempts</literal>: number of times to check the original primary
for a clean shutdown after executing the shutdown command, before aborting
</simpara>
</listitem>
<listitem>
<simpara>
<literal>reconnect_interval</literal>: interval (in seconds) to check the original
primary for a clean shutdown after executing the shutdown command (up to a maximum
of <literal>reconnect_attempts</literal> tries)
</simpara>
</listitem>
<listitem>
<simpara>
<literal>replication_lag_critical</literal>:
if replication lag (in seconds) on the standby exceeds this value, the
switchover will be aborted (unless the <literal>-F/--force</literal> option
is provided)
</simpara>
</listitem>
</itemizedlist> <note>
</para> <simpara>
See <xref linkend="repmgr-standby-switchover"> for a full list of available
command line options and <filename>repmgr.conf</filename> settings relevant
to performing a switchover.
</simpara>
</note>
<sect2 id="switchover-pg-rewind" xreflabel="Switchover and pg_rewind">
<indexterm>
<primary>pg_rewind</primary>
<secondary>using with "repmgr standby switchover"</secondary>
</indexterm>
<title>Switchover and pg_rewind</title>
<para>
If the demotion candidate does not shut down smoothly or cleanly, there's a risk it
will have a slightly divergent timeline and will not be able to attach to the new
primary. To fix this situation without needing to reclone the old primary, it's
possible to use the <application>pg_rewind</application> utility, which will usually be
able to resync the two servers.
</para>
<para>
To have &repmgr; execute <application>pg_rewind</application> if it detects this
situation after promoting the new primary, add the <option>--force-rewind</option>
option.
</para>
<note>
<simpara>
If &repmgr; detects a situation where it needs to execute <application>pg_rewind</application>,
it will execute a <literal>CHECKPOINT</literal> on the new primary before executing
<application>pg_rewind</application>.
</simpara>
</note>
<para>
For more details on <application>pg_rewind</application>, see:
<ulink url="https://www.postgresql.org/docs/current/static/app-pgrewind.html">https://www.postgresql.org/docs/current/static/app-pgrewind.html</ulink>.
</para>
<para>
<application>pg_rewind</application> has been part of the core PostgreSQL distribution since
version 9.5. Users of versions 9.3 and 9.4 will need to manually install it; the source code is available here:
<ulink url="https://github.com/vmware/pg_rewind">https://github.com/vmware/pg_rewind</ulink>.
If the <application>pg_rewind</application>
binary is not installed in the PostgreSQL <filename>bin</filename> directory, provide
its full path on the demotion candidate with <option>--force-rewind</option>.
</para>
<para>
Note that building the 9.3/9.4 version of <application>pg_rewind</application> requires the PostgreSQL
source code. Also, PostgreSQL 9.3 does not provide <varname>wal_log_hints</varname>,
meaning data checksums must have been enabled when the database was initialized.
</para>
</sect2>
</sect1> </sect1>
<sect1 id="switchover-execution" xreflabel="Executing the switchover command"> <sect1 id="switchover-execution" xreflabel="Executing the switchover command">

View File

@@ -1 +1 @@
<!ENTITY repmgrversion "4.0.3"> <!ENTITY repmgrversion "4.0.6">

View File

@@ -44,5 +44,8 @@
#define ERR_REGISTRATION_SYNC 20 #define ERR_REGISTRATION_SYNC 20
#define ERR_OUT_OF_MEMORY 21 #define ERR_OUT_OF_MEMORY 21
#define ERR_SWITCHOVER_INCOMPLETE 22 #define ERR_SWITCHOVER_INCOMPLETE 22
#define ERR_FOLLOW_FAIL 23
#define ERR_REJOIN_FAIL 24
#define ERR_CLUSTER_CHECK 25
#endif /* _ERRCODE_H_ */ #endif /* _ERRCODE_H_ */

7
log.c
View File

@@ -329,6 +329,13 @@ logger_set_terse(void)
} }
void
logger_set_min_level(int min_log_level)
{
if (min_log_level > log_level)
log_level = min_log_level;
}
int int
detect_log_level(const char *level) detect_log_level(const char *level)
{ {

1
log.h
View File

@@ -128,6 +128,7 @@ bool logger_shutdown(void);
void logger_set_verbose(void); void logger_set_verbose(void);
void logger_set_terse(void); void logger_set_terse(void);
void logger_set_min_level(int min_log_level);
void void
log_detail(const char *fmt,...) log_detail(const char *fmt,...)

View File

@@ -569,6 +569,8 @@ do_cluster_crosscheck(void)
t_node_status_cube **cube; t_node_status_cube **cube;
bool error_found = false;
n = build_cluster_crosscheck(&cube, &name_length); n = build_cluster_crosscheck(&cube, &name_length);
if (runtime_options.output_mode == OM_CSV) if (runtime_options.output_mode == OM_CSV)
{ {
@@ -648,9 +650,11 @@ do_cluster_crosscheck(void)
{ {
case -2: case -2:
c = '?'; c = '?';
error_found = true;
break; break;
case -1: case -1:
c = 'x'; c = 'x';
error_found = true;
break; break;
case 0: case 0:
c = '*'; c = '*';
@@ -689,6 +693,11 @@ do_cluster_crosscheck(void)
free(cube); free(cube);
} }
if (error_found == true)
{
exit(ERR_CLUSTER_CHECK);
}
} }
@@ -704,6 +713,8 @@ do_cluster_matrix()
t_node_matrix_rec **matrix_rec_list; t_node_matrix_rec **matrix_rec_list;
bool error_found = false;
n = build_cluster_matrix(&matrix_rec_list, &name_length); n = build_cluster_matrix(&matrix_rec_list, &name_length);
if (runtime_options.output_mode == OM_CSV) if (runtime_options.output_mode == OM_CSV)
@@ -742,9 +753,11 @@ do_cluster_matrix()
{ {
case -2: case -2:
c = '?'; c = '?';
error_found = true;
break; break;
case -1: case -1:
c = 'x'; c = 'x';
error_found = true;
break; break;
case 0: case 0:
c = '*'; c = '*';
@@ -770,6 +783,11 @@ do_cluster_matrix()
} }
free(matrix_rec_list); free(matrix_rec_list);
if (error_found == true)
{
exit(ERR_CLUSTER_CHECK);
}
} }
@@ -964,8 +982,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
initPQExpBuffer(&command_output); initPQExpBuffer(&command_output);
(void) remote_command( (void) remote_command(host,
host,
runtime_options.remote_user, runtime_options.remote_user,
command.data, command.data,
&command_output); &command_output);
@@ -1144,9 +1161,8 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
/* fix to work with --node-id */ /* fix to work with --node-id */
if (cube[i]->node_id == config_file_options.node_id) if (cube[i]->node_id == config_file_options.node_id)
{ {
(void) local_command( (void) local_command_simple(command.data,
command.data, &command_output);
&command_output);
} }
else else
{ {
@@ -1170,8 +1186,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
log_verbose(LOG_DEBUG, "build_cluster_crosscheck(): executing\n %s", quoted_command.data); log_verbose(LOG_DEBUG, "build_cluster_crosscheck(): executing\n %s", quoted_command.data);
(void) remote_command( (void) remote_command(host,
host,
runtime_options.remote_user, runtime_options.remote_user,
quoted_command.data, quoted_command.data,
&command_output); &command_output);

View File

@@ -92,7 +92,7 @@ do_node_status(void)
/* Check node exists and is really a standby */ /* Check node exists and is really a standby */
if (get_node_record(conn, config_file_options.node_id, &node_info) != RECORD_FOUND) if (get_node_record_with_upstream(conn, config_file_options.node_id, &node_info) != RECORD_FOUND)
{ {
log_error(_("no record found for node %i"), config_file_options.node_id); log_error(_("no record found for node %i"), config_file_options.node_id);
PQfinish(conn); PQfinish(conn);
@@ -308,14 +308,16 @@ do_node_status(void)
/* /*
* check for missing replication slots - we do this regardless of * check for missing replication slots - we do this regardless of
* what "max_replication_slots" is set to * what "max_replication_slots" is set to, in case the downstream
* node was configured with "use_replication_slots=true" and is
* expecting a replication slot to be available
*/ */
{ {
NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER; NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER;
get_downsteam_nodes_with_missing_slot(conn, get_downstream_nodes_with_missing_slot(conn,
config_file_options.node_id, config_file_options.node_id,
&missing_slots); &missing_slots);
if (missing_slots.node_count > 0) if (missing_slots.node_count > 0)
{ {
@@ -938,6 +940,7 @@ do_node_check_replication_connection(void)
return; return;
} }
/* retrieve remote node record from local database */
local_conn = establish_db_connection(config_file_options.conninfo, true); local_conn = establish_db_connection(config_file_options.conninfo, true);
record_status = get_node_record(local_conn, runtime_options.remote_node_id, &node_record); record_status = get_node_record(local_conn, runtime_options.remote_node_id, &node_record);
@@ -954,8 +957,12 @@ do_node_check_replication_connection(void)
initialize_conninfo_params(&remote_conninfo, false); initialize_conninfo_params(&remote_conninfo, false);
parse_conninfo_string(node_record.conninfo, &remote_conninfo, NULL, false); parse_conninfo_string(node_record.conninfo, &remote_conninfo, NULL, false);
if (strcmp(param_get(&remote_conninfo, "user"), node_record.repluser) != 0)
{
param_set(&remote_conninfo, "user", node_record.repluser);
param_set(&remote_conninfo, "dbname", "replication");
}
param_set(&remote_conninfo, "replication", "1"); param_set(&remote_conninfo, "replication", "1");
param_set(&remote_conninfo, "user", node_record.repluser);
repl_conn = establish_db_connection_by_params(&remote_conninfo, false); repl_conn = establish_db_connection_by_params(&remote_conninfo, false);
@@ -1511,7 +1518,7 @@ do_node_service(void)
if (data_dir_required_for_action(action)) if (data_dir_required_for_action(action))
{ {
get_node_data_directory(data_dir); get_node_config_directory(data_dir);
if (data_dir[0] == '\0') if (data_dir[0] == '\0')
{ {
@@ -1599,7 +1606,7 @@ _do_node_service_list_actions(t_server_action action)
if (data_dir_required == true) if (data_dir_required == true)
{ {
get_node_data_directory(data_dir); get_node_config_directory(data_dir);
} }
/* show command for specific action only */ /* show command for specific action only */
@@ -1665,6 +1672,13 @@ parse_server_action(const char *action_name)
* *
* Note that "repmgr node rejoin" is also executed by * Note that "repmgr node rejoin" is also executed by
* "repmgr standby switchover" after promoting the new primary. * "repmgr standby switchover" after promoting the new primary.
*
* Parameters:
* --dry-run
* --force-rewind[=VALUE]
* --config-files
* --config-archive-dir
* -W/--no-wait
*/ */
void void
do_node_rejoin(void) do_node_rejoin(void)
@@ -1726,7 +1740,7 @@ do_node_rejoin(void)
{ {
log_error(_("database is not shut down cleanly")); log_error(_("database is not shut down cleanly"));
if (runtime_options.force_rewind == true) if (runtime_options.force_rewind_used == true)
{ {
log_detail(_("pg_rewind will not be able to run")); log_detail(_("pg_rewind will not be able to run"));
} }
@@ -1756,7 +1770,17 @@ do_node_rejoin(void)
PQfinish(upstream_conn); PQfinish(upstream_conn);
/* connect to registered primary and check it's not in recovery */ /* connect to registered primary and check it's not in recovery */
upstream_conn = establish_db_connection(primary_node_record.conninfo, true); upstream_conn = establish_db_connection(primary_node_record.conninfo, false);
if (PQstatus(upstream_conn) != CONNECTION_OK)
{
log_error(_("unable to connect to current primary \"%s\" (node ID: %i)"),
primary_node_record.node_name,
primary_node_record.node_id);
log_detail(_("primay node conninfo is: \"%s\""),
primary_node_record.conninfo);
exit(ERR_BAD_CONFIG);
}
upstream_recovery_type = get_recovery_type(upstream_conn); upstream_recovery_type = get_recovery_type(upstream_conn);
@@ -1772,30 +1796,33 @@ do_node_rejoin(void)
} }
/* /*
* If --force-rewind specified, check pg_rewind can be used, and * --force-rewind specified - check prerequisites, and attempt to execute
* pre-emptively fetch the list of configuration files which should be * (if --dry-run provided, just output the command which would be executed)
* archived
*/ */
if (runtime_options.force_rewind == true)
if (runtime_options.force_rewind_used == true)
{ {
PQExpBufferData reason;
PQExpBufferData msg; PQExpBufferData msg;
PQExpBufferData filebuf;
int ret;
initPQExpBuffer(&reason); /*
* Check that pg_rewind can be used
*/
if (can_use_pg_rewind(upstream_conn, config_file_options.data_directory, &reason) == false) initPQExpBuffer(&msg);
if (can_use_pg_rewind(upstream_conn, config_file_options.data_directory, &msg) == false)
{ {
log_error(_("--force-rewind specified but pg_rewind cannot be used")); log_error(_("--force-rewind specified but pg_rewind cannot be used"));
log_detail("%s", reason.data); log_detail("%s", msg.data);
termPQExpBuffer(&reason); termPQExpBuffer(&msg);
PQfinish(upstream_conn); PQfinish(upstream_conn);
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
termPQExpBuffer(&reason);
initPQExpBuffer(&msg);
appendPQExpBuffer(&msg, appendPQExpBuffer(&msg,
_("prerequisites for using pg_rewind are met")); _("prerequisites for using pg_rewind are met"));
@@ -1808,25 +1835,31 @@ do_node_rejoin(void)
log_verbose(LOG_INFO, "%s", msg.data); log_verbose(LOG_INFO, "%s", msg.data);
} }
termPQExpBuffer(&msg); termPQExpBuffer(&msg);
}
/*
* Forcibly rewind node if requested (this is mainly for use when this
* action is being executed by "repmgr standby switchover")
*/
if (runtime_options.force_rewind == true)
{
int ret;
PQExpBufferData filebuf;
/*
* Archive requested configuration files.
*
* In --dry-run mode this acts as a check that the files can be archived, though
* errors will only be logged; any copied files will be deleted and --dry-run
* execution will continue.
*/
_do_node_archive_config(); _do_node_archive_config();
/* execute pg_rewind */ /* execute pg_rewind */
initPQExpBuffer(&command); initPQExpBuffer(&command);
appendPQExpBuffer(&command, if (runtime_options.force_rewind_path[0] != '\0')
"%s -D ", {
make_pg_path("pg_rewind")); appendPQExpBuffer(&command,
"%s -D ",
runtime_options.force_rewind_path);
}
else
{
appendPQExpBuffer(&command,
"%s -D ",
make_pg_path("pg_rewind"));
}
appendShellString(&command, appendShellString(&command,
config_file_options.data_directory); config_file_options.data_directory);
@@ -1840,115 +1873,128 @@ do_node_rejoin(void)
log_info(_("pg_rewind would now be executed")); log_info(_("pg_rewind would now be executed"));
log_detail(_("pg_rewind command is:\n %s"), log_detail(_("pg_rewind command is:\n %s"),
command.data); command.data);
PQfinish(upstream_conn);
exit(SUCCESS);
} }
else
log_notice(_("executing pg_rewind"));
log_debug("pg_rewind command is:\n %s",
command.data);
initPQExpBuffer(&command_output);
ret = local_command(
command.data,
&command_output);
termPQExpBuffer(&command);
if (ret == false)
{ {
log_error(_("unable to execute pg_rewind")); log_notice(_("executing pg_rewind"));
log_detail("%s", command_output.data); log_debug("pg_rewind command is:\n %s",
command.data);
initPQExpBuffer(&command_output);
ret = local_command(command.data,
&command_output);
termPQExpBuffer(&command);
if (ret == false)
{
log_error(_("unable to execute pg_rewind"));
log_detail("%s", command_output.data);
termPQExpBuffer(&command_output);
exit(ERR_BAD_CONFIG);
}
termPQExpBuffer(&command_output); termPQExpBuffer(&command_output);
exit(ERR_BAD_CONFIG); /* Restore any previously archived config files */
} _do_node_restore_config();
termPQExpBuffer(&command_output); initPQExpBuffer(&filebuf);
/* Restore any previously archived config files */ /* remove any recovery.done file copied in by pg_rewind */
_do_node_restore_config(); appendPQExpBuffer(&filebuf,
"%s/recovery.done",
initPQExpBuffer(&filebuf);
/* remove any recovery.done file copied in by pg_rewind */
appendPQExpBuffer(&filebuf,
"%s/recovery.done",
config_file_options.data_directory);
if (stat(filebuf.data, &statbuf) == 0)
{
log_verbose(LOG_INFO, _("deleting \"recovery.done\""));
if (unlink(filebuf.data) == -1)
{
log_warning(_("unable to delete \"%s\""),
filebuf.data);
log_detail("%s", strerror(errno));
}
}
termPQExpBuffer(&filebuf);
/* delete any replication slots copied in by pg_rewind */
{
PQExpBufferData slotdir_path;
DIR *slotdir;
struct dirent *slotdir_ent;
initPQExpBuffer(&slotdir_path);
appendPQExpBuffer(&slotdir_path,
"%s/pg_replslot",
config_file_options.data_directory); config_file_options.data_directory);
slotdir = opendir(slotdir_path.data); if (stat(filebuf.data, &statbuf) == 0)
if (slotdir == NULL)
{ {
log_warning(_("unable to open replication slot directory \"%s\""), log_verbose(LOG_INFO, _("deleting \"recovery.done\""));
slotdir_path.data);
log_detail("%s", strerror(errno));
}
else
{
while ((slotdir_ent = readdir(slotdir)) != NULL) {
struct stat statbuf;
PQExpBufferData slotdir_ent_path;
if(strcmp(slotdir_ent->d_name, ".") == 0 || strcmp(slotdir_ent->d_name, "..") == 0) if (unlink(filebuf.data) == -1)
continue; {
log_warning(_("unable to delete \"%s\""),
initPQExpBuffer(&slotdir_ent_path); filebuf.data);
log_detail("%s", strerror(errno));
appendPQExpBuffer(&slotdir_ent_path,
"%s/%s",
slotdir_path.data,
slotdir_ent->d_name);
if (stat(slotdir_ent_path.data, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
{
termPQExpBuffer(&slotdir_ent_path);
continue;
}
log_debug("deleting slot directory \"%s\"", slotdir_ent_path.data);
if (rmdir_recursive(slotdir_ent_path.data) != 0 && errno != EEXIST)
{
log_warning(_("unable to delete replication slot directory \"%s\""), slotdir_ent_path.data);
log_detail("%s", strerror(errno));
log_hint(_("directory may need to be manually removed"));
}
termPQExpBuffer(&slotdir_ent_path);
} }
} }
termPQExpBuffer(&slotdir_path); termPQExpBuffer(&filebuf);
/*
* Delete any replication slots copied in by pg_rewind.
*
* TODO:
* - from PostgreSQL 11, this will be handled by pg_rewind, so
* we can skip this step from that version; see commit
* 266b6acb312fc440c1c1a2036aa9da94916beac6
* - possibly delete contents of various other directories
* as per the above commit for pre-PostgreSQL 11
*/
{
PQExpBufferData slotdir_path;
DIR *slotdir;
struct dirent *slotdir_ent;
initPQExpBuffer(&slotdir_path);
appendPQExpBuffer(&slotdir_path,
"%s/pg_replslot",
config_file_options.data_directory);
slotdir = opendir(slotdir_path.data);
if (slotdir == NULL)
{
log_warning(_("unable to open replication slot directory \"%s\""),
slotdir_path.data);
log_detail("%s", strerror(errno));
}
else
{
while ((slotdir_ent = readdir(slotdir)) != NULL) {
struct stat statbuf;
PQExpBufferData slotdir_ent_path;
if(strcmp(slotdir_ent->d_name, ".") == 0 || strcmp(slotdir_ent->d_name, "..") == 0)
continue;
initPQExpBuffer(&slotdir_ent_path);
appendPQExpBuffer(&slotdir_ent_path,
"%s/%s",
slotdir_path.data,
slotdir_ent->d_name);
if (stat(slotdir_ent_path.data, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
{
termPQExpBuffer(&slotdir_ent_path);
continue;
}
log_debug("deleting slot directory \"%s\"", slotdir_ent_path.data);
if (rmdir_recursive(slotdir_ent_path.data) != 0 && errno != EEXIST)
{
log_warning(_("unable to delete replication slot directory \"%s\""), slotdir_ent_path.data);
log_detail("%s", strerror(errno));
log_hint(_("directory may need to be manually removed"));
}
termPQExpBuffer(&slotdir_ent_path);
}
}
termPQExpBuffer(&slotdir_path);
}
} }
} }
if (runtime_options.dry_run == true)
{
log_info(_("prerequisites for executing NODE REJOIN are met"));
exit(SUCCESS);
}
initPQExpBuffer(&follow_output); initPQExpBuffer(&follow_output);
success = do_standby_follow_internal(upstream_conn, success = do_standby_follow_internal(upstream_conn,
@@ -1959,7 +2005,9 @@ do_node_rejoin(void)
if (success == false) if (success == false)
{ {
log_notice(_("NODE REJOIN failed")); log_notice(_("NODE REJOIN failed"));
log_detail("%s", follow_output.data);
if (strlen(follow_output.data))
log_detail("%s", follow_output.data);
create_event_notification(upstream_conn, create_event_notification(upstream_conn,
&config_file_options, &config_file_options,
@@ -1975,22 +2023,99 @@ do_node_rejoin(void)
} }
/* /*
* XXX add checks that node actually started and connected to primary, * Actively check that node actually started and connected to primary,
* if not exit with ERR_REJOIN_FAIL * if not exit with ERR_REJOIN_FAIL.
*
* This check can be overridden with -W/--no-wait, in which case a one-time
* check will be carried out.
*/ */
if (runtime_options.no_wait == false)
{
int i;
create_event_notification(upstream_conn, for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
&config_file_options, {
config_file_options.node_id, if (is_server_available(config_file_options.conninfo))
"node_rejoin", {
success, log_verbose(LOG_INFO, _("demoted primary is pingable"));
follow_output.data); break;
}
PQfinish(upstream_conn); if (i % 5 == 0)
{
log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"),
config_file_options.node_id,
i + 1, config_file_options.standby_reconnect_timeout);
}
else
{
log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts",
config_file_options.node_id,
i + 1, config_file_options.standby_reconnect_timeout);
}
log_notice(_("NODE REJOIN successful")); sleep(1);
log_detail("%s", follow_output.data); }
for (; i < config_file_options.standby_reconnect_timeout; i++)
{
success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
if (success == true)
{
log_verbose(LOG_INFO, _("node %i has attached to its upstream node"),
config_file_options.node_id);
break;
}
if (i % 5 == 0)
{
log_info(_("waiting for node %i to connect to new primary; %i of max %i attempts"),
config_file_options.node_id,
i + 1, config_file_options.standby_reconnect_timeout);
}
else
{
log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts",
config_file_options.node_id,
i + 1, config_file_options.standby_reconnect_timeout);
}
sleep(1);
}
create_event_notification(upstream_conn,
&config_file_options,
config_file_options.node_id,
"node_rejoin",
success,
follow_output.data);
if (success == false)
{
termPQExpBuffer(&follow_output);
log_notice(_("NODE REJOIN failed"));
exit(ERR_REJOIN_FAIL);
}
}
else
{
success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
}
if (success == true)
{
log_notice(_("NODE REJOIN successful"));
log_detail("%s", follow_output.data);
}
else
{
/*
* if we reach here, no record found in upstream node's pg_stat_replication */
log_notice(_("NODE REJOIN has completed but node is not yet reattached to upstream"));
log_hint(_("you will need to manually check the node's replication status"));
}
termPQExpBuffer(&follow_output); termPQExpBuffer(&follow_output);
return; return;
@@ -2042,6 +2167,11 @@ _do_node_archive_config(void)
termPQExpBuffer(&archive_dir); termPQExpBuffer(&archive_dir);
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
if (runtime_options.dry_run == true)
{
log_verbose(LOG_INFO, "temporary archive directory \"%s\" created", archive_dir.data);
}
} }
else if (!S_ISDIR(statbuf.st_mode)) else if (!S_ISDIR(statbuf.st_mode))
{ {
@@ -2066,8 +2196,8 @@ _do_node_archive_config(void)
{ {
/* /*
* attempt to remove any existing files in the directory TODO: collate * attempt to remove any existing files in the directory
* problem files into list * TODO: collate problem files into list
*/ */
while ((arcdir_ent = readdir(arcdir)) != NULL) while ((arcdir_ent = readdir(arcdir)) != NULL)
{ {
@@ -2143,7 +2273,11 @@ _do_node_archive_config(void)
if (i < config_file_len) if (i < config_file_len)
{ {
strncpy(filenamebuf, runtime_options.config_files + i, config_file_len - i); int filename_len = config_file_len - i;
strncpy(filenamebuf, runtime_options.config_files + i, filename_len);
filenamebuf[filename_len] = '\0';
initPQExpBuffer(&pathbuf); initPQExpBuffer(&pathbuf);
appendPQExpBuffer(&pathbuf, appendPQExpBuffer(&pathbuf,
@@ -2221,7 +2355,7 @@ _do_node_archive_config(void)
} }
else else
{ {
log_verbose(LOG_INFO, "directory \"%s\" deleted", archive_dir.data); log_verbose(LOG_INFO, "temporary archive directory \"%s\" deleted", archive_dir.data);
} }
} }
@@ -2430,15 +2564,15 @@ do_node_help(void)
puts(""); puts("");
printf(_(" Configuration file required, runs on local node only.\n")); printf(_(" Configuration file required, runs on local node only.\n"));
puts(""); puts("");
printf(_(" --csv emit output as CSV\n")); printf(_(" --csv emit output as CSV\n"));
printf(_(" --nagios emit output in Nagios format (individual status output only)\n")); printf(_(" --nagios emit output in Nagios format (individual status output only)\n"));
puts(""); puts("");
printf(_(" Following options check an individual status:\n")); printf(_(" Following options check an individual status:\n"));
printf(_(" --archive-ready number of WAL files ready for archiving\n")); printf(_(" --archive-ready number of WAL files ready for archiving\n"));
printf(_(" --downstream whether all downstream nodes are connected\n")); printf(_(" --downstream whether all downstream nodes are connected\n"));
printf(_(" --replication-lag replication lag in seconds (standbys only)\n")); printf(_(" --replication-lag replication lag in seconds (standbys only)\n"));
printf(_(" --role check node has expected role\n")); printf(_(" --role check node has expected role\n"));
printf(_(" --slots check for inactive replication slots\n")); printf(_(" --slots check for inactive replication slots\n"));
puts(""); puts("");
@@ -2448,13 +2582,16 @@ do_node_help(void)
puts(""); puts("");
printf(_(" Configuration file required, runs on local node only.\n")); printf(_(" Configuration file required, runs on local node only.\n"));
puts(""); puts("");
printf(_(" --dry-run check that the prerequisites are met for rejoining the node\n" \ printf(_(" --dry-run check that the prerequisites are met for rejoining the node\n" \
" (including usability of \"pg_rewind\" if requested)\n")); " (including usability of \"pg_rewind\" if requested)\n"));
printf(_(" --force-rewind execute \"pg_rewind\" if necessary\n")); printf(_(" --force-rewind[=VALUE] execute \"pg_rewind\" if necessary\n"));
printf(_(" --config-files comma-separated list of configuration files to retain\n" \ printf(_(" (9.3 and 9.4 - provide full \"pg_rewind\" path)\n"));
" after executing \"pg_rewind\"\n"));
printf(_(" --config-archive-dir directory to temporarily store retained configuration files\n" \ printf(_(" --config-files comma-separated list of configuration files to retain\n" \
" (default: /tmp)\n")); " after executing \"pg_rewind\"\n"));
printf(_(" --config-archive-dir directory to temporarily store retained configuration files\n" \
" (default: /tmp)\n"));
printf(_(" -W/--no-wait don't wait for the node to rejoin cluster\n"));
puts(""); puts("");
printf(_("NODE SERVICE\n")); printf(_("NODE SERVICE\n"));

File diff suppressed because it is too large Load Diff

View File

@@ -65,7 +65,7 @@ do_witness_register(void)
if (recovery_type == RECTYPE_STANDBY) if (recovery_type == RECTYPE_STANDBY)
{ {
log_error(_("provided node is a standby")); log_error(_("provided node is a standby"));
log_error(_("a witness node must run on an independent primary server")); log_hint(_("a witness node must run on an independent primary server"));
PQfinish(witness_conn); PQfinish(witness_conn);
@@ -86,6 +86,7 @@ do_witness_register(void)
/* connect to primary with provided parameters */ /* connect to primary with provided parameters */
log_info(_("connecting to primary node")); log_info(_("connecting to primary node"));
/* /*
* Extract the repmgr user and database names from the conninfo string * Extract the repmgr user and database names from the conninfo string
* provided in repmgr.conf * provided in repmgr.conf
@@ -135,8 +136,11 @@ do_witness_register(void)
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
/* XXX sanity check witness node is not part of main cluster */ /*
* TODO: sanity check witness node is not part of main cluster; we could
* add a random application_name to the respective connections,
* and do a simple check of pg_stat_activity
*/
/* create repmgr extension, if does not exist */ /* create repmgr extension, if does not exist */
if (runtime_options.dry_run == false && !create_repmgr_extension(witness_conn)) if (runtime_options.dry_run == false && !create_repmgr_extension(witness_conn))
@@ -182,7 +186,6 @@ do_witness_register(void)
log_error(_("witness node is already registered")); log_error(_("witness node is already registered"));
log_hint(_("use option -F/--force to reregister the node")); log_hint(_("use option -F/--force to reregister the node"));
PQfinish(witness_conn); PQfinish(witness_conn);
PQfinish(primary_conn); PQfinish(primary_conn);
@@ -190,8 +193,26 @@ do_witness_register(void)
} }
} }
/*
* Check that an active node with the same node_name doesn't exist already
*/
// XXX check other node with same name does not exist record_status = get_node_record_by_name(primary_conn,
config_file_options.node_name,
&node_record);
if (record_status == RECORD_FOUND)
{
if (node_record.active == true && node_record.node_id != config_file_options.node_id)
{
log_error(_("node %i exists already with node_name \"%s\""),
node_record.node_id,
config_file_options.node_name);
PQfinish(primary_conn);
exit(ERR_BAD_CONFIG);
}
}
/* /*
* if repmgr.nodes contains entries, delete if -F/--force provided, * if repmgr.nodes contains entries, delete if -F/--force provided,
@@ -222,6 +243,7 @@ do_witness_register(void)
PQfinish(witness_conn); PQfinish(witness_conn);
exit(SUCCESS); exit(SUCCESS);
} }
/* create record on primary */ /* create record on primary */
/* /*

View File

@@ -42,6 +42,7 @@ typedef struct
bool force; bool force;
char pg_bindir[MAXLEN]; /* overrides setting in repmgr.conf */ char pg_bindir[MAXLEN]; /* overrides setting in repmgr.conf */
bool wait; bool wait;
bool no_wait;
/* logging options */ /* logging options */
char log_level[MAXLEN]; /* overrides setting in repmgr.conf */ char log_level[MAXLEN]; /* overrides setting in repmgr.conf */
@@ -80,6 +81,7 @@ typedef struct
char replication_user[MAXLEN]; char replication_user[MAXLEN];
char upstream_conninfo[MAXLEN]; char upstream_conninfo[MAXLEN];
bool without_barman; bool without_barman;
bool recovery_conf_only;
/* "standby clone"/"standby follow" options */ /* "standby clone"/"standby follow" options */
int upstream_node_id; int upstream_node_id;
@@ -91,7 +93,8 @@ typedef struct
/* "standby switchover" options */ /* "standby switchover" options */
bool always_promote; bool always_promote;
bool force_rewind; bool force_rewind_used;
char force_rewind_path[MAXPGPATH];
bool siblings_follow; bool siblings_follow;
/* "node status" options */ /* "node status" options */
@@ -132,26 +135,26 @@ typedef struct
/* configuration metadata */ \ /* configuration metadata */ \
false, false, false, false, \ false, false, false, false, \
/* general configuration options */ \ /* general configuration options */ \
"", false, false, "", false, \ "", false, false, "", false, false, \
/* logging options */ \ /* logging options */ \
"", false, false, false, \ "", false, false, false, \
/* output options */ \ /* output options */ \
false, false, false, \ false, false, false, \
/* database connection options */ \ /* database connection options */ \
"", "", "", "", \ "", "", "", "", \
/* other connection options */ \ /* other connection options */ \
"", "", \ "", "", \
/* general node options */ \ /* general node options */ \
UNKNOWN_NODE_ID, "", "", UNKNOWN_NODE_ID, \ UNKNOWN_NODE_ID, "", "", UNKNOWN_NODE_ID, \
/* "standby clone" options */ \ /* "standby clone" options */ \
false, CONFIG_FILE_SAMEPATH, false, false, false, "", "", "", \ false, CONFIG_FILE_SAMEPATH, false, false, false, "", "", "", \
false, \ false, false, \
/* "standby clone"/"standby follow" options */ \ /* "standby clone"/"standby follow" options */ \
NO_UPSTREAM_NODE, \ NO_UPSTREAM_NODE, \
/* "standby register" options */ \ /* "standby register" options */ \
false, 0, DEFAULT_WAIT_START, \ false, 0, DEFAULT_WAIT_START, \
/* "standby switchover" options */ \ /* "standby switchover" options */ \
false, false, false, \ false, false, "", false, \
/* "node status" options */ \ /* "node status" options */ \
false, \ false, \
/* "node check" options */ \ /* "node check" options */ \
@@ -164,7 +167,7 @@ typedef struct
false, "", CLUSTER_EVENT_LIMIT, \ false, "", CLUSTER_EVENT_LIMIT, \
/* "cluster cleanup" options */ \ /* "cluster cleanup" options */ \
0, \ 0, \
/* Following options for internal use */ \ /* following options for internal use */ \
"/tmp", OM_TEXT \ "/tmp", OM_TEXT \
} }
@@ -207,6 +210,7 @@ extern void check_93_config(void);
extern bool create_repmgr_extension(PGconn *conn); extern bool create_repmgr_extension(PGconn *conn);
extern int test_ssh_connection(char *host, char *remote_user); extern int test_ssh_connection(char *host, char *remote_user);
extern bool local_command(const char *command, PQExpBufferData *outputbuf); extern bool local_command(const char *command, PQExpBufferData *outputbuf);
extern bool local_command_simple(const char *command, PQExpBufferData *outputbuf);
extern standy_clone_mode get_standby_clone_mode(void); extern standy_clone_mode get_standby_clone_mode(void);
@@ -227,7 +231,9 @@ extern void print_help_header(void);
/* server control functions */ /* server control functions */
extern void get_server_action(t_server_action action, char *script, char *data_dir); extern void get_server_action(t_server_action action, char *script, char *data_dir);
extern bool data_dir_required_for_action(t_server_action action); extern bool data_dir_required_for_action(t_server_action action);
extern void get_node_config_directory(char *config_dir_buf);
extern void get_node_data_directory(char *data_dir_buf); extern void get_node_data_directory(char *data_dir_buf);
extern void init_node_record(t_node_info *node_record); extern void init_node_record(t_node_info *node_record);
extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
#endif /* _REPMGR_CLIENT_GLOBAL_H_ */ #endif /* _REPMGR_CLIENT_GLOBAL_H_ */

View File

@@ -53,6 +53,7 @@
#include "repmgr.h" #include "repmgr.h"
#include "compat.h" #include "compat.h"
#include "controldata.h"
#include "repmgr-client.h" #include "repmgr-client.h"
#include "repmgr-client-global.h" #include "repmgr-client-global.h"
#include "repmgr-action-primary.h" #include "repmgr-action-primary.h"
@@ -90,6 +91,7 @@ t_node_info target_node_info = T_NODE_INFO_INITIALIZER;
static ItemList cli_errors = {NULL, NULL}; static ItemList cli_errors = {NULL, NULL};
static ItemList cli_warnings = {NULL, NULL}; static ItemList cli_warnings = {NULL, NULL};
static bool _local_command(const char *command, PQExpBufferData *outputbuf, bool simple);
int int
main(int argc, char **argv) main(int argc, char **argv)
@@ -176,7 +178,7 @@ main(int argc, char **argv)
strncpy(runtime_options.username, pw->pw_name, MAXLEN); strncpy(runtime_options.username, pw->pw_name, MAXLEN);
} }
while ((c = getopt_long(argc, argv, "?Vb:f:FWd:h:p:U:R:S:D:ckL:tvC:", long_options, while ((c = getopt_long(argc, argv, "?Vb:f:FwWd:h:p:U:R:S:D:ck:L:tvC:", long_options,
&optindex)) != -1) &optindex)) != -1)
{ {
/* /*
@@ -241,11 +243,16 @@ main(int argc, char **argv)
strncpy(runtime_options.replication_user, optarg, MAXLEN); strncpy(runtime_options.replication_user, optarg, MAXLEN);
break; break;
/* -W/--wait */ /* -w/--wait */
case 'W': case 'w':
runtime_options.wait = true; runtime_options.wait = true;
break; break;
/* -W/--no-wait */
case 'W':
runtime_options.no_wait = true;
break;
/*---------------------------- /*----------------------------
* database connection options * database connection options
*---------------------------- *----------------------------
@@ -388,6 +395,11 @@ main(int argc, char **argv)
runtime_options.without_barman = true; runtime_options.without_barman = true;
break; break;
case OPT_RECOVERY_CONF_ONLY:
runtime_options.recovery_conf_only = true;
break;
/*--------------------------- /*---------------------------
* "standby register" options * "standby register" options
*--------------------------- *---------------------------
@@ -415,7 +427,13 @@ main(int argc, char **argv)
break; break;
case OPT_FORCE_REWIND: case OPT_FORCE_REWIND:
runtime_options.force_rewind = true; runtime_options.force_rewind_used = true;
if (optarg != NULL)
{
strncpy(runtime_options.force_rewind_path, optarg, MAXPGPATH);
}
break; break;
case OPT_SIBLINGS_FOLLOW: case OPT_SIBLINGS_FOLLOW:
@@ -616,7 +634,7 @@ main(int argc, char **argv)
* If -d/--dbname appears to be a conninfo string, validate by attempting * If -d/--dbname appears to be a conninfo string, validate by attempting
* to parse it (and if successful, store the parsed parameters) * to parse it (and if successful, store the parsed parameters)
*/ */
if (runtime_options.dbname) if (runtime_options.dbname[0])
{ {
if (strncmp(runtime_options.dbname, "postgresql://", 13) == 0 || if (strncmp(runtime_options.dbname, "postgresql://", 13) == 0 ||
strncmp(runtime_options.dbname, "postgres://", 11) == 0 || strncmp(runtime_options.dbname, "postgres://", 11) == 0 ||
@@ -992,32 +1010,10 @@ main(int argc, char **argv)
runtime_options.output_mode = OM_OPTFORMAT; runtime_options.output_mode = OM_OPTFORMAT;
} }
/* check for conflicts between runtime options and configuration file */
/* ================================================================== */
if (action == STANDBY_CLONE)
{
standy_clone_mode mode = get_standby_clone_mode();
if (mode == barman && runtime_options.without_barman == false
&& config_file_options.use_replication_slots == true)
{
log_error(_("STANDBY CLONE in Barman mode is incompatible with configuration option \"use_replication_slots\""));
log_hint(_("set \"use_replication_slots\" to \"no\" in repmgr.conf, or use --without-barman to clone directly from the upstream server"));
exit(ERR_BAD_CONFIG);
}
}
/* /*
* Check for configuration file items which can be overriden by runtime * Check for configuration file items which can be overriden by runtime
* options * options
*/ * =====================================================================
/*
* ============================================================================
*/ */
/* /*
@@ -1071,6 +1067,17 @@ main(int argc, char **argv)
if (runtime_options.terse) if (runtime_options.terse)
logger_set_terse(); logger_set_terse();
/*
* If --dry-run specified, ensure log_level is at least LOG_INFO, regardless
* of what's in the configuration file or -L/--log-level paremeter, otherwise
* some or output might not be displayed.
*/
if (runtime_options.dry_run == true)
{
logger_set_min_level(LOG_INFO);
}
/* /*
* Node configuration information is not needed for all actions, with * Node configuration information is not needed for all actions, with
* STANDBY CLONE being the main exception. * STANDBY CLONE being the main exception.
@@ -1331,6 +1338,15 @@ check_cli_parameters(const int action)
_("--no-upstream-connection only effective in Barman mode")); _("--no-upstream-connection only effective in Barman mode"));
} }
} }
if (strlen(config_file_options.config_directory))
{
if (runtime_options.copy_external_config_files == false)
{
item_list_append(&cli_warnings,
_("\"config_directory\" set in repmgr.conf, but --copy-external-config-files not provided"));
}
}
} }
break; break;
@@ -1495,6 +1511,39 @@ check_cli_parameters(const int action)
} }
} }
if (runtime_options.replication_user[0])
{
switch (action)
{
case PRIMARY_REGISTER:
case STANDBY_REGISTER:
case STANDBY_CLONE:
break;
case STANDBY_FOLLOW:
item_list_append_format(&cli_warnings,
_("--replication-user ignored when executing %s"),
action_name(action));
default:
item_list_append_format(&cli_warnings,
_("--replication-user not required when executing %s"),
action_name(action));
}
}
if (runtime_options.recovery_conf_only == true)
{
switch (action)
{
case STANDBY_CLONE:
break;
default:
item_list_append_format(&cli_warnings,
_("--create-recovery-conf will be ignored when executing %s"),
action_name(action));
}
}
if (runtime_options.event[0]) if (runtime_options.event[0])
{ {
switch (action) switch (action)
@@ -1508,25 +1557,6 @@ check_cli_parameters(const int action)
} }
} }
if (runtime_options.replication_user[0])
{
switch (action)
{
case PRIMARY_REGISTER:
case STANDBY_REGISTER:
break;
case STANDBY_CLONE:
case STANDBY_FOLLOW:
item_list_append_format(&cli_warnings,
_("--replication-user ignored when executing %s)"),
action_name(action));
default:
item_list_append_format(&cli_warnings,
_("--replication-user not required when executing %s"),
action_name(action));
}
}
if (runtime_options.limit_provided) if (runtime_options.limit_provided)
{ {
switch (action) switch (action)
@@ -1565,6 +1595,41 @@ check_cli_parameters(const int action)
} }
} }
/* --wait/--no-wait */
if (runtime_options.wait == true && runtime_options.no_wait == true)
{
item_list_append_format(&cli_errors,
_("both --wait and --no-wait options provided"));
}
else
{
if (runtime_options.wait)
{
switch (action)
{
case STANDBY_FOLLOW:
break;
default:
item_list_append_format(&cli_warnings,
_("--wait will be ignored when executing %s"),
action_name(action));
}
}
else if (runtime_options.wait)
{
switch (action)
{
case NODE_REJOIN:
break;
default:
item_list_append_format(&cli_warnings,
_("--no-wait will be ignored when executing %s"),
action_name(action));
}
}
}
/* repmgr node service --action */ /* repmgr node service --action */
if (runtime_options.action[0] != '\0') if (runtime_options.action[0] != '\0')
{ {
@@ -1606,7 +1671,7 @@ check_cli_parameters(const int action)
} }
} }
if (runtime_options.force_rewind == true) if (runtime_options.force_rewind_used == true)
{ {
switch (action) switch (action)
{ {
@@ -1777,10 +1842,11 @@ do_help(void)
printf(_("Usage:\n")); printf(_("Usage:\n"));
printf(_(" %s [OPTIONS] primary {register|unregister}\n"), progname()); printf(_(" %s [OPTIONS] primary {register|unregister}\n"), progname());
printf(_(" %s [OPTIONS] standby {register|unregister|clone|promote|follow}\n"), progname()); printf(_(" %s [OPTIONS] standby {register|unregister|clone|promote|follow|switchover}\n"), progname());
printf(_(" %s [OPTIONS] bdr {register|unregister}\n"), progname()); printf(_(" %s [OPTIONS] bdr {register|unregister}\n"), progname());
printf(_(" %s [OPTIONS] node status\n"), progname()); printf(_(" %s [OPTIONS] node {status|check|rejoin|service}\n"), progname());
printf(_(" %s [OPTIONS] cluster {show|event|matrix|crosscheck}\n"), progname()); printf(_(" %s [OPTIONS] cluster {show|event|matrix|crosscheck}\n"), progname());
printf(_(" %s [OPTIONS] witness {register|unregister}\n"), progname());
puts(""); puts("");
@@ -2097,12 +2163,28 @@ test_ssh_connection(char *host, char *remote_user)
} }
/* /*
* Execute a command locally. "outputbuf" should either be an * Execute a command locally. "outputbuf" should either be an
* initialised PQexpbuffer, or NULL * initialised PQexpbuffer, or NULL
*/ */
bool bool
local_command(const char *command, PQExpBufferData *outputbuf) local_command(const char *command, PQExpBufferData *outputbuf)
{
return _local_command(command, outputbuf, false);
}
bool
local_command_simple(const char *command, PQExpBufferData *outputbuf)
{
return _local_command(command, outputbuf, true);
}
static bool
_local_command(const char *command, PQExpBufferData *outputbuf, bool simple)
{ {
FILE *fp = NULL; FILE *fp = NULL;
char output[MAXLEN]; char output[MAXLEN];
@@ -2129,7 +2211,8 @@ local_command(const char *command, PQExpBufferData *outputbuf)
while (fgets(output, MAXLEN, fp) != NULL) while (fgets(output, MAXLEN, fp) != NULL)
{ {
appendPQExpBuffer(outputbuf, "%s", output); appendPQExpBuffer(outputbuf, "%s", output);
if (!feof(fp))
if (!feof(fp) && simple == false)
{ {
break; break;
} }
@@ -2151,10 +2234,19 @@ local_command(const char *command, PQExpBufferData *outputbuf)
} }
/*
* get_superuser_connection()
*
* Check if provided connection "conn" is a superuser connection, if not attempt to
* make a superuser connection "superuser_conn" with the provided --superuser parameter.
*
* "privileged_conn" is set to whichever connection is the superuser connection.
*/
void void
get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privileged_conn) get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privileged_conn)
{ {
t_connection_user userinfo = T_CONNECTION_USER_INITIALIZER; t_connection_user userinfo = T_CONNECTION_USER_INITIALIZER;
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
bool is_superuser = false; bool is_superuser = false;
/* this should never happen */ /* this should never happen */
@@ -2163,6 +2255,7 @@ get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privil
log_error(_("no database connection available")); log_error(_("no database connection available"));
exit(ERR_INTERNAL); exit(ERR_INTERNAL);
} }
is_superuser = is_superuser_connection(*conn, &userinfo); is_superuser = is_superuser_connection(*conn, &userinfo);
if (is_superuser == true) if (is_superuser == true)
@@ -2180,9 +2273,11 @@ get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privil
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
*superuser_conn = establish_db_connection_as_user(config_file_options.conninfo, initialize_conninfo_params(&conninfo_params, false);
runtime_options.superuser, conn_to_param_list(*conn, &conninfo_params);
false); param_set(&conninfo_params, "user", runtime_options.superuser);
*superuser_conn = establish_db_connection_by_params(&conninfo_params, false);
if (PQstatus(*superuser_conn) != CONNECTION_OK) if (PQstatus(*superuser_conn) != CONNECTION_OK)
{ {
@@ -2202,6 +2297,8 @@ get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privil
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
log_debug("established superuser connection as \"%s\"", runtime_options.superuser);
*privileged_conn = *superuser_conn; *privileged_conn = *superuser_conn;
return; return;
} }
@@ -2343,9 +2440,6 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
} }
/* /*
* Execute a command via ssh on the remote host. * Execute a command via ssh on the remote host.
* *
@@ -2411,7 +2505,7 @@ remote_command(const char *host, const char *user, const char *command, PQExpBuf
if (outputbuf != NULL) if (outputbuf != NULL)
{ {
if (strlen(outputbuf->data)) if (strlen(outputbuf->data))
log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n %s", outputbuf->data); log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n%s", outputbuf->data);
else else
log_verbose(LOG_DEBUG, "remote_command(): no output returned"); log_verbose(LOG_DEBUG, "remote_command(): no output returned");
} }
@@ -2660,6 +2754,33 @@ data_dir_required_for_action(t_server_action action)
} }
/*
* Copy the location of the configuration file directory into the
* provided buffer; if "config_directory" provided, use that, otherwise
* default to the data directory.
*
* This is primarily intended for use with "pg_ctl" (which itself shouldn't
* be used outside of development environments).
*/
void
get_node_config_directory(char *config_dir_buf)
{
if (config_file_options.config_directory[0] != '\0')
{
strncpy(config_dir_buf, config_file_options.config_directory, MAXPGPATH);
return;
}
if (config_file_options.data_directory[0] != '\0')
{
strncpy(config_dir_buf, config_file_options.data_directory, MAXPGPATH);
return;
}
return;
}
void void
get_node_data_directory(char *data_dir_buf) get_node_data_directory(char *data_dir_buf)
{ {
@@ -2707,7 +2828,7 @@ init_node_record(t_node_info *node_record)
if (config_file_options.replication_user[0] != '\0') if (config_file_options.replication_user[0] != '\0')
{ {
/* replication user explicitly provided */ /* replication user explicitly provided in configuration file */
strncpy(node_record->repluser, config_file_options.replication_user, NAMEDATALEN); strncpy(node_record->repluser, config_file_options.replication_user, NAMEDATALEN);
} }
else else
@@ -2724,3 +2845,77 @@ init_node_record(t_node_info *node_record)
create_slot_name(node_record->slot_name, config_file_options.node_id); create_slot_name(node_record->slot_name, config_file_options.node_id);
} }
} }
bool
can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason)
{
bool can_use = true;
int server_version_num = get_server_version(conn, NULL);
/* wal_log_hints not available in 9.3, so just determine if data checksums enabled */
if (server_version_num < 90400)
{
int data_checksum_version = get_data_checksum_version(data_directory);
if (data_checksum_version < 0)
{
appendPQExpBuffer(reason,
_("unable to determine data checksum version"));
can_use = false;
}
else if (data_checksum_version == 0)
{
appendPQExpBuffer(reason,
_("this cluster was initialised without data checksums"));
can_use = false;
}
return can_use;
}
/* "full_page_writes" must be on in any case */
if (guc_set(conn, "full_page_writes", "=", "off"))
{
if (can_use == false)
appendPQExpBuffer(reason, "; ");
appendPQExpBuffer(reason,
_("\"full_page_writes\" must be set to \"on\""));
can_use = false;
}
/*
* "wal_log_hints" off - are data checksums available? Note: we're
* checking the local pg_control file here as the value will be the same
* throughout the cluster and saves a round-trip to the demotion
* candidate.
*/
if (guc_set(conn, "wal_log_hints", "=", "on") == false)
{
int data_checksum_version = get_data_checksum_version(data_directory);
if (data_checksum_version < 0)
{
if (can_use == false)
appendPQExpBuffer(reason, "; ");
appendPQExpBuffer(reason,
_("\"wal_log_hints\" is set to \"off\" but unable to determine data checksum version"));
can_use = false;
}
else if (data_checksum_version == 0)
{
if (can_use == false)
appendPQExpBuffer(reason, "; ");
appendPQExpBuffer(reason,
_("\"wal_log_hints\" is set to \"off\" and data checksums are disabled"));
can_use = false;
}
}
return can_use;
}

View File

@@ -85,6 +85,8 @@
#define OPT_WAIT_START 1036 #define OPT_WAIT_START 1036
#define OPT_REPL_CONN 1037 #define OPT_REPL_CONN 1037
#define OPT_REMOTE_NODE_ID 1038 #define OPT_REMOTE_NODE_ID 1038
#define OPT_RECOVERY_CONF_ONLY 1039
#define OPT_NO_WAIT 1040
/* deprecated since 3.3 */ /* deprecated since 3.3 */
#define OPT_DATA_DIR 999 #define OPT_DATA_DIR 999
@@ -103,7 +105,8 @@ static struct option long_options[] =
{"dry-run", no_argument, NULL, OPT_DRY_RUN}, {"dry-run", no_argument, NULL, OPT_DRY_RUN},
{"force", no_argument, NULL, 'F'}, {"force", no_argument, NULL, 'F'},
{"pg_bindir", required_argument, NULL, 'b'}, {"pg_bindir", required_argument, NULL, 'b'},
{"wait", no_argument, NULL, 'W'}, {"wait", no_argument, NULL, 'w'},
{"no-wait", no_argument, NULL, 'W'},
/* connection options */ /* connection options */
{"dbname", required_argument, NULL, 'd'}, {"dbname", required_argument, NULL, 'd'},
@@ -139,6 +142,7 @@ static struct option long_options[] =
{"upstream-conninfo", required_argument, NULL, OPT_UPSTREAM_CONNINFO}, {"upstream-conninfo", required_argument, NULL, OPT_UPSTREAM_CONNINFO},
{"upstream-node-id", required_argument, NULL, OPT_UPSTREAM_NODE_ID}, {"upstream-node-id", required_argument, NULL, OPT_UPSTREAM_NODE_ID},
{"without-barman", no_argument, NULL, OPT_WITHOUT_BARMAN}, {"without-barman", no_argument, NULL, OPT_WITHOUT_BARMAN},
{"recovery-conf-only", no_argument, NULL, OPT_RECOVERY_CONF_ONLY},
/* "standby register" options */ /* "standby register" options */
{"wait-start", required_argument, NULL, OPT_WAIT_START}, {"wait-start", required_argument, NULL, OPT_WAIT_START},
@@ -166,7 +170,7 @@ static struct option long_options[] =
/* "node rejoin" options */ /* "node rejoin" options */
{"config-files", required_argument, NULL, OPT_CONFIG_FILES}, {"config-files", required_argument, NULL, OPT_CONFIG_FILES},
{"config-archive-dir", required_argument, NULL, OPT_CONFIG_ARCHIVE_DIR}, {"config-archive-dir", required_argument, NULL, OPT_CONFIG_ARCHIVE_DIR},
{"force-rewind", no_argument, NULL, OPT_FORCE_REWIND}, {"force-rewind", optional_argument, NULL, OPT_FORCE_REWIND},
/* "node service" options */ /* "node service" options */
{"action", required_argument, NULL, OPT_ACTION}, {"action", required_argument, NULL, OPT_ACTION},

View File

@@ -40,18 +40,28 @@
# is not running and there's no other way of determining # is not running and there's no other way of determining
# the data directory. # the data directory.
#replication_user='repmgr' # User to make replication connections with, if not set defaults
# to the user defined in "conninfo".
# ============================================================================= # =============================================================================
# Optional configuration items # Optional configuration items
# ============================================================================= # =============================================================================
#------------------------------------------------------------------------------
# Server settings
#------------------------------------------------------------------------------
#config_directory='' # If configuration files are located outside the data
# directory, specify the directory where the main
# postgresql.conf file is located.
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# Replication settings # Replication settings
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
#replication_user='repmgr' # User to make replication connections with, if not set defaults
# to the user defined in "conninfo".
#replication_type=physical # Must be one of 'physical' or 'bdr'. #replication_type=physical # Must be one of 'physical' or 'bdr'.
#location=default # arbitrary string defining the location of the node; this #location=default # arbitrary string defining the location of the node; this
@@ -65,9 +75,6 @@
# at least the number of standbys which will connect # at least the number of standbys which will connect
# to the primary. # to the primary.
#recovery_min_apply_delay= # If provided, "recovery_min_apply_delay" in recovery.conf
# will be set to this value.
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# Witness server settings # Witness server settings
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
@@ -91,7 +98,7 @@
#log_facility=STDERR # Logging facility: possible values are STDERR, or for #log_facility=STDERR # Logging facility: possible values are STDERR, or for
# syslog integration, one of LOCAL0, LOCAL1, ..., LOCAL7, USER # syslog integration, one of LOCAL0, LOCAL1, ..., LOCAL7, USER
#log_file='' # stderr can be redirected to an arbitrary file: #log_file='' # stderr can be redirected to an arbitrary file
#log_status_interval=300 # interval (in seconds) for repmgrd to log a status message #log_status_interval=300 # interval (in seconds) for repmgrd to log a status message
@@ -161,7 +168,7 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# Standby clone settings # "standby clone" settings
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# #
# These settings apply when cloning a standby ("repmgr standby clone"). # These settings apply when cloning a standby ("repmgr standby clone").
@@ -175,8 +182,29 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
# file system location to another. This # file system location to another. This
# parameter can be provided multiple times. # parameter can be provided multiple times.
#restore_command='' # This will be placed in the recovery.conf #restore_command='' # This will be placed in the recovery.conf file generated
# file generated by repmgr # by repmgr.
#archive_cleanup_command='' # This will be placed in the recovery.conf file generated
# by repmgr. Note we recommend using Barman for managing
# WAL archives (see: https://www.pgbarman.org )
#recovery_min_apply_delay= # If provided, "recovery_min_apply_delay" in recovery.conf
# will be set to this value (PostgreSQL 9.4 and later).
#------------------------------------------------------------------------------
# "standby promote" settings
#------------------------------------------------------------------------------
# These settings apply when instructing a standby to promote itself to the
# new primary ("repmgr standby promote").
#promote_check_timeout=60 # The length of time (in seconds) to wait
# for the new primary to finish promoting
#promote_check_interval=1 # The interval (in seconds) to check whether
# the new primary has finished promoting
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# Standby follow settings # Standby follow settings
@@ -185,8 +213,10 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
# These settings apply when instructing a standby to follow the new primary # These settings apply when instructing a standby to follow the new primary
# ("repmgr standby follow"). # ("repmgr standby follow").
#primary_follow_timeout=60 # The length of time (in seconds) to wait #primary_follow_timeout=60 # The max length of time (in seconds) to wait
# for the new primary to become available # for the new primary to become available
#standby_follow_timeout=15 # The max length of time (in seconds) to wait
# for the standby to connect to the primary
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
@@ -235,6 +265,8 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
#primary_notification_timeout=60 # Interval (in seconds) which repmgrd on a standby #primary_notification_timeout=60 # Interval (in seconds) which repmgrd on a standby
# will wait for a notification from the new primary, # will wait for a notification from the new primary,
# before falling back to degraded monitoring # before falling back to degraded monitoring
#standby_reconnect_timeout=60 # Interval (in seconds) which repmgrd on a standby will wait
# to reconnect to the local node after executing "follow_command"
#monitoring_history=no # Whether to write monitoring data to the "montoring_history" table #monitoring_history=no # Whether to write monitoring data to the "montoring_history" table
#monitor_interval_secs=2 # Interval (in seconds) at which to write monitoring data #monitor_interval_secs=2 # Interval (in seconds) at which to write monitoring data
@@ -270,6 +302,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
# /usr/bin/systemctl start postgresql-9.6, \ # /usr/bin/systemctl start postgresql-9.6, \
# /usr/bin/systemctl restart postgresql-9.6 # /usr/bin/systemctl restart postgresql-9.6
# #
# Debian/Ubuntu users: use "sudo pg_ctlcluster" to execute service control commands.
#
# For more details, see: https://repmgr.org/docs/4.0/configuration-service-commands.html
#service_start_command = '' #service_start_command = ''
#service_stop_command = '' #service_stop_command = ''

View File

@@ -70,6 +70,7 @@
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */ #define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */ #define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */ #define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */
#define DEFAULT_STANDBY_FOLLOW_TIMEOUT 30 /* seconds */
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */ #define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
#define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */ #define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */
#define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */ #define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */
@@ -77,6 +78,9 @@
#define DEFAULT_REPLICATION_LAG_CRITICAL 600 /* seconds */ #define DEFAULT_REPLICATION_LAG_CRITICAL 600 /* seconds */
#define DEFAULT_WITNESS_SYNC_INTERVAL 15 /* seconds */ #define DEFAULT_WITNESS_SYNC_INTERVAL 15 /* seconds */
#define DEFAULT_WAIT_START 30 /* seconds */ #define DEFAULT_WAIT_START 30 /* seconds */
#define DEFAULT_PROMOTE_CHECK_TIMEOUT 60 /* seconds */
#define DEFAULT_PROMOTE_CHECK_INTERVAL 1 /* seconds */
#define DEFAULT_STANDBY_RECONNECT_TIMEOUT 60 /* seconds */
#ifndef RECOVERY_COMMAND_FILE #ifndef RECOVERY_COMMAND_FILE
#define RECOVERY_COMMAND_FILE "recovery.conf" #define RECOVERY_COMMAND_FILE "recovery.conf"

View File

@@ -1,3 +1,3 @@
#define REPMGR_VERSION_DATE "" #define REPMGR_VERSION_DATE ""
#define REPMGR_VERSION "4.0.3" #define REPMGR_VERSION "4.0.6"

View File

@@ -35,6 +35,29 @@ do_bdr_node_check(void)
/* nothing to do at the moment */ /* nothing to do at the moment */
} }
void
handle_sigint_bdr(SIGNAL_ARGS)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
"%s signal received",
postgres_signal_arg == SIGTERM
? "TERM" : "INT");
create_event_notification(local_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_shutdown",
true,
event_details.data);
termPQExpBuffer(&event_details);
terminate(SUCCESS);
}
void void
monitor_bdr(void) monitor_bdr(void)
@@ -98,23 +121,6 @@ monitor_bdr(void)
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
/* Retrieve record for this node from the local database */
record_status = get_node_record(local_conn, config_file_options.node_id, &local_node_info);
/*
* Terminate if we can't find the local node record. This is a
* "fix-the-config" situation, not a lot else we can do.
*/
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve record for local node (ID: %i), terminating"),
local_node_info.node_id);
log_hint(_("check that \"repmgr bdr register\" was executed for this node"));
PQfinish(local_conn);
exit(ERR_BAD_CONFIG);
}
if (local_node_info.active == false) if (local_node_info.active == false)
{ {
log_error(_("local node (ID: %i) is marked as inactive in repmgr"), log_error(_("local node (ID: %i) is marked as inactive in repmgr"),
@@ -152,15 +158,16 @@ monitor_bdr(void)
cell->node_info->node_status = NODE_STATUS_UP; cell->node_info->node_status = NODE_STATUS_UP;
} }
log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id); log_info(_("starting continuous BDR node monitoring on node %i"),
config_file_options.node_id);
log_info(_("starting continuous BDR node monitoring")); INSTR_TIME_SET_CURRENT(log_status_interval_start);
while (true) while (true)
{ {
/* monitoring loop */ /* monitoring loop */
log_verbose(LOG_DEBUG, "BDR check loop..."); log_verbose(LOG_DEBUG, "BDR check loop - checking %i nodes", nodes.node_count);
for (cell = nodes.head; cell; cell = cell->next) for (cell = nodes.head; cell; cell = cell->next)
{ {
@@ -262,7 +269,6 @@ loop:
if (config_file_options.log_status_interval > 0) if (config_file_options.log_status_interval > 0)
{ {
int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start); int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
if (log_status_interval_elapsed >= config_file_options.log_status_interval) if (log_status_interval_elapsed >= config_file_options.log_status_interval)
{ {
log_info(_("monitoring BDR replication status on node \"%s\" (ID: %i)"), log_info(_("monitoring BDR replication status on node \"%s\" (ID: %i)"),
@@ -273,8 +279,7 @@ loop:
{ {
if (cell->node_info->monitoring_state == MS_DEGRADED) if (cell->node_info->monitoring_state == MS_DEGRADED)
{ {
log_detail( log_detail(_("monitoring node \"%s\" (ID: %i) in degraded mode"),
_("monitoring node \"%s\" (ID: %i) in degraded mode"),
cell->node_info->node_name, cell->node_info->node_name,
cell->node_info->node_id); cell->node_info->node_id);
} }

View File

@@ -22,4 +22,5 @@
extern void do_bdr_node_check(void); extern void do_bdr_node_check(void);
extern void monitor_bdr(void); extern void monitor_bdr(void);
extern void handle_sigint_bdr(SIGNAL_ARGS);
#endif /* _REPMGRD_BDR_H_ */ #endif /* _REPMGRD_BDR_H_ */

File diff suppressed because it is too large Load Diff

View File

@@ -24,6 +24,7 @@ void do_physical_node_check(void);
void monitor_streaming_primary(void); void monitor_streaming_primary(void);
void monitor_streaming_standby(void); void monitor_streaming_standby(void);
void monitor_streaming_witness(void); void monitor_streaming_witness(void);
void close_connections_physical(void);
void handle_sigint_physical(SIGNAL_ARGS);
#endif /* _REPMGRD_PHYSICAL_H_ */ #endif /* _REPMGRD_PHYSICAL_H_ */

View File

@@ -53,9 +53,6 @@ bool startup_event_logged = false;
MonitoringState monitoring_state = MS_NORMAL; MonitoringState monitoring_state = MS_NORMAL;
instr_time degraded_monitoring_start; instr_time degraded_monitoring_start;
static void close_connections(void);
void (*_close_connections) (void) = NULL;
/* /*
* Record receipt of SIGHUP; will cause configuration file to be reread * Record receipt of SIGHUP; will cause configuration file to be reread
* at the appropriate point in the main loop. * at the appropriate point in the main loop.
@@ -73,7 +70,6 @@ static void start_monitoring(void);
#ifndef WIN32 #ifndef WIN32
static void setup_event_handlers(void); static void setup_event_handlers(void);
static void handle_sighup(SIGNAL_ARGS); static void handle_sighup(SIGNAL_ARGS);
static void handle_sigint(SIGNAL_ARGS);
#endif #endif
int calculate_elapsed(instr_time start_time); int calculate_elapsed(instr_time start_time);
@@ -255,6 +251,8 @@ main(int argc, char **argv)
strncpy(config_file_options.log_level, cli_log_level, MAXLEN); strncpy(config_file_options.log_level, cli_log_level, MAXLEN);
} }
log_notice(_("repmgrd (repmgr %s) starting up"), REPMGR_VERSION);
/* /*
* -m/--monitoring-history, if provided, will override repmgr.conf's * -m/--monitoring-history, if provided, will override repmgr.conf's
* monitoring_history; this is for backwards compatibility as it's * monitoring_history; this is for backwards compatibility as it's
@@ -329,7 +327,7 @@ main(int argc, char **argv)
{ {
log_error(_("unable to determine status of \"repmgr\" extension")); log_error(_("unable to determine status of \"repmgr\" extension"));
log_detail("%s", PQerrorMessage(local_conn)); log_detail("%s", PQerrorMessage(local_conn));
PQfinish(local_conn); close_connection(&local_conn);
exit(ERR_DB_QUERY); exit(ERR_DB_QUERY);
} }
@@ -346,19 +344,33 @@ main(int argc, char **argv)
} }
log_hint(_("check that this node is part of a repmgr cluster")); log_hint(_("check that this node is part of a repmgr cluster"));
PQfinish(local_conn); close_connection(&local_conn);
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
/* Retrieve record for this node from the local database */ /* Retrieve record for this node from the local database */
record_status = get_node_record(local_conn, config_file_options.node_id, &local_node_info); record_status = get_node_record(local_conn, config_file_options.node_id, &local_node_info);
/*
* Terminate if we can't find the local node record. This is a
* "fix-the-config" situation, not a lot else we can do.
*/
if (record_status != RECORD_FOUND) if (record_status != RECORD_FOUND)
{ {
log_error(_("no metadata record found for this node - terminating")); log_error(_("no metadata record found for this node - terminating"));
log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
PQfinish(local_conn); switch (config_file_options.replication_type)
{
case REPLICATION_TYPE_PHYSICAL:
log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
break;
case REPLICATION_TYPE_BDR:
log_hint(_("check that 'repmgr bdr register' was executed for this node"));
break;
}
close_connection(&local_conn);
terminate(ERR_BAD_CONFIG); terminate(ERR_BAD_CONFIG);
} }
@@ -377,7 +389,7 @@ main(int argc, char **argv)
{ {
log_error(_("unable to write to shared memory")); log_error(_("unable to write to shared memory"));
log_hint(_("ensure \"shared_preload_libraries\" includes \"repmgr\"")); log_hint(_("ensure \"shared_preload_libraries\" includes \"repmgr\""));
PQfinish(local_conn); close_connection(&local_conn);
terminate(ERR_BAD_CONFIG); terminate(ERR_BAD_CONFIG);
} }
} }
@@ -389,7 +401,6 @@ main(int argc, char **argv)
} }
else else
{ {
_close_connections = close_connections_physical;
log_debug("node id is %i, upstream node id is %i", log_debug("node id is %i, upstream node id is %i",
local_node_info.node_id, local_node_info.node_id,
local_node_info.upstream_node_id); local_node_info.upstream_node_id);
@@ -440,6 +451,7 @@ start_monitoring(void)
break; break;
case WITNESS: case WITNESS:
monitor_streaming_witness(); monitor_streaming_witness();
break;
case BDR: case BDR:
monitor_bdr(); monitor_bdr();
return; return;
@@ -612,11 +624,6 @@ check_and_create_pid_file(const char *pid_file)
#ifndef WIN32 #ifndef WIN32
static void
handle_sigint(SIGNAL_ARGS)
{
terminate(SUCCESS);
}
/* SIGHUP: set flag to re-read config file at next convenient time */ /* SIGHUP: set flag to re-read config file at next convenient time */
static void static void
@@ -629,8 +636,23 @@ static void
setup_event_handlers(void) setup_event_handlers(void)
{ {
pqsignal(SIGHUP, handle_sighup); pqsignal(SIGHUP, handle_sighup);
pqsignal(SIGINT, handle_sigint);
pqsignal(SIGTERM, handle_sigint); /*
* we want to be able to write a "repmgrd_shutdown" event, so delegate
* signal handling to the respective replication type handler, as it
* will know best which database connection to use
*/
switch (config_file_options.replication_type)
{
case REPLICATION_TYPE_BDR:
pqsignal(SIGINT, handle_sigint_bdr);
pqsignal(SIGTERM, handle_sigint_bdr);
break;
case REPLICATION_TYPE_PHYSICAL:
pqsignal(SIGINT, handle_sigint_physical);
pqsignal(SIGTERM, handle_sigint_physical);
break;
}
} }
#endif #endif
@@ -679,17 +701,29 @@ PGconn *
try_reconnect(t_node_info *node_info) try_reconnect(t_node_info *node_info)
{ {
PGconn *conn; PGconn *conn;
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
int i; int i;
int max_attempts = config_file_options.reconnect_attempts; int max_attempts = config_file_options.reconnect_attempts;
initialize_conninfo_params(&conninfo_params, false);
/* we assume by now the conninfo string is parseable */
(void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false);
/* set some default values if not explicitly provided */
param_set_ine(&conninfo_params, "connect_timeout", "2");
param_set_ine(&conninfo_params, "fallback_application_name", "repmgr");
for (i = 0; i < max_attempts; i++) for (i = 0; i < max_attempts; i++)
{ {
log_info(_("checking state of node %i, %i of %i attempts"), log_info(_("checking state of node %i, %i of %i attempts"),
node_info->node_id, i + 1, max_attempts); node_info->node_id, i + 1, max_attempts);
if (is_server_available(node_info->conninfo) == true) if (is_server_available_params(&conninfo_params) == true)
{ {
log_notice(_("node has recovered, reconnecting")); log_notice(_("node has recovered, reconnecting"));
/* /*
@@ -697,14 +731,18 @@ try_reconnect(t_node_info *node_info)
* connection denied due to connection exhaustion - fall back to * connection denied due to connection exhaustion - fall back to
* degraded monitoring? - make that configurable * degraded monitoring? - make that configurable
*/ */
conn = establish_db_connection(node_info->conninfo, false);
conn = establish_db_connection_by_params(&conninfo_params, false);
if (PQstatus(conn) == CONNECTION_OK) if (PQstatus(conn) == CONNECTION_OK)
{ {
free_conninfo_params(&conninfo_params);
node_info->node_status = NODE_STATUS_UP; node_info->node_status = NODE_STATUS_UP;
return conn; return conn;
} }
PQfinish(conn); close_connection(&conn);
log_notice(_("unable to reconnect to node")); log_notice(_("unable to reconnect to node"));
} }
@@ -716,13 +754,14 @@ try_reconnect(t_node_info *node_info)
} }
} }
log_warning(_("unable to reconnect to node %i after %i attempts"), log_warning(_("unable to reconnect to node %i after %i attempts"),
node_info->node_id, node_info->node_id,
max_attempts); max_attempts);
node_info->node_status = NODE_STATUS_DOWN; node_info->node_status = NODE_STATUS_DOWN;
free_conninfo_params(&conninfo_params);
return NULL; return NULL;
} }
@@ -758,24 +797,9 @@ print_monitoring_state(MonitoringState monitoring_state)
} }
static void
close_connections()
{
if (_close_connections != NULL)
_close_connections();
if (local_conn != NULL && PQstatus(local_conn) == CONNECTION_OK)
{
PQfinish(local_conn);
local_conn = NULL;
}
}
void void
terminate(int retval) terminate(int retval)
{ {
close_connections();
logger_shutdown(); logger_shutdown();
if (pid_file) if (pid_file)