mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Compare commits
390 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b4ca6851ab | ||
|
|
347948b79f | ||
|
|
83e492d4ef | ||
|
|
1906ea89bd | ||
|
|
eab4fd2795 | ||
|
|
3f1fe9b6c2 | ||
|
|
e672f7e3ee | ||
|
|
fd86160dff | ||
|
|
f19cf62f09 | ||
|
|
8018ba97d6 | ||
|
|
73554c6e16 | ||
|
|
f23a93e12d | ||
|
|
d9947a46e8 | ||
|
|
e3a632e29d | ||
|
|
939cbd0721 | ||
|
|
c45c5abfb8 | ||
|
|
1953ec7459 | ||
|
|
a6eacca6e4 | ||
|
|
948e076ad9 | ||
|
|
a3bd9d33ff | ||
|
|
9dc928a7d5 | ||
|
|
9acf7bdfea | ||
|
|
29acd10f37 | ||
|
|
9df511eee3 | ||
|
|
6441db23ff | ||
|
|
7792de3543 | ||
|
|
94fe3e395e | ||
|
|
ff26173b1e | ||
|
|
4c11a57334 | ||
|
|
1d2d6e3587 | ||
|
|
c03913d32a | ||
|
|
37a41a66f9 | ||
|
|
4c2c8ecbab | ||
|
|
b84b6180ee | ||
|
|
58f55222d9 | ||
|
|
5cbaff8d0a | ||
|
|
a38e229e61 | ||
|
|
272abdd483 | ||
|
|
b4f6043abc | ||
|
|
a7f3f899ff | ||
|
|
3ec43eda36 | ||
|
|
ce8e1cccc4 | ||
|
|
70bfa4c8e1 | ||
|
|
f0d5ad503d | ||
|
|
b9ee57ee0f | ||
|
|
d5d6ed4be7 | ||
|
|
f4655074ae | ||
|
|
67d26ab7e2 | ||
|
|
70a7b45a03 | ||
|
|
4251590833 | ||
|
|
9347d34ce0 | ||
|
|
feb90ee50c | ||
|
|
0a6486bb7f | ||
|
|
39443bbcee | ||
|
|
fc636b1bd2 | ||
|
|
048bad1c88 | ||
|
|
4528eb1796 | ||
|
|
169c9ccd32 | ||
|
|
5f92fbddf2 | ||
|
|
617e466f72 | ||
|
|
435fac297b | ||
|
|
4bc12b4c94 | ||
|
|
91234994e2 | ||
|
|
ee9da30f20 | ||
|
|
2e67bc1341 | ||
|
|
18ab5cab4e | ||
|
|
60bb4e9fc8 | ||
|
|
52bee6b98d | ||
|
|
ecb1f379f5 | ||
|
|
e1cd2c22d4 | ||
|
|
1dea6b76d9 | ||
|
|
702f90fc9d | ||
|
|
c4d1eec6f3 | ||
|
|
b241c606c0 | ||
|
|
45c896d716 | ||
|
|
514595ea10 | ||
|
|
531194fa27 | ||
|
|
2aa67c992c | ||
|
|
37892afcfc | ||
|
|
e4e5e35552 | ||
|
|
b320c1f0ae | ||
|
|
280654bed6 | ||
|
|
ae675059c0 | ||
|
|
454ebabe89 | ||
|
|
d1d6ef8d12 | ||
|
|
5d6eab74f6 | ||
|
|
59b7453bbf | ||
|
|
bde8c7e29c | ||
|
|
bc6584a90d | ||
|
|
074d79b44f | ||
|
|
2eeb288573 | ||
|
|
48a2274b11 | ||
|
|
19bcfa7264 | ||
|
|
486877c3d5 | ||
|
|
9753bcc8c3 | ||
|
|
bd35b450da | ||
|
|
1f256d4d73 | ||
|
|
1524e2449f | ||
|
|
0cd2bd2e91 | ||
|
|
98b78df16c | ||
|
|
b946dce2f0 | ||
|
|
39234afcbf | ||
|
|
23569a19b1 | ||
|
|
c650fd3412 | ||
|
|
c30e65b3f2 | ||
|
|
07097575b1 | ||
|
|
71d151ca87 | ||
|
|
5abec2bb97 | ||
|
|
de70fd42dc | ||
|
|
99550b91bd | ||
|
|
70190c37c4 | ||
|
|
f3fc4e5afb | ||
|
|
629c552348 | ||
|
|
85a97c933f | ||
|
|
3a5a4388c7 | ||
|
|
9338a9e233 | ||
|
|
7fad2ed2c8 | ||
|
|
9305953bd2 | ||
|
|
aeb9639ed9 | ||
|
|
bc9e725d05 | ||
|
|
905e108f8f | ||
|
|
f2362a06fa | ||
|
|
7b85cb9f12 | ||
|
|
790bec21dd | ||
|
|
a0dc673439 | ||
|
|
25019d1cc5 | ||
|
|
d00cb767a6 | ||
|
|
8e0d28d8dc | ||
|
|
e146fb4fc3 | ||
|
|
8773543e10 | ||
|
|
a4cd4ee553 | ||
|
|
a61dd8a750 | ||
|
|
2c84716e66 | ||
|
|
f1667a7e98 | ||
|
|
b91900f831 | ||
|
|
aa1e64ec11 | ||
|
|
5d6037303b | ||
|
|
8aaf6571a0 | ||
|
|
9433f80364 | ||
|
|
aee13aee52 | ||
|
|
f0a0be0248 | ||
|
|
c4332d9a52 | ||
|
|
c7b325e2a4 | ||
|
|
b89941f218 | ||
|
|
2b3b1faa20 | ||
|
|
b9cd321aed | ||
|
|
984ce7420b | ||
|
|
464ec6bec3 | ||
|
|
3bbbf6daa9 | ||
|
|
cd3312496e | ||
|
|
cce8b76171 | ||
|
|
2a529e7e8b | ||
|
|
f62b3b2868 | ||
|
|
701944c194 | ||
|
|
d8048060a2 | ||
|
|
31f25856a2 | ||
|
|
92c73b68a0 | ||
|
|
90909e2e42 | ||
|
|
b036870c83 | ||
|
|
321eb844e4 | ||
|
|
2c9700586c | ||
|
|
f9a1861ded | ||
|
|
59ed86c01a | ||
|
|
f24b30327c | ||
|
|
48381a5b4e | ||
|
|
20b79f998c | ||
|
|
a41e7bb726 | ||
|
|
b9ba97a36d | ||
|
|
d8aa472c5f | ||
|
|
9273e7af73 | ||
|
|
f04f2af8aa | ||
|
|
bdb4f66a9d | ||
|
|
c402b08791 | ||
|
|
64bb034d34 | ||
|
|
ea54aaa290 | ||
|
|
b34c331eba | ||
|
|
19e0b6a1b6 | ||
|
|
9349171b55 | ||
|
|
d4ee4cc14c | ||
|
|
d7420d7274 | ||
|
|
70e4243a1d | ||
|
|
b6264b77c4 | ||
|
|
9e7cb6d01c | ||
|
|
0435bda115 | ||
|
|
a5aa47c1dd | ||
|
|
7654dd615b | ||
|
|
c83e9870fe | ||
|
|
8b13d14294 | ||
|
|
ba13172b3a | ||
|
|
32b81e7d49 | ||
|
|
cbfef17a1d | ||
|
|
a48d408e4e | ||
|
|
e5f50e7b99 | ||
|
|
aeea02b598 | ||
|
|
59eca2be30 | ||
|
|
dfe57d2406 | ||
|
|
061932d023 | ||
|
|
3f5762e03a | ||
|
|
42fa9a2a88 | ||
|
|
f23065e041 | ||
|
|
efe4a9c344 | ||
|
|
0970789b1d | ||
|
|
07b79286b5 | ||
|
|
c3d284e097 | ||
|
|
a9e09d436a | ||
|
|
965984a510 | ||
|
|
1980deb480 | ||
|
|
b6fe91ebcd | ||
|
|
44cbb44500 | ||
|
|
99161c38d2 | ||
|
|
57d3ee768c | ||
|
|
7dce3ed234 | ||
|
|
58efb0f158 | ||
|
|
d261768541 | ||
|
|
aa8547a219 | ||
|
|
9f04a846ec | ||
|
|
ff0e480fdd | ||
|
|
8881b69c06 | ||
|
|
0b3a310802 | ||
|
|
4523137bfc | ||
|
|
666f5cf851 | ||
|
|
e89938e132 | ||
|
|
d97905f6fd | ||
|
|
bed66edfd9 | ||
|
|
ba7ef9e643 | ||
|
|
10be941298 | ||
|
|
75379eab2e | ||
|
|
d4e993a240 | ||
|
|
695a45f9ed | ||
|
|
028c874f81 | ||
|
|
b3c2831bd3 | ||
|
|
e191a32eac | ||
|
|
c66c8ebc98 | ||
|
|
3389491151 | ||
|
|
81eb9d99e7 | ||
|
|
1156f27979 | ||
|
|
b5b9aacc8a | ||
|
|
b89b3c0961 | ||
|
|
9cf5bf3f93 | ||
|
|
9a5bd0d489 | ||
|
|
40408a1734 | ||
|
|
40410e43ab | ||
|
|
3c25d5a03a | ||
|
|
7e21ceb158 | ||
|
|
313aa3c5d7 | ||
|
|
10d46f7e85 | ||
|
|
9e90fcd584 | ||
|
|
c53782cda3 | ||
|
|
66b40ffc68 | ||
|
|
a6a2be2239 | ||
|
|
bdcc4d9e83 | ||
|
|
9f587efb74 | ||
|
|
2aacd29e60 | ||
|
|
311f7e561e | ||
|
|
b498db87aa | ||
|
|
74c44a7178 | ||
|
|
5ff3744895 | ||
|
|
793d83b22c | ||
|
|
0f4e04e61e | ||
|
|
80a280cbf4 | ||
|
|
b223cb4cee | ||
|
|
9d1f5c0de3 | ||
|
|
784c9c4793 | ||
|
|
0caec90d81 | ||
|
|
1458f6e6aa | ||
|
|
a2d38c6084 | ||
|
|
5f1bf0fb8f | ||
|
|
7d99b96717 | ||
|
|
3b10750a7f | ||
|
|
af0a60b8eb | ||
|
|
b419c5fec7 | ||
|
|
2cfcc33a64 | ||
|
|
273db444b2 | ||
|
|
2bf3eeb931 | ||
|
|
c3bc5585d9 | ||
|
|
b84f217710 | ||
|
|
90c49c0c28 | ||
|
|
41c1550788 | ||
|
|
c336e384ab | ||
|
|
bc1956dee9 | ||
|
|
a459c60145 | ||
|
|
65721bbbcd | ||
|
|
96895ba8a8 | ||
|
|
e0d6d906e7 | ||
|
|
dc8ffd30c6 | ||
|
|
24392fa11b | ||
|
|
06b5239ada | ||
|
|
56173d94a9 | ||
|
|
578f11003c | ||
|
|
36bd7cdc9f | ||
|
|
62ac56c3f5 | ||
|
|
c79852cce0 | ||
|
|
3907a545b0 | ||
|
|
d1d057a184 | ||
|
|
b70e3b48c8 | ||
|
|
ab6c3d9b6e | ||
|
|
6999dbb52a | ||
|
|
b2348c9a70 | ||
|
|
7b26180ebb | ||
|
|
d70a5250ab | ||
|
|
024accfbba | ||
|
|
55c967fd14 | ||
|
|
c1edb896df | ||
|
|
fd66d93937 | ||
|
|
40e94635b2 | ||
|
|
9ad41bfb0f | ||
|
|
35c156ce7e | ||
|
|
85f27ff559 | ||
|
|
ad03885b72 | ||
|
|
3e38759c02 | ||
|
|
15a5d2ee9d | ||
|
|
61c91df332 | ||
|
|
b346914d4d | ||
|
|
ac40ef0e43 | ||
|
|
eebf07549f | ||
|
|
a40fd60cb5 | ||
|
|
bd24848ce9 | ||
|
|
7ab81e10de | ||
|
|
455a0bd93f | ||
|
|
11d25e2aef | ||
|
|
b14fbbdc72 | ||
|
|
2491b8ae52 | ||
|
|
fce3c02760 | ||
|
|
1f8f6f3a39 | ||
|
|
401f903456 | ||
|
|
688337dec3 | ||
|
|
b660cb9fe4 | ||
|
|
5d8d9db21d | ||
|
|
9439467958 | ||
|
|
38e3aae053 | ||
|
|
80bef0eb28 | ||
|
|
bea4b03cc2 | ||
|
|
97905b02ae | ||
|
|
b0a2ee2259 | ||
|
|
bb4fdcda98 | ||
|
|
7b33faa09b | ||
|
|
5de2b1ee13 | ||
|
|
f184b1e68a | ||
|
|
bd2f6db1e1 | ||
|
|
1693ec0e90 | ||
|
|
17e75f6b31 | ||
|
|
3b8586d82a | ||
|
|
6acec3e041 | ||
|
|
1d830bf0e2 | ||
|
|
3f99ee8ede | ||
|
|
b5f640d04d | ||
|
|
92a62a958e | ||
|
|
a4a956593c | ||
|
|
ceeb6d7130 | ||
|
|
9681708b1a | ||
|
|
3573950425 | ||
|
|
c1586e39b7 | ||
|
|
7745844078 | ||
|
|
e1e59e85d7 | ||
|
|
6fc79470fc | ||
|
|
b7d576863d | ||
|
|
c1338df5e3 | ||
|
|
221fb63e92 | ||
|
|
987823861f | ||
|
|
7a6eb6321b | ||
|
|
f4df6696ba | ||
|
|
bc584d84f6 | ||
|
|
76f5bcf3cd | ||
|
|
b1aab930af | ||
|
|
58994365ff | ||
|
|
c3949b2aea | ||
|
|
6ba49de44e | ||
|
|
b61f853a69 | ||
|
|
f2bc898761 | ||
|
|
7bcf87b8ed | ||
|
|
6983547325 | ||
|
|
34c4f4c3f8 | ||
|
|
f8667c1aac | ||
|
|
08ab6290c1 | ||
|
|
97cafd8c54 | ||
|
|
78b969f208 | ||
|
|
3f558416f3 | ||
|
|
410fa5e54d | ||
|
|
44a224ad92 | ||
|
|
33dedf4e96 | ||
|
|
4f4d20c30b | ||
|
|
69cb87322d | ||
|
|
4351836520 | ||
|
|
a87f18682c | ||
|
|
1a630d079e | ||
|
|
d2929f6426 | ||
|
|
f3f002bea5 | ||
|
|
93471b8d68 | ||
|
|
46e0a9a8db | ||
|
|
3620fa79e8 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -47,6 +47,9 @@ lib*.pc
|
||||
# other
|
||||
/.lineno
|
||||
*.dSYM
|
||||
*.orig
|
||||
*.rej
|
||||
|
||||
# generated binaries
|
||||
repmgr
|
||||
repmgrd
|
||||
|
||||
@@ -2,7 +2,7 @@ License and Contributions
|
||||
=========================
|
||||
|
||||
`repmgr` is licensed under the GPL v3. All of its code and documentation is
|
||||
Copyright 2010-2018, 2ndQuadrant Limited. See the files COPYRIGHT and LICENSE for
|
||||
Copyright 2010-2019, 2ndQuadrant Limited. See the files COPYRIGHT and LICENSE for
|
||||
details.
|
||||
|
||||
The development of repmgr has primarily been sponsored by 2ndQuadrant customers.
|
||||
@@ -24,7 +24,7 @@ Code style
|
||||
Code in repmgr should be formatted to the same standards as the main PostgreSQL
|
||||
project. For more details see:
|
||||
|
||||
https://www.postgresql.org/docs/current/static/source-format.html
|
||||
https://www.postgresql.org/docs/current/source-format.html
|
||||
|
||||
Contributors should reformat their code similarly before submitting code to
|
||||
the project, in order to minimize merge conflicts with other work.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
Copyright (c) 2010-2018, 2ndQuadrant Limited
|
||||
Copyright (c) 2010-2019, 2ndQuadrant Limited
|
||||
All rights reserved.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
|
||||
4
FAQ.md
4
FAQ.md
@@ -1,8 +1,10 @@
|
||||
FAQ - Frequently Asked Questions about repmgr
|
||||
=============================================
|
||||
|
||||
The repmgr 4 FAQ is located here: [repmgr FAQ (Frequently Asked Questions)](https://repmgr.org/docs/4.0/appendix-faq.html "repmgr FAQ")
|
||||
The repmgr 4 FAQ is located here: [repmgr FAQ (Frequently Asked Questions)](https://repmgr.org/docs/current/appendix-faq.html "repmgr FAQ")
|
||||
|
||||
The repmgr 3.x FAQ can be found here:
|
||||
|
||||
https://github.com/2ndQuadrant/repmgr/blob/REL3_3_STABLE/FAQ.md
|
||||
|
||||
Note that repmgr 3.x is no longer supported.
|
||||
|
||||
60
HISTORY
60
HISTORY
@@ -1,4 +1,62 @@
|
||||
4.1.0 2018-??-??
|
||||
4.3 2019-??
|
||||
repmgr: add "daemon (start|stop)" command; GitHub #528 (Ian)
|
||||
repmgr: add --version-number command line option (Ian)
|
||||
repmgr: add --compact option to "cluster show"; GitHub #521 (Ian)
|
||||
repmgr: cluster show - differentiate between unreachable nodes
|
||||
and nodes which are running but rejecting connections (Ian)
|
||||
repmgr: add --dry-run option to "standby promote"; GitHub #522 (Ian)
|
||||
repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
|
||||
repmgr: prevent potential race condition in "standby switchover"
|
||||
when checking received WAL location; GitHub #518 (Ian)
|
||||
repmgr: ensure "standby switchover" verifies repmgr can read the
|
||||
data directory on the demotion candidate; GitHub #523 (Ian)
|
||||
repmgr: ensure "standby switchover" verifies replication connection
|
||||
exists; GitHub #519 (Ian)
|
||||
repmgr: add sanity check for correct extension version (Ian)
|
||||
repmgr: ensure "witness register --dry-run" does not attempt to read node
|
||||
tables if repmgr extension not installed; GitHub #513 (Ian)
|
||||
repmgr: ensure "standby register" fails when --upstream-node-id is the
|
||||
same as the local node ID (Ian)
|
||||
repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
|
||||
repmgrd: on a cascaded standby, don't fail over if "failover=manual";
|
||||
GitHub #531 (Ian)
|
||||
repmgrd: don't consider nodes where repmgrd is not running as promotion
|
||||
candidates (Ian)
|
||||
repmgrd: add option "connection_check_type" (Ian)
|
||||
repmgrd: improve witness monitoring when primary node not available (Ian)
|
||||
repmgrd: handle situation where a primary has unexpectedly appeared
|
||||
during failover; GitHub #420 (Ian)
|
||||
|
||||
4.2 2018-10-24
|
||||
repmgr: add parameter "shutdown_check_timeout" for use by "standby switchover";
|
||||
GitHub #504 (Ian)
|
||||
repmgr: add "--node-id" option to "repmgr cluster cleanup"; GitHub #493 (Ian)
|
||||
repmgr: report unreachable nodes when running "repmgr cluster (matrix|crosscheck);
|
||||
GitHub #246 (Ian)
|
||||
repmgr: add configuration file parameter "repmgr_bindir"; GitHub #246 (Ian)
|
||||
repmgr: fix "Missing replication slots" label in "node check"; GitHub #507 (Ian)
|
||||
repmgrd: fix parsing of -d/--daemonize option (Ian)
|
||||
repmgrd: support "pausing" of repmgrd (Ian)
|
||||
|
||||
4.1.1 2018-09-05
|
||||
logging: explicitly log the text of failed queries as ERRORs to
|
||||
assist logfile analysis; GitHub #498
|
||||
repmgr: truncate version string, if necessary; GitHub #490 (Ian)
|
||||
repmgr: improve messages emitted during "standby promote" (Ian)
|
||||
repmgr: "standby clone" - don't copy external config files in --dry-run
|
||||
mode; GitHub #491 (Ian)
|
||||
repmgr: add "cluster_cleanup" event; GitHub #492 (Ian)
|
||||
repmgr: (standby switchover) improve detection of free walsenders;
|
||||
GitHub #495 (Ian)
|
||||
repmgr: (node rejoin) improve replication slot handling; GitHub #499 (Ian)
|
||||
repmgrd: ensure that sending SIGHUP always results in the log file
|
||||
being reopened; GitHub #485 (Ian)
|
||||
repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian)
|
||||
repmgrd: fix startup on witness node when local data is stale; GitHub #488/#489 (Ian)
|
||||
repmgrd: improve cascaded standby failover handling; GitHub #480 (Ian)
|
||||
repmgrd: improve reconnection handling (Ian)
|
||||
|
||||
4.1.0 2018-07-31
|
||||
repmgr: change default log_level to INFO, add documentation; GitHub #470 (Ian)
|
||||
repmgr: add "--missing-slots" check to "repmgr node check" (Ian)
|
||||
repmgr: improve command line error handling; GitHub #464 (Ian)
|
||||
|
||||
45
Makefile.in
45
Makefile.in
@@ -13,8 +13,11 @@ DATA = \
|
||||
repmgr--unpackaged--4.0.sql \
|
||||
repmgr--4.0.sql \
|
||||
repmgr--4.0--4.1.sql \
|
||||
repmgr--4.1.sql
|
||||
|
||||
repmgr--4.1.sql \
|
||||
repmgr--4.1--4.2.sql \
|
||||
repmgr--4.2.sql \
|
||||
repmgr--4.2--4.3.sql \
|
||||
repmgr--4.3.sql
|
||||
|
||||
REGRESS = repmgr_extension
|
||||
|
||||
@@ -29,21 +32,26 @@ all: \
|
||||
PG_CPPFLAGS = -std=gnu89 -I$(includedir_internal) -I$(libpq_srcdir) -Wall -Wmissing-prototypes -Wmissing-declarations $(EXTRA_CFLAGS)
|
||||
SHLIB_LINK = $(libpq)
|
||||
|
||||
HEADERS = $(wildcard *.h)
|
||||
|
||||
|
||||
OBJS = \
|
||||
repmgr.o
|
||||
|
||||
include Makefile.global
|
||||
|
||||
ifeq ($(vpath_build),yes)
|
||||
HEADERS = $(wildcard *.h)
|
||||
else
|
||||
HEADERS_built = $(wildcard *.h)
|
||||
endif
|
||||
|
||||
$(info Building against PostgreSQL $(MAJORVERSION))
|
||||
|
||||
REPMGR_CLIENT_OBJS = repmgr-client.o \
|
||||
repmgr-action-primary.o repmgr-action-standby.o repmgr-action-witness.o \
|
||||
repmgr-action-bdr.o repmgr-action-cluster.o repmgr-action-node.o \
|
||||
configfile.o log.o strutil.o controldata.o dirutil.o compat.o dbutils.o
|
||||
REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o configfile.o log.o dbutils.o strutil.o controldata.o compat.o
|
||||
repmgr-action-bdr.o repmgr-action-cluster.o repmgr-action-node.o repmgr-action-daemon.o \
|
||||
configfile.o log.o strutil.o controldata.o dirutil.o compat.o dbutils.o sysutils.o
|
||||
REPMGRD_OBJS = repmgrd.o repmgrd-physical.o repmgrd-bdr.o configfile.o log.o dbutils.o strutil.o controldata.o compat.o sysutils.o
|
||||
DATE=$(shell date "+%Y-%m-%d")
|
||||
|
||||
repmgr_version.h: repmgr_version.h.in
|
||||
@@ -78,28 +86,15 @@ clean: additional-clean
|
||||
maintainer-clean: additional-maintainer-clean
|
||||
|
||||
additional-clean:
|
||||
rm -f repmgr-client.o
|
||||
rm -f repmgr-action-primary.o
|
||||
rm -f repmgr-action-standby.o
|
||||
rm -f repmgr-action-witness.o
|
||||
rm -f repmgr-action-bdr.o
|
||||
rm -f repmgr-action-node.o
|
||||
rm -f repmgr-action-cluster.o
|
||||
rm -f repmgrd.o
|
||||
rm -f repmgrd-physical.o
|
||||
rm -f repmgrd-bdr.o
|
||||
rm -f compat.o
|
||||
rm -f configfile.o
|
||||
rm -f controldata.o
|
||||
rm -f dbutils.o
|
||||
rm -f dirutil.o
|
||||
rm -f log.o
|
||||
rm -f strutil.o
|
||||
rm -f *.o
|
||||
|
||||
maintainer-additional-clean: clean
|
||||
rm -f configure
|
||||
additional-maintainer-clean: clean
|
||||
$(MAKE) -C doc maintainer-clean
|
||||
rm -f config.status config.log
|
||||
rm -f config.h
|
||||
rm -f repmgr_version.h
|
||||
rm -f Makefile
|
||||
rm -f Makefile.global
|
||||
@rm -rf autom4te.cache/
|
||||
|
||||
ifeq ($(MAJORVERSION),$(filter $(MAJORVERSION),9.3 9.4))
|
||||
|
||||
@@ -10,7 +10,7 @@ operations.
|
||||
`repmgr 4` is a complete rewrite of the existing `repmgr` codebase, allowing
|
||||
the use of all of the latest features in PostgreSQL replication.
|
||||
|
||||
PostgreSQL 10, 9.6 and 9.5 are fully supported.
|
||||
PostgreSQL 11, 10, 9.6 and 9.5 are fully supported.
|
||||
PostgreSQL 9.4 and 9.3 are supported, with some restrictions.
|
||||
|
||||
`repmgr` is distributed under the GNU GPL 3 and maintained by 2ndQuadrant.
|
||||
@@ -19,7 +19,7 @@ PostgreSQL 9.4 and 9.3 are supported, with some restrictions.
|
||||
|
||||
`repmgr 4` supports monitoring of a two-node BDR 2.0 cluster on PostgreSQL 9.6
|
||||
only. Note that BDR 2.0 is not publicly available; please contact 2ndQuadrant
|
||||
for details. `repmgr 4` will support future public BDR releases.
|
||||
for details.
|
||||
|
||||
|
||||
Documentation
|
||||
@@ -27,7 +27,7 @@ Documentation
|
||||
|
||||
The main `repmgr` documentation is available here:
|
||||
|
||||
> [repmgr 4 documentation](https://repmgr.org/docs/4.0/index.html)
|
||||
> [repmgr 4 documentation](https://repmgr.org/docs/4.2/index.html)
|
||||
|
||||
The `README` file for `repmgr` 3.x is available here:
|
||||
|
||||
|
||||
35
compat.c
35
compat.c
@@ -6,7 +6,7 @@
|
||||
* supported PostgreSQL versions. They're unlikely to change but
|
||||
* it would be worth keeping an eye on them for any fixes/improvements.
|
||||
*
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
@@ -98,9 +98,42 @@ appendShellString(PQExpBuffer buf, const char *str)
|
||||
|
||||
if (*p == '\'')
|
||||
appendPQExpBufferStr(buf, "'\"'\"'");
|
||||
else if (*p == '&')
|
||||
appendPQExpBufferStr(buf, "\\&");
|
||||
else
|
||||
appendPQExpBufferChar(buf, *p);
|
||||
}
|
||||
|
||||
appendPQExpBufferChar(buf, '\'');
|
||||
}
|
||||
|
||||
/*
|
||||
* Adapted from: src/fe_utils/string_utils.c
|
||||
*/
|
||||
void
|
||||
appendRemoteShellString(PQExpBuffer buf, const char *str)
|
||||
{
|
||||
const char *p;
|
||||
|
||||
appendPQExpBufferStr(buf, "\\'");
|
||||
|
||||
for (p = str; *p; p++)
|
||||
{
|
||||
if (*p == '\n' || *p == '\r')
|
||||
{
|
||||
fprintf(stderr,
|
||||
_("shell command argument contains a newline or carriage return: \"%s\"\n"),
|
||||
str);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (*p == '\'')
|
||||
appendPQExpBufferStr(buf, "'\"'\"'");
|
||||
else if (*p == '&')
|
||||
appendPQExpBufferStr(buf, "\\&");
|
||||
else
|
||||
appendPQExpBufferChar(buf, *p);
|
||||
}
|
||||
|
||||
appendPQExpBufferStr(buf, "\\'");
|
||||
}
|
||||
|
||||
4
compat.h
4
compat.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* compat.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
@@ -27,4 +27,6 @@ extern void appendConnStrVal(PQExpBuffer buf, const char *str);
|
||||
|
||||
extern void appendShellString(PQExpBuffer buf, const char *str);
|
||||
|
||||
extern void appendRemoteShellString(PQExpBuffer buf, const char *str);
|
||||
|
||||
#endif
|
||||
|
||||
259
configfile.c
259
configfile.c
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* config.c - parse repmgr.conf and other configuration-related functionality
|
||||
*
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -28,6 +28,7 @@ char config_file_path[MAXPGPATH] = "";
|
||||
static bool config_file_provided = false;
|
||||
bool config_file_found = false;
|
||||
|
||||
static void parse_config(t_configuration_options *options, bool terse);
|
||||
static void _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *warning_list);
|
||||
|
||||
static void _parse_line(char *buf, char *name, char *value);
|
||||
@@ -87,8 +88,7 @@ load_config(const char *config_file, bool verbose, bool terse, t_configuration_o
|
||||
|
||||
if (pwd != NULL)
|
||||
{
|
||||
appendPQExpBuffer(&fullpath,
|
||||
"%s", pwd);
|
||||
appendPQExpBufferStr(&fullpath, pwd);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -104,9 +104,7 @@ load_config(const char *config_file, bool verbose, bool terse, t_configuration_o
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
appendPQExpBuffer(&fullpath,
|
||||
"%s",
|
||||
cwd);
|
||||
appendPQExpBufferStr(&fullpath, cwd);
|
||||
}
|
||||
|
||||
appendPQExpBuffer(&fullpath,
|
||||
@@ -125,9 +123,9 @@ load_config(const char *config_file, bool verbose, bool terse, t_configuration_o
|
||||
|
||||
if (stat(config_file_path, &stat_config) != 0)
|
||||
{
|
||||
log_error(_("provided configuration file \"%s\" not found: %s"),
|
||||
config_file,
|
||||
strerror(errno));
|
||||
log_error(_("provided configuration file \"%s\" not found"),
|
||||
config_file);
|
||||
log_detail("%s", strerror(errno));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
@@ -238,7 +236,7 @@ end_search:
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
static void
|
||||
parse_config(t_configuration_options *options, bool terse)
|
||||
{
|
||||
/* Collate configuration file errors here for friendlier reporting */
|
||||
@@ -287,6 +285,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
memset(options->data_directory, 0, sizeof(options->data_directory));
|
||||
memset(options->config_directory, 0, sizeof(options->data_directory));
|
||||
memset(options->pg_bindir, 0, sizeof(options->pg_bindir));
|
||||
memset(options->repmgr_bindir, 0, sizeof(options->repmgr_bindir));
|
||||
options->replication_type = REPLICATION_TYPE_PHYSICAL;
|
||||
|
||||
/*-------------
|
||||
@@ -334,7 +333,9 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
* standby switchover settings
|
||||
*------------------------
|
||||
*/
|
||||
options->shutdown_check_timeout = DEFAULT_SHUTDOWN_CHECK_TIMEOUT;
|
||||
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
|
||||
options->wal_receive_check_timeout = DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT;
|
||||
|
||||
/*-----------------
|
||||
* repmgrd settings
|
||||
@@ -357,6 +358,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
|
||||
options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */
|
||||
memset(options->repmgrd_pid_file, 0, sizeof(options->repmgrd_pid_file));
|
||||
options->standby_disconnect_on_failover = false;
|
||||
options->sibling_nodes_disconnect_timeout = DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT;
|
||||
options->connection_check_type = CHECK_PING;
|
||||
options->primary_visibility_consensus = false;
|
||||
memset(options->failover_validation_command, 0, sizeof(options->failover_validation_command));
|
||||
options->election_rerun_interval = DEFAULT_ELECTION_RERUN_INTERVAL;
|
||||
|
||||
/*-------------
|
||||
* witness settings
|
||||
@@ -371,17 +378,24 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->bdr_local_monitoring_only = false;
|
||||
options->bdr_recovery_timeout = DEFAULT_BDR_RECOVERY_TIMEOUT;
|
||||
|
||||
/*-----------------
|
||||
* service settings
|
||||
*-----------------
|
||||
/*-------------------------
|
||||
* service command settings
|
||||
*-------------------------
|
||||
*/
|
||||
memset(options->pg_ctl_options, 0, sizeof(options->pg_ctl_options));
|
||||
memset(options->service_stop_command, 0, sizeof(options->service_stop_command));
|
||||
memset(options->service_start_command, 0, sizeof(options->service_start_command));
|
||||
memset(options->service_stop_command, 0, sizeof(options->service_stop_command));
|
||||
memset(options->service_restart_command, 0, sizeof(options->service_restart_command));
|
||||
memset(options->service_reload_command, 0, sizeof(options->service_reload_command));
|
||||
memset(options->service_promote_command, 0, sizeof(options->service_promote_command));
|
||||
|
||||
/*---------------------------------
|
||||
* repmgrd service command settings
|
||||
*---------------------------------
|
||||
*/
|
||||
memset(options->repmgrd_service_start_command, 0, sizeof(options->repmgrd_service_start_command));
|
||||
memset(options->repmgrd_service_stop_command, 0, sizeof(options->repmgrd_service_stop_command));
|
||||
|
||||
/*----------------------------
|
||||
* event notification settings
|
||||
*----------------------------
|
||||
@@ -466,11 +480,18 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
/* Copy into correct entry in parameters struct */
|
||||
if (strcmp(name, "node_id") == 0)
|
||||
{
|
||||
options->node_id = repmgr_atoi(value, name, error_list, 1);
|
||||
options->node_id = repmgr_atoi(value, name, error_list, MIN_NODE_ID);
|
||||
node_id_found = true;
|
||||
}
|
||||
else if (strcmp(name, "node_name") == 0)
|
||||
strncpy(options->node_name, value, MAXLEN);
|
||||
{
|
||||
if (strlen(value) < sizeof(options->node_name))
|
||||
strncpy(options->node_name, value, sizeof(options->node_name));
|
||||
else
|
||||
item_list_append_format(error_list,
|
||||
_("value for \"node_name\" must contain fewer than %lu characters"),
|
||||
sizeof(options->node_name));
|
||||
}
|
||||
else if (strcmp(name, "conninfo") == 0)
|
||||
strncpy(options->conninfo, value, MAXLEN);
|
||||
else if (strcmp(name, "data_directory") == 0)
|
||||
@@ -480,14 +501,17 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
|
||||
else if (strcmp(name, "replication_user") == 0)
|
||||
{
|
||||
if (strlen(value) < NAMEDATALEN)
|
||||
strncpy(options->replication_user, value, NAMEDATALEN);
|
||||
if (strlen(value) < sizeof(options->replication_user))
|
||||
strncpy(options->replication_user, value, sizeof(options->replication_user));
|
||||
else
|
||||
item_list_append(error_list,
|
||||
_("value for \"replication_user\" must contain fewer than " STR(NAMEDATALEN) " characters"));
|
||||
item_list_append_format(error_list,
|
||||
_("value for \"replication_user\" must contain fewer than %lu characters"),
|
||||
sizeof(options->replication_user));
|
||||
}
|
||||
else if (strcmp(name, "pg_bindir") == 0)
|
||||
strncpy(options->pg_bindir, value, MAXPGPATH);
|
||||
else if (strcmp(name, "repmgr_bindir") == 0)
|
||||
strncpy(options->repmgr_bindir, value, MAXPGPATH);
|
||||
|
||||
else if (strcmp(name, "replication_type") == 0)
|
||||
{
|
||||
@@ -544,8 +568,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
|
||||
/* standby switchover settings */
|
||||
else if (strcmp(name, "shutdown_check_timeout") == 0)
|
||||
options->shutdown_check_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "standby_reconnect_timeout") == 0)
|
||||
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "wal_receive_check_timeout") == 0)
|
||||
options->wal_receive_check_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
|
||||
/* node rejoin settings */
|
||||
else if (strcmp(name, "node_rejoin_timeout") == 0)
|
||||
@@ -581,11 +609,11 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
else if (strcmp(name, "priority") == 0)
|
||||
options->priority = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "location") == 0)
|
||||
strncpy(options->location, value, MAXLEN);
|
||||
strncpy(options->location, value, sizeof(options->location));
|
||||
else if (strcmp(name, "promote_command") == 0)
|
||||
strncpy(options->promote_command, value, MAXLEN);
|
||||
strncpy(options->promote_command, value, sizeof(options->promote_command));
|
||||
else if (strcmp(name, "follow_command") == 0)
|
||||
strncpy(options->follow_command, value, MAXLEN);
|
||||
strncpy(options->follow_command, value, sizeof(options->follow_command));
|
||||
else if (strcmp(name, "reconnect_attempts") == 0)
|
||||
options->reconnect_attempts = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "reconnect_interval") == 0)
|
||||
@@ -604,6 +632,36 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "repmgrd_pid_file") == 0)
|
||||
strncpy(options->repmgrd_pid_file, value, MAXPGPATH);
|
||||
else if (strcmp(name, "standby_disconnect_on_failover") == 0)
|
||||
options->standby_disconnect_on_failover = parse_bool(value, name, error_list);
|
||||
else if (strcmp(name, "sibling_nodes_disconnect_timeout") == 0)
|
||||
options->sibling_nodes_disconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "connection_check_type") == 0)
|
||||
{
|
||||
if (strcasecmp(value, "ping") == 0)
|
||||
{
|
||||
options->connection_check_type = CHECK_PING;
|
||||
}
|
||||
else if (strcasecmp(value, "connection") == 0)
|
||||
{
|
||||
options->connection_check_type = CHECK_CONNECTION;
|
||||
}
|
||||
else if (strcasecmp(value, "query") == 0)
|
||||
{
|
||||
options->connection_check_type = CHECK_QUERY;
|
||||
}
|
||||
else
|
||||
{
|
||||
item_list_append(error_list,
|
||||
_("value for \"connection_check_type\" must be \"ping\", \"connection\" or \"query\"\n"));
|
||||
}
|
||||
}
|
||||
else if (strcmp(name, "primary_visibility_consensus") == 0)
|
||||
options->primary_visibility_consensus = parse_bool(value, name, error_list);
|
||||
else if (strcmp(name, "failover_validation_command") == 0)
|
||||
strncpy(options->failover_validation_command, value, sizeof(options->failover_validation_command));
|
||||
else if (strcmp(name, "election_rerun_interval") == 0)
|
||||
options->election_rerun_interval = repmgr_atoi(value, name, error_list, 0);
|
||||
|
||||
/* witness settings */
|
||||
else if (strcmp(name, "witness_sync_interval") == 0)
|
||||
@@ -617,41 +675,48 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
|
||||
/* service settings */
|
||||
else if (strcmp(name, "pg_ctl_options") == 0)
|
||||
strncpy(options->pg_ctl_options, value, MAXLEN);
|
||||
else if (strcmp(name, "service_stop_command") == 0)
|
||||
strncpy(options->service_stop_command, value, MAXLEN);
|
||||
strncpy(options->pg_ctl_options, value, sizeof(options->pg_ctl_options));
|
||||
else if (strcmp(name, "service_start_command") == 0)
|
||||
strncpy(options->service_start_command, value, MAXLEN);
|
||||
strncpy(options->service_start_command, value, sizeof(options->service_start_command));
|
||||
else if (strcmp(name, "service_stop_command") == 0)
|
||||
strncpy(options->service_stop_command, value, sizeof(options->service_stop_command));
|
||||
else if (strcmp(name, "service_restart_command") == 0)
|
||||
strncpy(options->service_restart_command, value, MAXLEN);
|
||||
strncpy(options->service_restart_command, value, sizeof(options->service_restart_command));
|
||||
else if (strcmp(name, "service_reload_command") == 0)
|
||||
strncpy(options->service_reload_command, value, MAXLEN);
|
||||
strncpy(options->service_reload_command, value, sizeof(options->service_reload_command));
|
||||
else if (strcmp(name, "service_promote_command") == 0)
|
||||
strncpy(options->service_promote_command, value, MAXLEN);
|
||||
strncpy(options->service_promote_command, value, sizeof(options->service_promote_command));
|
||||
|
||||
/* repmgrd service settings */
|
||||
else if (strcmp(name, "repmgrd_service_start_command") == 0)
|
||||
strncpy(options->repmgrd_service_start_command, value, sizeof(options->repmgrd_service_start_command));
|
||||
else if (strcmp(name, "repmgrd_service_stop_command") == 0)
|
||||
strncpy(options->repmgrd_service_stop_command, value, sizeof(options->repmgrd_service_stop_command));
|
||||
|
||||
|
||||
/* event notification settings */
|
||||
else if (strcmp(name, "event_notification_command") == 0)
|
||||
strncpy(options->event_notification_command, value, MAXLEN);
|
||||
strncpy(options->event_notification_command, value, sizeof(options->event_notification_command));
|
||||
else if (strcmp(name, "event_notifications") == 0)
|
||||
{
|
||||
/* store unparsed value for comparison when reloading config */
|
||||
strncpy(options->event_notifications_orig, value, MAXLEN);
|
||||
strncpy(options->event_notifications_orig, value, sizeof(options->event_notifications_orig));
|
||||
parse_event_notifications_list(options, value);
|
||||
}
|
||||
|
||||
/* barman settings */
|
||||
else if (strcmp(name, "barman_host") == 0)
|
||||
strncpy(options->barman_host, value, MAXLEN);
|
||||
strncpy(options->barman_host, value, sizeof(options->barman_host));
|
||||
else if (strcmp(name, "barman_server") == 0)
|
||||
strncpy(options->barman_server, value, MAXLEN);
|
||||
strncpy(options->barman_server, value, sizeof(options->barman_server));
|
||||
else if (strcmp(name, "barman_config") == 0)
|
||||
strncpy(options->barman_config, value, MAXLEN);
|
||||
strncpy(options->barman_config, value, sizeof(options->barman_config));
|
||||
|
||||
/* rsync/ssh settings */
|
||||
else if (strcmp(name, "rsync_options") == 0)
|
||||
strncpy(options->rsync_options, value, MAXLEN);
|
||||
strncpy(options->rsync_options, value, sizeof(options->rsync_options));
|
||||
else if (strcmp(name, "ssh_options") == 0)
|
||||
strncpy(options->ssh_options, value, MAXLEN);
|
||||
strncpy(options->ssh_options, value, sizeof(options->ssh_options));
|
||||
|
||||
/* undocumented settings for testing */
|
||||
else if (strcmp(name, "promote_delay") == 0)
|
||||
@@ -785,7 +850,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
PQconninfoFree(conninfo_options);
|
||||
}
|
||||
|
||||
|
||||
/* set values for parameters which default to other parameters */
|
||||
|
||||
/*
|
||||
@@ -813,13 +877,13 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
if (options->archive_ready_warning >= options->archive_ready_critical)
|
||||
{
|
||||
item_list_append(error_list,
|
||||
_("\archive_ready_critical\" must be greater than \"archive_ready_warning\""));
|
||||
_("\"archive_ready_critical\" must be greater than \"archive_ready_warning\""));
|
||||
}
|
||||
|
||||
if (options->replication_lag_warning >= options->replication_lag_critical)
|
||||
{
|
||||
item_list_append(error_list,
|
||||
_("\replication_lag_critical\" must be greater than \"replication_lag_warning\""));
|
||||
_("\"replication_lag_critical\" must be greater than \"replication_lag_warning\""));
|
||||
}
|
||||
|
||||
if (options->standby_reconnect_timeout < options->node_rejoin_timeout)
|
||||
@@ -1029,15 +1093,19 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
|
||||
* loop is started up; it therefore only needs to reload options required
|
||||
* by repmgrd, which are as follows:
|
||||
*
|
||||
* changeable options:
|
||||
* changeable options (keep the list in "doc/repmgrd-configuration.sgml" in sync
|
||||
* with these):
|
||||
*
|
||||
* - async_query_timeout
|
||||
* - bdr_local_monitoring_only
|
||||
* - bdr_recovery_timeout
|
||||
* - connection_check_type
|
||||
* - conninfo
|
||||
* - degraded_monitoring_timeout
|
||||
* - event_notification_command
|
||||
* - event_notifications
|
||||
* - failover
|
||||
* - failover_validation_command
|
||||
* - follow_command
|
||||
* - log_facility
|
||||
* - log_file
|
||||
@@ -1045,18 +1113,27 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
|
||||
* - log_status_interval
|
||||
* - monitor_interval_secs
|
||||
* - monitoring_history
|
||||
* - primary_notification_timeout
|
||||
* - primary_visibility_consensus
|
||||
* - promote_command
|
||||
* - promote_delay
|
||||
* - reconnect_attempts
|
||||
* - reconnect_interval
|
||||
* - repmgrd_standby_startup_timeout
|
||||
* - retry_promote_interval_secs
|
||||
* - sibling_nodes_disconnect_timeout
|
||||
* - standby_disconnect_on_failover
|
||||
*
|
||||
* non-changeable options
|
||||
*
|
||||
* Not publicly documented:
|
||||
* - promote_delay
|
||||
*
|
||||
* non-changeable options (repmgrd references these from the "repmgr.nodes"
|
||||
* table, not the configuration file)
|
||||
*
|
||||
* - node_id
|
||||
* - node_name
|
||||
* - data_directory
|
||||
* - location
|
||||
* - priority
|
||||
* - replication_type
|
||||
*
|
||||
@@ -1065,7 +1142,7 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
|
||||
|
||||
*/
|
||||
bool
|
||||
reload_config(t_configuration_options *orig_options)
|
||||
reload_config(t_configuration_options *orig_options, t_server_type server_type)
|
||||
{
|
||||
PGconn *conn;
|
||||
t_configuration_options new_options = T_CONFIGURATION_OPTIONS_INITIALIZER;
|
||||
@@ -1081,6 +1158,20 @@ reload_config(t_configuration_options *orig_options)
|
||||
|
||||
_parse_config(&new_options, &config_errors, &config_warnings);
|
||||
|
||||
|
||||
if (server_type == PRIMARY || server_type == STANDBY)
|
||||
{
|
||||
if (new_options.promote_command[0] == '\0')
|
||||
{
|
||||
item_list_append(&config_errors, _("\"promote_command\": required parameter was not found"));
|
||||
}
|
||||
|
||||
if (new_options.follow_command[0] == '\0')
|
||||
{
|
||||
item_list_append(&config_errors, _("\"follow_command\": required parameter was not found"));
|
||||
}
|
||||
}
|
||||
|
||||
if (config_errors.head != NULL)
|
||||
{
|
||||
ItemListCell *cell = NULL;
|
||||
@@ -1089,8 +1180,8 @@ reload_config(t_configuration_options *orig_options)
|
||||
|
||||
initPQExpBuffer(&errors);
|
||||
|
||||
appendPQExpBuffer(&errors,
|
||||
"following errors were detected:\n");
|
||||
appendPQExpBufferStr(&errors,
|
||||
"following errors were detected:\n");
|
||||
|
||||
for (cell = config_errors.head; cell; cell = cell->next)
|
||||
{
|
||||
@@ -1113,13 +1204,12 @@ reload_config(t_configuration_options *orig_options)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (strncmp(new_options.node_name, orig_options->node_name, MAXLEN) != 0)
|
||||
if (strncmp(new_options.node_name, orig_options->node_name, sizeof(orig_options->node_name)) != 0)
|
||||
{
|
||||
log_warning(_("\"node_name\" cannot be changed, keeping current configuration"));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* No configuration problems detected - copy any changed values
|
||||
*
|
||||
@@ -1169,8 +1259,8 @@ reload_config(t_configuration_options *orig_options)
|
||||
{
|
||||
strncpy(orig_options->conninfo, new_options.conninfo, MAXLEN);
|
||||
log_info(_("\"conninfo\" is now \"%s\""), new_options.conninfo);
|
||||
|
||||
}
|
||||
|
||||
PQfinish(conn);
|
||||
}
|
||||
|
||||
@@ -1248,7 +1338,6 @@ reload_config(t_configuration_options *orig_options)
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
|
||||
/* promote_command */
|
||||
if (strncmp(orig_options->promote_command, new_options.promote_command, MAXLEN) != 0)
|
||||
{
|
||||
@@ -1258,7 +1347,7 @@ reload_config(t_configuration_options *orig_options)
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/* promote_delay */
|
||||
/* promote_delay (for testing use only; not documented */
|
||||
if (orig_options->promote_delay != new_options.promote_delay)
|
||||
{
|
||||
orig_options->promote_delay = new_options.promote_delay;
|
||||
@@ -1294,6 +1383,51 @@ reload_config(t_configuration_options *orig_options)
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/* standby_disconnect_on_failover */
|
||||
if (orig_options->standby_disconnect_on_failover != new_options.standby_disconnect_on_failover)
|
||||
{
|
||||
orig_options->standby_disconnect_on_failover = new_options.standby_disconnect_on_failover;
|
||||
log_info(_("\"standby_disconnect_on_failover\" is now \"%s\""),
|
||||
new_options.standby_disconnect_on_failover == true ? "TRUE" : "FALSE");
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/* sibling_nodes_disconnect_timeout */
|
||||
if (orig_options->sibling_nodes_disconnect_timeout != new_options.sibling_nodes_disconnect_timeout)
|
||||
{
|
||||
orig_options->sibling_nodes_disconnect_timeout = new_options.sibling_nodes_disconnect_timeout;
|
||||
log_info(_("\"sibling_nodes_disconnect_timeout\" is now \"%i\""),
|
||||
new_options.sibling_nodes_disconnect_timeout);
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/* connection_check_type */
|
||||
if (orig_options->connection_check_type != new_options.connection_check_type)
|
||||
{
|
||||
orig_options->connection_check_type = new_options.connection_check_type;
|
||||
log_info(_("\"connection_check_type\" is now \"%s\""),
|
||||
print_connection_check_type(new_options.connection_check_type));
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/* primary_visibility_consensus */
|
||||
if (orig_options->primary_visibility_consensus != new_options.primary_visibility_consensus)
|
||||
{
|
||||
orig_options->primary_visibility_consensus = new_options.primary_visibility_consensus;
|
||||
log_info(_("\"primary_visibility_consensus\" is now \"%s\""),
|
||||
new_options.primary_visibility_consensus == true ? "TRUE" : "FALSE");
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/* failover_validation_command */
|
||||
if (strncmp(orig_options->failover_validation_command, new_options.failover_validation_command, MAXPGPATH) != 0)
|
||||
{
|
||||
strncpy(orig_options->failover_validation_command, new_options.failover_validation_command, MAXPGPATH);
|
||||
log_info(_("\"failover_validation_command\" is now \"%s\""), new_options.failover_validation_command);
|
||||
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle changes to logging configuration
|
||||
*/
|
||||
@@ -1505,13 +1639,16 @@ repmgr_atoi(const char *value, const char *config_item, ItemList *error_list, in
|
||||
*
|
||||
* TODO: accept "any unambiguous prefix of one of these" as per postgresql.conf:
|
||||
*
|
||||
* https://www.postgresql.org/docs/current/static/config-setting.html
|
||||
* https://www.postgresql.org/docs/current/config-setting.html
|
||||
*/
|
||||
bool
|
||||
parse_bool(const char *s, const char *config_item, ItemList *error_list)
|
||||
{
|
||||
PQExpBufferData errors;
|
||||
|
||||
if (s == NULL)
|
||||
return true;
|
||||
|
||||
if (strcasecmp(s, "0") == 0)
|
||||
return false;
|
||||
|
||||
@@ -1888,3 +2025,21 @@ parse_pg_basebackup_options(const char *pg_basebackup_options, t_basebackup_opti
|
||||
|
||||
return backup_options_ok;
|
||||
}
|
||||
|
||||
|
||||
const char *
|
||||
print_connection_check_type(ConnectionCheckType type)
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case CHECK_PING:
|
||||
return "ping";
|
||||
case CHECK_QUERY:
|
||||
return "query";
|
||||
case CHECK_CONNECTION:
|
||||
return "connection";
|
||||
}
|
||||
|
||||
/* should never reach here */
|
||||
return "UNKNOWN";
|
||||
}
|
||||
|
||||
54
configfile.h
54
configfile.h
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* configfile.h
|
||||
*
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
@@ -37,6 +37,13 @@ typedef enum
|
||||
FAILOVER_AUTOMATIC
|
||||
} failover_mode_opt;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CHECK_PING,
|
||||
CHECK_QUERY,
|
||||
CHECK_CONNECTION
|
||||
} ConnectionCheckType;
|
||||
|
||||
typedef struct EventNotificationListCell
|
||||
{
|
||||
struct EventNotificationListCell *next;
|
||||
@@ -69,12 +76,13 @@ typedef struct
|
||||
{
|
||||
/* node information */
|
||||
int node_id;
|
||||
char node_name[MAXLEN];
|
||||
char node_name[NAMEDATALEN];
|
||||
char conninfo[MAXLEN];
|
||||
char replication_user[NAMEDATALEN];
|
||||
char data_directory[MAXPGPATH];
|
||||
char config_directory[MAXPGPATH];
|
||||
char pg_bindir[MAXPGPATH];
|
||||
char repmgr_bindir[MAXPGPATH];
|
||||
int replication_type;
|
||||
|
||||
/* log settings */
|
||||
@@ -103,7 +111,9 @@ typedef struct
|
||||
int standby_follow_timeout;
|
||||
|
||||
/* standby switchover settings */
|
||||
int shutdown_check_timeout;
|
||||
int standby_reconnect_timeout;
|
||||
int wal_receive_check_timeout;
|
||||
|
||||
/* node rejoin settings */
|
||||
int node_rejoin_timeout;
|
||||
@@ -132,6 +142,12 @@ typedef struct
|
||||
int primary_notification_timeout;
|
||||
int repmgrd_standby_startup_timeout;
|
||||
char repmgrd_pid_file[MAXPGPATH];
|
||||
bool standby_disconnect_on_failover;
|
||||
int sibling_nodes_disconnect_timeout;
|
||||
ConnectionCheckType connection_check_type;
|
||||
bool primary_visibility_consensus;
|
||||
char failover_validation_command[MAXPGPATH];
|
||||
int election_rerun_interval;
|
||||
|
||||
/* BDR settings */
|
||||
bool bdr_local_monitoring_only;
|
||||
@@ -139,14 +155,18 @@ typedef struct
|
||||
|
||||
/* service settings */
|
||||
char pg_ctl_options[MAXLEN];
|
||||
char service_stop_command[MAXLEN];
|
||||
char service_start_command[MAXLEN];
|
||||
char service_restart_command[MAXLEN];
|
||||
char service_reload_command[MAXLEN];
|
||||
char service_promote_command[MAXLEN];
|
||||
char service_start_command[MAXPGPATH];
|
||||
char service_stop_command[MAXPGPATH];
|
||||
char service_restart_command[MAXPGPATH];
|
||||
char service_reload_command[MAXPGPATH];
|
||||
char service_promote_command[MAXPGPATH];
|
||||
|
||||
/* repmgrd service settings */
|
||||
char repmgrd_service_start_command[MAXPGPATH];
|
||||
char repmgrd_service_stop_command[MAXPGPATH];
|
||||
|
||||
/* event notification settings */
|
||||
char event_notification_command[MAXLEN];
|
||||
char event_notification_command[MAXPGPATH];
|
||||
char event_notifications_orig[MAXLEN];
|
||||
EventNotificationList event_notifications;
|
||||
|
||||
@@ -170,9 +190,9 @@ typedef struct
|
||||
|
||||
#define T_CONFIGURATION_OPTIONS_INITIALIZER { \
|
||||
/* node information */ \
|
||||
UNKNOWN_NODE_ID, "", "", "", "", "", "", REPLICATION_TYPE_PHYSICAL, \
|
||||
UNKNOWN_NODE_ID, "", "", "", "", "", "", "", REPLICATION_TYPE_PHYSICAL, \
|
||||
/* log settings */ \
|
||||
"", "", "", DEFAULT_LOG_STATUS_INTERVAL, \
|
||||
"", "", "", DEFAULT_LOG_STATUS_INTERVAL, \
|
||||
/* standby clone settings */ \
|
||||
false, "", "", { NULL, NULL }, "", false, "", false, "", \
|
||||
/* standby promote settings */ \
|
||||
@@ -181,7 +201,9 @@ typedef struct
|
||||
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
||||
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
|
||||
/* standby switchover settings */ \
|
||||
DEFAULT_SHUTDOWN_CHECK_TIMEOUT, \
|
||||
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
|
||||
DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT, \
|
||||
/* node rejoin settings */ \
|
||||
DEFAULT_NODE_REJOIN_TIMEOUT, \
|
||||
/* node check settings */ \
|
||||
@@ -196,12 +218,15 @@ typedef struct
|
||||
DEFAULT_RECONNECTION_INTERVAL, \
|
||||
false, -1, \
|
||||
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||
-1, "", \
|
||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||
-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, \
|
||||
CHECK_PING, true, "", DEFAULT_ELECTION_RERUN_INTERVAL, \
|
||||
/* BDR settings */ \
|
||||
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||
/* service settings */ \
|
||||
"", "", "", "", "", "", \
|
||||
/* repmgrd service settings */ \
|
||||
"", "", \
|
||||
/* event notification settings */ \
|
||||
"", "", { NULL, NULL }, \
|
||||
/* barman settings */ \
|
||||
@@ -273,13 +298,13 @@ typedef struct
|
||||
"", "", "", "" \
|
||||
}
|
||||
|
||||
#include "dbutils.h"
|
||||
|
||||
void set_progname(const char *argv0);
|
||||
const char *progname(void);
|
||||
|
||||
void load_config(const char *config_file, bool verbose, bool terse, t_configuration_options *options, char *argv0);
|
||||
void parse_config(t_configuration_options *options, bool terse);
|
||||
bool reload_config(t_configuration_options *orig_options);
|
||||
bool reload_config(t_configuration_options *orig_options, t_server_type server_type);
|
||||
|
||||
bool parse_recovery_conf(const char *data_dir, t_recovery_conf *conf);
|
||||
|
||||
@@ -304,5 +329,6 @@ void free_parsed_argv(char ***argv_array);
|
||||
/* called by repmgr-client and repmgrd */
|
||||
void exit_with_cli_errors(ItemList *error_list, const char *repmgr_command);
|
||||
void print_item_list(ItemList *item_list);
|
||||
const char *print_connection_check_type(ConnectionCheckType type);
|
||||
|
||||
#endif /* _REPMGR_CONFIGFILE_H_ */
|
||||
|
||||
38
configure
vendored
38
configure
vendored
@@ -1,8 +1,8 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for repmgr 4.1.
|
||||
# Generated by GNU Autoconf 2.69 for repmgr 4.3.
|
||||
#
|
||||
# Report bugs to <pgsql-bugs@postgresql.org>.
|
||||
# Report bugs to <repmgr@googlegroups.com>.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -11,7 +11,7 @@
|
||||
# This configure script is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy, distribute and modify it.
|
||||
#
|
||||
# Copyright (c) 2010-2018, 2ndQuadrant Ltd.
|
||||
# Copyright (c) 2010-2019, 2ndQuadrant Ltd.
|
||||
## -------------------- ##
|
||||
## M4sh Initialization. ##
|
||||
## -------------------- ##
|
||||
@@ -269,7 +269,7 @@ fi
|
||||
$as_echo "$0: be upgraded to zsh 4.3.4 or later."
|
||||
else
|
||||
$as_echo "$0: Please tell bug-autoconf@gnu.org and
|
||||
$0: pgsql-bugs@postgresql.org about your system, including
|
||||
$0: repmgr@googlegroups.com about your system, including
|
||||
$0: any error possibly output before this message. Then
|
||||
$0: install a modern shell, or manually run the script
|
||||
$0: under such a shell if you do have one."
|
||||
@@ -582,10 +582,10 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='repmgr'
|
||||
PACKAGE_TARNAME='repmgr'
|
||||
PACKAGE_VERSION='4.1'
|
||||
PACKAGE_STRING='repmgr 4.1'
|
||||
PACKAGE_BUGREPORT='pgsql-bugs@postgresql.org'
|
||||
PACKAGE_URL='https://2ndquadrant.com/en/resources/repmgr/'
|
||||
PACKAGE_VERSION='4.3'
|
||||
PACKAGE_STRING='repmgr 4.3'
|
||||
PACKAGE_BUGREPORT='repmgr@googlegroups.com'
|
||||
PACKAGE_URL='https://repmgr.org/'
|
||||
|
||||
ac_subst_vars='LTLIBOBJS
|
||||
LIBOBJS
|
||||
@@ -1178,7 +1178,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures repmgr 4.1 to adapt to many kinds of systems.
|
||||
\`configure' configures repmgr 4.3 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1239,7 +1239,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of repmgr 4.1:";;
|
||||
short | recursive ) echo "Configuration of repmgr 4.3:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1249,8 +1249,8 @@ Some influential environment variables:
|
||||
Use these variables to override the choices made by `configure' or to help
|
||||
it to find libraries and programs with nonstandard names/locations.
|
||||
|
||||
Report bugs to <pgsql-bugs@postgresql.org>.
|
||||
repmgr home page: <https://2ndquadrant.com/en/resources/repmgr/>.
|
||||
Report bugs to <repmgr@googlegroups.com>.
|
||||
repmgr home page: <https://repmgr.org/>.
|
||||
_ACEOF
|
||||
ac_status=$?
|
||||
fi
|
||||
@@ -1313,14 +1313,14 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
repmgr configure 4.1
|
||||
repmgr configure 4.3
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
This configure script is free software; the Free Software Foundation
|
||||
gives unlimited permission to copy, distribute and modify it.
|
||||
|
||||
Copyright (c) 2010-2018, 2ndQuadrant Ltd.
|
||||
Copyright (c) 2010-2019, 2ndQuadrant Ltd.
|
||||
_ACEOF
|
||||
exit
|
||||
fi
|
||||
@@ -1332,7 +1332,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by repmgr $as_me 4.1, which was
|
||||
It was created by repmgr $as_me 4.3, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2359,7 +2359,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by repmgr $as_me 4.1, which was
|
||||
This file was extended by repmgr $as_me 4.3, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -2415,14 +2415,14 @@ $config_files
|
||||
Configuration headers:
|
||||
$config_headers
|
||||
|
||||
Report bugs to <pgsql-bugs@postgresql.org>.
|
||||
repmgr home page: <https://2ndquadrant.com/en/resources/repmgr/>."
|
||||
Report bugs to <repmgr@googlegroups.com>.
|
||||
repmgr home page: <https://repmgr.org/>."
|
||||
|
||||
_ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
repmgr config.status 4.1
|
||||
repmgr config.status 4.3
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
AC_INIT([repmgr], [4.1], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])
|
||||
AC_INIT([repmgr], [4.3], [repmgr@googlegroups.com], [repmgr], [https://repmgr.org/])
|
||||
|
||||
AC_COPYRIGHT([Copyright (c) 2010-2018, 2ndQuadrant Ltd.])
|
||||
AC_COPYRIGHT([Copyright (c) 2010-2019, 2ndQuadrant Ltd.])
|
||||
|
||||
AC_CONFIG_HEADER(config.h)
|
||||
|
||||
|
||||
185
controldata.c
185
controldata.c
@@ -1,6 +1,12 @@
|
||||
/*
|
||||
* controldata.c
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* controldata.c - functions for reading the pg_control file
|
||||
*
|
||||
* The functions provided here enable repmgr to read a pg_control file
|
||||
* in a version-indepent way, even if the PostgreSQL instance is not
|
||||
* running. For that reason we can't use on the pg_control_*() functions
|
||||
* provided in PostgreSQL 9.6 and later.
|
||||
*
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
@@ -30,6 +36,53 @@
|
||||
|
||||
static ControlFileInfo *get_controlfile(const char *DataDir);
|
||||
|
||||
int
|
||||
get_pg_version(const char *data_directory, char *version_string)
|
||||
{
|
||||
char PgVersionPath[MAXPGPATH] = "";
|
||||
FILE *fp = NULL;
|
||||
char *endptr = NULL;
|
||||
char file_version_string[MAX_VERSION_STRING] = "";
|
||||
long file_major, file_minor;
|
||||
int ret;
|
||||
|
||||
snprintf(PgVersionPath, MAXPGPATH, "%s/PG_VERSION", data_directory);
|
||||
|
||||
fp = fopen(PgVersionPath, "r");
|
||||
|
||||
if (fp == NULL)
|
||||
{
|
||||
log_warning(_("could not open file \"%s\" for reading"),
|
||||
PgVersionPath);
|
||||
log_detail("%s", strerror(errno));
|
||||
return UNKNOWN_SERVER_VERSION_NUM;
|
||||
}
|
||||
|
||||
file_version_string[0] = '\0';
|
||||
|
||||
ret = fscanf(fp, "%23s", file_version_string);
|
||||
fclose(fp);
|
||||
|
||||
if (ret != 1 || endptr == file_version_string)
|
||||
{
|
||||
log_warning(_("unable to determine major version number from PG_VERSION"));
|
||||
|
||||
return UNKNOWN_SERVER_VERSION_NUM;
|
||||
}
|
||||
|
||||
file_major = strtol(file_version_string, &endptr, 10);
|
||||
file_minor = 0;
|
||||
|
||||
if (*endptr == '.')
|
||||
file_minor = strtol(endptr + 1, NULL, 10);
|
||||
|
||||
if (version_string != NULL)
|
||||
strncpy(version_string, file_version_string, MAX_VERSION_STRING);
|
||||
|
||||
return ((int) file_major * 10000) + ((int) file_minor * 100);
|
||||
}
|
||||
|
||||
|
||||
uint64
|
||||
get_system_identifier(const char *data_directory)
|
||||
{
|
||||
@@ -44,6 +97,7 @@ get_system_identifier(const char *data_directory)
|
||||
return system_identifier;
|
||||
}
|
||||
|
||||
|
||||
DBState
|
||||
get_db_state(const char *data_directory)
|
||||
{
|
||||
@@ -60,7 +114,7 @@ get_db_state(const char *data_directory)
|
||||
}
|
||||
|
||||
|
||||
extern XLogRecPtr
|
||||
XLogRecPtr
|
||||
get_latest_checkpoint_location(const char *data_directory)
|
||||
{
|
||||
ControlFileInfo *control_file_info = NULL;
|
||||
@@ -112,10 +166,59 @@ describe_db_state(DBState state)
|
||||
case DB_IN_PRODUCTION:
|
||||
return _("in production");
|
||||
}
|
||||
|
||||
return _("unrecognized status code");
|
||||
}
|
||||
|
||||
|
||||
TimeLineID
|
||||
get_timeline(const char *data_directory)
|
||||
{
|
||||
ControlFileInfo *control_file_info = NULL;
|
||||
TimeLineID timeline = -1;
|
||||
|
||||
control_file_info = get_controlfile(data_directory);
|
||||
|
||||
timeline = (int) control_file_info->timeline;
|
||||
|
||||
pfree(control_file_info);
|
||||
|
||||
return timeline;
|
||||
}
|
||||
|
||||
|
||||
TimeLineID
|
||||
get_min_recovery_end_timeline(const char *data_directory)
|
||||
{
|
||||
ControlFileInfo *control_file_info = NULL;
|
||||
TimeLineID timeline = -1;
|
||||
|
||||
control_file_info = get_controlfile(data_directory);
|
||||
|
||||
timeline = (int) control_file_info->minRecoveryPointTLI;
|
||||
|
||||
pfree(control_file_info);
|
||||
|
||||
return timeline;
|
||||
}
|
||||
|
||||
|
||||
XLogRecPtr
|
||||
get_min_recovery_location(const char *data_directory)
|
||||
{
|
||||
ControlFileInfo *control_file_info = NULL;
|
||||
XLogRecPtr minRecoveryPoint = InvalidXLogRecPtr;
|
||||
|
||||
control_file_info = get_controlfile(data_directory);
|
||||
|
||||
minRecoveryPoint = control_file_info->minRecoveryPoint;
|
||||
|
||||
pfree(control_file_info);
|
||||
|
||||
return minRecoveryPoint;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* We maintain our own version of get_controlfile() as we need cross-version
|
||||
* compatibility, and also don't care if the file isn't readable.
|
||||
@@ -123,14 +226,10 @@ describe_db_state(DBState state)
|
||||
static ControlFileInfo *
|
||||
get_controlfile(const char *DataDir)
|
||||
{
|
||||
char file_version_string[MAX_VERSION_STRING] = "";
|
||||
ControlFileInfo *control_file_info;
|
||||
FILE *fp = NULL;
|
||||
int fd, ret, version_num;
|
||||
char PgVersionPath[MAXPGPATH] = "";
|
||||
int fd, version_num;
|
||||
char ControlFilePath[MAXPGPATH] = "";
|
||||
char file_version_string[64] = "";
|
||||
long file_major, file_minor;
|
||||
char *endptr = NULL;
|
||||
void *ControlFileDataPtr = NULL;
|
||||
int expected_size = 0;
|
||||
|
||||
@@ -142,50 +241,32 @@ get_controlfile(const char *DataDir)
|
||||
control_file_info->state = DB_SHUTDOWNED;
|
||||
control_file_info->checkPoint = InvalidXLogRecPtr;
|
||||
control_file_info->data_checksum_version = -1;
|
||||
control_file_info->timeline = -1;
|
||||
control_file_info->minRecoveryPointTLI = -1;
|
||||
control_file_info->minRecoveryPoint = InvalidXLogRecPtr;
|
||||
|
||||
/*
|
||||
* Read PG_VERSION, as we'll need to determine which struct to read
|
||||
* the control file contents into
|
||||
*/
|
||||
snprintf(PgVersionPath, MAXPGPATH, "%s/PG_VERSION", DataDir);
|
||||
|
||||
fp = fopen(PgVersionPath, "r");
|
||||
version_num = get_pg_version(DataDir, file_version_string);
|
||||
|
||||
if (fp == NULL)
|
||||
if (version_num == UNKNOWN_SERVER_VERSION_NUM)
|
||||
{
|
||||
log_warning(_("could not open file \"%s\" for reading"),
|
||||
PgVersionPath);
|
||||
log_detail("%s", strerror(errno));
|
||||
log_warning(_("unable to determine server version number from PG_VERSION"));
|
||||
return control_file_info;
|
||||
}
|
||||
|
||||
file_version_string[0] = '\0';
|
||||
|
||||
ret = fscanf(fp, "%63s", file_version_string);
|
||||
fclose(fp);
|
||||
|
||||
if (ret != 1 || endptr == file_version_string)
|
||||
if (version_num < MIN_SUPPORTED_VERSION_NUM)
|
||||
{
|
||||
log_warning(_("unable to determine major version number from PG_VERSION"));
|
||||
|
||||
log_warning(_("data directory appears to be initialised for %s"),
|
||||
file_version_string);
|
||||
log_detail(_("minimum supported PostgreSQL version is %s"),
|
||||
MIN_SUPPORTED_VERSION);
|
||||
return control_file_info;
|
||||
}
|
||||
|
||||
file_major = strtol(file_version_string, &endptr, 10);
|
||||
file_minor = 0;
|
||||
|
||||
if (*endptr == '.')
|
||||
file_minor = strtol(endptr + 1, NULL, 10);
|
||||
|
||||
version_num = ((int) file_major * 10000) + ((int) file_minor * 100);
|
||||
|
||||
if (version_num < 90300)
|
||||
{
|
||||
log_warning(_("Data directory appears to be initialised for %s"), file_version_string);
|
||||
return control_file_info;
|
||||
}
|
||||
|
||||
|
||||
snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);
|
||||
|
||||
if ((fd = open(ControlFilePath, O_RDONLY | PG_BINARY, 0)) == -1)
|
||||
@@ -220,6 +301,8 @@ get_controlfile(const char *DataDir)
|
||||
ControlFilePath);
|
||||
log_detail("%s", strerror(errno));
|
||||
|
||||
close(fd);
|
||||
|
||||
return control_file_info;
|
||||
}
|
||||
|
||||
@@ -227,13 +310,27 @@ get_controlfile(const char *DataDir)
|
||||
|
||||
control_file_info->control_file_processed = true;
|
||||
|
||||
if (version_num >= 90500)
|
||||
if (version_num >= 110000)
|
||||
{
|
||||
ControlFileData11 *ptr = (struct ControlFileData11 *)ControlFileDataPtr;
|
||||
control_file_info->system_identifier = ptr->system_identifier;
|
||||
control_file_info->state = ptr->state;
|
||||
control_file_info->checkPoint = ptr->checkPoint;
|
||||
control_file_info->data_checksum_version = ptr->data_checksum_version;
|
||||
control_file_info->timeline = ptr->checkPointCopy.ThisTimeLineID;
|
||||
control_file_info->minRecoveryPointTLI = ptr->minRecoveryPointTLI;
|
||||
control_file_info->minRecoveryPoint = ptr->minRecoveryPoint;
|
||||
}
|
||||
else if (version_num >= 90500)
|
||||
{
|
||||
ControlFileData95 *ptr = (struct ControlFileData95 *)ControlFileDataPtr;
|
||||
control_file_info->system_identifier = ptr->system_identifier;
|
||||
control_file_info->state = ptr->state;
|
||||
control_file_info->checkPoint = ptr->checkPoint;
|
||||
control_file_info->data_checksum_version = ptr->data_checksum_version;
|
||||
control_file_info->timeline = ptr->checkPointCopy.ThisTimeLineID;
|
||||
control_file_info->minRecoveryPointTLI = ptr->minRecoveryPointTLI;
|
||||
control_file_info->minRecoveryPoint = ptr->minRecoveryPoint;
|
||||
}
|
||||
else if (version_num >= 90400)
|
||||
{
|
||||
@@ -242,6 +339,9 @@ get_controlfile(const char *DataDir)
|
||||
control_file_info->state = ptr->state;
|
||||
control_file_info->checkPoint = ptr->checkPoint;
|
||||
control_file_info->data_checksum_version = ptr->data_checksum_version;
|
||||
control_file_info->timeline = ptr->checkPointCopy.ThisTimeLineID;
|
||||
control_file_info->minRecoveryPointTLI = ptr->minRecoveryPointTLI;
|
||||
control_file_info->minRecoveryPoint = ptr->minRecoveryPoint;
|
||||
}
|
||||
else if (version_num >= 90300)
|
||||
{
|
||||
@@ -250,6 +350,9 @@ get_controlfile(const char *DataDir)
|
||||
control_file_info->state = ptr->state;
|
||||
control_file_info->checkPoint = ptr->checkPoint;
|
||||
control_file_info->data_checksum_version = ptr->data_checksum_version;
|
||||
control_file_info->timeline = ptr->checkPointCopy.ThisTimeLineID;
|
||||
control_file_info->minRecoveryPointTLI = ptr->minRecoveryPointTLI;
|
||||
control_file_info->minRecoveryPoint = ptr->minRecoveryPoint;
|
||||
}
|
||||
|
||||
pfree(ControlFileDataPtr);
|
||||
@@ -257,9 +360,7 @@ get_controlfile(const char *DataDir)
|
||||
/*
|
||||
* We don't check the CRC here as we're potentially checking a pg_control
|
||||
* file from a different PostgreSQL version to the one repmgr was compiled
|
||||
* against. However we're only interested in the first few fields, which
|
||||
* should be constant across supported versions
|
||||
*
|
||||
* against.
|
||||
*/
|
||||
|
||||
return control_file_info;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* controldata.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "postgres_fe.h"
|
||||
#include "catalog/pg_control.h"
|
||||
|
||||
#define MAX_VERSION_STRING 24
|
||||
/*
|
||||
* A simplified representation of pg_control containing only those fields
|
||||
* required by repmgr.
|
||||
@@ -23,6 +24,9 @@ typedef struct
|
||||
DBState state;
|
||||
XLogRecPtr checkPoint;
|
||||
uint32 data_checksum_version;
|
||||
TimeLineID timeline;
|
||||
TimeLineID minRecoveryPointTLI;
|
||||
XLogRecPtr minRecoveryPoint;
|
||||
} ControlFileInfo;
|
||||
|
||||
|
||||
@@ -134,13 +138,11 @@ typedef struct ControlFileData93
|
||||
|
||||
|
||||
/*
|
||||
* Following fields added since 9.3:
|
||||
* Following field added since 9.3:
|
||||
*
|
||||
* int max_worker_processes;
|
||||
* int max_prepared_xacts;
|
||||
* int max_locks_per_xact;
|
||||
*
|
||||
*/
|
||||
|
||||
typedef struct ControlFileData94
|
||||
{
|
||||
uint64 system_identifier;
|
||||
@@ -265,12 +267,80 @@ typedef struct ControlFileData95
|
||||
|
||||
} ControlFileData95;
|
||||
|
||||
/*
|
||||
* Following field removed in 11:
|
||||
*
|
||||
* XLogRecPtr prevCheckPoint;
|
||||
*
|
||||
* In 10, following field appended *after* "data_checksum_version":
|
||||
*
|
||||
* char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN];
|
||||
*
|
||||
* (but we don't care about that)
|
||||
*/
|
||||
|
||||
typedef struct ControlFileData11
|
||||
{
|
||||
uint64 system_identifier;
|
||||
|
||||
uint32 pg_control_version; /* PG_CONTROL_VERSION */
|
||||
uint32 catalog_version_no; /* see catversion.h */
|
||||
|
||||
DBState state; /* see enum above */
|
||||
pg_time_t time; /* time stamp of last pg_control update */
|
||||
XLogRecPtr checkPoint; /* last check point record ptr */
|
||||
|
||||
CheckPoint95 checkPointCopy; /* copy of last check point record */
|
||||
|
||||
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
|
||||
|
||||
XLogRecPtr minRecoveryPoint;
|
||||
TimeLineID minRecoveryPointTLI;
|
||||
XLogRecPtr backupStartPoint;
|
||||
XLogRecPtr backupEndPoint;
|
||||
bool backupEndRequired;
|
||||
|
||||
int wal_level;
|
||||
bool wal_log_hints;
|
||||
int MaxConnections;
|
||||
int max_worker_processes;
|
||||
int max_prepared_xacts;
|
||||
int max_locks_per_xact;
|
||||
bool track_commit_timestamp;
|
||||
|
||||
uint32 maxAlign; /* alignment requirement for tuples */
|
||||
double floatFormat; /* constant 1234567.0 */
|
||||
|
||||
uint32 blcksz; /* data block size for this DB */
|
||||
uint32 relseg_size; /* blocks per segment of large relation */
|
||||
|
||||
uint32 xlog_blcksz; /* block size within WAL files */
|
||||
uint32 xlog_seg_size; /* size of each WAL segment */
|
||||
|
||||
uint32 nameDataLen; /* catalog name field width */
|
||||
uint32 indexMaxKeys; /* max number of columns in an index */
|
||||
|
||||
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
|
||||
uint32 loblksize; /* chunk size in pg_largeobject */
|
||||
|
||||
bool enableIntTimes; /* int64 storage enabled? */
|
||||
|
||||
bool float4ByVal; /* float4 pass-by-value? */
|
||||
bool float8ByVal; /* float8, int8, etc pass-by-value? */
|
||||
|
||||
uint32 data_checksum_version;
|
||||
|
||||
} ControlFileData11;
|
||||
|
||||
|
||||
extern int get_pg_version(const char *data_directory, char *version_string);
|
||||
extern DBState get_db_state(const char *data_directory);
|
||||
extern const char *describe_db_state(DBState state);
|
||||
extern int get_data_checksum_version(const char *data_directory);
|
||||
extern uint64 get_system_identifier(const char *data_directory);
|
||||
extern XLogRecPtr get_latest_checkpoint_location(const char *data_directory);
|
||||
extern TimeLineID get_timeline(const char *data_directory);
|
||||
extern TimeLineID get_min_recovery_end_timeline(const char *data_directory);
|
||||
extern XLogRecPtr get_min_recovery_location(const char *data_directory);
|
||||
|
||||
#endif /* _CONTROLDATA_H_ */
|
||||
|
||||
123
dbutils.h
123
dbutils.h
@@ -1,7 +1,7 @@
|
||||
/*
|
||||
* dbutils.h
|
||||
*
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -20,6 +20,7 @@
|
||||
#ifndef _REPMGR_DBUTILS_H_
|
||||
#define _REPMGR_DBUTILS_H_
|
||||
|
||||
#include "access/timeline.h"
|
||||
#include "access/xlogdefs.h"
|
||||
#include "pqexpbuffer.h"
|
||||
#include "portability/instr_time.h"
|
||||
@@ -47,6 +48,7 @@ typedef enum
|
||||
typedef enum
|
||||
{
|
||||
REPMGR_INSTALLED = 0,
|
||||
REPMGR_OLD_VERSION_INSTALLED,
|
||||
REPMGR_AVAILABLE,
|
||||
REPMGR_UNAVAILABLE,
|
||||
REPMGR_UNKNOWN
|
||||
@@ -78,7 +80,8 @@ typedef enum
|
||||
NODE_STATUS_UP,
|
||||
NODE_STATUS_SHUTTING_DOWN,
|
||||
NODE_STATUS_DOWN,
|
||||
NODE_STATUS_UNCLEAN_SHUTDOWN
|
||||
NODE_STATUS_UNCLEAN_SHUTDOWN,
|
||||
NODE_STATUS_REJECTED
|
||||
} NodeStatus;
|
||||
|
||||
typedef enum
|
||||
@@ -104,6 +107,24 @@ typedef enum
|
||||
} BackupState;
|
||||
|
||||
|
||||
/*
|
||||
* Struct to store extension version information
|
||||
*/
|
||||
|
||||
typedef struct s_extension_versions {
|
||||
char default_version[8];
|
||||
int default_version_num;
|
||||
char installed_version[8];
|
||||
int installed_version_num;
|
||||
} t_extension_versions;
|
||||
|
||||
#define T_EXTENSION_VERSIONS_INITIALIZER { \
|
||||
"", \
|
||||
UNKNOWN_SERVER_VERSION_NUM, \
|
||||
"", \
|
||||
UNKNOWN_SERVER_VERSION_NUM \
|
||||
}
|
||||
|
||||
/*
|
||||
* Struct to store node information
|
||||
*/
|
||||
@@ -113,8 +134,8 @@ typedef struct s_node_info
|
||||
int node_id;
|
||||
int upstream_node_id;
|
||||
t_server_type type;
|
||||
char node_name[MAXLEN];
|
||||
char upstream_node_name[MAXLEN];
|
||||
char node_name[NAMEDATALEN];
|
||||
char upstream_node_name[NAMEDATALEN];
|
||||
char conninfo[MAXLEN];
|
||||
char repluser[NAMEDATALEN];
|
||||
char location[MAXLEN];
|
||||
@@ -163,7 +184,7 @@ typedef struct s_node_info
|
||||
MS_NORMAL, \
|
||||
NULL, \
|
||||
/* for ad-hoc use e.g. when working with a list of nodes */ \
|
||||
"", true, true \
|
||||
"", true, true, \
|
||||
/* various statistics */ \
|
||||
-1, -1, -1, -1, -1, -1 \
|
||||
}
|
||||
@@ -281,22 +302,16 @@ typedef struct BdrNodeInfoList
|
||||
typedef struct
|
||||
{
|
||||
char current_timestamp[MAXLEN];
|
||||
uint64 last_wal_receive_lsn;
|
||||
uint64 last_wal_replay_lsn;
|
||||
bool in_recovery;
|
||||
XLogRecPtr last_wal_receive_lsn;
|
||||
XLogRecPtr last_wal_replay_lsn;
|
||||
char last_xact_replay_timestamp[MAXLEN];
|
||||
int replication_lag_time;
|
||||
bool receiving_streamed_wal;
|
||||
bool wal_replay_paused;
|
||||
int upstream_last_seen;
|
||||
} ReplInfo;
|
||||
|
||||
#define T_REPLINFO_INTIALIZER { \
|
||||
"", \
|
||||
InvalidXLogRecPtr, \
|
||||
InvalidXLogRecPtr, \
|
||||
"", \
|
||||
0 \
|
||||
}
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char filepath[MAXPGPATH];
|
||||
@@ -327,9 +342,24 @@ typedef struct
|
||||
UNKNOWN_TIMELINE_ID, \
|
||||
InvalidXLogRecPtr \
|
||||
}
|
||||
/* global variables */
|
||||
|
||||
extern int server_version_num;
|
||||
|
||||
typedef struct RepmgrdInfo {
|
||||
int node_id;
|
||||
int pid;
|
||||
char pid_text[MAXLEN];
|
||||
char pid_file[MAXLEN];
|
||||
bool pg_running;
|
||||
char pg_running_text[MAXLEN];
|
||||
RecoveryType recovery_type;
|
||||
bool running;
|
||||
char repmgrd_running[MAXLEN];
|
||||
bool paused;
|
||||
bool wal_paused_pending_wal;
|
||||
int upstream_last_seen;
|
||||
char upstream_last_seen_text[MAXLEN];
|
||||
} RepmgrdInfo;
|
||||
|
||||
|
||||
/* macros */
|
||||
|
||||
@@ -346,15 +376,13 @@ __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
|
||||
bool atobool(const char *value);
|
||||
|
||||
/* connection functions */
|
||||
PGconn *establish_db_connection(const char *conninfo,
|
||||
PGconn *establish_db_connection(const char *conninfo,
|
||||
const bool exit_on_error);
|
||||
PGconn *establish_db_connection_quiet(const char *conninfo);
|
||||
|
||||
PGconn *establish_db_connection_by_params(t_conninfo_param_list *param_list,
|
||||
PGconn *establish_db_connection_by_params(t_conninfo_param_list *param_list,
|
||||
const bool exit_on_error);
|
||||
PGconn *establish_primary_db_connection(PGconn *conn,
|
||||
PGconn *establish_primary_db_connection(PGconn *conn,
|
||||
const bool exit_on_error);
|
||||
|
||||
PGconn *get_primary_connection(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
|
||||
PGconn *get_primary_connection_quiet(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
|
||||
|
||||
@@ -373,6 +401,7 @@ void param_set_ine(t_conninfo_param_list *param_list, const char *param, const
|
||||
char *param_get(t_conninfo_param_list *param_list, const char *param);
|
||||
bool parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char **errmsg, bool ignore_local_params);
|
||||
char *param_list_to_string(t_conninfo_param_list *param_list);
|
||||
char *normalize_conninfo_string(const char *conninfo_str);
|
||||
bool has_passfile(void);
|
||||
|
||||
|
||||
@@ -380,7 +409,6 @@ bool has_passfile(void);
|
||||
bool begin_transaction(PGconn *conn);
|
||||
bool commit_transaction(PGconn *conn);
|
||||
bool rollback_transaction(PGconn *conn);
|
||||
bool check_cluster_schema(PGconn *conn);
|
||||
|
||||
/* GUC manipulation functions */
|
||||
bool set_config(PGconn *conn, const char *config_param, const char *config_value);
|
||||
@@ -388,31 +416,47 @@ bool set_config_bool(PGconn *conn, const char *config_param, bool state);
|
||||
int guc_set(PGconn *conn, const char *parameter, const char *op, const char *value);
|
||||
int guc_set_typed(PGconn *conn, const char *parameter, const char *op, const char *value, const char *datatype);
|
||||
bool get_pg_setting(PGconn *conn, const char *setting, char *output);
|
||||
bool alter_system_int(PGconn *conn, const char *name, int value);
|
||||
bool pg_reload_conf(PGconn *conn);
|
||||
|
||||
/* server information functions */
|
||||
bool get_cluster_size(PGconn *conn, char *size);
|
||||
int get_server_version(PGconn *conn, char *server_version);
|
||||
int get_server_version(PGconn *conn, char *server_version_buf);
|
||||
|
||||
RecoveryType get_recovery_type(PGconn *conn);
|
||||
int get_primary_node_id(PGconn *conn);
|
||||
int get_ready_archive_files(PGconn *conn, const char *data_directory);
|
||||
bool identify_system(PGconn *repl_conn, t_system_identification *identification);
|
||||
TimeLineHistoryEntry *get_timeline_history(PGconn *repl_conn, TimeLineID tli);
|
||||
|
||||
/* repmgrd shared memory functions */
|
||||
bool repmgrd_set_local_node_id(PGconn *conn, int local_node_id);
|
||||
int repmgrd_get_local_node_id(PGconn *conn);
|
||||
bool repmgrd_check_local_node_id(PGconn *conn);
|
||||
BackupState server_in_exclusive_backup_mode(PGconn *conn);
|
||||
void repmgrd_set_pid(PGconn *conn, pid_t repmgrd_pid, const char *pidfile);
|
||||
pid_t repmgrd_get_pid(PGconn *conn);
|
||||
bool repmgrd_is_running(PGconn *conn);
|
||||
bool repmgrd_is_paused(PGconn *conn);
|
||||
bool repmgrd_pause(PGconn *conn, bool pause);
|
||||
pid_t get_wal_receiver_pid(PGconn *conn);
|
||||
|
||||
/* extension functions */
|
||||
ExtensionStatus get_repmgr_extension_status(PGconn *conn);
|
||||
ExtensionStatus get_repmgr_extension_status(PGconn *conn, t_extension_versions *extversions);
|
||||
|
||||
/* node management functions */
|
||||
void checkpoint(PGconn *conn);
|
||||
bool vacuum_table(PGconn *conn, const char *table);
|
||||
|
||||
bool promote_standby(PGconn *conn, bool wait, int wait_seconds);
|
||||
bool resume_wal_replay(PGconn *conn);
|
||||
|
||||
/* node record functions */
|
||||
t_server_type parse_node_type(const char *type);
|
||||
const char *get_node_type_string(t_server_type type);
|
||||
|
||||
RecordStatus get_node_record(PGconn *conn, int node_id, t_node_info *node_info);
|
||||
RecordStatus refresh_node_record(PGconn *conn, int node_id, t_node_info *node_info);
|
||||
|
||||
RecordStatus get_node_record_with_upstream(PGconn *conn, int node_id, t_node_info *node_info);
|
||||
|
||||
RecordStatus get_node_record_by_name(PGconn *conn, const char *node_name, t_node_info *node_info);
|
||||
@@ -421,7 +465,7 @@ t_node_info *get_node_record_pointer(PGconn *conn, int node_id);
|
||||
bool get_local_node_record(PGconn *conn, int node_id, t_node_info *node_info);
|
||||
bool get_primary_node_record(PGconn *conn, t_node_info *node_info);
|
||||
|
||||
void get_all_node_records(PGconn *conn, NodeInfoList *node_list);
|
||||
bool get_all_node_records(PGconn *conn, NodeInfoList *node_list);
|
||||
void get_downstream_node_records(PGconn *conn, int node_id, NodeInfoList *nodes);
|
||||
void get_active_sibling_node_records(PGconn *conn, int node_id, int upstream_node_id, NodeInfoList *node_list);
|
||||
void get_node_records_by_priority(PGconn *conn, NodeInfoList *node_list);
|
||||
@@ -459,7 +503,7 @@ PGresult *get_event_records(PGconn *conn, int node_id, const char *node_name,
|
||||
|
||||
/* replication slot functions */
|
||||
void create_slot_name(char *slot_name, int node_id);
|
||||
bool create_replication_slot(PGconn *conn, char *slot_name, int server_version_num, PQExpBufferData *error_msg);
|
||||
bool create_replication_slot(PGconn *conn, char *slot_name, PQExpBufferData *error_msg);
|
||||
bool drop_replication_slot(PGconn *conn, char *slot_name);
|
||||
RecordStatus get_slot_record(PGconn *conn, char *slot_name, t_replication_slot *record);
|
||||
int get_free_replication_slot_count(PGconn *conn);
|
||||
@@ -470,12 +514,14 @@ bool get_tablespace_name_by_location(PGconn *conn, const char *location, char *
|
||||
|
||||
/* asynchronous query functions */
|
||||
bool cancel_query(PGconn *conn, int timeout);
|
||||
int wait_connection_availability(PGconn *conn, long long timeout);
|
||||
int wait_connection_availability(PGconn *conn, int timeout);
|
||||
|
||||
/* node availability functions */
|
||||
bool is_server_available(const char *conninfo);
|
||||
bool is_server_available_quiet(const char *conninfo);
|
||||
bool is_server_available_params(t_conninfo_param_list *param_list);
|
||||
void connection_ping(PGconn *conn);
|
||||
ExecStatusType connection_ping(PGconn *conn);
|
||||
ExecStatusType connection_ping_reconnect(PGconn *conn);
|
||||
|
||||
/* monitoring functions */
|
||||
void
|
||||
@@ -491,8 +537,8 @@ add_monitoring_record(PGconn *primary_conn,
|
||||
long long unsigned int apply_lag_bytes
|
||||
);
|
||||
|
||||
int get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_history);
|
||||
bool delete_monitoring_records(PGconn *primary_conn, int keep_history);
|
||||
int get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_history, int node_id);
|
||||
bool delete_monitoring_records(PGconn *primary_conn, int keep_history, int node_id);
|
||||
|
||||
|
||||
|
||||
@@ -506,12 +552,17 @@ bool get_new_primary(PGconn *conn, int *primary_node_id);
|
||||
void reset_voting_status(PGconn *conn);
|
||||
|
||||
/* replication status functions */
|
||||
XLogRecPtr get_current_wal_lsn(PGconn *conn);
|
||||
XLogRecPtr get_primary_current_lsn(PGconn *conn);
|
||||
XLogRecPtr get_node_current_lsn(PGconn *conn);
|
||||
XLogRecPtr get_last_wal_receive_location(PGconn *conn);
|
||||
bool get_replication_info(PGconn *conn, ReplInfo *replication_info);
|
||||
void init_replication_info(ReplInfo *replication_info);
|
||||
bool get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replication_info);
|
||||
int get_replication_lag_seconds(PGconn *conn);
|
||||
void get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *node_info);
|
||||
void get_node_replication_stats(PGconn *conn, t_node_info *node_info);
|
||||
bool is_downstream_node_attached(PGconn *conn, char *node_name);
|
||||
void set_upstream_last_seen(PGconn *conn);
|
||||
int get_upstream_last_seen(PGconn *conn, t_server_type node_type);
|
||||
bool is_wal_replay_paused(PGconn *conn, bool check_pending_wal);
|
||||
|
||||
/* BDR functions */
|
||||
int get_bdr_version_num(void);
|
||||
|
||||
35
dirutil.c
35
dirutil.c
@@ -3,7 +3,7 @@
|
||||
* dirmod.c
|
||||
* directory handling functions
|
||||
*
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -50,7 +50,7 @@ typedef long pgpid_t;
|
||||
* and tablespace directories.
|
||||
*/
|
||||
DataDirState
|
||||
check_dir(char *path)
|
||||
check_dir(const char *path)
|
||||
{
|
||||
DIR *chkdir = NULL;
|
||||
struct dirent *file = NULL;
|
||||
@@ -91,12 +91,17 @@ check_dir(char *path)
|
||||
* Create directory with error log message when failing
|
||||
*/
|
||||
bool
|
||||
create_dir(char *path)
|
||||
create_dir(const char *path)
|
||||
{
|
||||
if (mkdir_p(path, 0700) == 0)
|
||||
char create_dir_path[MAXPGPATH];
|
||||
|
||||
/* mkdir_p() may modify the supplied path */
|
||||
strncpy(create_dir_path, path, MAXPGPATH);
|
||||
|
||||
if (mkdir_p(create_dir_path, 0700) == 0)
|
||||
return true;
|
||||
|
||||
log_error(_("unable to create directory \"%s\""), path);
|
||||
log_error(_("unable to create directory \"%s\""), create_dir_path);
|
||||
log_detail("%s", strerror(errno));
|
||||
|
||||
return false;
|
||||
@@ -104,13 +109,12 @@ create_dir(char *path)
|
||||
|
||||
|
||||
bool
|
||||
set_dir_permissions(char *path)
|
||||
set_dir_permissions(const char *path)
|
||||
{
|
||||
return (chmod(path, 0700) != 0) ? false : true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* function from initdb.c */
|
||||
/* source adapted from FreeBSD /src/bin/mkdir/mkdir.c */
|
||||
|
||||
@@ -198,9 +202,9 @@ mkdir_p(char *path, mode_t omode)
|
||||
|
||||
|
||||
bool
|
||||
is_pg_dir(char *path)
|
||||
is_pg_dir(const char *path)
|
||||
{
|
||||
char dirpath[MAXPGPATH];
|
||||
char dirpath[MAXPGPATH] = "";
|
||||
struct stat sb;
|
||||
|
||||
/* test pgdata */
|
||||
@@ -223,7 +227,7 @@ is_pg_dir(char *path)
|
||||
* any further useful progress can be made.
|
||||
*/
|
||||
PgDirState
|
||||
is_pg_running(char *path)
|
||||
is_pg_running(const char *path)
|
||||
{
|
||||
long pid;
|
||||
FILE *pidf;
|
||||
@@ -272,6 +276,8 @@ is_pg_running(char *path)
|
||||
log_warning(_("invalid data in PostgreSQL PID file \"%s\""), path);
|
||||
}
|
||||
|
||||
fclose(pidf);
|
||||
|
||||
return PG_DIR_NOT_RUNNING;
|
||||
}
|
||||
|
||||
@@ -291,7 +297,7 @@ is_pg_running(char *path)
|
||||
|
||||
|
||||
bool
|
||||
create_pg_dir(char *path, bool force)
|
||||
create_pg_dir(const char *path, bool force)
|
||||
{
|
||||
/* Check this directory can be used as a PGDATA dir */
|
||||
switch (check_dir(path))
|
||||
@@ -347,8 +353,9 @@ create_pg_dir(char *path, bool force)
|
||||
}
|
||||
break;
|
||||
case DIR_ERROR:
|
||||
log_error(_("could not access directory \"%s\": %s"),
|
||||
path, strerror(errno));
|
||||
log_error(_("could not access directory \"%s\"")
|
||||
, path);
|
||||
log_detail("%s", strerror(errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -358,7 +365,7 @@ create_pg_dir(char *path, bool force)
|
||||
|
||||
|
||||
int
|
||||
rmdir_recursive(char *path)
|
||||
rmdir_recursive(const char *path)
|
||||
{
|
||||
return nftw(path, unlink_dir_callback, 64, FTW_DEPTH | FTW_PHYS);
|
||||
}
|
||||
|
||||
16
dirutil.h
16
dirutil.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* dirutil.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -35,13 +35,13 @@ typedef enum
|
||||
} PgDirState;
|
||||
|
||||
extern int mkdir_p(char *path, mode_t omode);
|
||||
extern bool set_dir_permissions(char *path);
|
||||
extern bool set_dir_permissions(const char *path);
|
||||
|
||||
extern DataDirState check_dir(char *path);
|
||||
extern bool create_dir(char *path);
|
||||
extern bool is_pg_dir(char *path);
|
||||
extern PgDirState is_pg_running(char *path);
|
||||
extern bool create_pg_dir(char *path, bool force);
|
||||
extern int rmdir_recursive(char *path);
|
||||
extern DataDirState check_dir(const char *path);
|
||||
extern bool create_dir(const char *path);
|
||||
extern bool is_pg_dir(const char *path);
|
||||
extern PgDirState is_pg_running(const char *path);
|
||||
extern bool create_pg_dir(const char *path, bool force);
|
||||
extern int rmdir_recursive(const char *path);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -61,7 +61,7 @@ clean:
|
||||
|
||||
maintainer-clean:
|
||||
rm -rf html
|
||||
rm -rf Makefile
|
||||
rm -f Makefile
|
||||
|
||||
zip: html
|
||||
cp -r html repmgr-docs-$(REPMGR_VERSION)
|
||||
|
||||
@@ -21,13 +21,17 @@
|
||||
in PostgreSQL 9.3, as well as improved automated failover support
|
||||
via <application>repmgrd</application>, and is not compatible with PostgreSQL 9.2
|
||||
and earlier. We recommend upgrading to &repmgr; 4, as the &repmgr; 3.x
|
||||
series will no longer be actively maintained.
|
||||
series is no longer maintained.
|
||||
</para>
|
||||
<para>
|
||||
&repmgr; 2.x supports PostgreSQL 9.0 ~ 9.3. While it is compatible
|
||||
with PostgreSQL 9.3, we recommend using repmgr 4.x. &repmgr; 2.x is
|
||||
no longer maintained.
|
||||
</para>
|
||||
<para>
|
||||
See also <link linkend="install-compatibility-matrix">&repmgr; compatibility matrix</link>
|
||||
and <link linkend="faq-upgrade-repmgr">Should I upgrade &repmgr;?</link>.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-replication-slots-advantage" xreflabel="Advantages of replication slots">
|
||||
@@ -35,15 +39,25 @@
|
||||
<para>
|
||||
Replication slots, introduced in PostgreSQL 9.4, ensure that the
|
||||
primary server will retain WAL files until they have been consumed
|
||||
by all standby servers. This makes WAL file management much easier,
|
||||
and if used &repmgr; will no longer insist on a fixed minimum number
|
||||
(default: 5000) of WAL files being retained.
|
||||
by all standby servers. This means standby servers should never
|
||||
fail due to not being able to retrieve required WAL files from the
|
||||
primary.
|
||||
</para>
|
||||
<para>
|
||||
However this does mean that if a standby is no longer connected to the
|
||||
primary, the presence of the replication slot will cause WAL files
|
||||
to be retained indefinitely.
|
||||
to be retained indefinitely, and eventually lead to disk space
|
||||
exhaustion.
|
||||
</para>
|
||||
|
||||
<tip>
|
||||
<para>
|
||||
2ndQuadrant's recommended configuration is to configure
|
||||
<ulink url="https://www.pgbarman.org/">Barman</ulink> as a fallback
|
||||
source of WAL files, rather than maintain replication slots for
|
||||
each standby. See also: <link linkend="cloning-from-barman-restore-command">Using Barman as a WAL file source</link>.
|
||||
</para>
|
||||
</tip>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-replication-slots-number" xreflabel="Number of replication slots">
|
||||
@@ -62,7 +76,7 @@
|
||||
<para>
|
||||
Before PostgreSQL 10, hash indexes were not WAL logged and are therefore not suitable
|
||||
for use in streaming replication in PostgreSQL 9.6 and earlier. See the
|
||||
<ulink url="https://www.postgresql.org/docs/9.6/static/sql-createindex.html#AEN80279">PostgreSQL documentation</ulink>
|
||||
<ulink url="https://www.postgresql.org/docs/9.6/sql-createindex.html#AEN80279">PostgreSQL documentation</ulink>
|
||||
for details.
|
||||
</para>
|
||||
<para>
|
||||
@@ -82,12 +96,11 @@
|
||||
<para>
|
||||
For <emphasis>major</emphasis> version upgrades (e.g. from PostgreSQL 9.6 to PostgreSQL 10),
|
||||
the traditional approach is to "reseed" a cluster by upgrading a single
|
||||
node with <ulink url="https://www.postgresql.org/docs/current/static/pgupgrade.html">pg_upgrade</ulink>
|
||||
node with <ulink url="https://www.postgresql.org/docs/current/pgupgrade.html">pg_upgrade</ulink>
|
||||
and recloning standbys from this.
|
||||
</para>
|
||||
<para>
|
||||
To minimize downtime during major upgrades, for more recent PostgreSQL
|
||||
versions (PostgreSQL 9.4 and later),
|
||||
To minimize downtime during major upgrades from PostgreSQL 9.4 and later,
|
||||
<ulink url="https://www.2ndquadrant.com/en/resources/pglogical/">pglogical</ulink>
|
||||
can be used to set up a parallel cluster using the newer PostgreSQL version,
|
||||
which can be kept in sync with the existing production cluster until the
|
||||
@@ -108,6 +121,82 @@
|
||||
is not possible, contact your vendor for assistance.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-old-packages">
|
||||
<title>How can I obtain old versions of &repmgr; packages?</title>
|
||||
<para>
|
||||
See appendix <xref linkend="packages-old-versions"> for details.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-repmgr-required-for-replication">
|
||||
<title>Is &repmgr; required for streaming replication?</title>
|
||||
<para>
|
||||
No.
|
||||
</para>
|
||||
<para>
|
||||
&repmgr; (together with <application>repmgrd</application>) assists with
|
||||
<emphasis>managing</emphasis> replication. It does not actually perform replication, which
|
||||
is part of the core PostgreSQL functionality.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-what-if-repmgr-uninstalled">
|
||||
<title>Will replication stop working if &repmgr; is uninstalled?</title>
|
||||
<para>
|
||||
No. See preceding question.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-version-mix">
|
||||
<title>Does it matter if different &repmgr; versions are present in the replication cluster?</title>
|
||||
<para>
|
||||
Yes. If different "major" &repmgr; versions (e.g. 3.3.x and 4.1.x) are present,
|
||||
&repmgr; (in particular <application>repmgrd</application>)
|
||||
may not run, or run properly, or in the worst case (if different <application>repmgrd</application>
|
||||
versions are running and there are differences in the failover implementation) break
|
||||
your replication cluster.
|
||||
</para>
|
||||
<para>
|
||||
If different "minor" &repmgr; versions (e.g. 4.1.1 and 4.1.6) are installed,
|
||||
&repmgr; will function, but we strongly recommend always running the same version
|
||||
to ensure there are no unexpected suprises, e.g. a newer version behaving slightly
|
||||
differently to the older version.
|
||||
</para>
|
||||
<para>
|
||||
See also <link linkend="faq-upgrade-repmgr">Should I upgrade &repmgr;?</link>.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-upgrade-repmgr">
|
||||
<title>Should I upgrade &repmgr;?</title>
|
||||
<para>
|
||||
Yes.
|
||||
</para>
|
||||
<para>
|
||||
We don't release new versions for fun, you know. Upgrading may require a little effort,
|
||||
but running an older &repmgr; version with bugs which have since been fixed may end up
|
||||
costing you more effort. The same applies to PostgreSQL itself.
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-repmgr-conf-data-directory">
|
||||
<title>Why do I need to specify the data directory location in repmgr.conf?</title>
|
||||
<para>
|
||||
In some circumstances &repmgr; may need to access a PostgreSQL data
|
||||
directory while the PostgreSQL server is not running, e.g. to confirm
|
||||
it shut down cleanly during a <link linkend="performing-switchover">switchover</link>.
|
||||
</para>
|
||||
<para>
|
||||
Additionally, this provides support when using &repmgr; on PostgreSQL 9.6 and
|
||||
earlier, where the <literal>repmgr</literal> user is not a superuser; in that
|
||||
case the <literal>repmgr</literal> user will not be able to access the
|
||||
<literal>data_directory</literal> configuration setting, access to which is restricted
|
||||
to superusers. (In PostgreSQL 10 and later, non-superusers can be added to the
|
||||
group <option>pg_read_all_settings</option> which will enable them to read this setting).
|
||||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="faq-repmgr" xreflabel="repmgr">
|
||||
@@ -239,11 +328,22 @@
|
||||
Under some circumstances event notifications can be generated for servers
|
||||
which have not yet been registered; it's also useful to retain a record
|
||||
of events which includes servers removed from the replication cluster
|
||||
which no longer have an entry in the <literal>repmrg.nodes</literal> table.
|
||||
which no longer have an entry in the <literal>repmgr.nodes</literal> table.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
<sect2 id="faq-repmgr-recovery-conf-quoted-values" xreflabel="Quoted values in recovery.conf">
|
||||
<title>Why are some values in <filename>recovery.conf</filename> surrounded by pairs of single quotes?</title>
|
||||
<para>
|
||||
This is to ensure that user-supplied values which are written as parameter values in <filename>recovery.conf</filename>
|
||||
are escaped correctly and do not cause errors when <filename>recovery.conf</filename> is parsed.
|
||||
</para>
|
||||
<para>
|
||||
The escaping is performed by an internal PostgreSQL routine, which leaves strings consisting
|
||||
of digits and alphabetical characters only as-is, but wraps everything else in pairs of single quotes,
|
||||
even if the string does not contain any characters which need escaping.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
</sect1>
|
||||
@@ -255,7 +355,7 @@
|
||||
<sect2 id="faq-repmgrd-prevent-promotion" xreflabel="Prevent standby from being promoted to primary">
|
||||
<title>How can I prevent a node from ever being promoted to primary?</title>
|
||||
<para>
|
||||
In `repmgr.conf`, set its priority to a value of 0 or less; apply the changed setting with
|
||||
In <filename>repmgr.conf</filename>, set its priority to a value of <literal>0</literal>; apply the changed setting with
|
||||
<command><link linkend="repmgr-standby-register">repmgr standby register --force</link></command>.
|
||||
</para>
|
||||
<para>
|
||||
@@ -303,5 +403,36 @@
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-repmgrd-pg-bindir" xreflabel="repmgrd does not apply pg_bindir to promote_command or follow_command">
|
||||
<title>
|
||||
<application>repmgrd</application> ignores pg_bindir when executing <varname>promote_command</varname> or <varname>follow_command</varname>
|
||||
</title>
|
||||
<para>
|
||||
<varname>promote_command</varname> or <varname>follow_command</varname> can be user-defined scripts,
|
||||
so &repmgr; will not apply <option>pg_bindir</option> even if excuting &repmgr;. Always provide the full
|
||||
path; see <xref linkend="repmgrd-automatic-failover-configuration"> for more details.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-repmgrd-startup-no-upstream" xreflabel="repmgrd does not start if upstream node is not running">
|
||||
<title>
|
||||
<application>repmgrd</application> aborts startup with the error "<literal>upstream node must be running before repmgrd can start</literal>"
|
||||
</title>
|
||||
<para>
|
||||
<application>repmgrd</application> does this to avoid starting up on a replication cluster
|
||||
which is not in a healthy state. If the upstream is unavailable, <application>repmgrd</application>
|
||||
may initiate a failover immediately after starting up, which could have unintended side-effects,
|
||||
particularly if <application>repmgrd</application> is not running on other nodes.
|
||||
</para>
|
||||
<para>
|
||||
In particular, it's possible that the node's local copy of the <literal>repmgr.nodes</literal> copy
|
||||
is out-of-date, which may lead to incorrect failover behaviour.
|
||||
</para>
|
||||
<para>
|
||||
The onus is therefore on the adminstrator to manually set the cluster to a stable, healthy state before
|
||||
starting <application>repmgrd</application>.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
</appendix>
|
||||
|
||||
@@ -12,10 +12,17 @@
|
||||
|
||||
<sect1 id="packages-centos" xreflabel="CentOS packages">
|
||||
<title>CentOS Packages</title>
|
||||
|
||||
<indexterm>
|
||||
<primary>packages</primary>
|
||||
<secondary>CentOS packages</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>CentOS</primary>
|
||||
<secondary>package information</secondary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
Currently, &repmgr; RPM packages are provided for versions 6.x and 7.x of CentOS. These should also
|
||||
work on matching versions of Red Hat Enterprise Linux, Scientific Linux and Oracle Enterprise Linux;
|
||||
@@ -53,11 +60,11 @@
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>Repository URL:</entry>
|
||||
<entry><ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink></entry>
|
||||
<entry><ulink url="https://dl.2ndquadrant.com/">https://dl.2ndquadrant.com/</ulink></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ">https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ</ulink></entry>
|
||||
<entry><ulink url="https://repmgr.org/docs/current/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ">https://repmgr.org/docs/current/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ</ulink></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
@@ -237,6 +244,12 @@
|
||||
<primary>packages</primary>
|
||||
<secondary>Debian/Ubuntu packages</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>Debian/Ubuntu</primary>
|
||||
<secondary>package information</secondary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
&repmgr; <literal>.deb</literal> packages are provided via the
|
||||
PostgreSQL Community APT repository, and are available for each community-supported
|
||||
@@ -253,6 +266,23 @@
|
||||
</para>
|
||||
|
||||
|
||||
<table id="apt-2ndquadrant-repository">
|
||||
<title>2ndQuadrant public repository</title>
|
||||
<tgroup cols="2">
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>Repository URL:</entry>
|
||||
<entry><ulink url="https://dl.2ndquadrant.com/">https://dl.2ndquadrant.com/</ulink></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://repmgr.org/docs/current/installation-packages.html#INSTALLATION-PACKAGES-DEBIAN">https://repmgr.org/docs/current/installation-packages.html#INSTALLATION-PACKAGES-DEBIAN</ulink></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
|
||||
|
||||
<table id="apt-repository">
|
||||
<title>PostgreSQL Community APT repository (PGDG)</title>
|
||||
<tgroup cols="2">
|
||||
@@ -263,7 +293,7 @@
|
||||
</row>
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://wiki.postgresql.org/wiki/Apt)">https://wiki.postgresql.org/wiki/Apt)</ulink></entry>
|
||||
<entry><ulink url="https://wiki.postgresql.org/wiki/Apt">https://wiki.postgresql.org/wiki/Apt</ulink></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
@@ -365,6 +395,134 @@
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="packages-snapshot" xreflabel="Snapshot packages">
|
||||
<title>Snapshot packages</title>
|
||||
<indexterm>
|
||||
<primary>snapshot packages</primary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>packages</primary>
|
||||
<secondary>snaphots</secondary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
For testing new features and bug fixes, from time to time 2ndQuadrant provides
|
||||
so-called "snapshot packages" via its public repository. These packages
|
||||
are built from the &repmgr; source at a particular point in time, and are not formal
|
||||
releases.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
We do not recommend installing these packages in a production environment
|
||||
unless specifically advised.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
To install a snapshot package, it's necessary to install the 2ndQuadrant public snapshot repository,
|
||||
following the instructions here: <ulink url="https://dl.2ndquadrant.com/default/release/site/">https://dl.2ndquadrant.com/default/release/site/</ulink> but replace <literal>release</literal> with <literal>snapshot</literal>
|
||||
in the appropriate URL.
|
||||
</para>
|
||||
<para>
|
||||
For example, to install the snapshot RPM repository for PostgreSQL 9.6, execute (as <literal>root</literal>):
|
||||
<programlisting>
|
||||
curl https://dl.2ndquadrant.com/default/snapshot/get/9.6/rpm | bash</programlisting>
|
||||
|
||||
or as a normal user with root sudo access:
|
||||
<programlisting>
|
||||
curl https://dl.2ndquadrant.com/default/snapshot/get/9.6/rpm | sudo bash</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Alternatively you can browse the repository here:
|
||||
<ulink url="https://dl.2ndquadrant.com/default/snapshot/browse/">https://dl.2ndquadrant.com/default/snapshot/browse/</ulink>.
|
||||
</para>
|
||||
<para>
|
||||
Once the repository is installed, installing or updating &repmgr; will result in the latest snapshot
|
||||
package being installed.
|
||||
</para>
|
||||
<para>
|
||||
The package name will be formatted like this:
|
||||
<programlisting>
|
||||
repmgr96-4.1.1-0.0git320.g5113ab0.1.el7.x86_64.rpm</programlisting>
|
||||
containg the snapshot build number (here: <literal>320</literal>) and the hash
|
||||
of the <application>git</application> commit it was built from (here: <literal>g5113ab0</literal>).
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Note that the next formal release (in the above example <literal>4.1.1</literal>), once available,
|
||||
will install in place of any snapshot builds.
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="packages-old-versions" xreflabel="Installing old package versions">
|
||||
<title>Installing old package versions</title>
|
||||
|
||||
<indexterm>
|
||||
<primary>old packages</primary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>packages</primary>
|
||||
<secondary>old versions</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>installation</primary>
|
||||
<secondary>old package versions</secondary>
|
||||
</indexterm>
|
||||
|
||||
<sect2 id="packages-old-versions-debian" xreflabel="old Debian package versions">
|
||||
<title>Debian/Ubuntu</title>
|
||||
<para>
|
||||
An archive of old packages (<literal>3.3.2</literal> and later) for Debian/Ubuntu-based systems is available here:
|
||||
<ulink url="http://atalia.postgresql.org/morgue/r/repmgr/">http://atalia.postgresql.org/morgue/r/repmgr/</ulink>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="packages-old-versions-rhel-centos" xreflabel="old RHEL/CentOS package versions">
|
||||
<title>RHEL/CentOS</title>
|
||||
<para>
|
||||
Old RPM packages (<literal>3.2</literal> and later) can be retrieved from the
|
||||
(deprecated) 2ndQuadrant repository at
|
||||
<ulink url="http://packages.2ndquadrant.com/">http://packages.2ndquadrant.com/</ulink>
|
||||
by installing the appropriate repository RPM:
|
||||
</para>
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm</ulink>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</ulink>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
<para>
|
||||
Old versions can be located with e.g.:
|
||||
<programlisting>
|
||||
yum --showduplicates list repmgr96</programlisting>
|
||||
(substitute the appropriate package name; see <xref linkend="packages-centos">) and installed with:
|
||||
<programlisting>
|
||||
yum install {package_name}-{version}</programlisting>
|
||||
where <literal>{package_name}</literal> is the base package name (e.g. <literal>repmgr96</literal>)
|
||||
and <literal>{version}</literal> is the version listed by the
|
||||
<command> yum --showduplicates list ...</command> command, e.g. <literal>4.0.6-1.rhel6</literal>.
|
||||
</para>
|
||||
<para>For example:
|
||||
<programlisting>
|
||||
yum install repmgr96-4.0.6-1.rhel6</programlisting>
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="packages-packager-info" xreflabel="Information for packagers">
|
||||
<title>Information for packagers</title>
|
||||
@@ -373,7 +531,7 @@
|
||||
<secondary>information for packagers</secondary>
|
||||
</indexterm>
|
||||
<para>
|
||||
We recommend patching the following parameters when
|
||||
We recommend patching the following parameters when
|
||||
building the package as built-in default values for user convenience.
|
||||
These values can nevertheless be overridden by the user, if desired.
|
||||
</para>
|
||||
|
||||
@@ -15,9 +15,589 @@
|
||||
See also: <xref linkend="upgrading-repmgr">
|
||||
</para>
|
||||
|
||||
<sect1 id="release-4.3">
|
||||
<title>Release 4.3</title>
|
||||
<para><emphasis>Mar ???, 2019</emphasis></para>
|
||||
<para>
|
||||
&repmgr; 4.3 is a major release.
|
||||
</para>
|
||||
|
||||
<important>
|
||||
<para>
|
||||
On Debian-based systems, including Ubuntu, if using <application>repmgrd</application>
|
||||
please ensure that in the file <filename>/etc/init.d/repmgrd</filename>, the parameter
|
||||
<varname>REPMGRD_OPTS</varname> contains "<literal>--daemonize=false</literal>", e.g.:
|
||||
<programlisting>
|
||||
# additional options
|
||||
REPMGRD_OPTS="--daemonize=false"</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd configuration on Debian/Ubuntu</link>.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
<sect2>
|
||||
<title>repmgr enhancements</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>:
|
||||
option <option>--upstream-node-id</option> can now be used to specify another standby
|
||||
to follow.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>:
|
||||
verify that it is actually possible to follow another node.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>:
|
||||
verify that it is actually possible to attach the node to the current primary.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
New commands <link linkend="repmgr-daemon-start"><command>repmgr daemon start</command></link> and
|
||||
<link linkend="repmgr-daemon-stop"><command>repmgr daemon stop</command></link>:
|
||||
these provide a standardized way of starting and stopping <application>repmgrd</application>.
|
||||
GitHub #528.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
These commands require the configuration file settings
|
||||
<varname>repmgrd_service_start_command</varname> and <varname>repmgrd_service_stop_command</varname>
|
||||
in <filename>repmgr.conf</filename> to be set.
|
||||
</para>
|
||||
</note>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>
|
||||
additionally displays the node priority and the interval (in seconds) since the
|
||||
<application>repmgrd</application> instance last verified its upstream node was available.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Add <option>--compact</option> option to <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command> (GitHub #521).
|
||||
</para>
|
||||
<para>
|
||||
This makes it easier to copy the output into emails, chats etc. as a compact table.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
|
||||
differentiate between unreachable nodes and nodes which are running but rejecting connections.
|
||||
</para>
|
||||
<para>
|
||||
This makes it possible to see whether a node is unreachable at network level,
|
||||
or if it is running but rejecting connections for some reason.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Add <option>--dry-run</option> to <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> (GitHub #522).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command>repmgr --version-number</command> outputs the "raw"
|
||||
repmgr version number (e.g. <literal>40300</literal>). This is intended
|
||||
for use by scripts etc. requiring an easily parseable representation
|
||||
of the &repmgr; version.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="repmgr-node-check"><command>repmgr node check --data-directory-config</command></link>
|
||||
option added; this is to confirm &repmgr; is correctly configured. GitHub #523.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Add check to <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>
|
||||
to ensure the data directory on the demotion candidate is configured correctly in <filename>repmgr.conf</filename>.
|
||||
This is to ensure that &repmgr;, when remotely executed on the demotion candidate, can correctly verify
|
||||
that PostgreSQL on the demotion candidate was shut down cleanly. GitHub #523.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>repmgrd enhancements</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> will no longer consider nodes where <application>repmgrd</application>
|
||||
is not running as promotion candidates.
|
||||
</para>
|
||||
<para>
|
||||
Previously, if <application>repmgrd</application> was not running on a node, but
|
||||
that node qualified as the promotion candidate, it would never be promoted due to
|
||||
the absence of a running <application>repmgrd</application>.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Add option <option>connection_check_type</option> to enable selection of the method
|
||||
<application>repmgrd</application> uses to determine whether the upstream node is available.
|
||||
</para>
|
||||
<para>
|
||||
Possible values are <literal>ping</literal> (default; uses <command>PQping()</command> to
|
||||
determine server availability), <literal>connection</literal> (attempst to make a new connection to
|
||||
the upstream node), and <literal>query</literal> (determines server availability
|
||||
by executing an SQL statement on the node via the existing connection).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
New configuration option <link linkend="repmgrd-failover-validation"><option>failover_validation_command</option></link>
|
||||
to allow an external mechanism to validate the failover decision made by <application>repmgrd</application>.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
New configuration option <link linkend="repmgrd-standby-disconnection-on-failover"><option>standby_disconnect_on_failover</option></link>
|
||||
to force standbys to disconnect their WAL receivers before making a failover decision.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
In a failover situation, <application>repmgrd</application> will not attempt to promote a
|
||||
node if another standby has already appeared (e.g. by being promoted manually).
|
||||
GitHub #420.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Bug fixes</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr;: when executing <command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>,
|
||||
prevent escaping issues with connection URIs when executing <command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>
|
||||
on the demotion candidate. GitHub #525.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr;: when executing <command><link linkend="repmgr-witness-register">repmgr witness register</link></command>,
|
||||
check the node to connected is actually the primary (i.e. not the witness server). GitHub #528.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr;: when executing <link linkend="repmgr-standby-clone"><command>repmgr standby clone</command></link>,
|
||||
recheck primary/upstream connection(s) after the data copy operation is complete, as these may
|
||||
have gone away.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
|
||||
avoid a potential race condition when comparing received WAL on the standby to the primary's shutdown location,
|
||||
as the standby's walreceiver may not have yet flushed all received WAL to disk. GitHub #518.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr;: when executing <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
|
||||
verify the standby (promotion candidate) is currently attached to the primary (demotion candidate). GitHub #519.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application>: on a cascaded standby, don't fail over if
|
||||
<literal>failover=manual</literal>. GitHub #531.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
|
||||
fix display of node IDs with multiple digits.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
ensure <command><link linkend="repmgr-primary-unregister">repmgr primary unregister</link></command>
|
||||
behaves correctly when executed on a witness server. GitHub #548.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
ensure <command><link linkend="repmgr-standby-register">repmgr standby register</link></command>
|
||||
fails when <option>--upstream-node-id</option> is the same as the local node ID.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-node-check">repmgr node check</link></command>
|
||||
will only consider physical replication slots, as the purpose
|
||||
of slot checks is to warn about potential issues with
|
||||
streaming replication standbys which are no longer attached.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="release-4.2">
|
||||
<title>Release 4.2</title>
|
||||
<para><emphasis>Wed October 24, 2018</emphasis></para>
|
||||
|
||||
<para>
|
||||
&repmgr; 4.2 is a major release, with the main new feature being the
|
||||
ability to <link linkend="repmgrd-pausing">pause repmgrd</link>, e.g. during planned maintenance
|
||||
operations. Various other usability enhancements and a couple of bug fixes are also included;
|
||||
see notes below for details.
|
||||
</para>
|
||||
<para>
|
||||
A restart of the PostgreSQL server <emphasis>is</emphasis> required
|
||||
for this release. For detailed upgrade instructions, see
|
||||
<link linkend="upgrading-major-version">Upgrading a major version release</link>.
|
||||
</para>
|
||||
|
||||
<important>
|
||||
<para>
|
||||
On Debian-based systems, including Ubuntu, if using <application>repmgrd</application>
|
||||
please ensure that the in the file <filename>/etc/init.d/repmgrd</filename>, the parameter
|
||||
<varname>REPMGRD_OPTS</varname> contains "<literal>--daemonize=false</literal>", e.g.:
|
||||
<programlisting>
|
||||
# additional options
|
||||
REPMGRD_OPTS="--daemonize=false"</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
For further details, see <link linkend="repmgrd-configuration-debian-ubuntu">repmgrd daemon configuration on Debian/Ubuntu</link>.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
<sect2>
|
||||
<title>Configuration file changes</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
New parameter <varname>shutdown_check_timeout</varname> (default: 60 seconds) added;
|
||||
this provides an explicit timeout for
|
||||
<command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>
|
||||
to check that the demotion candidate (current primary) has shut down. Previously, the parameters
|
||||
<literal>reconnect_attempts</literal> and <literal>reconnect_interval</literal>
|
||||
were used to calculate a timeout, but these are actually
|
||||
intended for primary failure detection. (GitHub #504).
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
New parameter <varname>repmgr_bindir</varname> added, to facilitate remote invocation of repmgr
|
||||
when the repmgr binary is located somewhere other than the PostgreSQL binary directory, as it
|
||||
cannot be assumed all package maintainers will install &repmgr; there.
|
||||
</para>
|
||||
<para>
|
||||
This parameter is optional; if not set (the default), &repmgr; will fall back
|
||||
to <option>pg_bindir</option> (if set).
|
||||
</para>
|
||||
<para>
|
||||
(GitHub #246).
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>repmgr enhancements</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-cluster-cleanup">repmgr cluster cleanup</link></command>
|
||||
now accepts the <option>--node-id</option> option to delete records for only one
|
||||
node. (GitHub #493).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
When running
|
||||
<command><link linkend="repmgr-cluster-matrix">repmgr cluster matrix</link></command> and
|
||||
<command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command>,
|
||||
&repmgr; will report nodes unreachable via SSH, and emit return code <literal>ERR_BAD_SSH</literal>.
|
||||
(GitHub #246).
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
Users relying on
|
||||
<command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command>
|
||||
to return a non-zero return code as a way of detecting connectivity errors should be aware
|
||||
that <literal>ERR_BAD_SSH</literal> will be returned if there is an SSH connection error
|
||||
from the node where the command is executed, even if the command is able to establish
|
||||
that PostgreSQL connectivity is fine. Therefore the exact return code should be checked
|
||||
to determine what kind of connectivity error has been detected.
|
||||
</para>
|
||||
</note>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
<sect2>
|
||||
<title>repmgrd enhancements</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> can now be "paused", i.e. instructed
|
||||
not to take any action such as a failover, even if the prerequisites for such an
|
||||
action are detected.
|
||||
</para>
|
||||
<para>
|
||||
This removes the need to stop <application>repmgrd</application> on all nodes when
|
||||
performing a planned operation such as a switchover.
|
||||
</para>
|
||||
<para>
|
||||
For further details, see <link linkend="repmgrd-pausing">Pausing repmgrd</link>.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Bug fixes</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr;: fix "Missing replication slots" label in
|
||||
<command><link linkend="repmgr-node-check">repmgr node check</link></command>. (GitHub #507)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application>: fix parsing of <option>-d/--daemonize</option> option.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="release-4.1.1">
|
||||
<title>Release 4.1.1</title>
|
||||
<para><emphasis>Wed September 5, 2018</emphasis></para>
|
||||
<para>
|
||||
repmgr 4.1.1 contains a number of usability enhancements and bug fixes.
|
||||
</para>
|
||||
<para>
|
||||
We recommend upgrading to this version as soon as possible.
|
||||
This release can be installed as a simple package upgrade from repmgr 4.0 ~ 4.1.0;
|
||||
<application>repmgrd</application> (if running) should be restarted.
|
||||
See <xref linkend="upgrading-repmgr"> for more details.
|
||||
</para>
|
||||
|
||||
<sect2>
|
||||
<title>repmgr enhancements</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-standby-switchover">repmgr standby switchover --dry-run</link></command>
|
||||
no longer copies external configuration files to test they can be copied; this avoids making
|
||||
any changes to the target system. (GitHub #491).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-cluster-cleanup">repmgr cluster cleanup</link></command>:
|
||||
add <literal>cluster_cleanup</literal> event. (GitHub #492).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>:
|
||||
improve detection of free walsenders. (GitHub #495).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Improve messages emitted during
|
||||
<command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
<sect2>
|
||||
<title>repmgrd enhancements</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Always reopen the log file after
|
||||
receiving <literal>SIGHUP</literal>. Previously this only happened if
|
||||
a configuration file change was detected.
|
||||
(GitHub #485).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Report version number <emphasis>after</emphasis>
|
||||
logger initialisation. (GitHub #487).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Improve cascaded standby failover handling. (GitHub #480).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Improve reconnection handling after brief network outages; if
|
||||
monitoring data being collected, this could lead to orphaned
|
||||
sessions on the primary. (GitHub #480).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Check <varname>promote_command</varname> and <varname>follow_command</varname>
|
||||
are defined when reloading configuration. These were checked on startup but
|
||||
not reload by <application>repmgrd</application>, which made it possible to
|
||||
make <application>repmgrd</application> with invalid values. It's unlikely
|
||||
anyone would want to do this, but we should make it impossible anyway.
|
||||
(GitHub #486).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Other</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Text of any failed queries will now be logged as <literal>ERROR</literal> to assist
|
||||
logfile analysis at log levels higher than <literal>DEBUG</literal>.
|
||||
(GitHub #498).
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Bug fixes</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>:
|
||||
remove new upstream's replication slot if it still exists on the rejoined
|
||||
standby. (GitHub #499).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application>: fix startup on witness node when local data is stale. (GitHub #488, #489).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Truncate version string reported by PostgreSQL if necessary; some
|
||||
distributions insert additional detail after the actual version.
|
||||
(GitHub #490).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
|
||||
<sect1 id="release-4.1.0">
|
||||
<title>Release 4.1.0</title>
|
||||
<para><emphasis>???? ??, 2018</emphasis></para>
|
||||
<para><emphasis>Tue July 31, 2018</emphasis></para>
|
||||
<para>
|
||||
&repmgr; 4.1.0 introduces some changes to <application>repmgrd</application>
|
||||
behaviour and some additional configuration parameters.
|
||||
@@ -29,19 +609,20 @@
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> (if running) must be restarted.
|
||||
Execute <command>ALTER EXTENSION repmgr UPDATE</command>
|
||||
on the primary server in the database where &repmgr; is installed.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Execute <command>ALTER EXTENSION repmgr UPGRADE</command>
|
||||
on the primary server in the database where &repmgr; is installed.
|
||||
<application>repmgrd</application> must be restarted on all nodes where it is running.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
A restart of the PostgreSQL server is <emphasis>not</emphasis> required
|
||||
for this release.
|
||||
for this release (unless upgrading from repmgr 3.x).
|
||||
</para>
|
||||
<para>
|
||||
See <xref linkend="upgrading-repmgr-extension"> for more details.
|
||||
@@ -53,6 +634,17 @@
|
||||
review the changes listed below.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
<emphasis>Repository changes</emphasis>
|
||||
</para>
|
||||
<para>
|
||||
Coinciding with this release, the 2ndQuadrant repository structure has changed.
|
||||
See section <xref linkend="installation-packages"> for details, particularly
|
||||
if you are using a RPM-based system.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<sect2>
|
||||
<title>Configuration file changes</title>
|
||||
|
||||
@@ -214,7 +806,7 @@
|
||||
|
||||
<sect1 id="release-4.0.6">
|
||||
<title>Release 4.0.6</title>
|
||||
<para><emphasis>June 14, 2018</emphasis></para>
|
||||
<para><emphasis>Thu June 14, 2018</emphasis></para>
|
||||
<para>
|
||||
&repmgr; 4.0.6 contains a number of bug fixes and usability enhancements.
|
||||
</para>
|
||||
@@ -965,7 +1557,7 @@
|
||||
<emphasis>easier upgrades</emphasis>: &repmgr; is now implemented as a native
|
||||
PostgreSQL extension, which means future upgrades can be carried out by
|
||||
installing the upgraded package and issuing
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/sql-alterextension.html">ALTER EXTENSION repmgr UPDATE</ulink>.
|
||||
<ulink url="https://www.postgresql.org/docs/current/sql-alterextension.html">ALTER EXTENSION repmgr UPDATE</ulink>.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
@@ -5,14 +5,14 @@
|
||||
<title>repmgr source code signing key</title>
|
||||
<para>
|
||||
The signing key ID used for <application>repmgr</application> source code bundles is:
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/SOURCE-GPG-KEY-repmgr">
|
||||
<ulink url="https://repmgr.org/download/SOURCE-GPG-KEY-repmgr">
|
||||
<literal>0x297F1DCC</literal></ulink>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To download the <application>repmgr</application> source key to your computer:
|
||||
<programlisting>
|
||||
curl -s http://packages.2ndquadrant.com/repmgr/SOURCE-GPG-KEY-repmgr | gpg --import
|
||||
curl -s https://repmgr.org/download/SOURCE-GPG-KEY-repmgr | gpg --import
|
||||
gpg --fingerprint 0x297F1DCC
|
||||
</programlisting>
|
||||
then verify that the fingerprint is the expected value:
|
||||
|
||||
96
doc/appendix-support.sgml
Normal file
96
doc/appendix-support.sgml
Normal file
@@ -0,0 +1,96 @@
|
||||
<appendix id="appendix-support" xreflabel="repmgr support">
|
||||
<indexterm>
|
||||
<primary>support</primary>
|
||||
</indexterm>
|
||||
|
||||
<title>&repmgr; support</title>
|
||||
<para>
|
||||
<ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides 24x7
|
||||
production support for &repmgr; and other PostgreSQL
|
||||
products, including configuration assistance, installation
|
||||
verification and training for running a robust replication cluster.
|
||||
</para>
|
||||
<para>
|
||||
For further details see: <ulink url="https://2ndquadrant.com/en/support/">https://2ndquadrant.com/en/support/</ulink>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
A mailing list/forum is provided via Google groups to discuss contributions or issues: <ulink url="https://groups.google.com/group/repmgr">https://groups.google.com/group/repmgr</ulink>.
|
||||
</para>
|
||||
<para>
|
||||
Please report bugs and other issues to: <ulink url="https://github.com/2ndQuadrant/repmgr">https://github.com/2ndQuadrant/repmgr</ulink>.
|
||||
</para>
|
||||
|
||||
<important>
|
||||
<para>
|
||||
Please read the <link linkend="appendix-support-reporting-issues">following section</link> before submitting questions or issue reports.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
<sect1 id="appendix-support-reporting-issues" xreflabel="Reportins Issues">
|
||||
<indexterm>
|
||||
<primary>support</primary>
|
||||
<secondary>reporting issues</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Reporting Issues</title>
|
||||
|
||||
<para>
|
||||
When asking questions or reporting issues, it is extremely helpful if the following information is included:
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
&repmgr; version
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
How was &repmgr installed? From source? From packages? If
|
||||
so from which repository?
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<filename>repmpgr.conf</filename> files (suitably anonymized if necessary)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
Contents of the <literal>repmgr.nodes</literal> table (suitably anonymized if necessary)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
PostgreSQL version
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<para>
|
||||
If issues are encountered with a &repmgr; client command, please provide
|
||||
the output of that command executed with the options
|
||||
<option>-LDEBUG --verbose</option>, which will ensure &repmgr; emits
|
||||
the maximum level of logging output.
|
||||
</para>
|
||||
<para>
|
||||
If issues are encountered with <application>repmgrd</application>,
|
||||
please provide relevant extracts from the &repmgr; log files
|
||||
and if possible the PostgreSQL log itself. Please ensure these
|
||||
logs do not contain any confidential data.
|
||||
</para>
|
||||
<para>
|
||||
In all cases it is <emphasis>extremely</emphasis> useful to receive
|
||||
information on how to reliably reproduce an issue with as much detail as
|
||||
possible.
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
|
||||
</appendix>
|
||||
@@ -4,5 +4,5 @@ BDR failover with repmgrd
|
||||
This document has been integrated into the main `repmgr` documentation
|
||||
and is now located here:
|
||||
|
||||
> [BDR failover with repmgrd](https://repmgr.org/docs/4.0/repmgrd-bdr.html)
|
||||
> [BDR failover with repmgrd](https://repmgr.org/docs/current/repmgrd-bdr.html)
|
||||
|
||||
|
||||
@@ -4,4 +4,4 @@ Changes in repmgr 4
|
||||
This document has been integrated into the main `repmgr` documentation
|
||||
and is now located here:
|
||||
|
||||
> [Release notes](https://repmgr.org/docs/4.0/release-4.0.html)
|
||||
> [Release notes](https://repmgr.org/docs/current/release-4.0.html)
|
||||
|
||||
@@ -243,8 +243,8 @@
|
||||
</simpara>
|
||||
<simpara>
|
||||
As an alternative we recommend using 2ndQuadrant's <ulink url="https://www.pgbarman.org/">Barman</ulink>,
|
||||
which offloads WAL management to a separate server, negating the need to use replication
|
||||
slots to reserve WAL. See section <xref linkend="cloning-from-barman">
|
||||
which offloads WAL management to a separate server, removing the requirement to use a replication
|
||||
slot for each individual standby to reserve WAL. See section <xref linkend="cloning-from-barman">
|
||||
for more details on using &repmgr; together with Barman.
|
||||
</simpara>
|
||||
</tip>
|
||||
@@ -262,7 +262,7 @@
|
||||
meaning replication changes "cascade" down through a hierarchy of servers. This
|
||||
can be used to reduce load on the primary and minimize bandwith usage between
|
||||
sites. For more details, see the
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/warm-standby.html#CASCADING-REPLICATION">
|
||||
<ulink url="https://www.postgresql.org/docs/current/warm-standby.html#CASCADING-REPLICATION">
|
||||
PostgreSQL cascading replication documentation</ulink>.
|
||||
</para>
|
||||
<para>
|
||||
@@ -352,10 +352,12 @@
|
||||
provide additional parameters for <command>pg_basebackup</command> to customise the
|
||||
cloning process.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
By default, <command>pg_basebackup</command> performs a checkpoint before beginning the backup
|
||||
process. However, a normal checkpoint may take some time to complete;
|
||||
a fast checkpoint can be forced with the <literal>-c/--fast-checkpoint</literal> option.
|
||||
a fast checkpoint can be forced with <command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>'s
|
||||
<literal>-c/--fast-checkpoint</literal> option.
|
||||
Note that this may impact performance of the server being cloned from (typically the primary)
|
||||
so should be used with care.
|
||||
</para>
|
||||
@@ -370,6 +372,18 @@
|
||||
Other options can be passed to <command>pg_basebackup</command> by including them
|
||||
in the <filename>repmgr.conf</filename> setting <varname>pg_basebackup_options</varname>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Not that by default, &repmgr; executes <command>pg_basebackup</command> with <option>-X/--wal-method</option>
|
||||
(PostgreSQL 9.6 and earlier: <option>-X/--xlog-method</option>) set to <literal>stream</literal>.
|
||||
From PostgreSQL 9.6, if replication slots are in use, it will also create a replication slot before
|
||||
running the base backup, and execute <command>pg_basebackup</command> with the
|
||||
<option>-S/--slot</option> option set to the name of the previously created replication slot.
|
||||
</para>
|
||||
<para>
|
||||
These parameters can set by the user in <varname>pg_basebackup_options</varname>, in which case they
|
||||
will override the &repmgr; default values. However normally there's no reason to do this.
|
||||
</para>
|
||||
<para>
|
||||
If using a separate directory to store WAL files, provide the option <literal>--waldir</literal>
|
||||
(<literal>--xlogdir</literal> in PostgreSQL 9.6 and earlier) with the absolute path to the
|
||||
@@ -377,7 +391,7 @@
|
||||
a symlink will automatically be created from the main data directory.
|
||||
</para>
|
||||
<para>
|
||||
See the <ulink url="https://www.postgresql.org/docs/current/static/app-pgbasebackup.html">PostgreSQL pg_basebackup documentation</ulink>
|
||||
See the <ulink url="https://www.postgresql.org/docs/current/app-pgbasebackup.html">PostgreSQL pg_basebackup documentation</ulink>
|
||||
for more details of available options.
|
||||
</para>
|
||||
</sect2>
|
||||
@@ -399,7 +413,7 @@
|
||||
user's <filename>~/.pgpass</filename> file. It's also possible to store the password in the
|
||||
environment variable <varname>PGPASSWORD</varname>, however this is not recommended for
|
||||
security reasons. For more details see the
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/libpq-pgpass.html">PostgreSQL password file documentation</ulink>.
|
||||
<ulink url="https://www.postgresql.org/docs/current/libpq-pgpass.html">PostgreSQL password file documentation</ulink>.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
|
||||
@@ -39,6 +39,10 @@
|
||||
called <varname>standby1</varname> (for example), things will be confusing
|
||||
to say the least.
|
||||
</para>
|
||||
<para>
|
||||
The string's maximum length is 63 characters and it should
|
||||
contain only printable ASCII characters.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
@@ -56,7 +60,7 @@
|
||||
</para>
|
||||
<para>
|
||||
For details on conninfo strings, see section <ulink
|
||||
url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING">Connection Strings</>
|
||||
url="https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING">Connection Strings</>
|
||||
in the PosgreSQL documentation.
|
||||
</para>
|
||||
<para>
|
||||
@@ -64,7 +68,7 @@
|
||||
<varname>connect_timeout</varname> in the <varname>conninfo</varname>
|
||||
string to determine the length of time which elapses before a network
|
||||
connection attempt is abandoned; for details see <ulink
|
||||
url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNECT-CONNECT-TIMEOUT">
|
||||
url="https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-CONNECT-TIMEOUT">
|
||||
the PostgreSQL documentation</>.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
@@ -17,15 +17,15 @@
|
||||
<link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>.
|
||||
</para>
|
||||
<para>
|
||||
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> to control the PostgreSQL
|
||||
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> utility to control the PostgreSQL
|
||||
server. However this can lead to various problems, particularly when PostgreSQL has been
|
||||
installed from packages, and expecially so if <application>systemd</application> is in use.
|
||||
installed from packages, and especially so if <application>systemd</application> is in use.
|
||||
</para>
|
||||
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If using <application>systemd</application>, ensure you have <varname>RemoteIPC</varname> set to <literal>off</literal>.
|
||||
If using <application>systemd</application>, ensure you have <varname>RemoveIPC</varname> set to <literal>off</literal>.
|
||||
See the <ulink url="https://wiki.postgresql.org/wiki/Systemd">systemd</ulink>
|
||||
entry in the <ulink url="https://wiki.postgresql.org/wiki/Main_Page">PostgreSQL wiki</ulink> for details.
|
||||
</para>
|
||||
@@ -48,6 +48,13 @@
|
||||
service_reload_command</programlisting>
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; will not apply <option>pg_bindir</option> when executing any of these commands;
|
||||
these can be user-defined scripts so must always be specified with the full path.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
It's also possible to specify a <varname>service_promote_command</varname>.
|
||||
@@ -67,12 +74,12 @@
|
||||
|
||||
<para>
|
||||
To confirm which command &repmgr; will execute for each action, use
|
||||
<command>repmgr node service --list --action=...</command>, e.g.:
|
||||
<command><link linkend="repmgr-node-service">repmgr node service --list-actions --action=...</link></command>, e.g.:
|
||||
<programlisting>
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=stop
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=start
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=restart
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=reload</programlisting>
|
||||
repmgr -f /etc/repmgr.conf node service --list-actions --action=stop
|
||||
repmgr -f /etc/repmgr.conf node service --list-actions --action=start
|
||||
repmgr -f /etc/repmgr.conf node service --list-actions --action=restart
|
||||
repmgr -f /etc/repmgr.conf node service --list-actions --action=reload</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
@@ -92,7 +99,7 @@
|
||||
Defaults:postgres !requiretty
|
||||
postgres ALL = NOPASSWD: /usr/bin/systemctl stop postgresql-9.6, \
|
||||
/usr/bin/systemctl start postgresql-9.6, \
|
||||
/usr/bin/systemctl restart postgresql-9.6 \
|
||||
/usr/bin/systemctl restart postgresql-9.6, \
|
||||
/usr/bin/systemctl reload postgresql-9.6</programlisting>
|
||||
</para>
|
||||
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
<sect1 id="configuration-file" xreflabel="configuration file location">
|
||||
<sect1 id="configuration-file" xreflabel="configuration file">
|
||||
<indexterm>
|
||||
<primary>repmgr.conf</primary>
|
||||
<secondary>location</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>configuration</primary>
|
||||
<secondary>repmgr.conf location</secondary>
|
||||
<secondary>repmgr.conf</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Configuration file location</title>
|
||||
<title>Configuration file</title>
|
||||
|
||||
<para>
|
||||
<application>repmgr</application> and <application>repmgrd</application>
|
||||
use a common configuration file, by default called
|
||||
@@ -21,6 +21,55 @@
|
||||
for more details.
|
||||
</para>
|
||||
|
||||
<sect2 id="configuration-file-format" xreflabel="configuration file format">
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgr.conf</primary>
|
||||
<secondary>format</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Configuration file format</title>
|
||||
|
||||
<para>
|
||||
<filename>repmgr.conf</filename> is a plain text file with one parameter/value
|
||||
combination per line.
|
||||
</para>
|
||||
<para>
|
||||
Whitespace is insignificant (except within a quoted parameter value) and blank lines are ignored.
|
||||
Hash marks (<literal>#</literal>) designate the remainder of the line as a comment.
|
||||
Parameter values that are not simple identifiers or numbers should be single-quoted.
|
||||
Note that single quote cannot be embedded in a parameter value.
|
||||
</para>
|
||||
<important>
|
||||
<para>
|
||||
&repmgr; will interpret double-quotes as being part of a string value; only use single quotes
|
||||
to quote parameter values.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
<para>
|
||||
Example of a valid <filename>repmgr.conf</filename> file:
|
||||
<programlisting>
|
||||
# repmgr.conf
|
||||
|
||||
node_id=1
|
||||
node_name= node1
|
||||
conninfo ='host=node1 dbname=repmgr user=repmgr connect_timeout=2'
|
||||
data_directory = /var/lib/pgsql/11/data</programlisting>
|
||||
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
|
||||
<sect2 id="configuration-file-location" xreflabel="configuration file location">
|
||||
<indexterm>
|
||||
<primary>repmgr.conf</primary>
|
||||
<secondary>location</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Configuration file location</title>
|
||||
|
||||
<para>
|
||||
The configuration file will be searched for in the following locations:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
@@ -50,7 +99,7 @@
|
||||
Note that if a file is explicitly specified with <literal>-f/--config-file</literal>,
|
||||
an error will be raised if it is not found or not readable, and no attempt will be made to
|
||||
check default locations; this is to prevent <application>repmgr</application> unexpectedly
|
||||
reading the wrong configuraton file.
|
||||
reading the wrong configuration file.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
@@ -65,5 +114,7 @@
|
||||
to <filename>/path/to/./repmgr.conf</filename>, whereas you'd normally write
|
||||
<filename>/path/to/repmgr.conf</filename>).
|
||||
</para>
|
||||
</note>
|
||||
</sect1>
|
||||
</note>
|
||||
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
@@ -1,6 +1,292 @@
|
||||
<chapter id="configuration" xreflabel="Configuration">
|
||||
<title>repmgr configuration</title>
|
||||
|
||||
<sect1 id="configuration-prerequisites" xreflabel="Prerequisites for configuration">
|
||||
<indexterm>
|
||||
<primary>configuration</primary>
|
||||
<secondary>prerequisites</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>configuration</primary>
|
||||
<secondary>ssh</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Prerequisites for configuration</title>
|
||||
<para>
|
||||
Following software must be installed on both servers:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><application>PostgreSQL</application></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<application>repmgr</application>
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
At network level, connections between the PostgreSQL port (default: <literal>5432</literal>)
|
||||
must be possible between all nodes.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Passwordless <command>SSH</command> connectivity between all servers in the replication cluster
|
||||
is not required, but is necessary in the following cases:
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<simpara>if you need &repmgr; to copy configuration files from outside the PostgreSQL
|
||||
data directory (as is the case with e.g. <link linkend="packages-debian-ubuntu">Debian packages</link>);
|
||||
in this case <command>rsync</command> must also be installed on all servers.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>to perform <link linkend="performing-switchover">switchover operations</link></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
when executing <command><link linkend="repmgr-cluster-matrix">repmgr cluster matrix</link></command>
|
||||
and <command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command>
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<tip>
|
||||
<simpara>
|
||||
Consider setting <varname>ConnectTimeout</varname> to a low value in your SSH configuration.
|
||||
This will make it faster to detect any SSH connection errors.
|
||||
</simpara>
|
||||
</tip>
|
||||
|
||||
<sect2 id="configuration-postgresql" xreflabel="PostgreSQL configuration">
|
||||
<indexterm>
|
||||
<primary>configuration</primary>
|
||||
<secondary>PostgreSQL</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>PostgreSQL configuration</primary>
|
||||
</indexterm>
|
||||
|
||||
<title>PostgreSQL configuration for &repmgr;</title>
|
||||
<para>
|
||||
The following PostgreSQL configuration parameters may need to be changed in order
|
||||
for &repmgr; (and replication itself) to function correctly.
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>hot_standby</primary>
|
||||
<secondary>PostgreSQL configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>hot_standby</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<option>hot_standby</option> must always be set to <literal>on</literal>, as &repmgr; needs
|
||||
to be able to connect to each server it manages.
|
||||
</para>
|
||||
<para>
|
||||
Note that <option>hot_standby</option> defaults to <literal>on</literal> from PostgreSQL 10
|
||||
and later; in PostgreSQL 9.6 and earlier, the default was <literal>off</literal>.
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL documentation: <ulink url="https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-HOT-STANDBY">hot_standby</ulink>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>wal_level</primary>
|
||||
<secondary>PostgreSQL configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>wal_level</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<option>wal_level</option> must be one of <option>replica</option> or <option>logical</option>
|
||||
(PostgreSQL 9.5 and earlier: one of <option>hot_standby</option> or <option>logical</option>).
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL documentation: <ulink url="https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-WAL-LEVEL">wal_level</ulink>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>max_wal_senders</primary>
|
||||
<secondary>PostgreSQL configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>max_wal_senders</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<option>max_wal_senders</option> must be set to a value of <literal>2</literal> or greater.
|
||||
In general you will need one WAL sender for each standby which will attach to the PostgreSQL
|
||||
instance; additionally &repmgr; will require two free WAL senders in order to clone further
|
||||
standbys.
|
||||
</para>
|
||||
<para>
|
||||
<option>max_wal_senders</option> should be set to an appropriate value on all PostgreSQL
|
||||
instances in the replication cluster which may potentially become a primary server or
|
||||
(in cascading replication) the upstream server of a standby.
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL documentation: <ulink url="https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-MAX-WAL-SENDERS">max_wal_senders</ulink>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>max_replication_slots</primary>
|
||||
<secondary>PostgreSQL configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>max_replication_slots</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
If you are intending to use replication slots, <option>max_replication_slots</option>
|
||||
must be set to a non-zero value.
|
||||
</para>
|
||||
<para>
|
||||
<option>max_replication_slots</option> should be set to an appropriate value on all PostgreSQL
|
||||
instances in the replication cluster which may potentially become a primary server or
|
||||
(in cascading replication) the upstream server of a standby.
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL documentation: <ulink url="https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-MAX-REPLICATION-SLOTS">max_replication_slots</ulink>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>wal_log_hints</primary>
|
||||
<secondary>PostgreSQL configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>wal_log_hints</option></term>
|
||||
<listitem>
|
||||
<para>If you are intending to use <application>pg_rewind</application>,
|
||||
and the cluster was not initialised using data checksums, you may want to consider enabling
|
||||
<option>wal_log_hints</option>.
|
||||
</para>
|
||||
<para>
|
||||
For more details see <xref linkend="repmgr-node-rejoin-pg-rewind">.
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL documentation: <ulink url="https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-WAL-LOG-HINTS">wal_log_hints</ulink>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>archive_mode</primary>
|
||||
<secondary>PostgreSQL configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>archive_mode</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
We suggest setting <option>archive_mode</option> to <literal>on</literal> (and
|
||||
<option>archive_command</option> to <literal>/bin/true</literal>; see below)
|
||||
even if you are currently not planning to use WAL file archiving.
|
||||
</para>
|
||||
<para>
|
||||
This will make it simpler to set up WAL file archiving if it is ever required,
|
||||
as changes to <option>archive_mode</option> require a full PostgreSQL server
|
||||
restart, while <option>archive_command</option> changes can be applied via a normal
|
||||
configuration reload.
|
||||
</para>
|
||||
<para>
|
||||
However, &repmgr; itself does not require WAL file archiving.
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL documentation: <ulink url="https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-ARCHIVE-MODE">archive_mode</ulink>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>archive_command</primary>
|
||||
<secondary>PostgreSQL configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>archive_command</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
If you have set <option>archive_mode</option> to <literal>on</literal> but are not currently planning
|
||||
to use WAL file archiving, set <option>archive_command</option> to a command which does nothing but returns
|
||||
<literal>true</literal>, such as <command>/bin/true</command>. See above for details.
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL documentation: <ulink url="https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-ARCHIVE-COMMAND">archive_command</ulink>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>wal_keep_segments</primary>
|
||||
<secondary>PostgreSQL configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>wal_keep_segments</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Normally there is no need to set <option>wal_keep_segments</option> (default: <literal>0</literal>), as it
|
||||
is <emphasis>not</emphasis> a reliable way of ensuring that all required WAL segments are available to standbys.
|
||||
Replication slots and/or an archiving solution such as Barman are recommended to ensure standbys have a reliable
|
||||
source of WAL segments at all times.
|
||||
</para>
|
||||
<para>
|
||||
The only reason ever to set <option>wal_keep_segments</option> is you have
|
||||
you have configured <option>pg_basebackup_options</option>
|
||||
in <filename>repmgr.conf</filename> to include the setting <literal>--wal-method=fetch</literal>
|
||||
(PostgreSQL 9.6 and earlier: <literal>--xlog-method=fetch</literal>)
|
||||
<emphasis>and</emphasis> you have <emphasis>not</emphasis> set <option>restore_command</option>
|
||||
in <filename>repmgr.conf</filename> to fetch WAL files from a reliable source such as Barman,
|
||||
in which case you'll need to set <option>wal_keep_segments</option>
|
||||
to a sufficiently high number to ensure that all WAL files required by the standby
|
||||
are retained. However we do not recommend managing replication in this way.
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL documentation: <ulink url="https://www.postgresql.org/docs/current/runtime-config-replication.html#GUC-WAL-KEEP-SEGMENTS">wal_keep_segments</ulink>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
<para>
|
||||
See also the <link linkend="quickstart-postgresql-configuration">PostgreSQL configuration</link> section in the
|
||||
<link linkend="quickstart">Quick-start guide</link>.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
&configuration-file;
|
||||
&configuration-file-required-settings;
|
||||
&configuration-file-log-settings;
|
||||
|
||||
@@ -1,86 +0,0 @@
|
||||
<chapter id="using-witness-server">
|
||||
<indexterm>
|
||||
<primary>witness server</primary>
|
||||
<seealso>Using a witness server with repmgrd</seealso>
|
||||
</indexterm>
|
||||
|
||||
|
||||
<title>Using a witness server</title>
|
||||
<para>
|
||||
A <xref linkend="witness-server"> is a normal PostgreSQL instance which
|
||||
is not part of the streaming replication cluster; its purpose is, if a
|
||||
failover situation occurs, to provide proof that the primary server
|
||||
itself is unavailable.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
A typical use case for a witness server is a two-node streaming replication
|
||||
setup, where the primary and standby are in different locations (data centres).
|
||||
By creating a witness server in the same location as the primary, if the primary
|
||||
becomes unavailable it's possible for the standby to decide whether it can
|
||||
promote itself without risking a "split brain" scenario: if it can't see either the
|
||||
witness or the primary server, it's likely there's a network-level interruption
|
||||
and it should not promote itself. If it can seen the witness but not the primary,
|
||||
this proves there is no network interruption and the primary itself is unavailable,
|
||||
and it can therefore promote itself (and ideally take action to fence the
|
||||
former primary).
|
||||
</para>
|
||||
<para>
|
||||
For more complex replication scenarios,e.g. with multiple datacentres, it may
|
||||
be preferable to use location-based failover, which ensures that only nodes
|
||||
in the same location as the primary will ever be promotion candidates;
|
||||
see <xref linkend="repmgrd-network-split"> for more details.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<simpara>
|
||||
A witness server will only be useful if <application>repmgrd</application>
|
||||
is in use.
|
||||
</simpara>
|
||||
</note>
|
||||
|
||||
<sect1 id="creating-witness-server">
|
||||
<title>Creating a witness server</title>
|
||||
<para>
|
||||
To create a witness server, set up a normal PostgreSQL instance on a server
|
||||
in the same physical location as the cluster's primary server.
|
||||
</para>
|
||||
<para>
|
||||
This instance should *not* be on the same physical host as the primary server,
|
||||
as otherwise if the primary server fails due to hardware issues, the witness
|
||||
server will be lost too.
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
&repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
|
||||
command, which would automatically create a PostgreSQL instance. However
|
||||
this often resulted in an unsatisfactory, hard-to-customise instance.
|
||||
</simpara>
|
||||
</note>
|
||||
<para>
|
||||
The witness server should be configured in the same way as a normal
|
||||
&repmgr; node; see section <xref linkend="configuration">.
|
||||
</para>
|
||||
<para>
|
||||
Register the witness server with <xref linkend="repmgr-witness-register">.
|
||||
This will create the &repmgr; extension on the witness server, and make
|
||||
a copy of the &repmgr; metadata.
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
As the witness server is not part of the replication cluster, further
|
||||
changes to the &repmgr; metadata will be synchronised by
|
||||
<application>repmgrd</application>.
|
||||
</simpara>
|
||||
</note>
|
||||
<para>
|
||||
Once the witness server has been configured, <application>repmgrd</application>
|
||||
should be started; for more details see <xref linkend="repmgrd-witness-server">.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
</chapter>
|
||||
@@ -88,7 +88,7 @@
|
||||
|
||||
<para>
|
||||
The values provided for <literal>%t</literal> and <literal>%d</literal>
|
||||
will probably contain spaces, so should be quoted in the provided command
|
||||
may contain spaces, so should be quoted in the provided command
|
||||
configuration, e.g.:
|
||||
<programlisting>
|
||||
event_notification_command='/path/to/some/script %n %e %s "%t" "%d"'
|
||||
@@ -147,58 +147,76 @@
|
||||
<para>
|
||||
By default, all notification types will be passed to the designated script;
|
||||
the notification types can be filtered to explicitly named ones using the
|
||||
<varname>event_notifications</varname> parameter:
|
||||
<varname>event_notifications</varname> parameter.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Events generated by the &repmgr; command:
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara><literal>primary_register</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-primary-register-events">cluster_created</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>primary_unregister</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-primary-register-events">primary_register</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_register</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-primary-unregister-events">primary_unregister</link></literal></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><literal><link linkend="repmgr-standby-clone-events">standby_clone</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_register_sync</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-register-events">standby_register</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_unregister</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-register-events">standby_register_sync</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_clone</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-unregister-events">standby_unregister</link></literal></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><literal><link linkend="repmgr-standby-promote-events">standby_promote</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_promote</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-follow-events">standby_follow</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_follow</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-switchover-events">standby_switchover</link></literal></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><literal><link linkend="repmgr-witness-register-events">witness_register</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_disconnect_manual</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-witness-unregister-events">witness_unregister</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_failure</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-node-rejoin-events">node_rejoin</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_recovery</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>witness_register</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>witness_unregister</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>node_rejoin</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-cluster-cleanup-events">cluster_cleanup</link></literal></simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Events generated by <application>repmgrd</application> (streaming replication mode):
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_start</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_shutdown</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_reload</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_failover_promote</literal></simpara>
|
||||
</listitem>
|
||||
@@ -208,15 +226,41 @@
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_failover_aborted</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_standby_reconnect</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_promote_error</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_local_disconnect</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_local_reconnect</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_upstream_disconnect</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_upstream_reconnect</literal></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_promote_error</literal></simpara>
|
||||
<simpara><literal>standby_disconnect_manual</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_failure</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_recovery</literal></simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Events generated by <application>repmgrd</application> (BDR mode):
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><literal>bdr_failover</literal></simpara>
|
||||
</listitem>
|
||||
|
||||
@@ -45,19 +45,14 @@
|
||||
<!ENTITY promoting-standby SYSTEM "promoting-standby.sgml">
|
||||
<!ENTITY follow-new-primary SYSTEM "follow-new-primary.sgml">
|
||||
<!ENTITY switchover SYSTEM "switchover.sgml">
|
||||
<!ENTITY configuring-witness-server SYSTEM "configuring-witness-server.sgml">
|
||||
|
||||
<!ENTITY event-notifications SYSTEM "event-notifications.sgml">
|
||||
<!ENTITY upgrading-repmgr SYSTEM "upgrading-repmgr.sgml">
|
||||
|
||||
<!ENTITY repmgrd-overview SYSTEM "repmgrd-overview.sgml">
|
||||
<!ENTITY repmgrd-automatic-failover SYSTEM "repmgrd-automatic-failover.sgml">
|
||||
<!ENTITY repmgrd-configuration SYSTEM "repmgrd-configuration.sgml">
|
||||
<!ENTITY repmgrd-demonstration SYSTEM "repmgrd-demonstration.sgml">
|
||||
<!ENTITY repmgrd-monitoring SYSTEM "repmgrd-monitoring.sgml">
|
||||
<!ENTITY repmgrd-degraded-monitoring SYSTEM "repmgrd-degraded-monitoring.sgml">
|
||||
<!ENTITY repmgrd-cascading-replication SYSTEM "repmgrd-cascading-replication.sgml">
|
||||
<!ENTITY repmgrd-network-split SYSTEM "repmgrd-network-split.sgml">
|
||||
<!ENTITY repmgrd-witness-server SYSTEM "repmgrd-witness-server.sgml">
|
||||
<!ENTITY repmgrd-operation SYSTEM "repmgrd-operation.sgml">
|
||||
<!ENTITY repmgrd-bdr SYSTEM "repmgrd-bdr.sgml">
|
||||
|
||||
<!ENTITY repmgr-primary-register SYSTEM "repmgr-primary-register.sgml">
|
||||
@@ -73,16 +68,23 @@
|
||||
<!ENTITY repmgr-node-status SYSTEM "repmgr-node-status.sgml">
|
||||
<!ENTITY repmgr-node-check SYSTEM "repmgr-node-check.sgml">
|
||||
<!ENTITY repmgr-node-rejoin SYSTEM "repmgr-node-rejoin.sgml">
|
||||
<!ENTITY repmgr-node-service SYSTEM "repmgr-node-service.sgml">
|
||||
<!ENTITY repmgr-cluster-show SYSTEM "repmgr-cluster-show.sgml">
|
||||
<!ENTITY repmgr-cluster-matrix SYSTEM "repmgr-cluster-matrix.sgml">
|
||||
<!ENTITY repmgr-cluster-crosscheck SYSTEM "repmgr-cluster-crosscheck.sgml">
|
||||
<!ENTITY repmgr-cluster-event SYSTEM "repmgr-cluster-event.sgml">
|
||||
<!ENTITY repmgr-cluster-cleanup SYSTEM "repmgr-cluster-cleanup.sgml">
|
||||
<!ENTITY repmgr-daemon-status SYSTEM "repmgr-daemon-status.sgml">
|
||||
<!ENTITY repmgr-daemon-start SYSTEM "repmgr-daemon-start.sgml">
|
||||
<!ENTITY repmgr-daemon-stop SYSTEM "repmgr-daemon-stop.sgml">
|
||||
<!ENTITY repmgr-daemon-pause SYSTEM "repmgr-daemon-pause.sgml">
|
||||
<!ENTITY repmgr-daemon-unpause SYSTEM "repmgr-daemon-unpause.sgml">
|
||||
|
||||
<!ENTITY appendix-release-notes SYSTEM "appendix-release-notes.sgml">
|
||||
<!ENTITY appendix-faq SYSTEM "appendix-faq.sgml">
|
||||
<!ENTITY appendix-signatures SYSTEM "appendix-signatures.sgml">
|
||||
<!ENTITY appendix-packages SYSTEM "appendix-packages.sgml">
|
||||
<!ENTITY appendix-support SYSTEM "appendix-support.sgml">
|
||||
|
||||
<!ENTITY bookindex SYSTEM "bookindex.sgml">
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
end of the preceding section (<xref linkend="promoting-standby">),
|
||||
execute this:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf repmgr standby follow
|
||||
$ repmgr -f /etc/repmgr.conf standby follow
|
||||
INFO: changing node 3's primary to node 2
|
||||
NOTICE: restarting server using "pg_ctl -l /var/log/postgresql/startup.log -w -D '/var/lib/postgresql/data' restart"
|
||||
waiting for server to shut down......... done
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
<sect1 id="installation-packages" xreflabel="Installing from packages">
|
||||
<title>Installing &repmgr; from packages</title>
|
||||
|
||||
<indexterm>
|
||||
<primary>installation</primary>
|
||||
<secondary>from packages</secondary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
We recommend installing &repmgr; using the available packages for your
|
||||
system.
|
||||
@@ -16,7 +22,7 @@
|
||||
<para>
|
||||
&repmgr; RPM packages for RedHat/CentOS variants and Fedora are available from the
|
||||
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
|
||||
<ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>; see following
|
||||
<ulink url="https://dl.2ndquadrant.com/">public repository</ulink>; see following
|
||||
section for details.
|
||||
</para>
|
||||
<para>
|
||||
@@ -29,9 +35,10 @@
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; packages are designed to be compatible with the community-provided PostgreSQL packages.
|
||||
&repmgr; RPM packages are designed to be compatible with the community-provided PostgreSQL packages
|
||||
and 2ndQuadrant's <ulink url="https://www.2ndquadrant.com/en/resources/2ndqpostgres/">2ndQPostgres</ulink>.
|
||||
They may not work with vendor-specific packages such as those provided by RedHat for RHEL
|
||||
customers, as the filesystem layout may be different to the community RPMs.
|
||||
customers, as the PostgreSQL filesystem layout may be different to the community RPMs.
|
||||
Please contact your support vendor for assistance.
|
||||
</para>
|
||||
</note>
|
||||
@@ -46,67 +53,77 @@
|
||||
<sect3 id="installation-packages-redhat-2ndq">
|
||||
<title>2ndQuadrant public RPM yum repository</title>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink> previously provided a dedicated
|
||||
&repmgr; repository at
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/">http://packages.2ndquadrant.com/repmgr/</ulink>.
|
||||
This repository will be deprecated in a future release as it is now replaced by
|
||||
the <ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>
|
||||
documented below.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
|
||||
<ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a dedicated <literal>yum</literal>
|
||||
<ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink> for 2ndQuadrant software,
|
||||
including &repmgr;. We recommend using this for all future &repmgr; releases.
|
||||
</para>
|
||||
<para>
|
||||
General instructions for using this repository can be found on its
|
||||
<ulink url="https://rpm.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
for installing &repmgr; follow below.
|
||||
</para>
|
||||
<ulink url="https://dl.2ndquadrant.com/">public repository</ulink> for 2ndQuadrant software,
|
||||
including &repmgr;. We recommend using this for all future &repmgr; releases.
|
||||
</para>
|
||||
<para>
|
||||
General instructions for using this repository can be found on its
|
||||
<ulink url="https://dl.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
for installing &repmgr; follow below.
|
||||
</para>
|
||||
<para>
|
||||
<emphasis>Installation</emphasis>
|
||||
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Locate the repository RPM for your PostgreSQL version from the list at:
|
||||
<ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Locate the repository RPM for your PostgreSQL version from the list at:
|
||||
<ulink url="https://dl.2ndquadrant.com/">https://dl.2ndquadrant.com/</ulink>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Install the repository RPM for your distribution and PostgreSQL version
|
||||
(this enables the 2ndQuadrant repository as a source of &repmgr; packages).
|
||||
</para>
|
||||
<para>
|
||||
For example, for PostgreSQL 10 on CentOS, execute:
|
||||
<programlisting>
|
||||
sudo yum install https://rpm.2ndquadrant.com/site/content/2ndquadrant-repo-10-1-1.el7.noarch.rpm
|
||||
</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Verify that the repository is installed with:
|
||||
<programlisting>
|
||||
Install the repository definition for your distribution and PostgreSQL version
|
||||
(this enables the 2ndQuadrant repository as a source of &repmgr; packages).
|
||||
</para>
|
||||
<para>
|
||||
For example, for PostgreSQL 10 on CentOS, execute:
|
||||
<programlisting>
|
||||
curl https://dl.2ndquadrant.com/default/release/get/10/rpm | sudo bash</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
For PostgreSQL 9.6 on CentOS, execute:
|
||||
<programlisting>
|
||||
curl https://dl.2ndquadrant.com/default/release/get/9.6/rpm | sudo bash</programlisting>
|
||||
</para>
|
||||
|
||||
|
||||
<para>
|
||||
Verify that the repository is installed with:
|
||||
<programlisting>
|
||||
sudo yum repolist</programlisting>
|
||||
The output should contain two entries like this:
|
||||
<programlisting>
|
||||
2ndquadrant-repo-10/7/x86_64 2ndQuadrant packages for PG10 for rhel 7 - x86_64 1
|
||||
2ndquadrant-repo-10-debug/7/x86_64 2ndQuadrant packages for PG10 for rhel 7 - x86_64 - Debug 1</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
The output should contain two entries like this:
|
||||
<programlisting>
|
||||
2ndquadrant-dl-default-release-pg10/7/x86_64 2ndQuadrant packages (PG10) for 7 - x86_64 4
|
||||
2ndquadrant-dl-default-release-pg10-debug/7/x86_64 2ndQuadrant packages (PG10) for 7 - x86_64 - Debug 3</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Install the &repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr10</literal>):
|
||||
<programlisting>
|
||||
$ yum install repmgr10</programlisting>
|
||||
sudo yum install repmgr10</programlisting>
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
For packages for PostgreSQL 9.6 and earlier, the package name does not contain
|
||||
a period between major and minor version numbers, e.g.
|
||||
<literal>repmgr96</literal>.
|
||||
</para>
|
||||
</note>
|
||||
<tip>
|
||||
<para>
|
||||
To determine the names of available packages, execute:
|
||||
<programlisting>
|
||||
yum search repmgr</programlisting>
|
||||
</para>
|
||||
</tip>
|
||||
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
@@ -149,7 +166,17 @@ $ yum install repmgr10</programlisting>
|
||||
<programlisting>
|
||||
[root@localhost ~]# yum install repmgr10-4.0.3-1.rhel7</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<emphasis>Installing old packages</emphasis>
|
||||
</para>
|
||||
<para>
|
||||
See appendix <link linkend="packages-old-versions-rhel-centos">Installing old package versions</link>
|
||||
for details on how to retrieve older package versions.
|
||||
</para>
|
||||
|
||||
</sect3>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="installation-packages-debian" xreflabel="Installing from packages on Debian or Ubuntu">
|
||||
@@ -175,61 +202,50 @@ $ yum install repmgr10</programlisting>
|
||||
<title>2ndQuadrant public apt repository for Debian/Ubuntu</title>
|
||||
|
||||
<para>
|
||||
Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
|
||||
<ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a
|
||||
<ulink url="https://apt.2ndquadrant.com/">public apt repository</ulink> for 2ndQuadrant software,
|
||||
including &repmgr;.
|
||||
</para>
|
||||
<para>
|
||||
General instructions for using this repository can be found on its
|
||||
<ulink url="https://apt.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
for installing &repmgr; follow below.
|
||||
</para>
|
||||
<ulink url="https://dl.2ndquadrant.com/">public apt repository</ulink> for 2ndQuadrant software,
|
||||
including &repmgr;.
|
||||
</para>
|
||||
<para>
|
||||
General instructions for using this repository can be found on its
|
||||
<ulink url="https://dl.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
for installing &repmgr; follow below.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
|
||||
<emphasis>Installation</emphasis>
|
||||
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
If not already present, install the <application>apt-transport-https</application> package:
|
||||
<programlisting>
|
||||
sudo apt-get install apt-transport-https</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Install the repository definition for your distribution and PostgreSQL version
|
||||
(this enables the 2ndQuadrant repository as a source of &repmgr; packages) by executing:
|
||||
<programlisting>
|
||||
curl https://dl.2ndquadrant.com/default/release/get/deb | sudo bash</programlisting>
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
This will automatically install the following additional packages, if not already present:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><literal>lsb-release</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>apt-transport-https</literal></simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</note>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Create <filename>/etc/apt/sources.list.d/2ndquadrant.list</filename> as follows:
|
||||
<programlisting>
|
||||
sudo sh -c 'echo "deb https://apt.2ndquadrant.com/ $(lsb_release -cs)-2ndquadrant main" > /etc/apt/sources.list.d/2ndquadrant.list'</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Install the 2ndQuadrant <ulink url="https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc">repository key</ulink>:
|
||||
<programlisting>
|
||||
sudo apt-get install curl ca-certificates
|
||||
curl https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc | sudo apt-key add -</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Update the package list
|
||||
<programlisting>
|
||||
sudo apt-get update</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<listitem>
|
||||
<para>
|
||||
Install the &repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr10</literal>):
|
||||
<programlisting>
|
||||
$ apt-get install postgresql-10-repmgr</programlisting>
|
||||
</para>
|
||||
sudo apt-get install postgresql-10-repmgr</programlisting>
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
For packages for PostgreSQL 9.6 and earlier, the package name includes
|
||||
@@ -237,11 +253,20 @@ $ apt-get install postgresql-10-repmgr</programlisting>
|
||||
<literal>postgresql-9.6-repmgr</literal>.
|
||||
</para>
|
||||
</note>
|
||||
</listitem>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</itemizedlist>
|
||||
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<emphasis>Installing old packages</emphasis>
|
||||
</para>
|
||||
<para>
|
||||
See appendix <link linkend="packages-old-versions-debian">Installing old package versions</link>
|
||||
for details on how to retrieve older package versions.
|
||||
</para>
|
||||
|
||||
</para>
|
||||
|
||||
</sect3>
|
||||
</sect2>
|
||||
|
||||
@@ -13,8 +13,9 @@
|
||||
</para>
|
||||
|
||||
<para>
|
||||
From version 4.0, repmgr is compatible with all PostgreSQL versions from 9.3, including PostgreSQL 10.
|
||||
Note that some &repmgr; functionality is not available in PostgreSQL 9.3 and PostgreSQL 9.4.
|
||||
&repmgr; 4.x is compatible with all PostgreSQL versions from 9.3. See
|
||||
section <link linkend="install-compatibility-matrix">&repmgr; compatibility matrix</link>
|
||||
for an overview of version compatibility.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
@@ -31,34 +32,33 @@
|
||||
<para>
|
||||
&repmgr; must be installed on each server in the replication cluster.
|
||||
If installing repmgr from packages, the package version must match the PostgreSQL
|
||||
version. If installing from source, repmgr must be compiled against the same
|
||||
version. If installing from source, &repmgr; must be compiled against the same
|
||||
major version.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<simpara>
|
||||
The same "major" &repmgr; version (e.g. <literal>4.2.x</literal>) <emphasis>must</emphasis>
|
||||
be installed on all node in the replication cluster. We strongly recommend keeping all
|
||||
nodes on the same (preferably latest) "minor" &repmgr; version to minimize the risk
|
||||
of incompatibilities.
|
||||
</simpara>
|
||||
<simpara>
|
||||
If different "major" &repmgr; versions (e.g. 3.3.x and 4.1.x)
|
||||
are installed on different nodes, in the best case &repmgr; (in particular <application>repmgrd</application>)
|
||||
will not run. In the worst case, you will end up with a broken cluster.
|
||||
</simpara>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
A dedicated system user for &repmgr; is *not* required; as many &repmgr; and
|
||||
A dedicated system user for &repmgr; is <emphasis>not</emphasis> required; as many &repmgr; and
|
||||
<application>repmgrd</application> actions require direct access to the PostgreSQL data directory,
|
||||
these commands should be executed by the <literal>postgres</literal> user.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Passwordless <command>ssh</command> connectivity between all servers in the replication cluster
|
||||
is not required, but is necessary in the following cases:
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<simpara>if you need &repmgr; to copy configuration files from outside the PostgreSQL
|
||||
data directory (in which case <command>rsync</command> is also required)</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>to perform <link linkend="performing-switchover">switchover operations</link></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
when executing <command><link linkend="repmgr-cluster-matrix">repmgr cluster matrix</link></command>
|
||||
and <command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command>
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
See also <link linkend="configuration-prerequisites">Prerequisites for configuration</link>
|
||||
for information on networking requirements.
|
||||
</para>
|
||||
|
||||
<tip>
|
||||
@@ -69,4 +69,111 @@
|
||||
terminated if your <command>ssh</command> session to the server is interrupted or closed.
|
||||
</simpara>
|
||||
</tip>
|
||||
|
||||
<sect2 id="install-compatibility-matrix">
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgr</primary>
|
||||
<secondary>compatibility matrix</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>compatibility matrix</primary>
|
||||
</indexterm>
|
||||
|
||||
<title>&repmgr; compatibility matrix</title>
|
||||
<para>
|
||||
The following table provides an overview of which &repmgr; version supports
|
||||
which PostgreSQL version.
|
||||
</para>
|
||||
|
||||
|
||||
<table id="repmgr-compatibility-matrix">
|
||||
<title>&repmgr; compatibility matrix</title>
|
||||
|
||||
<tgroup cols="2">
|
||||
<thead>
|
||||
<row>
|
||||
<entry>
|
||||
&repmgr; version
|
||||
</entry>
|
||||
<entry>
|
||||
Latest release
|
||||
</entry>
|
||||
<entry>
|
||||
Supported PostgreSQL versions
|
||||
</entry>
|
||||
</row>
|
||||
</thead>
|
||||
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>
|
||||
&repmgr; 4.x
|
||||
</entry>
|
||||
<entry>
|
||||
<link linkend="release-4.2">4.2</link> (2018-10-24)
|
||||
</entry>
|
||||
<entry>
|
||||
9.3, 9.4, 9.5, 9.6, 10, 11
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>
|
||||
&repmgr; 3.x
|
||||
</entry>
|
||||
<entry>
|
||||
<ulink url="https://repmgr.org/release-notes-3.3.2.html">3.3.2</ulink> (2017-05-30)
|
||||
</entry>
|
||||
<entry>
|
||||
9.3, 9.4, 9.5, 9.6
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>
|
||||
&repmgr; 2.x
|
||||
</entry>
|
||||
<entry>
|
||||
<ulink url="https://repmgr.org/release-notes-2.0.3.html">2.0.3</ulink> (2015-04-16)
|
||||
</entry>
|
||||
<entry>
|
||||
9.0, 9.1, 9.2, 9.3, 9.4
|
||||
</entry>
|
||||
</row>
|
||||
</tbody>
|
||||
|
||||
</tgroup>
|
||||
</table>
|
||||
|
||||
<important>
|
||||
<para>
|
||||
The &repmgr; 2.x and 3.x series are no longer maintained or supported.
|
||||
We strongly recommend upgrading to the latest &repmgr; version.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
|
||||
<para>
|
||||
Note that some &repmgr; functionality is not available in PostgreSQL 9.3 and PostgreSQL 9.4.
|
||||
</para>
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
PostgreSQL 9.3 does not support replication slots, so corresponding &repmgr; functionality
|
||||
is not available.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
In PostgreSQL 9.3 and PostgreSQL 9.4, <command>pg_rewind</command> is not part of the core
|
||||
distribution. <command>pg_rewind</command> will need to be compiled separately to be able
|
||||
to use any &repmgr; functionality which takes advantage of it.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
@@ -26,12 +26,68 @@
|
||||
add the <ulink
|
||||
url="http://apt.postgresql.org/">apt.postgresql.org</ulink>
|
||||
repository to your <filename>sources.list</filename> if you
|
||||
have not already done so. Then install the pre-requisites for
|
||||
building PostgreSQL with:
|
||||
have not already done so, and ensure the source repository is enabled.
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
If not configured, the source repository can be added by including
|
||||
a <literal>deb-src</literal> line as a copy of the existing <literal>deb</literal>
|
||||
line in the repository file, which is usually
|
||||
<filename>/etc/apt/sources.list.d/pgdg.list</filename>, e.g.:
|
||||
<programlisting>
|
||||
deb http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main
|
||||
deb-src http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main</programlisting>
|
||||
</para>
|
||||
</tip>
|
||||
<para>
|
||||
Then install the prerequisites for
|
||||
building PostgreSQL with e.g.:
|
||||
<programlisting>
|
||||
sudo apt-get update
|
||||
sudo apt-get build-dep postgresql-9.6</programlisting>
|
||||
</para>
|
||||
|
||||
<important>
|
||||
<simpara>
|
||||
Select the appropriate PostgreSQL version for your target repmgr version.
|
||||
</simpara>
|
||||
</important>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If using <command>apt-get build-dep</command> is not possible, the
|
||||
following packages may need to be installed manually:
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara><literal>llibedit-dev</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>llibkrb5-dev</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>llibpam0g-dev</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>llibreadline-dev</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>llibselinux1-dev</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>llibssl-dev</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>llibxml2-dev</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>llibxslt1-dev</literal></simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
@@ -45,15 +101,55 @@
|
||||
sudo yum install yum-utils openjade docbook-dtds docbook-style-dsssl docbook-style-xsl
|
||||
sudo yum-builddep postgresql96</programlisting>
|
||||
</para>
|
||||
|
||||
<important>
|
||||
<simpara>
|
||||
Select the appropriate PostgreSQL version for your target repmgr version.
|
||||
</simpara>
|
||||
</important>
|
||||
<note>
|
||||
<para>
|
||||
If using <command>yum-builddep</command> is not possible, the
|
||||
following packages may need to be installed manually:
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara><literal>libselinux-devel</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>libxml2-devel</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>libxslt-devel</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>openssl-devel</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>pam-devel</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>readline-devel</literal></simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<tip>
|
||||
<para>
|
||||
If building against PostgreSQL 11 or later configured with the <option>--with-llvm</option> option
|
||||
(this is the case with the PGDG-provided packages) you'll also need to install the
|
||||
<literal>llvm-toolset-7-clang</literal> package. This is available via the
|
||||
<ulink url="https://wiki.centos.org/AdditionalResources/Repositories/SCL">Software Collections (SCL) Repository</ulink>.
|
||||
</para>
|
||||
</tip>
|
||||
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<simpara>
|
||||
Select the appropriate PostgreSQL versions for your target repmgr version.
|
||||
</simpara>
|
||||
</note>
|
||||
</sect2>
|
||||
|
||||
|
||||
@@ -80,7 +176,7 @@
|
||||
</para>
|
||||
|
||||
<para>
|
||||
There are also tags for each &repmgr; release, e.g. <filename>4.0.5</filename>.
|
||||
There are also tags for each &repmgr; release, e.g. <literal>v4.2.0</literal>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
@@ -146,7 +242,7 @@
|
||||
The &repmgr; documentation is (like the main PostgreSQL project)
|
||||
written in DocBook format. To build it locally as HTML, you'll need to
|
||||
install the required packages as described in the
|
||||
<ulink url="https://www.postgresql.org/docs/9.6/static/docguide-toolsets.html">
|
||||
<ulink url="https://www.postgresql.org/docs/9.6/docguide-toolsets.html">
|
||||
PostgreSQL documentation</ulink> then execute:
|
||||
<programlisting>
|
||||
./configure && make install-doc</programlisting>
|
||||
@@ -165,7 +261,7 @@
|
||||
<note>
|
||||
<simpara>
|
||||
Due to changes in PostgreSQL's documentation build system from PostgreSQL 10,
|
||||
the documentation can currently only be built agains PostgreSQL 9.6 or earlier.
|
||||
the documentation can currently only be built against PostgreSQL 9.6 or earlier.
|
||||
This limitation will be fixed when time and resources permit.
|
||||
</simpara>
|
||||
</note>
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
<date>2017</date>
|
||||
|
||||
<copyright>
|
||||
<year>2010-2018</year>
|
||||
<year>2010-2019</year>
|
||||
<holder>2ndQuadrant, Ltd.</holder>
|
||||
</copyright>
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
<title>Legal Notice</title>
|
||||
|
||||
<para>
|
||||
<productname>repmgr</productname> is Copyright © 2010-2018
|
||||
<productname>repmgr</productname> is Copyright © 2010-2019
|
||||
by 2ndQuadrant, Ltd. All rights reserved.
|
||||
</para>
|
||||
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
<chapter id="quickstart" xreflabel="Quick-start guide">
|
||||
<title>Quick-start guide</title>
|
||||
|
||||
<indexterm>
|
||||
<primary>quickstart</primary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
This section gives a quick introduction to &repmgr;, including setting up a
|
||||
sample &repmgr; installation and a basic replication cluster.
|
||||
@@ -50,7 +54,8 @@
|
||||
</para>
|
||||
<para>
|
||||
If you want <application>repmgr</application> to copy configuration files which are
|
||||
located outside the PostgreSQL data directory, and/or to test <command>switchover</command>
|
||||
located outside the PostgreSQL data directory, and/or to test
|
||||
<command><link linkend="repmgr-standby-switchover">switchover</link></command>
|
||||
functionality, you will also need passwordless SSH connections between both servers, and
|
||||
<application>rsync</application> should be installed.
|
||||
</para>
|
||||
@@ -63,7 +68,7 @@
|
||||
</tip>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="quickstart-postgresql-configuration">
|
||||
<sect1 id="quickstart-postgresql-configuration" xreflabel="PostgreSQL configuration">
|
||||
<title>PostgreSQL configuration</title>
|
||||
<para>
|
||||
On the primary server, a PostgreSQL instance must be initialised and running.
|
||||
@@ -78,6 +83,13 @@
|
||||
|
||||
max_wal_senders = 10
|
||||
|
||||
# Enable replication slots; set this figure to at least one more
|
||||
# than the number of standbys which will connect to this server.
|
||||
# Note that repmgr will only make use of replication slots if
|
||||
# "use_replication_slots" is set to "true" in repmgr.conf
|
||||
|
||||
max_replication_slots = 0
|
||||
|
||||
# Ensure WAL files contain enough information to enable read-only queries
|
||||
# on the standby.
|
||||
#
|
||||
@@ -85,7 +97,7 @@
|
||||
# PostgreSQL 9.6 and later: one of 'replica' or 'logical'
|
||||
# ('hot_standby' will still be accepted as an alias for 'replica')
|
||||
#
|
||||
# See: https://www.postgresql.org/docs/current/static/runtime-config-wal.html#GUC-WAL-LEVEL
|
||||
# See: https://www.postgresql.org/docs/current/runtime-config-wal.html#GUC-WAL-LEVEL
|
||||
|
||||
wal_level = 'hot_standby'
|
||||
|
||||
@@ -102,16 +114,6 @@
|
||||
# you WALs in a secure place. /bin/true is an example of a command that
|
||||
# ignores archiving. Use something more sensible.
|
||||
archive_command = '/bin/true'
|
||||
|
||||
# If you have configured "pg_basebackup_options"
|
||||
# in "repmgr.conf" to include the setting "--xlog-method=fetch" (from
|
||||
# PostgreSQL 10 "--wal-method=fetch"), *and* you have not set
|
||||
# "restore_command" in "repmgr.conf"to fetch WAL files from another
|
||||
# source such as Barman, you'll need to set "wal_keep_segments" to a
|
||||
# high enough value to ensure that all WAL files generated while
|
||||
# the standby is being cloned are retained until the standby starts up.
|
||||
#
|
||||
# wal_keep_segments = 5000
|
||||
</programlisting>
|
||||
<tip>
|
||||
<simpara>
|
||||
@@ -126,6 +128,9 @@
|
||||
and the cluster was not initialised using data checksums, you may want to consider enabling
|
||||
<varname>wal_log_hints</varname>; for more details see <xref linkend="repmgr-node-rejoin-pg-rewind">.
|
||||
</para>
|
||||
<para>
|
||||
See also the <link linkend="configuration-postgresql">PostgreSQL configuration</link> section in the <link linkend="configuration">repmgr configuaration guide</link>.
|
||||
</para>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="quickstart-repmgr-user-database">
|
||||
@@ -196,11 +201,20 @@
|
||||
<sect1 id="quickstart-standby-preparation">
|
||||
<title>Preparing the standby</title>
|
||||
<para>
|
||||
On the standby, do not create a PostgreSQL instance, but do ensure the destination
|
||||
On the standby, do <emphasis>not</emphasis> create a PostgreSQL instance (i.e.
|
||||
do not execute <application>initdb</application> or any database creation
|
||||
scripts provided by packages), but do ensure the destination
|
||||
data directory (and any other directories which you want PostgreSQL to use)
|
||||
exist and are owned by the <literal>postgres</literal> system user. Permissions
|
||||
must be set to <literal>0700</literal> (<literal>drwx------</literal>).
|
||||
</para>
|
||||
<tip>
|
||||
<simpara>
|
||||
&repmgr; will place a copy of the primary's database files in this directory.
|
||||
It will however refuse to run if a PostgreSQL instance has already been
|
||||
created there.
|
||||
</simpara>
|
||||
</tip>
|
||||
<para>
|
||||
Check the primary database is reachable from the standby using <application>psql</application>:
|
||||
</para>
|
||||
@@ -210,7 +224,7 @@
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; stores connection information as <ulink
|
||||
url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING">libpq
|
||||
url="https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING">libpq
|
||||
connection strings</ulink> throughout. This documentation refers to them as <literal>conninfo</literal>
|
||||
strings; an alternative name is <literal>DSN</literal> (<literal>data source name</literal>).
|
||||
We'll use these in place of the <command>-h hostname -d databasename -U username</command> syntax.
|
||||
@@ -237,14 +251,42 @@
|
||||
server. See sections <xref linkend="configuration"> and <xref linkend="configuration-file">
|
||||
for further details about <filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; only uses <option>pg_bindir</option> when it executes
|
||||
PostgreSQL binaries directly.
|
||||
</para>
|
||||
<para>
|
||||
For user-defined scripts such as <option>promote_command</option> and the
|
||||
various <option>service_*_command</option>s, you <emphasis>must</emphasis>
|
||||
always explicitly provide the full path to the binary or script being
|
||||
executed, even if it is &repmgr; itself.
|
||||
</para>
|
||||
<para>
|
||||
This is because these options can contain user-defined scripts in arbitrary
|
||||
locations, so prepending <option>pg_bindir</option> may break them.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<tip>
|
||||
<simpara>
|
||||
For Debian-based distributions we recommend explictly setting
|
||||
<literal>pg_bindir</literal> to the directory where <command>pg_ctl</command> and other binaries
|
||||
<option>pg_bindir</option> to the directory where <command>pg_ctl</command> and other binaries
|
||||
not in the standard path are located. For PostgreSQL 9.6 this would be <filename>/usr/lib/postgresql/9.6/bin/</filename>.
|
||||
</simpara>
|
||||
</tip>
|
||||
|
||||
<tip>
|
||||
<simpara>
|
||||
If your distribution places the &repmgr; binaries in a location other than the
|
||||
PostgreSQL installation directory, specify this with <option>repmgr_bindir</option>
|
||||
to enable &repmgr; to perform operations (e.g.
|
||||
<command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command>)
|
||||
on other nodes.
|
||||
</simpara>
|
||||
</tip>
|
||||
|
||||
<para>
|
||||
See the file
|
||||
<ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</>
|
||||
@@ -404,7 +446,7 @@
|
||||
</para>
|
||||
<para>
|
||||
From PostgreSQL 9.6 you can also use the view
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/monitoring-stats.html#PG-STAT-WAL-RECEIVER-VIEW">
|
||||
<ulink url="https://www.postgresql.org/docs/current/monitoring-stats.html#PG-STAT-WAL-RECEIVER-VIEW">
|
||||
<literal>pg_stat_wal_receiver</literal></ulink> to check the replication status from the standby.
|
||||
|
||||
<programlisting>
|
||||
|
||||
@@ -15,9 +15,14 @@
|
||||
<title>Description</title>
|
||||
<para>
|
||||
Purges monitoring history from the <literal>repmgr.monitoring_history</literal> table to
|
||||
prevent excessive table growth. Use the <literal>-k/--keep-history</literal> to specify the
|
||||
number of days of monitoring history to retain. This command can be used
|
||||
manually or as a cronjob.
|
||||
prevent excessive table growth.
|
||||
</para>
|
||||
<para>
|
||||
By default <emphasis>all</emphasis> data will be removed; Use the <option>-k/--keep-history</option>
|
||||
option to specify the number of days of monitoring history to retain.
|
||||
</para>
|
||||
<para>
|
||||
This command can be executed manually or as a cronjob.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
@@ -38,4 +43,35 @@
|
||||
<filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-cluster-cleanup-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>cluster_cleanup</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Options</title>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--node-id</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Only delete monitoring records for the specified node.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
For more details see the sections <xref linkend="repmgrd-monitoring"> and
|
||||
<xref linkend="repmgrd-monitoring-configuration">.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
</refentry>
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
Following exit codes can be emitted by <command>repmgr cluster crosscheck</command>:
|
||||
One of the following exit codes will be emitted by <command>repmgr cluster crosscheck</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
@@ -55,12 +55,37 @@
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_BAD_SSH (12)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
One or more nodes could not be accessed via SSH.
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
This only applies to nodes unreachable from the node where
|
||||
this command is executed.
|
||||
</simpara>
|
||||
<simpara>
|
||||
It's also possible that the crosscheck establishes that
|
||||
connections between PostgreSQL on all nodes are functioning,
|
||||
even if SSH access between some nodes is not possible.
|
||||
</simpara>
|
||||
</note>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_NODE_STATUS (25)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
One or more nodes could not be reached.
|
||||
PostgreSQL on one or more nodes could not be reached.
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
This error code overrides <option>ERR_BAD_SSH</option>.
|
||||
</simpara>
|
||||
</note>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
@@ -102,7 +102,7 @@
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
Following exit codes can be emitted by <command>repmgr cluster matrix</command>:
|
||||
One of the following exit codes will be emitted by <command>repmgr cluster matrix</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
@@ -115,12 +115,26 @@
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_BAD_SSH (12)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
One or more nodes could not be accessed via SSH.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_NODE_STATUS (25)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
One or more nodes could not be reached.
|
||||
PostgreSQL on one or more nodes could not be reached.
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
This error code overrides <option>ERR_BAD_SSH</option>.
|
||||
</simpara>
|
||||
</note>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
@@ -22,6 +22,14 @@
|
||||
directly and can be run on any node in the cluster; this is also useful when analyzing
|
||||
connectivity from a particular node.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Node availability is tested by connecting from the node where
|
||||
<command>repmgr cluster show</command> is executed, and does not necessarily imply the node
|
||||
is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
|
||||
better overviews of connections between nodes.
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
@@ -44,80 +52,138 @@
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster show
|
||||
|
||||
ID | Name | Role | Status | Upstream | Location | Connection string
|
||||
----+-------+---------+-----------+----------+----------+-----------------------------------------
|
||||
1 | node1 | primary | * running | | default | host=db_node1 dbname=repmgr user=repmgr
|
||||
2 | node2 | standby | running | node1 | default | host=db_node2 dbname=repmgr user=repmgr
|
||||
3 | node3 | standby | running | node1 | default | host=db_node3 dbname=repmgr user=repmgr</programlisting>
|
||||
ID | Name | Role | Status | Upstream | Location | Priority | Connection string
|
||||
----+-------+---------+-----------+----------+----------+----------+-----------------------------------------
|
||||
1 | node1 | primary | * running | | default | 100 | host=db_node1 dbname=repmgr user=repmgr
|
||||
2 | node2 | standby | running | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr
|
||||
3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr</programlisting>
|
||||
</para>
|
||||
</refsect1>
|
||||
<refsect1>
|
||||
<title>Notes</title>
|
||||
<para>
|
||||
The column <literal>Role</literal> shows the expected server role according to the
|
||||
&repmgr; metadata. <literal>Status</literal> shows whether the server is running or unreachable.
|
||||
&repmgr; metadata.
|
||||
</para>
|
||||
<para>
|
||||
<literal>Status</literal> shows whether the server is running or unreachable.
|
||||
If the node has an unexpected role not reflected in the &repmgr; metadata, e.g. a node was manually
|
||||
promoted to primary, this will be highlighted with an exclamation mark, e.g.:
|
||||
promoted to primary, this will be highlighted with an exclamation mark.
|
||||
If a connection to the node cannot be made, this will be highlighted with a question mark.
|
||||
Note that the node will only be shown as <literal>? unreachable</literal>
|
||||
if a connection is not possible at network level; if the PostgreSQL instance on the
|
||||
node is pingable but not accepting connections, it will be shown as <literal>? running</literal>.
|
||||
</para>
|
||||
<para>
|
||||
In the following example, executed on <literal>node3</literal>, <literal>node1</literal> is not reachable
|
||||
at network level and assumed to be down; <literal>node2</literal> has been promoted to primary
|
||||
(but <literal>node3</literal> is not attached to it, and its metadata has not yet been updated);
|
||||
<literal>node4</literal> is running but rejecting connections (from <literal>node3</literal> at least).
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster show
|
||||
ID | Name | Role | Status | Upstream | Location | Priority | Connection string
|
||||
----+-------+---------+----------------------+----------+----------+----------+-----------------------------------------
|
||||
1 | node1 | primary | ? unreachable | | default | 100 | host=db_node1 dbname=repmgr user=repmgr
|
||||
2 | node2 | standby | ! running as primary | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr
|
||||
3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr
|
||||
4 | node4 | standby | ? running | node1 | default | 100 | host=db_node4 dbname=repmgr user=repmgr
|
||||
|
||||
ID | Name | Role | Status | Upstream | Location | Connection string
|
||||
----+-------+---------+----------------------+----------+----------+-----------------------------------------
|
||||
1 | node1 | primary | ? unreachable | | default | host=db_node1 dbname=repmgr user=repmgr
|
||||
2 | node2 | standby | ! running as primary | node1 | default | host=db_node2 dbname=repmgr user=repmgr
|
||||
3 | node3 | standby | running | node1 | default | host=db_node3 dbname=repmgr user=repmgr
|
||||
|
||||
WARNING: following issues were detected
|
||||
node "node1" (ID: 1) is registered as an active primary but is unreachable
|
||||
node "node2" (ID: 2) is registered as standby but running as primary</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Node availability is tested by connecting from the node where
|
||||
<command>repmgr cluster show</command> is executed, and does not necessarily imply the node
|
||||
is down. See <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck"> to get
|
||||
a better overviews of connections between nodes.
|
||||
WARNING: following issues were detected
|
||||
- unable to connect to node "node1" (ID: 1)
|
||||
- node "node1" (ID: 1) is registered as an active primary but is unreachable
|
||||
- node "node2" (ID: 2) is registered as standby but running as primary
|
||||
- unable to connect to node "node4" (ID: 4)
|
||||
HINT: execute with --verbose option to see connection error messages</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
To diagnose connection issues, execute <command>repmgr cluster show</command>
|
||||
with the <option>--verbose</option> option; this will display the error message
|
||||
for each failed connection attempt.
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
Use <xref linkend="repmgr-cluster-matrix"> and <xref linkend="repmgr-cluster-crosscheck">
|
||||
to diagnose connection issues across the whole replication cluster.
|
||||
</para>
|
||||
</tip>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Options</title>
|
||||
<para>
|
||||
<command>repmgr cluster show</command> accepts an optional parameter <literal>--csv</literal>, which
|
||||
outputs the replication cluster's status in a simple CSV format, suitable for
|
||||
parsing by scripts:
|
||||
<programlisting>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--csv</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<command>repmgr cluster show</command> accepts an optional parameter <literal>--csv</literal>, which
|
||||
outputs the replication cluster's status in a simple CSV format, suitable for
|
||||
parsing by scripts, e.g.:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster show --csv
|
||||
1,-1,-1
|
||||
2,0,0
|
||||
3,0,1</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
The columns have following meanings:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
node ID
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
</para>
|
||||
<para>
|
||||
The columns have following meanings:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
node ID
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
availability (0 = available, -1 = unavailable)
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
recovery state (0 = not in recovery, 1 = in recovery, -1 = unknown)
|
||||
</simpara>
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--compact</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Suppress display of the <literal>conninfo</literal> column.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--terse</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Suppress warnings about connection issues.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--verbose</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Display the full text of any database connection error messages
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
Following exit codes can be emitted by <command>repmgr cluster show</command>:
|
||||
One of the following exit codes will be emitted by <command>repmgr cluster show</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
@@ -130,11 +196,31 @@
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_BAD_CONFIG (1)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
An issue was encountered while attempting to retrieve
|
||||
&repmgr; metadata.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_DB_CONN (6)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr; was unable to connect to the local PostgreSQL instance.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_NODE_STATUS (25)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
One or more issues were detected.
|
||||
One or more issues were detected with the replication configuration,
|
||||
e.g. a node was not in its expected state.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -145,7 +231,7 @@
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
<xref linkend="repmgr-node-status">, <xref linkend="repmgr-node-check">
|
||||
<xref linkend="repmgr-node-status">, <xref linkend="repmgr-node-check">, <xref linkend="repmgr-daemon-status">
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
|
||||
114
doc/repmgr-daemon-pause.sgml
Normal file
114
doc/repmgr-daemon-pause.sgml
Normal file
@@ -0,0 +1,114 @@
|
||||
<refentry id="repmgr-daemon-pause">
|
||||
<indexterm>
|
||||
<primary>repmgr daemon pause</primary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>pausing</secondary>
|
||||
</indexterm>
|
||||
|
||||
<refmeta>
|
||||
<refentrytitle>repmgr daemon pause</refentrytitle>
|
||||
</refmeta>
|
||||
|
||||
<refnamediv>
|
||||
<refname>repmgr daemon pause</refname>
|
||||
<refpurpose>Instruct all <application>repmgrd</application> instances in the replication cluster to pause failover operations</refpurpose>
|
||||
</refnamediv>
|
||||
|
||||
<refsect1>
|
||||
<title>Description</title>
|
||||
<para>
|
||||
This command can be run on any active node in the replication cluster to instruct all
|
||||
running <application>repmgrd</application> instances to "pause" themselves, i.e. take no
|
||||
action (such as promoting themselves or following a new primary) if a failover event is detected.
|
||||
</para>
|
||||
<para>
|
||||
This functionality is useful for performing maintenance operations, such as switchovers
|
||||
or upgrades, which might otherwise trigger a failover if <application>repmgrd</application>
|
||||
is running normally.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
It's important to wait a few seconds after restarting PostgreSQL on any node before running
|
||||
<command>repmgr daemon pause</command>, as the <application>repmgrd</application> instance
|
||||
on the restarted node will take a second or two before it has updated its status.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
<xref linkend="repmgr-daemon-unpause"> will instruct all previously paused <application>repmgrd</application>
|
||||
instances to resume normal failover operation.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Execution</title>
|
||||
<para>
|
||||
<command>repmgr daemon pause</command> can be executed on any active node in the
|
||||
replication cluster. A valid <filename>repmgr.conf</filename> file is required.
|
||||
It will have no effect on previously paused nodes.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Example</title>
|
||||
<para>
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf daemon pause
|
||||
NOTICE: node 1 (node1) paused
|
||||
NOTICE: node 2 (node2) paused
|
||||
NOTICE: node 3 (node3) paused</programlisting>
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Options</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><option>--dry-run</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Check if nodes are reachable but don't pause <application>repmgrd</application>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
One of the following exit codes will be emitted by <command>repmgr daemon unpause</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> could be paused on all nodes.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_REPMGRD_PAUSE (26)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> could not be paused on one or mode nodes.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
<xref linkend="repmgr-daemon-unpause">, <xref linkend="repmgr-daemon-status">
|
||||
</para>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
||||
203
doc/repmgr-daemon-start.sgml
Normal file
203
doc/repmgr-daemon-start.sgml
Normal file
@@ -0,0 +1,203 @@
|
||||
<refentry id="repmgr-daemon-start">
|
||||
<indexterm>
|
||||
<primary>repmgr daemon start</primary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>starting</secondary>
|
||||
</indexterm>
|
||||
|
||||
<refmeta>
|
||||
<refentrytitle>repmgr daemon start</refentrytitle>
|
||||
</refmeta>
|
||||
|
||||
<refnamediv>
|
||||
<refname>repmgr daemon start</refname>
|
||||
<refpurpose>Start the <application>repmgrd</application> daemon</refpurpose>
|
||||
</refnamediv>
|
||||
|
||||
<refsect1>
|
||||
<title>Description</title>
|
||||
<para>
|
||||
This command starts the <application>repmgrd</application> daemon on the
|
||||
local node.
|
||||
</para>
|
||||
<para>
|
||||
By default, &repmgr; will wait for up to 15 seconds to confirm that <application>repmgrd</application>
|
||||
started. This behaviour can be overridden by specifying a diffent value using the <option>--wait</option>
|
||||
option, or disabled altogether with the <option>--no-wait</option> option.
|
||||
</para>
|
||||
|
||||
<important>
|
||||
<para>
|
||||
The <filename>repmgr.conf</filename> parameter <varname>repmgrd_service_start_command</varname>
|
||||
must be set for <command>repmgr daemon start</command> to work; see section
|
||||
<xref linkend="repmgr-daemon-start-configuration"> for details.
|
||||
</para>
|
||||
</important>
|
||||
</refsect1>
|
||||
|
||||
|
||||
|
||||
<refsect1>
|
||||
|
||||
<title>Options</title>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--dry-run</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Check prerequisites but don't actually attempt to start <application>repmgrd</application>.
|
||||
</para>
|
||||
<para>
|
||||
This action will output the command which would be executed.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-w</option></term>
|
||||
<term><option>--wait</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Wait for the specified number of seconds to confirm that <application>repmgrd</application>
|
||||
started successfully.
|
||||
</para>
|
||||
<para>
|
||||
Note that providing <option>--wait=0</option> is the equivalent of <option>--no-wait</option>.
|
||||
</para>
|
||||
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--no-wait</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Don't wait to confirm that <application>repmgrd</application>
|
||||
started successfully.
|
||||
</para>
|
||||
<para>
|
||||
This is equivalent to providing <option>--wait=0</option>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-daemon-start-configuration" xreflabel="repmgr daemon start configuration">
|
||||
<title>Configuration file settings</title>
|
||||
<para>
|
||||
The following parameter in <filename>repmgr.conf</filename> is relevant
|
||||
to <command>repmgr daemon start</command>:
|
||||
</para>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>repmgrd_service_start_command</primary>
|
||||
<secondary>with "repmgr daemon start"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>repmgrd_service_start_command</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<command>repmgr daemon start</command> will execute the command defined by the
|
||||
<varname>repmgrd_service_start_command</varname> parameter in <filename>repmgr.conf</filename>.
|
||||
This must be set to a shell command which will start <application>repmgrd</application>;
|
||||
if &repmgr; was installed from a package, this will be the service command defined by the
|
||||
package. For more details see <link linkend="appendix-packages">Appendix: &repmgr; package details</link>.
|
||||
</para>
|
||||
<important>
|
||||
<para>
|
||||
If &repmgr; was installed from a system package, and you do not configure
|
||||
<varname>repmgrd_service_start_command</varname> to an appropriate service command, this may
|
||||
result in the system becoming confused about the state of the <application>repmgrd</application>
|
||||
service; this is particularly the case with <literal>systemd</literal>.
|
||||
</para>
|
||||
</important>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
One of the following exit codes will be emitted by <command>repmgr daemon start</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The <application>repmgrd</application> start command (defined in
|
||||
<varname>repmgrd_service_start_command</varname>) was successfully executed.
|
||||
</para>
|
||||
<para>
|
||||
If the <option>--wait</option> option was provided, &repmgr; will confirm that
|
||||
<application>repmgrd</application> has actually started up.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_BAD_CONFIG (1)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<varname>repmgrd_service_start_command</varname> is not defined in
|
||||
<filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_DB_CONN (6)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr; was unable to connect to the local PostgreSQL node.
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL must be running before <application>repmgrd</application>
|
||||
can be started. Additionally, unless the <option>--no-wait</option> option was
|
||||
provided, &repmgr; needs to be able to connect to the local PostgreSQL node
|
||||
to determine the state of <application>repmgrd</application>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_REPMGRD_SERVICE (27)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The <application>repmgrd</application> start command (defined in
|
||||
<varname>repmgrd_service_start_command</varname>) was not successfully executed.
|
||||
</para>
|
||||
<para>
|
||||
This can also mean that &repmgr; was unable to confirm whether <application>repmgrd</application>
|
||||
successfully started (unless the <option>--no-wait</option> option was provided).
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
<xref linkend="repmgr-daemon-stop">, <xref linkend="repmgr-daemon-status">, <xref linkend="repmgrd-daemon">
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
</refentry>
|
||||
186
doc/repmgr-daemon-status.sgml
Normal file
186
doc/repmgr-daemon-status.sgml
Normal file
@@ -0,0 +1,186 @@
|
||||
<refentry id="repmgr-daemon-status">
|
||||
<indexterm>
|
||||
<primary>repmgr daemon status</primary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>displaying daemon status</secondary>
|
||||
</indexterm>
|
||||
|
||||
<refmeta>
|
||||
<refentrytitle>repmgr daemon status</refentrytitle>
|
||||
</refmeta>
|
||||
|
||||
<refnamediv>
|
||||
<refname>repmgr daemon status</refname>
|
||||
<refpurpose>display information about the status of <application>repmgrd</application> on each node in the cluster</refpurpose>
|
||||
</refnamediv>
|
||||
|
||||
<refsect1>
|
||||
<title>Description</title>
|
||||
<para>
|
||||
This command provides an overview over all active nodes in the cluster and the state
|
||||
of each node's <application>repmgrd</application> instance. It can be used to check
|
||||
the result of <xref linkend="repmgr-daemon-pause"> and <xref linkend="repmgr-daemon-unpause">
|
||||
operations.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Execution</title>
|
||||
<para>
|
||||
<command>repmgr daemon status</command> can be executed on any active node in the
|
||||
replication cluster. A valid <filename>repmgr.conf</filename> file is required.
|
||||
</para>
|
||||
<para>
|
||||
If PostgreSQL is not running on a node, &repmgr; will not be able to determine the
|
||||
status of that node's <application>repmgrd</application> instance.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
After restarting PostgreSQL on any node, the <application>repmgrd</application> instance
|
||||
will take a second or two before it is able to update its status. Until then,
|
||||
<application>repmgrd</application> will be shown as not running.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Examples</title>
|
||||
<para>
|
||||
<application>repmgrd</application> running normally on all nodes:
|
||||
<programlisting>$ repmgr -f /etc/repmgr.conf daemon status
|
||||
ID | Name | Role | Priority | Status | repmgrd | PID | Paused? | Upstream last seen
|
||||
----+-------+---------+----------+---------+---------+-------+---------+--------------------
|
||||
1 | node1 | primary | 100 | running | running | 71987 | no | n/a
|
||||
2 | node2 | standby | 100 | running | running | 71996 | no | 1 second(s) ago
|
||||
3 | node3 | standby | 100 | running | running | 72042 | no | 1 second(s) ago
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<application>repmgrd</application> paused on all nodes (using <xref linkend="repmgr-daemon-pause">):
|
||||
<programlisting>$ repmgr -f /etc/repmgr.conf daemon status
|
||||
ID | Name | Role | Priority | Status | repmgrd | PID | Paused? | Upstream last seen
|
||||
----+-------+---------+----------+---------+---------+-------+---------+--------------------
|
||||
1 | node1 | primary | 100 | running | running | 71987 | yes | n/a
|
||||
2 | node2 | standby | 100 | running | running | 71996 | yes | 0 second(s) ago
|
||||
3 | node3 | standby | 100 | running | running | 72042 | yes | 0 second(s) ago
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<application>repmgrd</application> not running on one node:
|
||||
<programlisting>$ repmgr -f /etc/repmgr.conf daemon status
|
||||
ID | Name | Role | Priority | Status | repmgrd | PID | Paused? | Upstream last seen
|
||||
----+-------+---------+----------+---------+-------------+-------+---------+--------------------
|
||||
1 | node1 | primary | 100 | running | running | 71987 | yes | n/a
|
||||
2 | node2 | standby | 100 | running | not running | n/a | n/a | n/a
|
||||
3 | node3 | standby | 100 | running | running | 72042 | yes | 0 second(s) ago</programlisting>
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Options</title>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--csv</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<command>repmgr daemon status</command> accepts an optional parameter <literal>--csv</literal>, which
|
||||
outputs the replication cluster's status in a simple CSV format, suitable for
|
||||
parsing by scripts, e.g.:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf daemon status --csv
|
||||
1,node1,primary,1,1,5722,1,100,-1
|
||||
2,node2,standby,1,0,-1,1,100,1
|
||||
3,node3,standby,1,1,5779,1,100,1</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
The columns have following meanings:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
node ID
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
node name
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
node type (primary or standby)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
PostgreSQL server running (1 = running, 0 = not running)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<application>repmgrd</application> running (1 = running, 0 = not running, -1 = unknown)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<application>repmgrd</application> PID (-1 if not running or status unknown)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<application>repmgrd</application> paused (1 = paused, 0 = not paused, -1 = unknown)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<application>repmgrd</application> node priority
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
interval in seconds since the node's upstream was last seen (this will be -1 if the value could not be retrieved, or the node is primary)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--verbose</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Display the full text of any database connection error messages
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
<xref linkend="repmgr-daemon-pause">, <xref linkend="repmgr-daemon-unpause">, <xref linkend="repmgr-cluster-show">
|
||||
</para>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
200
doc/repmgr-daemon-stop.sgml
Normal file
200
doc/repmgr-daemon-stop.sgml
Normal file
@@ -0,0 +1,200 @@
|
||||
<refentry id="repmgr-daemon-stop">
|
||||
<indexterm>
|
||||
<primary>repmgr daemon stop</primary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>stopping</secondary>
|
||||
</indexterm>
|
||||
|
||||
<refmeta>
|
||||
<refentrytitle>repmgr daemon stop</refentrytitle>
|
||||
</refmeta>
|
||||
|
||||
<refnamediv>
|
||||
<refname>repmgr daemon stop</refname>
|
||||
<refpurpose>Stop the <application>repmgrd</application> daemon</refpurpose>
|
||||
</refnamediv>
|
||||
|
||||
<refsect1>
|
||||
<title>Description</title>
|
||||
<para>
|
||||
This command stops the <application>repmgrd</application> daemon on the
|
||||
local node.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
By default, &repmgr; will wait for up to 15 seconds to confirm that <application>repmgrd</application>
|
||||
stopped. This behaviour can be overridden by specifying a diffent value using the <option>--wait</option>
|
||||
option, or disabled altogether with the <option>--no-wait</option> option.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
If PostgreSQL is not running on the local node, under some circumstances &repmgr; may not
|
||||
be able to confirm if <application>repmgrd</application> has actually stopped.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<important>
|
||||
<para>
|
||||
The <filename>repmgr.conf</filename> parameter <varname>repmgrd_service_stop_command</varname>
|
||||
must be set for <command>repmgr daemon stop</command> to work; see section
|
||||
<xref linkend="repmgr-daemon-stop-configuration"> for details.
|
||||
</para>
|
||||
</important>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Configuration</title>
|
||||
<para>
|
||||
<command>repmgr daemon stop</command> will execute the command defined by the
|
||||
<varname>repmgrd_service_stop_command</varname> parameter in <filename>repmgr.conf</filename>.
|
||||
This must be set to a shell command which will stop <application>repmgrd</application>;
|
||||
if &repmgr; was installed from a package, this will be the service command defined by the
|
||||
package. For more details see <link linkend="appendix-packages">Appendix: &repmgr; package details</link>.
|
||||
</para>
|
||||
|
||||
<important>
|
||||
<para>
|
||||
If &repmgr; was installed from a system package, and you do not configure
|
||||
<varname>repmgrd_service_stop_command</varname> to an appropriate service command, this may
|
||||
result in the system becoming confused about the state of the <application>repmgrd</application>
|
||||
service; this is particularly the case with <literal>systemd</literal>.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
|
||||
<title>Options</title>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--dry-run</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Check prerequisites but don't actually attempt to stop <application>repmgrd</application>.
|
||||
</para>
|
||||
<para>
|
||||
This action will output the command which would be executed.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-w</option></term>
|
||||
<term><option>--wait</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Wait for the specified number of seconds to confirm that <application>repmgrd</application>
|
||||
stopped successfully.
|
||||
</para>
|
||||
<para>
|
||||
Note that providing <option>--wait=0</option> is the equivalent of <option>--no-wait</option>.
|
||||
</para>
|
||||
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--no-wait</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Don't wait to confirm that <application>repmgrd</application>
|
||||
stopped successfully.
|
||||
</para>
|
||||
<para>
|
||||
This is equivalent to providing <option>--wait=0</option>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-daemon-stop-configuration" xreflabel="repmgr daemon stop configuration">
|
||||
<title>Configuration file settings</title>
|
||||
<para>
|
||||
The following parameter in <filename>repmgr.conf</filename> is relevant
|
||||
to <command>repmgr daemon stop</command>:
|
||||
</para>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>repmgrd_service_stop_command</primary>
|
||||
<secondary>with "repmgr daemon stop"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>repmgrd_service_stop_command</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<command>repmgr daemon stop</command> will execute the command defined by the
|
||||
<varname>repmgrd_service_stop_command</varname> parameter in <filename>repmgr.conf</filename>.
|
||||
This must be set to a shell command which will stop <application>repmgrd</application>;
|
||||
if &repmgr; was installed from a package, this will be the service command defined by the
|
||||
package. For more details see <link linkend="appendix-packages">Appendix: &repmgr; package details</link>.
|
||||
</para>
|
||||
<important>
|
||||
<para>
|
||||
If &repmgr; was installed from a system package, and you do not configure
|
||||
<varname>repmgrd_service_stop_command</varname> to an appropriate service command, this may
|
||||
result in the system becoming confused about the state of the <application>repmgrd</application>
|
||||
service; this is particularly the case with <literal>systemd</literal>.
|
||||
</para>
|
||||
</important>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
One of the following exit codes will be emitted by <command>repmgr daemon stop</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> could be stopped.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_BAD_CONFIG (1)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<varname>repmgrd_service_stop_command</varname> is not defined in
|
||||
<filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_REPMGRD_SERVICE (27)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> could not be stopped.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
<xref linkend="repmgr-daemon-start">, <xref linkend="repmgr-daemon-status">, <xref linkend="repmgrd-daemon">
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
</refentry>
|
||||
109
doc/repmgr-daemon-unpause.sgml
Normal file
109
doc/repmgr-daemon-unpause.sgml
Normal file
@@ -0,0 +1,109 @@
|
||||
<refentry id="repmgr-daemon-unpause">
|
||||
<indexterm>
|
||||
<primary>repmgr daemon unpause</primary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>unpausing</secondary>
|
||||
</indexterm>
|
||||
|
||||
|
||||
<refmeta>
|
||||
<refentrytitle>repmgr daemon unpause</refentrytitle>
|
||||
</refmeta>
|
||||
|
||||
<refnamediv>
|
||||
<refname>repmgr daemon unpause</refname>
|
||||
<refpurpose>Instruct all <application>repmgrd</application> instances in the replication cluster to resume failover operations</refpurpose>
|
||||
</refnamediv>
|
||||
|
||||
<refsect1>
|
||||
<title>Description</title>
|
||||
<para>
|
||||
This command can be run on any active node in the replication cluster to instruct all
|
||||
running <application>repmgrd</application> instances to "unpause"
|
||||
(following a previous execution of <xref linkend="repmgr-daemon-pause">)
|
||||
and resume normal failover/monitoring operation.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
It's important to wait a few seconds after restarting PostgreSQL on any node before running
|
||||
<command>repmgr daemon pause</command>, as the <application>repmgrd</application> instance
|
||||
on the restarted node will take a second or two before it has updated its status.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Execution</title>
|
||||
<para>
|
||||
<command>repmgr daemon unpause</command> can be executed on any active node in the
|
||||
replication cluster. A valid <filename>repmgr.conf</filename> file is required.
|
||||
It will have no effect on nodes which are not already paused.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Example</title>
|
||||
<para>
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf daemon unpause
|
||||
NOTICE: node 1 (node1) unpaused
|
||||
NOTICE: node 2 (node2) unpaused
|
||||
NOTICE: node 3 (node3) unpaused</programlisting>
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Options</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><option>--dry-run</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Check if nodes are reachable but don't unpause <application>repmgrd</application>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
One of the following exit codes will be emitted by <command>repmgr daemon unpause</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> could be unpaused on all nodes.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_REPMGRD_PAUSE (26)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> could not be unpaused on one or mode nodes.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
<xref linkend="repmgr-daemon-pause">, <xref linkend="repmgr-daemon-status">
|
||||
</para>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
||||
@@ -18,6 +18,14 @@
|
||||
Performs some health checks on a node from a replication perspective.
|
||||
This command must be run on the local node.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
Currently &repmgr; performs health checks on physical replication
|
||||
slots only, with the aim of warning about streaming replication standbys which
|
||||
have become detached and the associated risk of uncontrolled WAL file
|
||||
growth.
|
||||
</para>
|
||||
</note>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
@@ -30,7 +38,8 @@
|
||||
Replication lag: OK (N/A - node is primary)
|
||||
WAL archiving: OK (0 pending files)
|
||||
Downstream servers: OK (2 of 2 downstream nodes attached)
|
||||
Replication slots: OK (node has no replication slots)</programlisting>
|
||||
Replication slots: OK (node has no physical replication slots)
|
||||
Missing replication slots: OK (node has no missing physical replication slots)</programlisting>
|
||||
</para>
|
||||
</refsect1>
|
||||
<refsect1>
|
||||
@@ -43,7 +52,7 @@
|
||||
OK (node is primary)</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Parameters for individual checks are as follows:
|
||||
Parameters for individual checks are as follows:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
@@ -75,16 +84,26 @@
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>--slots</literal>: checks there are no inactive replication slots
|
||||
<literal>--slots</literal>: checks there are no inactive physical replication slots
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>--missing-slots</literal>: checks there are no missing replication slots
|
||||
<literal>--missing-slots</literal>: checks there are no missing physical replication slots
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>--data-directory-config</literal>: checks the data directory configured in
|
||||
<filename>repmgr.conf</filename> matches the actual data directory.
|
||||
This check is not directly related to replication, but is useful to verify &repmgr;
|
||||
is correctly configured.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</refsect1>
|
||||
@@ -104,6 +123,7 @@
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>--nagios</literal>: generate output in a Nagios-compatible format
|
||||
(for individual checks only)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
@@ -150,9 +170,10 @@
|
||||
|
||||
|
||||
<para>
|
||||
Following exit codes can be emitted by <command>repmgr status check</command>
|
||||
One of the following exit codes will be emitted by <command>repmgr status check</command>
|
||||
if no individual check was specified.
|
||||
</para>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
@@ -174,6 +195,7 @@
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
|
||||
|
||||
@@ -28,6 +28,10 @@
|
||||
If the node is running and needs to be attached to the current primary, use
|
||||
<xref linkend="repmgr-standby-follow">.
|
||||
</para>
|
||||
<para>
|
||||
Note <xref linkend="repmgr-standby-follow"> can only be used for standbys which have not diverged
|
||||
from the rest of the cluster.
|
||||
</para>
|
||||
</tip>
|
||||
</refsect1>
|
||||
|
||||
@@ -63,10 +67,10 @@
|
||||
<term><option>--force-rewind[=/path/to/pg_rewind]</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Execute <application>pg_rewind</application> if necessary.
|
||||
Execute <application>pg_rewind</application>.
|
||||
</para>
|
||||
<para>
|
||||
It is only necessary to provide the <application>pg_rewind</application>
|
||||
It is only necessary to provide the <application>pg_rewind</application> path
|
||||
if using PostgreSQL 9.3 or 9.4, and <application>pg_rewind</application>
|
||||
is not installed in the PostgreSQL <filename>bin</filename> directory.
|
||||
</para>
|
||||
@@ -115,7 +119,8 @@
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Configuration file settings</title>
|
||||
|
||||
<para>
|
||||
@@ -128,17 +133,72 @@
|
||||
the value set in <literal>standby_reconnect_timeout</literal>,
|
||||
60 seconds).
|
||||
</simpara>
|
||||
<simpara>
|
||||
Note that <literal>standby_reconnect_timeout</literal> must be
|
||||
set to a value equal to or greater than
|
||||
<literal>node_rejoin_timeout</literal>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
<refsect1>
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-node-rejoin-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>node_rejoin</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
</para>
|
||||
</refsect1>
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
One of the following exit codes will be emitted by <command>repmgr node rejoin</command>:
|
||||
</para>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The node rejoin succeeded; or if <option>--dry-run</option> was provided,
|
||||
no issues were detected which would prevent the node rejoin.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_BAD_CONFIG (1)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
A configuration issue was detected which prevented &repmgr; from
|
||||
continuing with the node rejoin.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_NO_RESTART (4)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The node could not be restarted.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_REJOIN_FAIL (24)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The node rejoin operation failed.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Notes</title>
|
||||
@@ -162,6 +222,10 @@
|
||||
postgres --single -D /var/lib/pgsql/data/ < /dev/null</programlisting>
|
||||
</para>
|
||||
</tip>
|
||||
<para>
|
||||
&repmgr; will attempt to verify whether the node can rejoin as-is, or whether
|
||||
<command>pg_rewind</command> must be used (see following section).
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-node-rejoin-pg-rewind" xreflabel="Using pg_rewind">
|
||||
@@ -183,64 +247,137 @@
|
||||
<command>pg_rewind</command> <emphasis>requires</emphasis> that either
|
||||
<varname>wal_log_hints</varname> is enabled, or that
|
||||
data checksums were enabled when the cluster was initialized. See the
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/app-pgrewind.html"><command>pg_rewind</command> documentation</ulink> for details.
|
||||
<ulink url="https://www.postgresql.org/docs/current/app-pgrewind.html"><command>pg_rewind</command> documentation</ulink> for details.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
To have <command>repmgr node rejoin</command> use <command>pg_rewind</command> if required,
|
||||
We strongly recommend familiarizing yourself with <command>pg_rewind</command> before attempting
|
||||
to use it with &repmgr;, as while it is an extremely useful tool, it is <emphasis>not</emphasis>
|
||||
a "magic bullet" which can resolve all problematic replication situations.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
A typical use-case for <command>pg_rewind</command> is when a scenario like the following
|
||||
is encountered:
|
||||
<programlisting>
|
||||
$ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node3 dbname=repmgr user=repmgr' \
|
||||
--force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose --dry-run
|
||||
INFO: replication connection to the rejoin target node was successful
|
||||
INFO: local and rejoin target system identifiers match
|
||||
DETAIL: system identifier is 6652184002263212600
|
||||
ERROR: this node cannot attach to rejoin target node 3
|
||||
DETAIL: rejoin target server's timeline 2 forked off current database system timeline 1 before current recovery point 0/610D710
|
||||
HINT: use --force-rewind to execute pg_rewind</programlisting>
|
||||
|
||||
Here, <literal>node3</literal> was promoted to a primary while the local node was
|
||||
still attached to the previous primary; this can potentially happen during e.g. a
|
||||
network split. <command>pg_rewind</command> can re-sync the local node with <literal>node3</literal>,
|
||||
removing the need for a full reclone.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To have <command>repmgr node rejoin</command> use <command>pg_rewind</command>,
|
||||
pass the command line option <literal>--force-rewind</literal>, which will tell &repmgr;
|
||||
to execute <command>pg_rewind</command> to ensure the node can be rejoined successfully.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Be aware that if <command>pg_rewind</command> is executed and actually performs a
|
||||
rewind operation, any configuration files in the PostgreSQL data directory will be
|
||||
overwritten with those from the source server.
|
||||
</para>
|
||||
<para>
|
||||
To prevent this happening, provide a comma-separated list of files to retain
|
||||
using the <literal>--config-file</literal> command line option; the specified files
|
||||
will be archived in a temporary directory (whose parent directory can be specified with
|
||||
<literal>--config-archive-dir</literal>) and restored once the rewind operation is
|
||||
complete.
|
||||
</para>
|
||||
<important>
|
||||
<para>
|
||||
Be aware that if <command>pg_rewind</command> is executed and actually performs a
|
||||
rewind operation, any configuration files in the PostgreSQL data directory will be
|
||||
overwritten with those from the source server.
|
||||
</para>
|
||||
<para>
|
||||
To prevent this happening, provide a comma-separated list of files to retain
|
||||
using the <literal>--config-file</literal> command line option; the specified files
|
||||
will be archived in a temporary directory (whose parent directory can be specified with
|
||||
<literal>--config-archive-dir</literal>) and restored once the rewind operation is
|
||||
complete.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
<para>
|
||||
Example, first using <literal>--dry-run</literal>, then actually executing the
|
||||
<literal>node rejoin command</literal>.
|
||||
<programlisting>
|
||||
$ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node1 dbname=repmgr user=repmgr' \
|
||||
--force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose --dry-run
|
||||
NOTICE: using provided configuration file "/etc/repmgr.conf"
|
||||
$ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node3 dbname=repmgr user=repmgr' \
|
||||
--config-files=postgresql.local.conf,postgresql.conf --verbose --force-rewind --dry-run
|
||||
INFO: replication connection to the rejoin target node was successful
|
||||
INFO: local and rejoin target system identifiers match
|
||||
DETAIL: system identifier is 6652460429293670710
|
||||
NOTICE: pg_rewind execution required for this node to attach to rejoin target node 3
|
||||
DETAIL: rejoin target server's timeline 2 forked off current database system timeline 1 before current recovery point 0/610D710
|
||||
INFO: prerequisites for using pg_rewind are met
|
||||
INFO: file "postgresql.local.conf" would be copied to "/tmp/repmgr-config-archive-node1/postgresql.local.conf"
|
||||
INFO: file "postgresql.conf" would be copied to "/tmp/repmgr-config-archive-node1/postgresql.local.conf"
|
||||
INFO: 2 files would have been copied to "/tmp/repmgr-config-archive-node1"
|
||||
INFO: directory "/tmp/repmgr-config-archive-node1" deleted
|
||||
INFO: file "postgresql.local.conf" would be copied to "/tmp/repmgr-config-archive-node2/postgresql.local.conf"
|
||||
INFO: file "postgresql.replication-setup.conf" would be copied to "/tmp/repmgr-config-archive-node2/postgresql.replication-setup.conf"
|
||||
INFO: pg_rewind would now be executed
|
||||
DETAIL: pg_rewind command is:
|
||||
pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node1 dbname=repmgr user=repmgr'</programlisting>
|
||||
pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node3 dbname=repmgr user=repmgr'
|
||||
INFO: prerequisites for executing NODE REJOIN are met</programlisting>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If <option>--force-rewind</option> is used with the <option>--dry-run</option> option,
|
||||
this checks the prerequisites for using <application>pg_rewind</application>, but is
|
||||
not an absolute guarantee that actually executing <application>pg_rewind</application>
|
||||
will succeed. See also section <xref linkend="repmgr-node-rejoin-caveats"> below.
|
||||
</para>
|
||||
|
||||
</note>
|
||||
|
||||
<programlisting>
|
||||
$ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node1 dbname=repmgr user=repmgr' \
|
||||
--force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose
|
||||
NOTICE: using provided configuration file "/etc/repmgr.conf"
|
||||
INFO: prerequisites for using pg_rewind are met
|
||||
INFO: 2 files copied to "/tmp/repmgr-config-archive-node1"
|
||||
$ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node3 dbname=repmgr user=repmgr' \
|
||||
--config-files=postgresql.local.conf,postgresql.conf --verbose --force-rewind
|
||||
NOTICE: pg_rewind execution required for this node to attach to rejoin target node 3
|
||||
DETAIL: rejoin target server's timeline 2 forked off current database system timeline 1 before current recovery point 0/610D710
|
||||
NOTICE: executing pg_rewind
|
||||
NOTICE: 2 files copied to /var/lib/pgsql/data
|
||||
INFO: directory "/tmp/repmgr-config-archive-node1" deleted
|
||||
INFO: deleting "recovery.done"
|
||||
INFO: setting node 1's primary to node 2
|
||||
NOTICE: starting server using "pg_ctl-l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' start"
|
||||
waiting for server to start.... done
|
||||
server started
|
||||
DETAIL: pg_rewind command is "pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node3 dbname=repmgr user=repmgr'"
|
||||
NOTICE: 2 files copied to /var/lib/postgresql/data
|
||||
NOTICE: setting node 2's upstream to node 3
|
||||
NOTICE: starting server using "pg_ctl -l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' start"
|
||||
NOTICE: NODE REJOIN successful
|
||||
DETAIL: node 1 is now attached to node 2</programlisting>
|
||||
DETAIL: node 2 is now attached to node 3</programlisting>
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-node-rejoin-caveats" xreflabel="Caveats">
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgr node rejoin</primary>
|
||||
<secondary>caveats</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Caveats when using <command>repmgr node rejoin</command></title>
|
||||
<para>
|
||||
<command>repmgr node rejoin</command> attempts to determine whether it will succeed by
|
||||
comparing the timelines and relative WAL positions of the local node (rejoin candidate) and primary
|
||||
(rejoin target). This is particularly important if planning to use <application>pg_rewind</application>,
|
||||
which currently (as of PostgreSQL 11) may appear to succeed (or indicate there is no action
|
||||
needed) but potentially allow an impossible action, such as trying to rejoin a standby to a
|
||||
primary which is behind the standby. &repmgr; will prevent this situation from occurring.
|
||||
</para>
|
||||
<para>
|
||||
Currently it is <emphasis>not</emphasis> possible to detect a situation where the rejoin target
|
||||
is a standby which has been "promoted" by removing <filename>recovery.conf</filename>
|
||||
(PostgreSQL 12 and later: <filename>standby.signal</filename>) and restarting it.
|
||||
In this case there will be no information about the point the rejoin target diverged
|
||||
from the current standby; the rejoin operation will fail and
|
||||
the current standby's PostgreSQL log will contain entries with the text
|
||||
"<literal>record with incorrect prev-link</literal>".
|
||||
</para>
|
||||
<para>
|
||||
We strongly recommend running <command>repmgr node rejoin</command> with the
|
||||
<option>--dry-run</option> option first. Additionally it might be a good idea
|
||||
to execute the <application>pg_rewind</application> command displayed by
|
||||
&repmgr; with the <application>pg_rewind</application> <option>--dry-run</option>
|
||||
option. Note that <application>pg_rewind</application> does not indicate that it
|
||||
is running in <option>--dry-run</option> mode.
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
|
||||
151
doc/repmgr-node-service.sgml
Normal file
151
doc/repmgr-node-service.sgml
Normal file
@@ -0,0 +1,151 @@
|
||||
<refentry id="repmgr-node-service">
|
||||
<indexterm>
|
||||
<primary>repmgr node service</primary>
|
||||
</indexterm>
|
||||
|
||||
<refmeta>
|
||||
<refentrytitle>repmgr node service</refentrytitle>
|
||||
</refmeta>
|
||||
|
||||
<refnamediv>
|
||||
<refname>repmgr node service</refname>
|
||||
<refpurpose>show or execute the system service command to stop/start/restart/reload/promote a node</refpurpose>
|
||||
</refnamediv>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>Description</title>
|
||||
<para>
|
||||
Shows or executes the system service command to stop/start/restart/reload a node.
|
||||
</para>
|
||||
<para>
|
||||
This command is mainly meant for internal &repmgr; usage, but is useful for
|
||||
confirming the command configuration.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
|
||||
<title>Options</title>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--dry-run</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Log the steps which would be taken, including displaying the command which would be executed.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--action</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The action to perform. One of <literal>start</literal>, <literal>stop</literal>,
|
||||
<literal>restart</literal>, <literal>reload</literal> or <literal>promote</literal>.
|
||||
</para>
|
||||
<para>
|
||||
If the parameter <option>--list-actions</option> is provided together with
|
||||
<option>--action</option>, the command which would be executed will be printed.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--list-actions</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
List all configured commands.
|
||||
</para>
|
||||
<para>
|
||||
If the parameter <option>--action</option> is provided together with
|
||||
<option>--list-actions</option>, the command which would be executed for that
|
||||
particular action will be printed.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--checkpoint</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Issue a <command>CHECKPOINT</command> before stopping or restarting the node.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
One of the following exit codes will be emitted by <command>repmgr node service</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
No issues were detected.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_LOCAL_COMMAND (5)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Execution of the system service command failed.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Examples</title>
|
||||
<para>
|
||||
See what action would be taken for a restart:
|
||||
<programlisting>
|
||||
[postgres@node1 ~]$ repmgr -f /etc/repmgr/11/repmgr.conf node service --action=restart --checkpoint --dry-run
|
||||
INFO: a CHECKPOINT would be issued here
|
||||
INFO: would execute server command "sudo service postgresql-11 restart"</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Restart the PostgreSQL instance:
|
||||
<programlisting>
|
||||
[postgres@node1 ~]$ repmgr -f /etc/repmgr/11/repmgr.conf node service --action=restart --checkpoint
|
||||
NOTICE: issuing CHECKPOINT
|
||||
DETAIL: executing server command "sudo service postgresql-11 restart"
|
||||
Redirecting to /bin/systemctl restart postgresql-11.service</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
List all commands:
|
||||
<programlisting>
|
||||
[postgres@node1 ~]$ repmgr -f /etc/repmgr/11/repmgr.conf node service --list-actions
|
||||
Following commands would be executed for each action:
|
||||
|
||||
start: "sudo service postgresql-11 start"
|
||||
stop: "sudo service postgresql-11 stop"
|
||||
restart: "sudo service postgresql-11 restart"
|
||||
reload: "sudo service postgresql-11 reload"
|
||||
promote: "/usr/pgsql-11/bin/pg_ctl -w -D '/var/lib/pgsql/11/data' promote"</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
List a single command:
|
||||
<programlisting>
|
||||
[postgres@node1 ~]$ repmgr -f /etc/repmgr/11/repmgr.conf node service --list-actions --action=promote
|
||||
/usr/pgsql-11/bin/pg_ctl -w -D '/var/lib/pgsql/11/data' promote </programlisting>
|
||||
</para>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
@@ -55,7 +55,7 @@
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
Following exit codes can be emitted by <command>repmgr node status</command>:
|
||||
One of the following exit codes will be emitted by <command>repmgr node status</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
|
||||
@@ -21,6 +21,15 @@
|
||||
installing the &repmgr; extension. This command needs to be executed before any
|
||||
standby nodes are registered.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
It's possibly to install the &repmgr; extension manually before executing
|
||||
<command>repmgr primary register</command>; in this case &repmgr; will
|
||||
detect the presence of the extension and skip that step.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
@@ -35,16 +44,16 @@
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If providing the configuration file location with <option>-f/--config-file</option>,
|
||||
avoid using a relative path, as &repmgr; stores the configuration file location
|
||||
in the repmgr metadata for use when &repmgr; is executed remotely (e.g. during
|
||||
<xref linkend="repmgr-standby-switchover">). &repmgr; will attempt to convert the
|
||||
a relative path into an absolute one, but this may not be the same as the path you
|
||||
would explicitly provide (e.g. <filename>./repmgr.conf</filename> might be converted
|
||||
to <filename>/path/to/./repmgr.conf</filename>, whereas you'd normally write
|
||||
<filename>/path/to/repmgr.conf</filename>).
|
||||
</para>
|
||||
<para>
|
||||
If providing the configuration file location with <option>-f/--config-file</option>,
|
||||
avoid using a relative path, as &repmgr; stores the configuration file location
|
||||
in the repmgr metadata for use when &repmgr; is executed remotely (e.g. during
|
||||
<xref linkend="repmgr-standby-switchover">). &repmgr; will attempt to convert the
|
||||
a relative path into an absolute one, but this may not be the same as the path you
|
||||
would explicitly provide (e.g. <filename>./repmgr.conf</filename> might be converted
|
||||
to <filename>/path/to/./repmgr.conf</filename>, whereas you'd normally write
|
||||
<filename>/path/to/repmgr.conf</filename>).
|
||||
</para>
|
||||
</note>
|
||||
</refsect1>
|
||||
|
||||
@@ -75,10 +84,18 @@
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-primary-register-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>primary_register</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
Following <link linkend="event-notifications">event notifications</link> will be generated:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><literal>cluster_created</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>primary_register</literal></simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-primary-unregister-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>primary_unregister</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -49,7 +49,7 @@
|
||||
not be copied by default. &repmgr; can copy these files, either to the same
|
||||
location on the standby server (provided appropriate directory and file permissions
|
||||
are available), or into the standby's data directory. This requires passwordless
|
||||
SSH access to the primary server. Add the option <literal>--copy-external-config-files</literal>
|
||||
SSH access to the primary server. Add the option <option>--copy-external-config-files</option>
|
||||
to the <command>repmgr standby clone</command> command; by default files will be copied to
|
||||
the same path as on the upstream server. Note that the user executing <command>repmgr</command>
|
||||
must have write access to those directories.
|
||||
@@ -59,18 +59,35 @@
|
||||
<literal>--copy-external-config-files=pgdata</literal>, but note that
|
||||
any include directives in the copied files may need to be updated.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
When executing <command>repmgr standby clone</command> with the
|
||||
<option>--copy-external-config-files</option> aand <option>--dry-run</option>
|
||||
options, &repmgr; will check the SSH connection to the source node, but
|
||||
will not verify whether the files can actually be copied.
|
||||
</para>
|
||||
<para>
|
||||
During the actual clone operation, a check will be made before the database itself
|
||||
is cloned to determine whether the files can actually be copied; if any problems are
|
||||
encountered, the clone operation will be aborted, enabling the user to fix
|
||||
any issues before retrying the clone operation.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<tip>
|
||||
<simpara>
|
||||
For reliable configuration file management we recommend using a
|
||||
configuration management tool such as Ansible, Chef, Puppet or Salt.
|
||||
</simpara>
|
||||
</tip>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-standby-clone-recovery-conf">
|
||||
<indexterm>
|
||||
<primary>recovery.conf</primary>
|
||||
<secondary>customising with "repmgr standby clone"</secondary>
|
||||
<secondary>customising with "repmgr standby clone"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Customising recovery.conf</title>
|
||||
@@ -153,7 +170,7 @@
|
||||
pg_basebackup_options='--xlog-method=fetch'</programlisting>
|
||||
|
||||
and ensure that <literal>wal_keep_segments</literal> is set to an appropriately high value.
|
||||
See the <ulink url="https://www.postgresql.org/docs/current/static/app-pgbasebackup.html">
|
||||
See the <ulink url="https://www.postgresql.org/docs/current/app-pgbasebackup.html">
|
||||
pg_basebackup</ulink> documentation for details.
|
||||
</para>
|
||||
|
||||
@@ -177,10 +194,11 @@
|
||||
<title>Using a standby cloned by another method</title>
|
||||
<para>
|
||||
&repmgr; supports standbys cloned by another method (e.g. using <application>barman</application>'s
|
||||
<command><ulink url="http://docs.pgbarman.org/release/2.4/#recover">barman recover</ulink></command> command).
|
||||
<command><ulink url="http://docs.pgbarman.org/release/2.5/#recover">barman recover</ulink></command> command).
|
||||
</para>
|
||||
<para>
|
||||
To integrate the standby as a &repmgr; node, ensure the <filename>repmgr.conf</filename>
|
||||
To integrate the standby as a &repmgr; node, once the standby has been cloned,
|
||||
ensure the <filename>repmgr.conf</filename>
|
||||
file is created for the node, and that it has been registered using
|
||||
<command><link linkend="repmgr-standby-register">repmgr standby register</link></command>.
|
||||
Then execute the command <command>repmgr standby clone --recovery-conf-only</command>.
|
||||
@@ -333,7 +351,7 @@
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-clone-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_clone</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -9,23 +9,35 @@
|
||||
|
||||
<refnamediv>
|
||||
<refname>repmgr standby follow</refname>
|
||||
<refpurpose>attach a standby to a new primary</refpurpose>
|
||||
<refpurpose>attach a running standby to a new upstream node</refpurpose>
|
||||
</refnamediv>
|
||||
|
||||
<refsect1>
|
||||
<title>Description</title>
|
||||
|
||||
<para>
|
||||
Attaches the standby to a new primary. This command requires a valid
|
||||
Attaches the standby ("follow candidate") to a new upstream node
|
||||
("follow target"). Typically this will be the primary, but this
|
||||
command can also be used to attach the standby to another standby.
|
||||
</para>
|
||||
<para>
|
||||
This command requires a valid
|
||||
<filename>repmgr.conf</filename> file for the standby, either specified
|
||||
explicitly with <literal>-f/--config-file</literal> or located in a
|
||||
default location; no additional arguments are required.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
By default &repmgr; will attempt to attach the standby to the current primary.
|
||||
If <option>--upstream-node-id</option> is provided, &repmgr; will attempt
|
||||
to attach the standby to the specified node, which can be another standby.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This command will force a restart of the standby server, which must be
|
||||
running. It can only be used to attach an active standby to the current primary node
|
||||
(and not to another standby).
|
||||
running.
|
||||
</para>
|
||||
|
||||
<tip>
|
||||
<para>
|
||||
To re-add an inactive node to the replication cluster, use
|
||||
@@ -36,9 +48,22 @@
|
||||
<para>
|
||||
<command>repmgr standby follow</command> will wait up to
|
||||
<varname>standby_follow_timeout</varname> seconds (default: <literal>30</literal>)
|
||||
to verify the standby has actually connected to the new primary.
|
||||
to verify the standby has actually connected to the new upstream node.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If <option>recovery_min_apply_delay</option> is set for the standby, it
|
||||
will not attach to the new upstream node until it has replayed available
|
||||
WAL.
|
||||
</para>
|
||||
<para>
|
||||
Conversely, if the standby is attached to an upstream standby
|
||||
which has <option>recovery_min_apply_delay</option> set, the upstream
|
||||
standby's replay state may actually be behind that of its new downstream node.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
@@ -65,19 +90,46 @@
|
||||
<term><option>--dry-run</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Check prerequisites but don't actually follow a new standby.
|
||||
Check prerequisites but don't actually follow a new upstream node.
|
||||
</para>
|
||||
<para>
|
||||
This will also verify whether the standby is capable of following the new upstream node.
|
||||
</para>
|
||||
<important>
|
||||
<para>
|
||||
This does not guarantee the standby can follow the primary; in
|
||||
particular, whether the primary and standby timelines have diverged,
|
||||
can currently only be determined by actually attempting to
|
||||
attach the standby to the primary.
|
||||
If a standby was turned into a primary by removing <filename>recovery.conf</filename>
|
||||
(<application>PostgreSQL 12</application> and later: <filename>standby.signal</filename>),
|
||||
&repmgr; will <emphasis>not</emphasis> be able to determine whether that primary's timeline
|
||||
has diverged from the timeline of the standby ("follow candidate").
|
||||
</para>
|
||||
<para>
|
||||
We recommend always to use <link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>
|
||||
to promote a standby to primary, as this will ensure that the new primary
|
||||
will perform a timeline switch (making it practical to check for timeline divergence)
|
||||
and also that &repmgr; metadata is updated correctly.
|
||||
</para>
|
||||
</important>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--upstream-node-id</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Node ID of the new upstream node ("follow target").
|
||||
</para>
|
||||
<para>
|
||||
If not provided, &repmgr; will attempt to follow the current primary node.
|
||||
</para>
|
||||
<para>
|
||||
Note that when using <application>repmgrd</application>, <option>--upstream-node-id</option>
|
||||
should always be configured;
|
||||
see <link linkend="repmgrd-automatic-failover-configuration">Automatic failover configuration</link>
|
||||
for details.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-w</option></term>
|
||||
<term><option>--wait</option></term>
|
||||
@@ -95,12 +147,103 @@
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Execution</title>
|
||||
|
||||
<para>
|
||||
Execute with the <literal>--dry-run</literal> option to test the follow operation as
|
||||
far as possible, without actually changing the status of the node.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Note that &repmgr; will first attempt to determine whether the standby
|
||||
("follow candidate") is capable of following the
|
||||
new upstream node ("follow target").
|
||||
</para>
|
||||
<para>
|
||||
If, for example, the new upstream node has diverged from this node's timeline,
|
||||
for example if the new upstream node was promoted to primary while this node
|
||||
was still attached to the original primary, it will <emphasis>not</emphasis>
|
||||
be possible to follow the new upstream node, and &repmgr; will emit an error
|
||||
message like this:
|
||||
<programlisting>
|
||||
ERROR: this node cannot attach to follow target node 3
|
||||
DETAIL: follow target server's timeline 2 forked off current database system timeline 1 before current recovery point 0/6108880</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
In this case, it may be possible to have this node follow the new upstream
|
||||
using <command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>
|
||||
with the <option>--force-rewind</option> to execute <command>pg_rewind</command>.
|
||||
This does mean that transactions which exist on this node, but not the new upstream,
|
||||
will be lost.
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
One of the following exit codes will be emitted by <command>repmgr standby follow</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The follow operation succeeded; or if <option>--dry-run</option> was provided,
|
||||
no issues were detected which would prevent the follow operation.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_BAD_CONFIG (1)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
A configuration issue was detected which prevented &repmgr; from
|
||||
continuing with the follow operation.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_NO_RESTART (4)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The node could not be restarted.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_DB_CONN (6)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr; was unable to establish a database connection to one of the nodes.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_FOLLOW_FAIL (23)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr; was unable to complete the follow command.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-standby-follow-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_follow</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
</para>
|
||||
<para>
|
||||
If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the primary
|
||||
If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the node
|
||||
being followed, <literal>%c</literal> with its <literal>conninfo</literal> string, and
|
||||
<literal>%a</literal> with its node name.
|
||||
</para>
|
||||
@@ -113,4 +256,3 @@
|
||||
</para>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
||||
|
||||
@@ -33,8 +33,26 @@
|
||||
Both values can be defined in <filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If WAL replay is paused on the standby, and not all WAL files on the standby have been
|
||||
replayed, &repmgr; will not attempt to promote it.
|
||||
</para>
|
||||
<para>
|
||||
This is because if WAL replay is paused, PostgreSQL itself will not react to a promote command
|
||||
until WAL replay is resumed and all pending WAL has been replayed. This means
|
||||
attempting to promote PostgreSQL in this state will leave PostgreSQL in a condition where the
|
||||
promotion may occur at a unpredictable point in the future.
|
||||
</para>
|
||||
<para>
|
||||
Note that if the standby is in archive recovery, &repmgr; will not be able to determine
|
||||
if more WAL is pending replay, and will abort the promotion attempt if WAL replay is paused.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>Example</title>
|
||||
<para>
|
||||
@@ -51,6 +69,127 @@
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>Options</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><option>--dry-run</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Check if this node can be promoted, but don't carry out the promotion
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Configuration file settings</title>
|
||||
<para>
|
||||
The following parameters in <filename>repmgr.conf</filename> are relevant to the
|
||||
promote operation:
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<indexterm>
|
||||
<primary>promote_check_interval</primary>
|
||||
<secondary>with "repmgr standby promote "</secondary>
|
||||
</indexterm>
|
||||
<simpara>
|
||||
<literal>promote_check_interval</literal>:
|
||||
interval (in seconds, default: 1 second) to wait between each check
|
||||
to determine whether the standby has been promoted.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<indexterm>
|
||||
<primary>promote_check_timeout</primary>
|
||||
<secondary>with "repmgr standby promote "</secondary>
|
||||
</indexterm>
|
||||
<simpara>
|
||||
<literal>promote_check_timeout</literal>:
|
||||
time (in seconds, default: 60 seconds) to wait to verify that the standby has been promoted
|
||||
before exiting with <literal>ERR_PROMOTION_FAIL</literal>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
Following exit codes can be emitted by <command>repmgr standby promote</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The standby was successfully promoted to primary.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_DB_CONN (6)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
&repmgr; was unable to connect to the local PostgreSQL node.
|
||||
</para>
|
||||
<para>
|
||||
PostgreSQL must be running before the node can be promoted.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_PROMOTION_FAIL (8)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The node could not be promoted to primary for one of the following
|
||||
reasons:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
there is an existing primary node in the replication cluster
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
the node is not a standby
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
WAL replay is paused on the node
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
execution of the PostgreSQL promote command failed
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1 id="repmgr-standby-promote-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_promote</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -159,7 +159,7 @@
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-register-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_register</literal> <link linkend="event-notifications">event notification</link>
|
||||
|
||||
@@ -35,6 +35,10 @@
|
||||
&repmgr; will attempt to check for potential issues but cannot guarantee
|
||||
a successful switchover.
|
||||
</para>
|
||||
<para>
|
||||
&repmgr; will refuse to perform the switchover if an exclusive backup is running on
|
||||
the current primary, or if WAL replay is paused on the standby.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
For more details on performing a switchover, including preparation and configuration,
|
||||
@@ -43,11 +47,14 @@
|
||||
|
||||
<note>
|
||||
<para>
|
||||
<application>repmgrd</application> should not be active on any nodes while a switchover is being
|
||||
executed. This restriction may be lifted in a later version.
|
||||
From <link linkend="release-4.2">repmgr 4.2</link>, &repmgr; will instruct any running
|
||||
<application>repmgrd</application> instances to pause operations while the switchover
|
||||
is being carried out, to prevent <application>repmgrd</application> from
|
||||
unintentionally promoting a node. For more details, see <xref linkend="repmgrd-pausing">.
|
||||
</para>
|
||||
<para>
|
||||
&repmgr; will not perform the switchover if an exclusive backup is running on the current primary.
|
||||
Users of &repmgr; versions prior to 4.2 should ensure that <application>repmgrd</application>
|
||||
is not running on any nodes while a switchover is being executed.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
@@ -61,8 +68,9 @@
|
||||
<term><option>--always-promote</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Promote standby to primary, even if it is behind original primary
|
||||
(original primary will be shut down in any case).
|
||||
Promote standby to primary, even if it is behind or has diverged
|
||||
from the original primary. The original primary will be shut down in any case,
|
||||
and will need to be manually reintegrated into the replication cluster.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -122,7 +130,23 @@
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--repmgrd-no-pause</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Don't pause <application>repmgrd</application> while executing a switchover.
|
||||
</para>
|
||||
<para>
|
||||
This option should not be used unless you take steps by other means
|
||||
to ensure <application>repmgrd</application> is paused or not
|
||||
running on all nodes.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<term><option>--siblings-follow</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
@@ -138,41 +162,119 @@
|
||||
<title>Configuration file settings</title>
|
||||
|
||||
<para>
|
||||
Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
|
||||
The following parameters in <filename>repmgr.conf</filename> are relevant to the
|
||||
switchover operation:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_attempts</literal>: number of times to check the original primary
|
||||
for a clean shutdown after executing the shutdown command, before aborting
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_interval</literal>: interval (in seconds) to check the original
|
||||
primary for a clean shutdown after executing the shutdown command (up to a maximum
|
||||
of <literal>reconnect_attempts</literal> tries)
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>replication_lag_critical</literal>:
|
||||
if replication lag (in seconds) on the standby exceeds this value, the
|
||||
switchover will be aborted (unless the <literal>-F/--force</literal> option
|
||||
is provided)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>standby_reconnect_timeout</literal>:
|
||||
number of seconds to attempt to wait for the demoted primary
|
||||
to reconnect to the promoted primary (default: 60 seconds)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>replication_lag_critical</primary>
|
||||
<secondary>with "repmgr standby switchover"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>replication_lag_critical</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
If replication lag (in seconds) on the standby exceeds this value, the
|
||||
switchover will be aborted (unless the <literal>-F/--force</literal> option
|
||||
is provided)
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>shutdown_check_timeout</primary>
|
||||
<secondary>with "repmgr standby switchover"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>shutdown_check_timeout</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The maximum number of seconds to wait for the
|
||||
demotion candidate (current primary) to shut down, before aborting the switchover.
|
||||
</para>
|
||||
<para>
|
||||
Note that this parameter is set on the node where <command>repmgr standby switchover</command>
|
||||
is executed (promotion candidate); setting it on the demotion candidate (former primary) will
|
||||
have no effect.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
In versions prior to <link linkend="release-4.2">&repmgr; 4.2</link>, <command>repmgr standby switchover</command> would
|
||||
use the values defined in <literal>reconnect_attempts</literal> and <literal>reconnect_interval</literal>
|
||||
to determine the timeout for demotion candidate shutdown.
|
||||
</para>
|
||||
</note>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>wal_receive_check_timeout</primary>
|
||||
<secondary>with "repmgr standby switchover"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>wal_receive_check_timeout</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
After the primary has shut down, the maximum number of seconds to wait for the
|
||||
walreceiver on the standby to flush WAL to disk before comparing WAL receive location
|
||||
with the primary's shut down location.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>standby_reconnect_timeout</primary>
|
||||
<secondary>with "repmgr standby switchover"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>standby_reconnect_timeout</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The maximum number of seconds to attempt to wait for the demotion candidate (former primary)
|
||||
to reconnect to the promoted primary (default: 60 seconds)
|
||||
</para>
|
||||
<para>
|
||||
Note that this parameter is set on the node where <command>repmgr standby switchover</command>
|
||||
is executed (promotion candidate); setting it on the demotion candidate (former primary) will
|
||||
have no effect.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>node_rejoin_timeout</primary>
|
||||
<secondary>with "repmgr standby switchover"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<term><option>node_rejoin_timeout</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
maximum number of seconds to attempt to wait for the demotion candidate (former primary)
|
||||
to reconnect to the promoted primary (default: 60 seconds)
|
||||
</para>
|
||||
<para>
|
||||
Note that this parameter is set on the the demotion candidate (former primary);
|
||||
setting it on the node where <command>repmgr standby switchover</command> is
|
||||
executed will have no effect.
|
||||
</para>
|
||||
<para>
|
||||
However, this value <emphasis>must</emphasis> be less than <option>standby_reconnect_timeout</option> on the
|
||||
promotion candidate (the node where <command>repmgr standby switchover</command> is executed).
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
|
||||
@@ -183,12 +285,7 @@
|
||||
Execute with the <literal>--dry-run</literal> option to test the switchover as far as
|
||||
possible without actually changing the status of either node.
|
||||
</para>
|
||||
<important>
|
||||
<para>
|
||||
<application>repmgrd</application> must be shut down on all nodes while a switchover is being
|
||||
executed. This restriction will be removed in a future &repmgr; version.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
<para>
|
||||
External database connections, e.g. from an application, should not be permitted while
|
||||
the switchover is taking place. In particular, active transactions on the primary
|
||||
@@ -196,7 +293,7 @@
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-switchover-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
<literal>standby_switchover</literal> and <literal>standby_promote</literal>
|
||||
@@ -213,7 +310,7 @@
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
Following exit codes can be emitted by <command>repmgr standby switchover</command>:
|
||||
One of the following exit codes will be emitted by <command>repmgr standby switchover</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
@@ -221,7 +318,8 @@
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The switchover completed successfully.
|
||||
The switchover completed successfully; or if <option>--dry-run</option> was provided,
|
||||
no issues were detected which would prevent the switchover operation.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -252,7 +350,10 @@
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
For more details see the section <xref linkend="performing-switchover">.
|
||||
<xref linkend="repmgr-standby-follow">, <xref linkend="repmgr-node-rejoin">
|
||||
</para>
|
||||
<para>
|
||||
For more details on performing a switchover operation, see the section <xref linkend="performing-switchover">.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-unregister-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_unregister</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -23,14 +23,27 @@
|
||||
use of the witness server with <application>repmgrd</application>.
|
||||
</para>
|
||||
<para>
|
||||
When executing <command>repmgr witness register</command>, connection information
|
||||
for the cluster primary server must also be provided. &repmgr; will automatically
|
||||
use the <varname>user</varname> and <varname>dbname</varname> values defined
|
||||
in the <varname>conninfo</varname> string defined in the witness node's
|
||||
<filename>repmgr.conf</filename>, if these are not explicitly provided.
|
||||
When executing <command>repmgr witness register</command>, database connection
|
||||
information for the cluster primary server must also be provided.
|
||||
</para>
|
||||
<para>
|
||||
Execute with the <literal>--dry-run</literal> option to check what would happen
|
||||
In most cases it's only necessary to provide the primary's hostname with
|
||||
the <option>-h</option>/<option>--host</option> option; &repmgr; will
|
||||
automatically use the <varname>user</varname> and <varname>dbname</varname>
|
||||
values defined in the <varname>conninfo</varname> string defined in the
|
||||
witness node's <filename>repmgr.conf</filename>, unless these are explicitly
|
||||
provided as command line options.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
The primary server must be registered with <command><link linkend="repmgr-primary-register">repmgr primary register</link></command> before the witness
|
||||
server can be registered.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
Execute with the <option>--dry-run</option> option to check what would happen
|
||||
without actually registering the witness server.
|
||||
</para>
|
||||
</refsect1>
|
||||
@@ -50,7 +63,7 @@
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-witness-register-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>witness_register</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -92,7 +92,7 @@
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-witness-unregister-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>witness_unregister</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -24,26 +24,26 @@
|
||||
<abstract>
|
||||
<para>
|
||||
This is the official documentation of &repmgr; &repmgrversion; for
|
||||
use with PostgreSQL 9.3 - PostgreSQL 10.
|
||||
use with PostgreSQL 9.3 - PostgreSQL 11.
|
||||
It describes the functionality supported by the current version of &repmgr;.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
&repmgr; was developed by
|
||||
&repmgr; is developed by
|
||||
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
|
||||
along with contributions from other individuals and companies.
|
||||
Contributions from the community are appreciated and welcome - get
|
||||
in touch via <ulink url="https://github.com/2ndQuadrant/repmgr">github</>
|
||||
or <ulink url="https://groups.google.com/group/repmgr">the mailing list/forum</>.
|
||||
in touch via <ulink url="https://github.com/2ndQuadrant/repmgr">github</ulink>
|
||||
or <ulink url="https://groups.google.com/group/repmgr">the mailing list/forum</ulink>.
|
||||
Multiple 2ndQuadrant customers contribute funding
|
||||
to make repmgr development possible.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
2ndQuadrant, a Platinum sponsor of the PostgreSQL project,
|
||||
continues to develop repmgr to meet internal needs and those of customers.
|
||||
Other companies as well as individual developers
|
||||
are welcome to participate in the efforts.
|
||||
&repmgr; is fully supported by 2ndQuadrant's
|
||||
<ulink url="https://www.2ndquadrant.com/en/support/support-postgresql/">24/7 Production Support</ulink>.
|
||||
2ndQuadrant, a Major Sponsor of the PostgreSQL project, continues to develop and maintain &repmgr;.
|
||||
Other companies as well as individual developers are welcome to participate in the efforts.
|
||||
</para>
|
||||
</abstract>
|
||||
|
||||
@@ -73,21 +73,16 @@
|
||||
&promoting-standby;
|
||||
&follow-new-primary;
|
||||
&switchover;
|
||||
&configuring-witness-server;
|
||||
&event-notifications;
|
||||
&upgrading-repmgr;
|
||||
</part>
|
||||
|
||||
<part id="using-repmgrd">
|
||||
<title>Using repmgrd</title>
|
||||
&repmgrd-overview;
|
||||
&repmgrd-automatic-failover;
|
||||
&repmgrd-configuration;
|
||||
&repmgrd-demonstration;
|
||||
&repmgrd-cascading-replication;
|
||||
&repmgrd-network-split;
|
||||
&repmgrd-witness-server;
|
||||
&repmgrd-degraded-monitoring;
|
||||
&repmgrd-monitoring;
|
||||
&repmgrd-operation;
|
||||
&repmgrd-bdr;
|
||||
</part>
|
||||
|
||||
@@ -107,17 +102,24 @@
|
||||
&repmgr-node-status;
|
||||
&repmgr-node-check;
|
||||
&repmgr-node-rejoin;
|
||||
&repmgr-node-service;
|
||||
&repmgr-cluster-show;
|
||||
&repmgr-cluster-matrix;
|
||||
&repmgr-cluster-crosscheck;
|
||||
&repmgr-cluster-event;
|
||||
&repmgr-cluster-cleanup;
|
||||
&repmgr-daemon-status;
|
||||
&repmgr-daemon-start;
|
||||
&repmgr-daemon-stop;
|
||||
&repmgr-daemon-pause;
|
||||
&repmgr-daemon-unpause;
|
||||
</part>
|
||||
|
||||
&appendix-release-notes;
|
||||
&appendix-signatures;
|
||||
&appendix-faq;
|
||||
&appendix-packages;
|
||||
&appendix-support;
|
||||
|
||||
<![%include-index;[&bookindex;]]>
|
||||
<![%include-xslt-index;[<index id="bookindex"></index>]]>
|
||||
|
||||
@@ -13,5 +13,285 @@
|
||||
providing monitoring information about the state of each standby.
|
||||
</para>
|
||||
|
||||
<sect1 id="repmgrd-witness-server" xreflabel="Using a witness server with repmgrd">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>witness server</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>witness server</primary>
|
||||
<secondary>repmgrd</secondary>
|
||||
</indexterm>
|
||||
<title>Using a witness server</title>
|
||||
<para>
|
||||
A <xref linkend="witness-server"> is a normal PostgreSQL instance which
|
||||
is not part of the streaming replication cluster; its purpose is, if a
|
||||
failover situation occurs, to provide proof that it is the primary server
|
||||
itself which is unavailable, rather than e.g. a network split between
|
||||
different physical locations.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
A typical use case for a witness server is a two-node streaming replication
|
||||
setup, where the primary and standby are in different locations (data centres).
|
||||
By creating a witness server in the same location (data centre) as the primary,
|
||||
if the primary becomes unavailable it's possible for the standby to decide whether
|
||||
it can promote itself without risking a "split brain" scenario: if it can't see either the
|
||||
witness or the primary server, it's likely there's a network-level interruption
|
||||
and it should not promote itself. If it can see the witness but not the primary,
|
||||
this proves there is no network interruption and the primary itself is unavailable,
|
||||
and it can therefore promote itself (and ideally take action to fence the
|
||||
former primary).
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
<emphasis>Never</emphasis> install a witness server on the same physical host
|
||||
as another node in the replication cluster managed by &repmgr; - it's essential
|
||||
the witness is not affected in any way by failure of another node.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
For more complex replication scenarios,e.g. with multiple datacentres, it may
|
||||
be preferable to use location-based failover, which ensures that only nodes
|
||||
in the same location as the primary will ever be promotion candidates;
|
||||
see <xref linkend="repmgrd-network-split"> for more details.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<simpara>
|
||||
A witness server will only be useful if <application>repmgrd</application>
|
||||
is in use.
|
||||
</simpara>
|
||||
</note>
|
||||
|
||||
<sect2 id="creating-witness-server">
|
||||
<title>Creating a witness server</title>
|
||||
<para>
|
||||
To create a witness server, set up a normal PostgreSQL instance on a server
|
||||
in the same physical location as the cluster's primary server.
|
||||
</para>
|
||||
<para>
|
||||
This instance should <emphasis>not</emphasis> be on the same physical host as the primary server,
|
||||
as otherwise if the primary server fails due to hardware issues, the witness
|
||||
server will be lost too.
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
&repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
|
||||
command, which would automatically create a PostgreSQL instance. However
|
||||
this often resulted in an unsatisfactory, hard-to-customise instance.
|
||||
</simpara>
|
||||
</note>
|
||||
<para>
|
||||
The witness server should be configured in the same way as a normal
|
||||
&repmgr; node; see section <xref linkend="configuration">.
|
||||
</para>
|
||||
<para>
|
||||
Register the witness server with <xref linkend="repmgr-witness-register">.
|
||||
This will create the &repmgr; extension on the witness server, and make
|
||||
a copy of the &repmgr; metadata.
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
As the witness server is not part of the replication cluster, further
|
||||
changes to the &repmgr; metadata will be synchronised by
|
||||
<application>repmgrd</application>.
|
||||
</simpara>
|
||||
</note>
|
||||
<para>
|
||||
Once the witness server has been configured, <application>repmgrd</application>
|
||||
should be started.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="repmgrd-network-split" xreflabel="Handling network splits with repmgrd">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>network splits</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>network splits</primary>
|
||||
</indexterm>
|
||||
|
||||
<title>Handling network splits with repmgrd</title>
|
||||
<para>
|
||||
A common pattern for replication cluster setups is to spread servers over
|
||||
more than one datacentre. This can provide benefits such as geographically-
|
||||
distributed read replicas and DR (disaster recovery capability). However
|
||||
this also means there is a risk of disconnection at network level between
|
||||
datacentre locations, which would result in a split-brain scenario if
|
||||
servers in a secondary data centre were no longer able to see the primary
|
||||
in the main data centre and promoted a standby among themselves.
|
||||
</para>
|
||||
<para>
|
||||
&repmgr; enables provision of "<xref linkend="witness-server">" to
|
||||
artificially create a quorum of servers in a particular location, ensuring
|
||||
that nodes in another location will not elect a new primary if they
|
||||
are unable to see the majority of nodes. However this approach does not
|
||||
scale well, particularly with more complex replication setups, e.g.
|
||||
where the majority of nodes are located outside of the primary datacentre.
|
||||
It also means the <literal>witness</literal> node needs to be managed as an
|
||||
extra PostgreSQL instance outside of the main replication cluster, which
|
||||
adds administrative and programming complexity.
|
||||
</para>
|
||||
<para>
|
||||
<literal>repmgr4</literal> introduces the concept of <literal>location</literal>:
|
||||
each node is associated with an arbitrary location string (default is
|
||||
<literal>default</literal>); this is set in <filename>repmgr.conf</filename>, e.g.:
|
||||
<programlisting>
|
||||
node_id=1
|
||||
node_name=node1
|
||||
conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'
|
||||
data_directory='/var/lib/postgresql/data'
|
||||
location='dc1'</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
In a failover situation, <application>repmgrd</application> will check if any servers in the
|
||||
same location as the current primary node are visible. If not, <application>repmgrd</application>
|
||||
will assume a network interruption and not promote any node in any
|
||||
other location (it will however enter <link linkend="repmgrd-degraded-monitoring">degraded monitoring</link>
|
||||
mode until a primary becomes visible).
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="repmgrd-standby-disconnection-on-failover" xreflabel="Standby disconnection on failover">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>standby disconnection on failover</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>standby disconnection on failover</primary>
|
||||
</indexterm>
|
||||
|
||||
<title>Standby disconnection on failover</title>
|
||||
<para>
|
||||
If <option>standby_disconnect_on_failover</option> is set to <literal>true</literal> in
|
||||
<filename>repmgr.conf</filename>, in a failover situation <application>repmgrd</application> will forcibly disconnect
|
||||
the local node's WAL receiver before making a failover decision.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
<option>standby_disconnect_on_failover</option> is available from PostgreSQL 9.5 and later.
|
||||
Additionally this requires that the <literal>repmgr</literal> database user is a superuser.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
By doing this, it's possible to ensure that, at the point the failover decision is made, no nodes
|
||||
are receiving data from the primary and their LSN location will be static.
|
||||
</para>
|
||||
<important>
|
||||
<para>
|
||||
<option>standby_disconnect_on_failover</option> <emphasis>must</emphasis> be set to the same value on
|
||||
all nodes.
|
||||
</para>
|
||||
</important>
|
||||
<para>
|
||||
Note that when using <option>standby_disconnect_on_failover</option> there will be a delay of 5 seconds
|
||||
plus however many seconds it takes to confirm the WAL receiver is disconnected before
|
||||
<application>repmgrd</application> proceeds with the failover decision.
|
||||
</para>
|
||||
<para>
|
||||
Following the failover operation, no matter what the outcome, each node will reconnect its WAL receiver.
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="repmgrd-failover-validation" xreflabel="Failover validation">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>failover validation</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>failover validation</primary>
|
||||
</indexterm>
|
||||
|
||||
<title>Failover validation</title>
|
||||
<para>
|
||||
From <link linkend="release-4.3">repmgr 4.3</link>, &repmgr; makes it possible to provide a script
|
||||
to <application>repmgrd</application> which, in a failover situation,
|
||||
will be executed by the promotion candidate (the node which has been selected
|
||||
to be the new primary) to confirm whether the node should actually be promoted.
|
||||
</para>
|
||||
<para>
|
||||
To use this, <option>failover_validation_command</option> in <filename>repmgr.conf</filename>
|
||||
to a script executable by the <literal>postgres</literal> system user, e.g.:
|
||||
<programlisting>
|
||||
failover_validation_command=/path/to/script.sh %n %a</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
The <literal>%n</literal> parameter will be replaced with the node ID, and the
|
||||
<literal>%a</literal> parameter will be replaced by the node name when the script is executed.
|
||||
</para>
|
||||
<para>
|
||||
This script must return an exit code of <literal>0</literal> to indicate the node should promote itself.
|
||||
Any other value will result in the promotion being aborted and the election rerun.
|
||||
There is a pause of <option>election_rerun_interval</option> seconds before the election is rerun.
|
||||
</para>
|
||||
<para>
|
||||
Sample <application>repmgrd</application> log file output during which the failover validation
|
||||
script rejects the proposed promotion candidate:
|
||||
<programlisting>
|
||||
[2019-03-13 21:01:30] [INFO] visible nodes: 2; total nodes: 2; no nodes have seen the primary within the last 4 seconds
|
||||
[2019-03-13 21:01:30] [NOTICE] promotion candidate is "node2" (ID: 2)
|
||||
[2019-03-13 21:01:30] [NOTICE] executing "failover_validation_command"
|
||||
[2019-03-13 21:01:30] [DETAIL] /usr/local/bin/failover-validation.sh 2
|
||||
[2019-03-13 21:01:30] [INFO] output returned by failover validation command:
|
||||
Node ID: 2
|
||||
|
||||
[2019-03-13 21:01:30] [NOTICE] failover validation command returned a non-zero value: "1"
|
||||
[2019-03-13 21:01:30] [NOTICE] promotion candidate election will be rerun
|
||||
[2019-03-13 21:01:30] [INFO] 1 followers to notify
|
||||
[2019-03-13 21:01:30] [NOTICE] notifying node "node3" (node ID: 3) to rerun promotion candidate selection
|
||||
INFO: node 3 received notification to rerun promotion candidate election
|
||||
[2019-03-13 21:01:30] [NOTICE] rerunning election after 15 seconds ("election_rerun_interval")</programlisting>
|
||||
</para>
|
||||
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="cascading-replication" xreflabel="Cascading replication">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>cascading replication</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>cascading replication</primary>
|
||||
<secondary>repmgrd</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd and cascading replication</title>
|
||||
<para>
|
||||
Cascading replication - where a standby can connect to an upstream node and not
|
||||
the primary server itself - was introduced in PostgreSQL 9.2. &repmgr; and
|
||||
<application>repmgrd</application> support cascading replication by keeping track of the relationship
|
||||
between standby servers - each node record is stored with the node id of its
|
||||
upstream ("parent") server (except of course the primary server).
|
||||
</para>
|
||||
<para>
|
||||
In a failover situation where the primary node fails and a top-level standby
|
||||
is promoted, a standby connected to another standby will not be affected
|
||||
and continue working as normal (even if the upstream standby it's connected
|
||||
to becomes the primary node). If however the node's direct upstream fails,
|
||||
the "cascaded standby" will attempt to reconnect to that node's parent
|
||||
(unless <varname>failover</varname> is set to <literal>manual</literal> in
|
||||
<filename>repmgr.conf</filename>).
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
</chapter>
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
Due to the nature of BDR, it's only safe to use this solution for
|
||||
Due to the nature of BDR 1.x/2.x, it's only safe to use this solution for
|
||||
a two-node scenario. Introducing additional nodes will create an inherent
|
||||
risk of node desynchronisation if a node goes down without being cleanly
|
||||
removed from the cluster.
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
<chapter id="repmgrd-cascading-replication">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>cascading replication</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd and cascading replication</title>
|
||||
<para>
|
||||
Cascading replication - where a standby can connect to an upstream node and not
|
||||
the primary server itself - was introduced in PostgreSQL 9.2. &repmgr; and
|
||||
<application>repmgrd</application> support cascading replication by keeping track of the relationship
|
||||
between standby servers - each node record is stored with the node id of its
|
||||
upstream ("parent") server (except of course the primary server).
|
||||
</para>
|
||||
<para>
|
||||
In a failover situation where the primary node fails and a top-level standby
|
||||
is promoted, a standby connected to another standby will not be affected
|
||||
and continue working as normal (even if the upstream standby it's connected
|
||||
to becomes the primary node). If however the node's direct upstream fails,
|
||||
the "cascaded standby" will attempt to reconnect to that node's parent.
|
||||
</para>
|
||||
</chapter>
|
||||
@@ -5,7 +5,7 @@
|
||||
<secondary>configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd configuration</title>
|
||||
<title>repmgrd setup and configuration</title>
|
||||
|
||||
<para>
|
||||
<application>repmgrd</application> is a daemon which runs on each PostgreSQL node,
|
||||
@@ -20,7 +20,7 @@
|
||||
</para>
|
||||
|
||||
<sect1 id="repmgrd-basic-configuration">
|
||||
<title>repmgrd basic configuration</title>
|
||||
<title>repmgrd configuration</title>
|
||||
|
||||
<para>
|
||||
To use <application>repmgrd</application>, its associated function library <emphasis>must</emphasis> be
|
||||
@@ -31,88 +31,417 @@
|
||||
</para>
|
||||
<para>
|
||||
Changing this setting requires a restart of PostgreSQL; for more details see
|
||||
the <ulink url="https://www.postgresql.org/docs/current/static/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
|
||||
the <ulink url="https://www.postgresql.org/docs/current/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To apply configuration file changes to a running <application>repmgrd</application>
|
||||
daemon, execute the operating system's r<application>repmgrd</application> service reload command
|
||||
(see <xref linkend="appendix-packages"> for examples),
|
||||
or for instances which were manually started, execute <command>kill -HUP</command>, e.g.
|
||||
<command>kill -HUP `cat /tmp/repmgrd.pid`</command>.
|
||||
The following configuraton options apply to <application>repmgrd</application> in all circumstances:
|
||||
</para>
|
||||
<note>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>monitor_interval_secs</primary>
|
||||
</indexterm>
|
||||
<term><option>monitor_interval_secs</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The interval (in seconds, default: <literal>2</literal>) to check the availability of the upstream node.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry id="connection-check-type">
|
||||
|
||||
<indexterm>
|
||||
<primary>connection_check_type</primary>
|
||||
</indexterm>
|
||||
<term><option>connection_check_type</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The option <option>connection_check_type</option> is used to select the method
|
||||
<application>repmgrd</application> uses to determine whether the upstream node is available.
|
||||
</para>
|
||||
<para>
|
||||
Possible values are:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>ping</literal> (default) - uses <command>PQping()</command> to
|
||||
determine server availability
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>connection</literal> - determines server availability
|
||||
by attempt ingto make a new connection to the upstream node
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>query</literal> - determines server availability
|
||||
by executing an SQL statement on the node via the existing connection
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>reconnect_attempts</primary>
|
||||
</indexterm>
|
||||
<term><option>reconnect_attempts</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The number of attempts (default: <literal>6</literal>) will be made to reconnect to an unreachable
|
||||
upstream node before initiating a failover.
|
||||
</para>
|
||||
<para>
|
||||
There will be an interval of <option>reconnect_interval</option> seconds between each reconnection
|
||||
attempt.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>reconnect_interval</primary>
|
||||
</indexterm>
|
||||
<term><option>reconnect_interval</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Interval (in seconds, default: <literal>10</literal>) between attempts to reconnect to an unreachable
|
||||
upstream node.
|
||||
</para>
|
||||
<para>
|
||||
The number of reconnection attempts is defined by the parameter <option>reconnect_attempts</option>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>degraded_monitoring_timeout</primary>
|
||||
</indexterm>
|
||||
<term><option>degraded_monitoring_timeout</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Interval (in seconds) after which <application>repmgrd</application> will terminate if
|
||||
either of the servers (local node and or upstream node) being monitored is no longer available
|
||||
(<link linkend="repmgrd-degraded-monitoring">degraded monitoring mode</link>).
|
||||
</para>
|
||||
<para>
|
||||
<literal>-1</literal> (default) disables this timeout completely.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
<para>
|
||||
Check the <application>repmgrd</application> log to see what changes were
|
||||
applied, or if any issues were encountered when reloading the configuration.
|
||||
See also <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename> for an annotated sample configuration file.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
Note that only a subset of configuration file parameters can be changed on a
|
||||
running <application>repmgrd</application> daemon.
|
||||
</para>
|
||||
|
||||
|
||||
<sect2 id="repmgrd-automatic-failover-configuration">
|
||||
<title>automatic failover configuration</title>
|
||||
<title>Required configuration for automatic failover</title>
|
||||
|
||||
<para>
|
||||
If using automatic failover, the following <application>repmgrd</application> options *must* be set in
|
||||
<filename>repmgr.conf</filename> :
|
||||
The following <application>repmgrd</application> options <emphasis>must</emphasis> be set in
|
||||
<filename>repmgr.conf</filename>:
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><option>failover</option></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><option>promote_command</option></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><option>follow_command</option></simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
|
||||
<para>
|
||||
Example:
|
||||
<programlisting>
|
||||
failover=automatic
|
||||
promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'
|
||||
follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Adjust file paths as appropriate; we recomment specifying the full path to the &repmgr; binary.
|
||||
Details of each option are as follows:
|
||||
</para>
|
||||
<para>
|
||||
Note that the <literal>--log-to-file</literal> option will cause
|
||||
output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
|
||||
to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
|
||||
See <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename>
|
||||
for further <application>repmgrd</application>-specific settings.
|
||||
</para>
|
||||
<para>
|
||||
When <varname>failover</varname> is set to <literal>automatic</literal>, upon detecting failure
|
||||
of the current primary, <application>repmgrd</application> will execute one of:
|
||||
</para>
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>promote_command</varname> (if the current server is to become the new primary)
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>follow_command</varname> (if the current server needs to follow another server which has
|
||||
become the new primary)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<note>
|
||||
<para>
|
||||
These commands can be any valid shell script which results in one of these
|
||||
two actions happening, but if &repmgr;'s <command>standby follow</command> or
|
||||
<command>standby promote</command>
|
||||
commands are not executed (either directly as shown here, or from a script which
|
||||
performs other actions), the &repmgr; metadata will not be updated and
|
||||
&repmgr; will no longer function reliably.
|
||||
</para>
|
||||
</note>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>failover</primary>
|
||||
</indexterm>
|
||||
<term><option>failover</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<option>failover</option> can be one of <literal>automatic</literal> or <literal>manual</literal>.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
If <option>failover</option> is set to <literal>manual</literal>, <application>repmgrd</application>
|
||||
will not take any action if a failover situation is detected, and the node may need to
|
||||
be modified manually (e.g. by executing <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>).
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>promote_command</primary>
|
||||
</indexterm>
|
||||
<term><option>promote_command</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The program or script defined in <option>promote_command</option> will be executed
|
||||
in a failover situation when <application>repmgrd</application> determines that
|
||||
the current node is to become the new primary node.
|
||||
</para>
|
||||
<para>
|
||||
Normally <option>promote_command</option> is set as &repmgr;'s
|
||||
<command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command> command.
|
||||
</para>
|
||||
<para>
|
||||
It is also possible to provide e.g. a shell script to e.g. perform user-defined tasks
|
||||
before promoting the current node. In this case the script <emphasis>must</emphasis>
|
||||
at some point execute <command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
|
||||
to promote the node; if this is not done, &repmgr; metadata will not be updated and
|
||||
&repmgr; will no longer function reliably.
|
||||
</para>
|
||||
<para>
|
||||
Example:
|
||||
<programlisting>
|
||||
promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Note that the <literal>--log-to-file</literal> option will cause
|
||||
output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
|
||||
to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
|
||||
or <option>follow_command</option>; these can be user-defined scripts so must always be
|
||||
specified with the full path.
|
||||
</para>
|
||||
</note>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>follow_command</primary>
|
||||
</indexterm>
|
||||
<term><option>follow_command</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The program or script defined in <option>follow_command</option> will be executed
|
||||
in a failover situation when <application>repmgrd</application> determines that
|
||||
the current node is to follow the new primary node.
|
||||
</para>
|
||||
<para>
|
||||
Normally <option>follow_command</option> is set as &repmgr;'s
|
||||
<command><link linkend="repmgr-standby-follow">repmgr standby promote</link></command> command.
|
||||
</para>
|
||||
<para>
|
||||
The <option>follow_command</option> parameter
|
||||
should provide the <literal>--upstream-node-id=%n</literal>
|
||||
option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
|
||||
<application>repmgrd</application> with the ID of the new primary node. If this is not provided,
|
||||
<command>repmgr standby follow</command> will attempt to determine the new primary by itself, but if the
|
||||
original primary comes back online after the new primary is promoted, there is a risk that
|
||||
<command>repmgr standby follow</command> will result in the node continuing to follow
|
||||
the original primary.
|
||||
</para>
|
||||
<para>
|
||||
It is also possible to provide e.g. a shell script to e.g. perform user-defined tasks
|
||||
before promoting the current node. In this case the script <emphasis>must</emphasis>
|
||||
at some point execute <command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>
|
||||
to promote the node; if this is not done, &repmgr; metadata will not be updated and
|
||||
&repmgr; will no longer function reliably.
|
||||
</para>
|
||||
<para>
|
||||
Example:
|
||||
<programlisting>
|
||||
follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Note that the <literal>--log-to-file</literal> option will cause
|
||||
output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
|
||||
to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
|
||||
or <option>follow_command</option>; these can be user-defined scripts so must always be
|
||||
specified with the full path.
|
||||
</para>
|
||||
</note>
|
||||
</listitem>
|
||||
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
|
||||
<para>
|
||||
The <varname>follow_command</varname> should provide the <literal>--upstream-node-id=%n</literal>
|
||||
option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
|
||||
<application>repmgrd</application> with the ID of the new primary node. If this is not provided, &repmgr;
|
||||
will attempt to determine the new primary by itself, but if the
|
||||
original primary comes back online after the new primary is promoted, there is a risk that
|
||||
<command>repmgr standby follow</command> will result in the node continuing to follow
|
||||
the original primary.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="repmgrd-service-configuration">
|
||||
<sect2 id="repmgrd-automatic-failover-configuration-optional">
|
||||
<title>Optional configuration for automatic failover</title>
|
||||
|
||||
<para>
|
||||
The following configuraton options can be use to fine-tune automatic failover:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>priority</primary>
|
||||
</indexterm>
|
||||
<term><option>priority</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Indicates a preferred priority (default: <literal>100</literal>) for promoting nodes;
|
||||
a value of zero prevents the node being promoted to primary.
|
||||
</para>
|
||||
<para>
|
||||
Note that the priority setting is only applied if two or more nodes are
|
||||
determined as promotion candidates; in that case the node with the
|
||||
higher priority is selected.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>failover_validation_command</primary>
|
||||
</indexterm>
|
||||
<term><option>failover_validation_command</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
User-defined script to execute for an external mechanism to validate the failover
|
||||
decision made by <application>repmgrd</application>.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
This option <emphasis>must</emphasis> be identically configured
|
||||
on all nodes.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
One or both of the following parameter placeholders
|
||||
should be provided, which will be replaced by repmgrd with the appropriate
|
||||
value:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><literal>%n</literal>: node ID</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>%a</literal>: node name</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<para>
|
||||
See also: <link linkend="repmgrd-failover-validation">Failover validation</link>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
|
||||
<indexterm>
|
||||
<primary>standby_disconnect_on_failover</primary>
|
||||
</indexterm>
|
||||
<term><option>standby_disconnect_on_failover</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
In a failover situation, disconnect the local node's WAL receiver.
|
||||
</para>
|
||||
<para>
|
||||
This option is available from PostgreSQL 9.5 and later.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
This option <emphasis>must</emphasis> be identically configured
|
||||
on all nodes.
|
||||
</para>
|
||||
<para>
|
||||
Additionally the &repmgr; user <emphasis>must</emphasis> be a superuser
|
||||
for this option.
|
||||
</para>
|
||||
<para>
|
||||
<application>repmgrd</application> will refuse to start if this option is set
|
||||
but either of these prerequisites is not met.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
See also: <link linkend="repmgrd-standby-disconnection-on-failover">Standby disconnection on failover</link>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
<para>
|
||||
The following options can be used to further fine-tune failover behaviour.
|
||||
In practice it's unlikely these will need to be changed from their default
|
||||
values, but are available as configuration options should the need arise.
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>election_rerun_interval</primary>
|
||||
</indexterm>
|
||||
<term><option>election_rerun_interval</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
If <option>failover_validation_command</option> is set, and the command returns
|
||||
an error, pause the specified amount of seconds (default: 15) before rerunning the election.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>sibling_nodes_disconnect_timeout</primary>
|
||||
</indexterm>
|
||||
<term><option>sibling_nodes_disconnect_timeout</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
If <option>standby_disconnect_on_failover</option> is <literal>true</literal>, the
|
||||
maximum length of time (in seconds, default: <literal>30</literal>)
|
||||
to wait for other standbys to confirm they have disconnected their
|
||||
WAL receivers.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
|
||||
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="postgresql-service-configuration">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>PostgreSQL service configuration</secondary>
|
||||
@@ -135,7 +464,43 @@
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="repmgrd-monitoring-configuration">
|
||||
<sect2 id="repmgrd-service-configuration">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>repmgrd service configuration</secondary>
|
||||
</indexterm>
|
||||
<title>repmgrd service configuration</title>
|
||||
<para>
|
||||
If you are intending to use the <link linkend="repmgr-daemon-start"><command>repmgr daemon start</command></link>
|
||||
and <link linkend="repmgr-daemon-stop"><command>repmgr daemon stop</command></link> commands, the following
|
||||
parameters <emphasis>must</emphasis> be set in <filename>repmgr.conf</filename>:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara><varname>repmgrd_service_start_command</varname></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><varname>repmgrd_service_stop_command</varname></simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
</para>
|
||||
<para>
|
||||
Example (for &repmgr; with PostgreSQL 11 on CentOS 7):
|
||||
<programlisting>
|
||||
repmgrd_service_start_command='sudo systemctl repmgr11 start'
|
||||
repmgrd_service_stop_command='sudo systemctl repmgr11 stop'
|
||||
</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
For more details see the reference page for each command.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
<sect2 id="repmgrd-monitoring-configuration" xreflabel="repmgrd monitoring configuration">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>monitoring configuration</secondary>
|
||||
@@ -148,19 +513,245 @@
|
||||
in <filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
<para>
|
||||
The default monitoring interval is 2 seconds; this value can be explicitly set using:
|
||||
<programlisting>
|
||||
monitor_interval_secs=<seconds></programlisting>
|
||||
in <filename>repmgr.conf</filename>.
|
||||
Monitoring data is written at the interval defined by
|
||||
the option <option>monitor_interval_secs</option> (see above).
|
||||
</para>
|
||||
<para>
|
||||
For more details on monitoring, see <xref linkend="repmgrd-monitoring">.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="repmgrd-reloading-configuration"xreflabel="reloading repmgrd configuration">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>applying configuration changes</secondary>
|
||||
</indexterm>
|
||||
<title>Applying configuration changes to repmgrd</title>
|
||||
<para>
|
||||
To apply configuration file changes to a running <application>repmgrd</application>
|
||||
daemon, execute the operating system's <application>repmgrd</application> service reload command
|
||||
(see <xref linkend="appendix-packages"> for examples),
|
||||
or for instances which were manually started, execute <command>kill -HUP</command>, e.g.
|
||||
<command>kill -HUP `cat /tmp/repmgrd.pid`</command>.
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
Check the <application>repmgrd</application> log to see what changes were
|
||||
applied, or if any issues were encountered when reloading the configuration.
|
||||
</para>
|
||||
</tip>
|
||||
<para>
|
||||
Note that only the following subset of configuration file parameters can be changed on a
|
||||
running <application>repmgrd</application> daemon:
|
||||
</para>
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>async_query_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>bdr_local_monitoring_only</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>bdr_recovery_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>connection_check_type</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>conninfo</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>degraded_monitoring_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>event_notification_command</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>event_notifications</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>failover_validation_command</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>failover</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>follow_command</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>log_facility</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>log_file</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>log_level</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>log_status_interval</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>monitor_interval_secs</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>monitoring_history</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>primary_notification_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>promote_command</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>reconnect_attempts</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>reconnect_interval</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>retry_promote_interval_secs</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>repmgrd_standby_startup_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>sibling_nodes_disconnect_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>standby_disconnect_on_failover</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
<para>
|
||||
The following set of configuration file parameters must be updated via
|
||||
<command><link linkend="repmgr-standby-register">repmgr standby register --force</link></command>,
|
||||
as they require changes to the <literal>repmgr.nodes</literal> table so they are visible to
|
||||
all nodes in the replication cluster:
|
||||
</para>
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>node_id</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>node_name</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>data_directory</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>location</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>priority</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
After executing <command><link linkend="repmgr-standby-register">repmgr standby register --force</link></command>,
|
||||
<application>repmgrd</application> <emphasis>must</emphasis> be restarted for the changes to take effect.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="repmgrd-daemon">
|
||||
<sect1 id="repmgrd-daemon" xreflabel="repmgrd daemon">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>starting and stopping</secondary>
|
||||
@@ -175,6 +766,20 @@
|
||||
See appendix <xref linkend="appendix-packages"> for details of service commands
|
||||
for different distributions.
|
||||
</para>
|
||||
<para>
|
||||
The commands <link linkend="repmgr-daemon-start"><command>repmgr daemon start</command></link> and
|
||||
<link linkend="repmgr-daemon-stop"><command>repmgr daemon stop</command></link> can be used
|
||||
as convenience wrappers to start and stop <application>repmgrd</application>.
|
||||
</para>
|
||||
<important>
|
||||
<para>
|
||||
<link linkend="repmgr-daemon-start"><command>repmgr daemon start</command></link> and
|
||||
<link linkend="repmgr-daemon-stop"><command>repmgr daemon stop</command></link> require
|
||||
that the appropriate start/stop commands are configured as
|
||||
<varname>repmgrd_service_start_command</varname> and <varname>repmgrd_service_stop_command</varname>
|
||||
in <filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
</important>
|
||||
<para>
|
||||
<application>repmgrd</application> can be started manually like this:
|
||||
<programlisting>
|
||||
@@ -199,7 +804,7 @@
|
||||
<simpara>
|
||||
This is a behaviour change from previous versions (earlier than 4.1), where
|
||||
the PID file had to be explicitly specified with the command line
|
||||
parameter <option> --pid-file</option>.
|
||||
parameter <option>--pid-file</option>.
|
||||
</simpara>
|
||||
</note>
|
||||
<para>
|
||||
@@ -219,7 +824,7 @@
|
||||
</para>
|
||||
<para>
|
||||
If none of the above apply, <application>repmgrd</application> will create a PID file
|
||||
in the operating system's temporary directory (das etermined by the environment variable
|
||||
in the operating system's temporary directory (as setermined by the environment variable
|
||||
<varname>TMPDIR</varname>, or if that is not set, will use <filename>/tmp</filename>).
|
||||
</para>
|
||||
<para>
|
||||
@@ -266,7 +871,7 @@ REPMGRD_ENABLED=no
|
||||
#REPMGRD_CONF="/path/to/repmgr.conf"
|
||||
|
||||
# additional options
|
||||
#REPMGRD_OPTS=""
|
||||
REPMGRD_OPTS="--daemonize=false"
|
||||
|
||||
# user to run repmgrd as
|
||||
#REPMGRD_USER=postgres
|
||||
@@ -281,6 +886,16 @@ REPMGRD_ENABLED=no
|
||||
Set <varname>REPMGRD_ENABLED</varname> to <literal>yes</literal>, and <varname>REPMGRD_CONF</varname>
|
||||
to the <filename>repmgr.conf</filename> file you are using.
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
See <xref linkend="packages-debian-ubuntu"> for details of the Debian/Ubuntu packages and
|
||||
typical file locations (including <filename>repmgr.conf</filename>).
|
||||
</para>
|
||||
</tip>
|
||||
<para>
|
||||
From <application>repmgrd</application> 4.1, ensure <varname>REPMGRD_OPTS</varname> includes
|
||||
<option>--daemonize=false</option>, as daemonization is handled by the service command.
|
||||
</para>
|
||||
<para>
|
||||
If using <application>systemd</application>, you may need to execute <command>systemctl daemon-reload</command>.
|
||||
Also, if you attempted to start <application>repmgrd</application> using <command>systemctl start repmgrd</command>,
|
||||
@@ -311,7 +926,7 @@ REPMGRD_ENABLED=no
|
||||
<para>
|
||||
For further details on <varname>conninfo</varname> network connection
|
||||
parameters, see the
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-PARAMKEYWORDS">PostgreSQL documentation</ulink>.
|
||||
<ulink url="https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS">PostgreSQL documentation</ulink>.
|
||||
</para>
|
||||
</sect1>
|
||||
|
||||
@@ -323,25 +938,34 @@ REPMGRD_ENABLED=no
|
||||
<secondary>repmgrd</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>log rotation</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd log rotation</title>
|
||||
<para>
|
||||
To ensure the current <application>repmgrd</application> logfile
|
||||
(specified in <filename>repmgr.conf</filename> with the parameter
|
||||
<option>log_file</option> does not grow indefinitely, configure your
|
||||
<option>log_file</option>) does not grow indefinitely, configure your
|
||||
system's <command>logrotate</command> to regularly rotate it.
|
||||
</para>
|
||||
<para>
|
||||
Sample configuration to rotate logfiles weekly with retention for
|
||||
up to 52 weeks and rotation forced if a file grows beyond 100Mb:
|
||||
<programlisting>
|
||||
/var/log/postgresql/repmgr-9.6.log {
|
||||
/var/log/repmgr/repmgrd.log {
|
||||
missingok
|
||||
compress
|
||||
rotate 52
|
||||
maxsize 100M
|
||||
weekly
|
||||
create 0600 postgres postgres
|
||||
postrotate
|
||||
/usr/bin/killall -HUP repmgrd
|
||||
endscript
|
||||
}</programlisting>
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
@@ -1,83 +0,0 @@
|
||||
<chapter id="repmgrd-degraded-monitoring">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>degraded monitoring</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>"degraded monitoring" mode</title>
|
||||
<para>
|
||||
In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
|
||||
of monitoring the nodes' upstream server. In these cases it enters "degraded
|
||||
monitoring" mode, where <application>repmgrd</application> remains active but is waiting for the situation
|
||||
to be resolved.
|
||||
</para>
|
||||
<para>
|
||||
Situations where this happens are:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, no nodes in the primary node's location are visible</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but no promotion candidate is available</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but the promotion candidate could not be promoted</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but the node was unable to follow the new primary</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but no primary has become available</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but automatic failover is not enabled for the node</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Example output in a situation where there is only one standby with <literal>failover=manual</literal>,
|
||||
and the primary node is unavailable (but is later restarted):
|
||||
<programlisting>
|
||||
[2017-08-29 10:59:19] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)
|
||||
[2017-08-29 10:59:33] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
|
||||
[2017-08-29 10:59:33] [INFO] checking state of node 1, 1 of 5 attempts
|
||||
[2017-08-29 10:59:33] [INFO] sleeping 1 seconds until next reconnection attempt
|
||||
(...)
|
||||
[2017-08-29 10:59:37] [INFO] checking state of node 1, 5 of 5 attempts
|
||||
[2017-08-29 10:59:37] [WARNING] unable to reconnect to node 1 after 5 attempts
|
||||
[2017-08-29 10:59:37] [NOTICE] this node is not configured for automatic failover so will not be considered as promotion candidate
|
||||
[2017-08-29 10:59:37] [NOTICE] no other nodes are available as promotion candidate
|
||||
[2017-08-29 10:59:37] [HINT] use "repmgr standby promote" to manually promote this node
|
||||
[2017-08-29 10:59:37] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
|
||||
[2017-08-29 10:59:53] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
|
||||
[2017-08-29 11:00:45] [NOTICE] reconnected to upstream node 1 after 68 seconds, resuming monitoring
|
||||
[2017-08-29 11:00:57] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)</programlisting>
|
||||
|
||||
</para>
|
||||
<para>
|
||||
By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
|
||||
However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
|
||||
after which <application>repmgrd</application> will terminate.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If <application>repmgrd</application> is monitoring a primary mode which has been stopped
|
||||
and manually restarted as a standby attached to a new primary, it will automatically detect
|
||||
the status change and update the node record to reflect the node's new status
|
||||
as an active standby. It will then resume monitoring the node as a standby.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</chapter>
|
||||
@@ -1,96 +0,0 @@
|
||||
<chapter id="repmgrd-demonstration">
|
||||
<title>repmgrd demonstration</title>
|
||||
<para>
|
||||
To demonstrate automatic failover, set up a 3-node replication cluster (one primary
|
||||
and two standbys streaming directly from the primary) so that the cluster looks
|
||||
something like this:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster show
|
||||
ID | Name | Role | Status | Upstream | Location | Connection string
|
||||
----+-------+---------+-----------+----------+----------+--------------------------------------
|
||||
1 | node1 | primary | * running | | default | host=node1 dbname=repmgr user=repmgr
|
||||
2 | node2 | standby | running | node1 | default | host=node2 dbname=repmgr user=repmgr
|
||||
3 | node3 | standby | running | node1 | default | host=node3 dbname=repmgr user=repmgr</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Start <application>repmgrd</application> on each standby and verify that it's running by examining the
|
||||
log output, which at log level <literal>INFO</literal> will look like this:
|
||||
<programlisting>
|
||||
[2017-08-24 17:31:00] [NOTICE] using configuration file "/etc/repmgr.conf"
|
||||
[2017-08-24 17:31:00] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr"
|
||||
[2017-08-24 17:31:00] [NOTICE] starting monitoring of node <literal>node2</literal> (ID: 2)
|
||||
[2017-08-24 17:31:00] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Each <application>repmgrd</application> should also have recorded its successful startup as an event:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster event --event=repmgrd_start
|
||||
Node ID | Name | Event | OK | Timestamp | Details
|
||||
---------+-------+---------------+----+---------------------+-------------------------------------------------------------
|
||||
3 | node3 | repmgrd_start | t | 2017-08-24 17:35:54 | monitoring connection to upstream node "node1" (node ID: 1)
|
||||
2 | node2 | repmgrd_start | t | 2017-08-24 17:35:50 | monitoring connection to upstream node "node1" (node ID: 1)
|
||||
1 | node1 | repmgrd_start | t | 2017-08-24 17:35:46 | monitoring cluster primary "node1" (node ID: 1) </programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Now stop the current primary server with e.g.:
|
||||
<programlisting>
|
||||
pg_ctl -D /var/lib/postgresql/data -m immediate stop</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
This will force the primary to shut down straight away, aborting all processes
|
||||
and transactions. This will cause a flurry of activity in the <application>repmgrd</application> log
|
||||
files as each <application>repmgrd</application> detects the failure of the primary and a failover
|
||||
decision is made. This is an extract from the log of a standby server (<literal>node2</literal>)
|
||||
which has promoted to new primary after failure of the original primary (<literal>node1</literal>).
|
||||
<programlisting>
|
||||
[2017-08-24 23:32:01] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state
|
||||
[2017-08-24 23:32:08] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
|
||||
[2017-08-24 23:32:08] [INFO] checking state of node 1, 1 of 5 attempts
|
||||
[2017-08-24 23:32:08] [INFO] sleeping 1 seconds until next reconnection attempt
|
||||
[2017-08-24 23:32:09] [INFO] checking state of node 1, 2 of 5 attempts
|
||||
[2017-08-24 23:32:09] [INFO] sleeping 1 seconds until next reconnection attempt
|
||||
[2017-08-24 23:32:10] [INFO] checking state of node 1, 3 of 5 attempts
|
||||
[2017-08-24 23:32:10] [INFO] sleeping 1 seconds until next reconnection attempt
|
||||
[2017-08-24 23:32:11] [INFO] checking state of node 1, 4 of 5 attempts
|
||||
[2017-08-24 23:32:11] [INFO] sleeping 1 seconds until next reconnection attempt
|
||||
[2017-08-24 23:32:12] [INFO] checking state of node 1, 5 of 5 attempts
|
||||
[2017-08-24 23:32:12] [WARNING] unable to reconnect to node 1 after 5 attempts
|
||||
INFO: setting voting term to 1
|
||||
INFO: node 2 is candidate
|
||||
INFO: node 3 has received request from node 2 for electoral term 1 (our term: 0)
|
||||
[2017-08-24 23:32:12] [NOTICE] this node is the winner, will now promote self and inform other nodes
|
||||
INFO: connecting to standby database
|
||||
NOTICE: promoting standby
|
||||
DETAIL: promoting server using 'pg_ctl -l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' promote'
|
||||
INFO: reconnecting to promoted server
|
||||
NOTICE: STANDBY PROMOTE successful
|
||||
DETAIL: node 2 was successfully promoted to primary
|
||||
INFO: node 3 received notification to follow node 2
|
||||
[2017-08-24 23:32:13] [INFO] switching to primary monitoring mode</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
The cluster status will now look like this, with the original primary (<literal>node1</literal>)
|
||||
marked as inactive, and standby <literal>node3</literal> now following the new primary
|
||||
(<literal>node2</literal>):
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster show
|
||||
ID | Name | Role | Status | Upstream | Location | Connection string
|
||||
----+-------+---------+-----------+----------+----------+----------------------------------------------------
|
||||
1 | node1 | primary | - failed | | default | host=node1 dbname=repmgr user=repmgr
|
||||
2 | node2 | primary | * running | | default | host=node2 dbname=repmgr user=repmgr
|
||||
3 | node3 | standby | running | node2 | default | host=node3 dbname=repmgr user=repmgr</programlisting>
|
||||
|
||||
</para>
|
||||
<para>
|
||||
<command>repmgr cluster event</command> will display a summary of what happened to each server
|
||||
during the failover:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster event
|
||||
Node ID | Name | Event | OK | Timestamp | Details
|
||||
---------+-------+--------------------------+----+---------------------+-----------------------------------------------------------------------------------
|
||||
3 | node3 | repmgrd_failover_follow | t | 2017-08-24 23:32:16 | node 3 now following new upstream node 2
|
||||
3 | node3 | standby_follow | t | 2017-08-24 23:32:16 | node 3 is now attached to node 2
|
||||
2 | node2 | repmgrd_failover_promote | t | 2017-08-24 23:32:13 | node 2 promoted to primary; old primary 1 marked as failed
|
||||
2 | node2 | standby_promote | t | 2017-08-24 23:32:13 | node 2 was successfully promoted to primary</programlisting>
|
||||
</para>
|
||||
</chapter>
|
||||
@@ -1,80 +0,0 @@
|
||||
<chapter id="repmgrd-monitoring">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>monitoring</secondary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>monitoring</primary>
|
||||
<secondary>with repmgrd</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Monitoring with repmgrd</title>
|
||||
<para>
|
||||
When <application>repmgrd</application> is running with the option <literal>monitoring_history=true</literal>,
|
||||
it will constantly write standby node status information to the
|
||||
<varname>monitoring_history</varname> table, providing a near-real time
|
||||
overview of replication status on all nodes
|
||||
in the cluster.
|
||||
</para>
|
||||
<para>
|
||||
The view <literal>replication_status</literal> shows the most recent state
|
||||
for each node, e.g.:
|
||||
<programlisting>
|
||||
repmgr=# select * from repmgr.replication_status;
|
||||
-[ RECORD 1 ]-------------+------------------------------
|
||||
primary_node_id | 1
|
||||
standby_node_id | 2
|
||||
standby_name | node2
|
||||
node_type | standby
|
||||
active | t
|
||||
last_monitor_time | 2017-08-24 16:28:41.260478+09
|
||||
last_wal_primary_location | 0/6D57A00
|
||||
last_wal_standby_location | 0/5000000
|
||||
replication_lag | 29 MB
|
||||
replication_time_lag | 00:00:11.736163
|
||||
apply_lag | 15 MB
|
||||
communication_time_lag | 00:00:01.365643</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
The interval in which monitoring history is written is controlled by the
|
||||
configuration parameter <varname>monitor_interval_secs</varname>;
|
||||
default is 2.
|
||||
</para>
|
||||
<para>
|
||||
As this can generate a large amount of monitoring data in the table
|
||||
<literal>repmgr.monitoring_history</literal>. it's advisable to regularly
|
||||
purge historical data using the <xref linkend="repmgr-cluster-cleanup">
|
||||
command; use the <literal>-k/--keep-history</literal> option to
|
||||
specify how many day's worth of data should be retained.
|
||||
</para>
|
||||
<para>
|
||||
It's possible to use <application>repmgrd</application> to run in monitoring
|
||||
mode only (without automatic failover capability) for some or all
|
||||
nodes by setting <literal>failover=manual</literal> in the node's
|
||||
<filename>repmgr.conf</filename> file. In the event of the node's upstream failing,
|
||||
no failover action will be taken and the node will require manual intervention to
|
||||
be reattached to replication. If this occurs, an
|
||||
<link linkend="event-notifications">event notification</link>
|
||||
<varname>standby_disconnect_manual</varname> will be created.
|
||||
</para>
|
||||
<para>
|
||||
Note that when a standby node is not streaming directly from its upstream
|
||||
node, e.g. recovering WAL from an archive, <varname>apply_lag</varname> will always appear as
|
||||
<literal>0 bytes</literal>.
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
If monitoring history is enabled, the contents of the <literal>repmgr.monitoring_history</literal>
|
||||
table will be replicated to attached standbys. This means there will be a small but
|
||||
constant stream of replication activity which may not be desirable. To prevent
|
||||
this, convert the table to an <literal>UNLOGGED</literal> one with:
|
||||
<programlisting>
|
||||
ALTER TABLE repmgr.monitoring_history SET UNLOGGED;</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
This will however mean that monitoring history will not be available on
|
||||
another node following a failover, and the view <literal>repmgr.replication_status</literal>
|
||||
will not work on standbys.
|
||||
</para>
|
||||
</tip>
|
||||
</chapter>
|
||||
@@ -1,48 +0,0 @@
|
||||
<chapter id="repmgrd-network-split" xreflabel="Handling network splits with repmgrd">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>network splits</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Handling network splits with repmgrd</title>
|
||||
<para>
|
||||
A common pattern for replication cluster setups is to spread servers over
|
||||
more than one datacentre. This can provide benefits such as geographically-
|
||||
distributed read replicas and DR (disaster recovery capability). However
|
||||
this also means there is a risk of disconnection at network level between
|
||||
datacentre locations, which would result in a split-brain scenario if
|
||||
servers in a secondary data centre were no longer able to see the primary
|
||||
in the main data centre and promoted a standby among themselves.
|
||||
</para>
|
||||
<para>
|
||||
&repmgr; enables provision of "<xref linkend="witness-server">" to
|
||||
artificially create a quorum of servers in a particular location, ensuring
|
||||
that nodes in another location will not elect a new primary if they
|
||||
are unable to see the majority of nodes. However this approach does not
|
||||
scale well, particularly with more complex replication setups, e.g.
|
||||
where the majority of nodes are located outside of the primary datacentre.
|
||||
It also means the <literal>witness</literal> node needs to be managed as an
|
||||
extra PostgreSQL instance outside of the main replication cluster, which
|
||||
adds administrative and programming complexity.
|
||||
</para>
|
||||
<para>
|
||||
<literal>repmgr4</literal> introduces the concept of <literal>location</literal>:
|
||||
each node is associated with an arbitrary location string (default is
|
||||
<literal>default</literal>); this is set in <filename>repmgr.conf</filename>, e.g.:
|
||||
<programlisting>
|
||||
node_id=1
|
||||
node_name=node1
|
||||
conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'
|
||||
data_directory='/var/lib/postgresql/data'
|
||||
location='dc1'</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
In a failover situation, <application>repmgrd</application> will check if any servers in the
|
||||
same location as the current primary node are visible. If not, <application>repmgrd</application>
|
||||
will assume a network interruption and not promote any node in any
|
||||
other location (it will however enter <xref linkend="repmgrd-degraded-monitoring"> mode until
|
||||
a primary becomes visible).
|
||||
</para>
|
||||
|
||||
</chapter>
|
||||
|
||||
386
doc/repmgrd-operation.sgml
Normal file
386
doc/repmgrd-operation.sgml
Normal file
@@ -0,0 +1,386 @@
|
||||
<chapter id="repmgrd-operation" xreflabel="repmgrd operation">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>operation</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd operation</title>
|
||||
|
||||
|
||||
<sect1 id="repmgrd-pausing">
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>pausing</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>pausing repmgrd</primary>
|
||||
</indexterm>
|
||||
|
||||
<title>Pausing repmgrd</title>
|
||||
|
||||
<para>
|
||||
In normal operation, <application>repmgrd</application> monitors the state of the
|
||||
PostgreSQL node it is running on, and will take appropriate action if problems
|
||||
are detected, e.g. (if so configured) promote the node to primary, if the existing
|
||||
primary has been determined as failed.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
However, <application>repmgrd</application> is unable to distinguish between
|
||||
planned outages (such as performing a <link linkend="performing-switchover">switchover</link>
|
||||
or installing PostgreSQL maintenance released), and an actual server outage. In versions prior to
|
||||
&repmgr; 4.2 it was necessary to stop <application>repmgrd</application> on all nodes (or at least
|
||||
on all nodes where <application>repmgrd</application> is
|
||||
<link linkend="repmgrd-automatic-failover">configured for automatic failover</link>)
|
||||
to prevent <application>repmgrd</application> from making unintentional changes to the
|
||||
replication cluster.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
From <link linkend="release-4.2">&repmgr; 4.2</link>, <application>repmgrd</application>
|
||||
can now be "paused", i.e. instructed not to take any action such as performing a failover.
|
||||
This can be done from any node in the cluster, removing the need to stop/restart
|
||||
each <application>repmgrd</application> individually.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
For major PostgreSQL upgrades, e.g. from PostgreSQL 10 to PostgreSQL 11,
|
||||
<application>repmgrd</application> should be shut down completely and only started up
|
||||
once the &repmgr; packages for the new PostgreSQL major version have been installed.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<sect2 id="repmgrd-pausing-prerequisites">
|
||||
<title>Prerequisites for pausing <application>repmgrd</application></title>
|
||||
<para>
|
||||
In order to be able to pause/unpause <application>repmgrd</application>, following
|
||||
prerequisites must be met:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara><link linkend="release-4.2">&repmgr; 4.2</link> or later must be installed on all nodes.</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>The same major &repmgr; version (e.g. 4.2) must be installed on all nodes (and preferably the same minor version).</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
PostgreSQL on all nodes must be accessible from the node where the
|
||||
<literal>pause</literal>/<literal>unpause</literal> operation is executed, using the
|
||||
<varname>conninfo</varname> string shown by <link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
These conditions are required for normal &repmgr; operation in any case.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="repmgrd-pausing-execution">
|
||||
<title>Pausing/unpausing <application>repmgrd</application></title>
|
||||
<para>
|
||||
To pause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link>, e.g.:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf daemon pause
|
||||
NOTICE: node 1 (node1) paused
|
||||
NOTICE: node 2 (node2) paused
|
||||
NOTICE: node 3 (node3) paused</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
The state of <application>repmgrd</application> on each node can be checked with
|
||||
<link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>, e.g.:
|
||||
<programlisting>$ repmgr -f /etc/repmgr.conf daemon status
|
||||
ID | Name | Role | Status | repmgrd | PID | Paused?
|
||||
----+-------+---------+---------+---------+------+---------
|
||||
1 | node1 | primary | running | running | 7851 | yes
|
||||
2 | node2 | standby | running | running | 7889 | yes
|
||||
3 | node3 | standby | running | running | 7918 | yes</programlisting>
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If executing a switchover with <link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link>,
|
||||
&repmgr; will automatically pause/unpause <application>repmgrd</application> as part of the switchover process.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
If the primary (in this example, <literal>node1</literal>) is stopped, <application>repmgrd</application>
|
||||
running on one of the standbys (here: <literal>node2</literal>) will react like this:
|
||||
<programlisting>
|
||||
[2018-09-20 12:22:21] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
|
||||
[2018-09-20 12:22:21] [INFO] checking state of node 1, 1 of 5 attempts
|
||||
[2018-09-20 12:22:21] [INFO] sleeping 1 seconds until next reconnection attempt
|
||||
...
|
||||
[2018-09-20 12:22:24] [INFO] sleeping 1 seconds until next reconnection attempt
|
||||
[2018-09-20 12:22:25] [INFO] checking state of node 1, 5 of 5 attempts
|
||||
[2018-09-20 12:22:25] [WARNING] unable to reconnect to node 1 after 5 attempts
|
||||
[2018-09-20 12:22:25] [NOTICE] node is paused
|
||||
[2018-09-20 12:22:33] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state
|
||||
[2018-09-20 12:22:33] [DETAIL] repmgrd paused by administrator
|
||||
[2018-09-20 12:22:33] [HINT] execute "repmgr daemon unpause" to resume normal failover mode</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
If the primary becomes available again (e.g. following a software upgrade), <application>repmgrd</application>
|
||||
will automatically reconnect, e.g.:
|
||||
<programlisting>
|
||||
[2018-09-20 13:12:41] [NOTICE] reconnected to upstream node 1 after 8 seconds, resuming monitoring</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To unpause <application>repmgrd</application>, execute <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>, e.g.:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf daemon unpause
|
||||
NOTICE: node 1 (node1) unpaused
|
||||
NOTICE: node 2 (node2) unpaused
|
||||
NOTICE: node 3 (node3) unpaused</programlisting>
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If the previous primary is no longer accessible when <application>repmgrd</application>
|
||||
is unpaused, no failover action will be taken. Instead, a new primary must be manually promoted using
|
||||
<link linkend="repmgr-standby-promote"><command>repmgr standby promote</command></link>,
|
||||
and any standbys attached to the new primary with
|
||||
<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>.
|
||||
</para>
|
||||
<para>
|
||||
This is to prevent <link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
|
||||
resulting in the automatic promotion of a new primary, which may be a problem particularly
|
||||
in larger clusters, where <application>repmgrd</application> could select a different promotion
|
||||
candidate to the one intended by the administrator.
|
||||
</para>
|
||||
</note>
|
||||
</sect2>
|
||||
<sect2 id="repmgrd-pausing-details">
|
||||
<title>Details on the <application>repmgrd</application> pausing mechanism</title>
|
||||
|
||||
<para>
|
||||
The pause state of each node will be stored over a PostgreSQL restart.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
|
||||
<link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link> can be
|
||||
executed even if <application>repmgrd</application> is not running; in this case,
|
||||
<application>repmgrd</application> will start up in whichever pause state has been set.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
<link linkend="repmgr-daemon-pause"><command>repmgr daemon pause</command></link> and
|
||||
<link linkend="repmgr-daemon-unpause"><command>repmgr daemon unpause</command></link>
|
||||
<emphasis>do not</emphasis> stop/start <application>repmgrd</application>.
|
||||
</para>
|
||||
</note>
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="repmgrd-wal-replay-pause">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>paused WAL replay</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd and paused WAL replay</title>
|
||||
<para>
|
||||
If WAL replay has been paused (using <command>pg_wal_replay_pause()</command>,
|
||||
on PostgreSQL 9.6 and earlier <command>pg_xlog_replay_pause()</command>),
|
||||
in a failover situation <application>repmgrd</application> will
|
||||
automatically resume WAL replay.
|
||||
</para>
|
||||
<para>
|
||||
This is because if WAL replay is paused, but WAL is pending replay,
|
||||
PostgreSQL cannot be promoted until WAL replay is resumed.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
<command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>
|
||||
will refuse to promote a node in this state, as the PostgreSQL
|
||||
<command>promote</command> command will not be acted on until
|
||||
WAL replay is resumed, leaving the cluster in a potentially
|
||||
unstable state. In this case it is up to the user to
|
||||
decide whether to resume WAL replay.
|
||||
</para>
|
||||
</note>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="repmgrd-degraded-monitoring" xreflabel="repmgrd degraded monitoring">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>degraded monitoring</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>degraded monitoring</primary>
|
||||
</indexterm>
|
||||
|
||||
<title>"degraded monitoring" mode</title>
|
||||
<para>
|
||||
In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
|
||||
of monitoring the node's upstream server. In these cases it enters "degraded monitoring"
|
||||
mode, where <application>repmgrd</application> remains active but is waiting for the situation
|
||||
to be resolved.
|
||||
</para>
|
||||
<para>
|
||||
Situations where this happens are:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, no nodes in the primary node's location are visible</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but no promotion candidate is available</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but the promotion candidate could not be promoted</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but the node was unable to follow the new primary</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but no primary has become available</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>a failover situation has occurred, but automatic failover is not enabled for the node</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Example output in a situation where there is only one standby with <literal>failover=manual</literal>,
|
||||
and the primary node is unavailable (but is later restarted):
|
||||
<programlisting>
|
||||
[2017-08-29 10:59:19] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)
|
||||
[2017-08-29 10:59:33] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
|
||||
[2017-08-29 10:59:33] [INFO] checking state of node 1, 1 of 5 attempts
|
||||
[2017-08-29 10:59:33] [INFO] sleeping 1 seconds until next reconnection attempt
|
||||
(...)
|
||||
[2017-08-29 10:59:37] [INFO] checking state of node 1, 5 of 5 attempts
|
||||
[2017-08-29 10:59:37] [WARNING] unable to reconnect to node 1 after 5 attempts
|
||||
[2017-08-29 10:59:37] [NOTICE] this node is not configured for automatic failover so will not be considered as promotion candidate
|
||||
[2017-08-29 10:59:37] [NOTICE] no other nodes are available as promotion candidate
|
||||
[2017-08-29 10:59:37] [HINT] use "repmgr standby promote" to manually promote this node
|
||||
[2017-08-29 10:59:37] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
|
||||
[2017-08-29 10:59:53] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in degraded state (automatic failover disabled)
|
||||
[2017-08-29 11:00:45] [NOTICE] reconnected to upstream node 1 after 68 seconds, resuming monitoring
|
||||
[2017-08-29 11:00:57] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state (automatic failover disabled)</programlisting>
|
||||
|
||||
</para>
|
||||
<para>
|
||||
By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
|
||||
However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
|
||||
after which <application>repmgrd</application> will terminate.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If <application>repmgrd</application> is monitoring a primary mode which has been stopped
|
||||
and manually restarted as a standby attached to a new primary, it will automatically detect
|
||||
the status change and update the node record to reflect the node's new status
|
||||
as an active standby. It will then resume monitoring the node as a standby.
|
||||
</para>
|
||||
</note>
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="repmgrd-monitoring" xreflabel="Storing monitoring data">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>monitoring</secondary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>monitoring</primary>
|
||||
<secondary>with repmgrd</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Storing monitoring data</title>
|
||||
<para>
|
||||
When <application>repmgrd</application> is running with the option <literal>monitoring_history=true</literal>,
|
||||
it will constantly write standby node status information to the
|
||||
<varname>monitoring_history</varname> table, providing a near-real time
|
||||
overview of replication status on all nodes
|
||||
in the cluster.
|
||||
</para>
|
||||
<para>
|
||||
The view <literal>replication_status</literal> shows the most recent state
|
||||
for each node, e.g.:
|
||||
<programlisting>
|
||||
repmgr=# select * from repmgr.replication_status;
|
||||
-[ RECORD 1 ]-------------+------------------------------
|
||||
primary_node_id | 1
|
||||
standby_node_id | 2
|
||||
standby_name | node2
|
||||
node_type | standby
|
||||
active | t
|
||||
last_monitor_time | 2017-08-24 16:28:41.260478+09
|
||||
last_wal_primary_location | 0/6D57A00
|
||||
last_wal_standby_location | 0/5000000
|
||||
replication_lag | 29 MB
|
||||
replication_time_lag | 00:00:11.736163
|
||||
apply_lag | 15 MB
|
||||
communication_time_lag | 00:00:01.365643</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
The interval in which monitoring history is written is controlled by the
|
||||
configuration parameter <varname>monitor_interval_secs</varname>;
|
||||
default is 2.
|
||||
</para>
|
||||
<para>
|
||||
As this can generate a large amount of monitoring data in the table
|
||||
<literal>repmgr.monitoring_history</literal>. it's advisable to regularly
|
||||
purge historical data using the <xref linkend="repmgr-cluster-cleanup">
|
||||
command; use the <literal>-k/--keep-history</literal> option to
|
||||
specify how many day's worth of data should be retained.
|
||||
</para>
|
||||
<para>
|
||||
It's possible to use <application>repmgrd</application> to run in monitoring
|
||||
mode only (without automatic failover capability) for some or all
|
||||
nodes by setting <literal>failover=manual</literal> in the node's
|
||||
<filename>repmgr.conf</filename> file. In the event of the node's upstream failing,
|
||||
no failover action will be taken and the node will require manual intervention to
|
||||
be reattached to replication. If this occurs, an
|
||||
<link linkend="event-notifications">event notification</link>
|
||||
<varname>standby_disconnect_manual</varname> will be created.
|
||||
</para>
|
||||
<para>
|
||||
Note that when a standby node is not streaming directly from its upstream
|
||||
node, e.g. recovering WAL from an archive, <varname>apply_lag</varname> will always appear as
|
||||
<literal>0 bytes</literal>.
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
If monitoring history is enabled, the contents of the <literal>repmgr.monitoring_history</literal>
|
||||
table will be replicated to attached standbys. This means there will be a small but
|
||||
constant stream of replication activity which may not be desirable. To prevent
|
||||
this, convert the table to an <literal>UNLOGGED</literal> one with:
|
||||
<programlisting>
|
||||
ALTER TABLE repmgr.monitoring_history SET UNLOGGED;</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
This will however mean that monitoring history will not be available on
|
||||
another node following a failover, and the view <literal>repmgr.replication_status</literal>
|
||||
will not work on standbys.
|
||||
</para>
|
||||
</tip>
|
||||
</sect1>
|
||||
|
||||
|
||||
</chapter>
|
||||
187
doc/repmgrd-overview.sgml
Normal file
187
doc/repmgrd-overview.sgml
Normal file
@@ -0,0 +1,187 @@
|
||||
<chapter id="repmgrd-overview" xreflabel="repmgrd overview">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>overview</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd overview</title>
|
||||
|
||||
<para>
|
||||
<application>repmgrd</application> ("<literal>replication manager daemon</literal>")
|
||||
is a management and monitoring daemon which runs
|
||||
on each node in a replication cluster. It can automate actions such as
|
||||
failover and updating standbys to follow the new primary, as well as
|
||||
providing monitoring information about the state of each standby.
|
||||
</para>
|
||||
<para>
|
||||
<application>repmgrd</application> is designed to be straightforward to set up
|
||||
and does not require additional external infrastructure.
|
||||
</para>
|
||||
<para>
|
||||
Functionality provided by <application>repmgrd</application> includes:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
wide range of <link linkend="repmgrd-basic-configuration">configuration options</link>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
option to execute custom scripts ("<link linkend="event-notifications">event notifications</link>
|
||||
at different points in the failover sequence
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
ability to <link linkend="repmgrd-pausing">pause repmgrd</link>
|
||||
operation on all nodes with a
|
||||
<link linkend="repmgr-daemon-pause"><command>single command</command></link>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
optional <link linkend="repmgrd-witness-server">witness server</link>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
"location" configuration option to restrict
|
||||
potential promotion candidates to a single location
|
||||
(e.g. when nodes are spread over multiple data centres)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<link linkend="connection-check-type">choice of method</link> to determine node availability
|
||||
(PostgreSQL ping, query execution or new connection)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
retention of monitoring statistics (optional)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
</para>
|
||||
|
||||
<sect1 id="repmgrd-demonstration">
|
||||
|
||||
<title>repmgrd demonstration</title>
|
||||
<para>
|
||||
To demonstrate automatic failover, set up a 3-node replication cluster (one primary
|
||||
and two standbys streaming directly from the primary) so that the cluster looks
|
||||
something like this:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster show --compact
|
||||
ID | Name | Role | Status | Upstream | Location | Prio.
|
||||
----+-------+---------+-----------+----------+----------+-------
|
||||
1 | node1 | primary | * running | | default | 100
|
||||
2 | node2 | standby | running | node1 | default | 100
|
||||
3 | node3 | standby | running | node1 | default | 100</programlisting>
|
||||
</para>
|
||||
|
||||
<tip>
|
||||
<para>
|
||||
See section <link linkend="repmgrd-automatic-failover-configuration">Required configuration for automatic failover</link>
|
||||
for an example of minimal <filename>repmgr.conf</filename> file settings suitable for use with <application>repmgrd</application>.
|
||||
</para>
|
||||
</tip>
|
||||
<para>
|
||||
Start <application>repmgrd</application> on each standby and verify that it's running by examining the
|
||||
log output, which at log level <literal>INFO</literal> will look like this:
|
||||
<programlisting>
|
||||
[2019-03-15 06:32:05] [NOTICE] repmgrd (repmgrd 4.3) starting up
|
||||
[2019-03-15 06:32:05] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr connect_timeout=2"
|
||||
INFO: set_repmgrd_pid(): provided pidfile is /var/run/repmgr/repmgrd-11.pid
|
||||
[2019-03-15 06:32:05] [NOTICE] starting monitoring of node "node2" (ID: 2)
|
||||
[2019-03-15 06:32:05] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Each <application>repmgrd</application> should also have recorded its successful startup as an event:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster event --event=repmgrd_start
|
||||
Node ID | Name | Event | OK | Timestamp | Details
|
||||
---------+-------+---------------+----+---------------------+-------------------------------------------------------------
|
||||
3 | node3 | repmgrd_start | t | 2019-03-14 04:17:30 | monitoring connection to upstream node "node1" (node ID: 1)
|
||||
2 | node2 | repmgrd_start | t | 2019-03-14 04:11:47 | monitoring connection to upstream node "node1" (node ID: 1)
|
||||
1 | node1 | repmgrd_start | t | 2019-03-14 04:04:31 | monitoring cluster primary "node1" (node ID: 1)</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Now stop the current primary server with e.g.:
|
||||
<programlisting>
|
||||
pg_ctl -D /var/lib/postgresql/data -m immediate stop</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
This will force the primary to shut down straight away, aborting all processes
|
||||
and transactions. This will cause a flurry of activity in the <application>repmgrd</application> log
|
||||
files as each <application>repmgrd</application> detects the failure of the primary and a failover
|
||||
decision is made. This is an extract from the log of a standby server (<literal>node2</literal>)
|
||||
which has promoted to new primary after failure of the original primary (<literal>node1</literal>).
|
||||
<programlisting>
|
||||
[2019-03-15 06:37:50] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
|
||||
[2019-03-15 06:37:50] [INFO] checking state of node 1, 1 of 3 attempts
|
||||
[2019-03-15 06:37:50] [INFO] sleeping 5 seconds until next reconnection attempt
|
||||
[2019-03-15 06:37:55] [INFO] checking state of node 1, 2 of 3 attempts
|
||||
[2019-03-15 06:37:55] [INFO] sleeping 5 seconds until next reconnection attempt
|
||||
[2019-03-15 06:38:00] [INFO] checking state of node 1, 3 of 3 attempts
|
||||
[2019-03-15 06:38:00] [WARNING] unable to reconnect to node 1 after 3 attempts
|
||||
[2019-03-15 06:38:00] [INFO] primary and this node have the same location ("default")
|
||||
[2019-03-15 06:38:00] [INFO] local node's last receive lsn: 0/900CBF8
|
||||
[2019-03-15 06:38:00] [INFO] node 3 last saw primary node 12 second(s) ago
|
||||
[2019-03-15 06:38:00] [INFO] last receive LSN for sibling node "node3" (ID: 3) is: 0/900CBF8
|
||||
[2019-03-15 06:38:00] [INFO] node "node3" (ID: 3) has same LSN as current candidate "node2" (ID: 2)
|
||||
[2019-03-15 06:38:00] [INFO] visible nodes: 2; total nodes: 2; no nodes have seen the primary within the last 4 seconds
|
||||
[2019-03-15 06:38:00] [NOTICE] promotion candidate is "node2" (ID: 2)
|
||||
[2019-03-15 06:38:00] [NOTICE] this node is the winner, will now promote itself and inform other nodes
|
||||
[2019-03-15 06:38:00] [INFO] promote_command is:
|
||||
"/usr/pgsql-11/bin/repmgr -f /etc/repmgr/11/repmgr.conf standby promote"
|
||||
NOTICE: promoting standby to primary
|
||||
DETAIL: promoting server "node2" (ID: 2) using "/usr/pgsql-11/bin/pg_ctl -w -D '/var/lib/pgsql/11/data' promote"
|
||||
NOTICE: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
|
||||
NOTICE: STANDBY PROMOTE successful
|
||||
DETAIL: server "node2" (ID: 2) was successfully promoted to primary
|
||||
[2019-03-15 06:38:01] [INFO] 3 followers to notify
|
||||
[2019-03-15 06:38:01] [NOTICE] notifying node "node3" (node ID: 3) to follow node 2
|
||||
INFO: node 3 received notification to follow node 2
|
||||
[2019-03-15 06:38:01] [INFO] switching to primary monitoring mode
|
||||
[2019-03-15 06:38:01] [NOTICE] monitoring cluster primary "node2" (node ID: 2)</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
The cluster status will now look like this, with the original primary (<literal>node1</literal>)
|
||||
marked as inactive, and standby <literal>node3</literal> now following the new primary
|
||||
(<literal>node2</literal>):
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster show --compact
|
||||
ID | Name | Role | Status | Upstream | Location | Prio.
|
||||
----+-------+---------+-----------+----------+----------+-------
|
||||
1 | node1 | primary | - failed | | default | 100
|
||||
2 | node2 | primary | * running | | default | 100
|
||||
3 | node3 | standby | running | node2 | default | 100</programlisting>
|
||||
|
||||
</para>
|
||||
<para>
|
||||
<link linkend="repmgr-cluster-event"><command>repmgr cluster event</command></link> will display a summary of
|
||||
what happened to each server during the failover:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster event
|
||||
Node ID | Name | Event | OK | Timestamp | Details
|
||||
---------+-------+----------------------------+----+---------------------+-------------------------------------------------------------
|
||||
3 | node3 | repmgrd_failover_follow | t | 2019-03-15 06:38:03 | node 3 now following new upstream node 2
|
||||
3 | node3 | standby_follow | t | 2019-03-15 06:38:02 | standby attached to upstream node "node2" (node ID: 2)
|
||||
2 | node2 | repmgrd_reload | t | 2019-03-15 06:38:01 | monitoring cluster primary "node2" (node ID: 2)
|
||||
2 | node2 | repmgrd_failover_promote | t | 2019-03-15 06:38:01 | node 2 promoted to primary; old primary 1 marked as failed
|
||||
2 | node2 | standby_promote | t | 2019-03-15 06:38:01 | server "node2" (ID: 2) was successfully promoted to primary</programlisting>
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
</chapter>
|
||||
@@ -1,31 +0,0 @@
|
||||
<chapter id="repmgrd-witness-server" xreflabel="Using a witness server with repmgrd">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>witness server</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Using a witness server with repmgrd</title>
|
||||
<para>
|
||||
In a situation caused e.g. by a network interruption between two
|
||||
data centres, it's important to avoid a "split-brain" situation where
|
||||
both sides of the network assume they are the active segment and the
|
||||
side without an active primary unilaterally promotes one of its standbys.
|
||||
</para>
|
||||
<para>
|
||||
To prevent this situation happening, it's essential to ensure that one
|
||||
network segment has a "voting majority", so other segments will know
|
||||
they're in the minority and not attempt to promote a new primary. Where
|
||||
an odd number of servers exists, this is not an issue. However, if each
|
||||
network has an even number of nodes, it's necessary to provide some way
|
||||
of ensuring a majority, which is where the witness server becomes useful.
|
||||
</para>
|
||||
<para>
|
||||
This is not a fully-fledged standby node and is not integrated into
|
||||
replication, but it effectively represents the "casting vote" when
|
||||
deciding which network segment has a majority. A witness server can
|
||||
be set up using <xref linkend="repmgr-witness-register">. Note that it only
|
||||
makes sense to create a witness server in conjunction with running
|
||||
<application>repmgrd</application>; the witness server will require its own
|
||||
<application>repmgrd</application> instance.
|
||||
</para>
|
||||
</chapter>
|
||||
@@ -19,9 +19,10 @@
|
||||
</para>
|
||||
<para>
|
||||
<command>repmgr standby switchover</command> differs from other &repmgr;
|
||||
actions in that it also performs actions on another server (the demotion
|
||||
candidate), which means passwordless SSH access is required to that server
|
||||
from the one where <command>repmgr standby switchover</command> is executed.
|
||||
actions in that it also performs actions on other servers (the demotion
|
||||
candidate, and optionally any other servers which are to follow the new primary),
|
||||
which means passwordless SSH access is required to those servers from the one where
|
||||
<command>repmgr standby switchover</command> is executed.
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
@@ -60,6 +61,13 @@
|
||||
&repmgr; being able to shut down the current primary server quickly and cleanly.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Ensure that the promotion candidate has sufficient free walsenders available
|
||||
(PostgreSQL configuration item <varname>max_wal_senders</varname>), and if replication
|
||||
slots are in use, at least one free slot is available for the demotion candidate (
|
||||
PostgreSQL configuration item <varname>max_replication_slots</varname>).
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Ensure that a passwordless SSH connection is possible from the promotion candidate
|
||||
(standby) to the demotion candidate (current primary). If <literal>--siblings-follow</literal>
|
||||
@@ -76,11 +84,12 @@
|
||||
|
||||
<para>
|
||||
Double-check which commands will be used to stop/start/restart the current
|
||||
primary; on the current primary execute:
|
||||
primary; this can be done by e.g. executing <command><link linkend="repmgr-node-service">repmgr node service</link></command>
|
||||
on the current primary:
|
||||
<programlisting>
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=stop
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=start
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=restart</programlisting>
|
||||
repmgr -f /etc/repmgr.conf node service --list-actions --action=stop
|
||||
repmgr -f /etc/repmgr.conf node service --list-actions --action=start
|
||||
repmgr -f /etc/repmgr.conf node service --list-actions --action=restart</programlisting>
|
||||
|
||||
</para>
|
||||
|
||||
@@ -128,8 +137,8 @@
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If an exclusive backup is running on the current primary, &repmgr; will not perform the
|
||||
switchover.
|
||||
If an exclusive backup is running on the current primary, or if WAL replay is paused on the standby,
|
||||
&repmgr; will <emphasis>not</emphasis> perform the switchover.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
@@ -146,12 +155,18 @@
|
||||
manually with <command>repmgr node check --archive-ready</command>.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
Ensure that <application>repmgrd</application> is *not* running anywhere to prevent it unintentionally
|
||||
promoting a node. This restriction will be removed in a future &repmgr; version.
|
||||
</para>
|
||||
</note>
|
||||
<note>
|
||||
<para>
|
||||
From <link linkend="release-4.2">repmgr 4.2</link>, &repmgr; will instruct any running
|
||||
<application>repmgrd</application> instances to pause operations while the switchover
|
||||
is being carried out, to prevent <application>repmgrd</application> from
|
||||
unintentionally promoting a node. For more details, see <xref linkend="repmgrd-pausing">.
|
||||
</para>
|
||||
<para>
|
||||
Users of &repmgr; versions prior to 4.2 should ensure that <application>repmgrd</application>
|
||||
is not running on any nodes while a switchover is being executed.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
|
||||
<para>
|
||||
@@ -221,7 +236,7 @@
|
||||
</note>
|
||||
<para>
|
||||
For more details on <application>pg_rewind</application>, see:
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/app-pgrewind.html">https://www.postgresql.org/docs/current/static/app-pgrewind.html</ulink>.
|
||||
<ulink url="https://www.postgresql.org/docs/current/app-pgrewind.html">https://www.postgresql.org/docs/current/app-pgrewind.html</ulink>.
|
||||
</para>
|
||||
<para>
|
||||
<application>pg_rewind</application> has been part of the core PostgreSQL distribution since
|
||||
@@ -296,7 +311,21 @@
|
||||
2 | node2 | primary | * running | | default | host=node2 dbname=repmgr user=repmgr
|
||||
</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
If <application>repmgrd</application> is in use, it's worth double-checking that
|
||||
all nodes are unpaused by executing <command><link linkend="repmgr-daemon-status">repmgr-daemon-status</link></command>.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
Users of &repmgr; versions prior to 4.2 will need to manually restart <application>repmgrd</application>
|
||||
on all nodes after the switchover is completed.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="switchover-caveats" xreflabel="Caveats">
|
||||
<indexterm>
|
||||
<primary>switchover</primary>
|
||||
@@ -318,21 +347,80 @@
|
||||
<simpara>
|
||||
<command>pg_rewind</command> *requires* that either <varname>wal_log_hints</varname> is enabled, or that
|
||||
data checksums were enabled when the cluster was initialized. See the
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/app-pgrewind.html">pg_rewind documentation</ulink>
|
||||
<ulink url="https://www.postgresql.org/docs/current/app-pgrewind.html">pg_rewind documentation</ulink>
|
||||
for details.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<application>repmgrd</application> should not be running with setting <varname>failover=automatic</varname>
|
||||
in <filename>repmgr.conf</filename> when a switchover is carried out, otherwise the
|
||||
<application>repmgrd</application> daemon may try and promote a standby by itself.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<para>
|
||||
We hope to remove some of these restrictions in future versions of &repmgr;.
|
||||
</para>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="switchover-troubleshooting" xreflabel="Troubleshooting">
|
||||
<indexterm>
|
||||
<primary>switchover</primary>
|
||||
<secondary>troubleshooting</secondary>
|
||||
</indexterm>
|
||||
<title>Troubleshooting switchover issues</title>
|
||||
|
||||
<para>
|
||||
As <link linkend="performing-switchover">emphasised previously</link>, performing a switchover
|
||||
is a non-trivial operation and there are a number of potential issues which can occur.
|
||||
While &repmgr; attempts to perform sanity checks, there's no guaranteed way of determining the success of
|
||||
a switchover without actually carrying it out.
|
||||
</para>
|
||||
|
||||
<sect2 id="switchover-troubleshooting-primary-shutdown">
|
||||
<title>Demotion candidate (old primary) does not shut down</title>
|
||||
<para>
|
||||
&repmgr; may abort a switchover with a message like:
|
||||
<programlisting>
|
||||
ERROR: shutdown of the primary server could not be confirmed
|
||||
HINT: check the primary server status before performing any further actions</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
This means the shutdown of the old primary has taken longer than &repmgr; expected,
|
||||
and it has given up waiting.
|
||||
</para>
|
||||
<para>
|
||||
In this case, check the PostgreSQL log on the primary server to see what is going
|
||||
on. It's entirely possible the shutdown process is just taking longer than the
|
||||
timeout set by the configuration parameter <varname>shutdown_check_timeout</varname>
|
||||
(default: 60 seconds), in which case you may need to adjust this parameter.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
Note that <varname>shutdown_check_timeout</varname> is set on the node where
|
||||
<command>repmgr standby switchover</command> is executed (promotion candidate); setting it on the
|
||||
demotion candidate (former primary) will have no effect.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
If the primary server has shut down cleanly, and no other node has been promoted,
|
||||
it is safe to restart it, in which case the replication cluster will be restored
|
||||
to its original configuration.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="switchover-troubleshooting-exclusive-backup">
|
||||
<title>Switchover aborts with an "exclusive backup" error</title>
|
||||
<para>
|
||||
&repmgr; may abort a switchover with a message like:
|
||||
<programlisting>
|
||||
ERROR: unable to perform a switchover while primary server is in exclusive backup mode
|
||||
HINT: stop backup before attempting the switchover</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
This means an exclusive backup is running on the current primary; interrupting this
|
||||
will not only abort the backup, but potentially leave the primary with an ambiguous
|
||||
backup state.
|
||||
</para>
|
||||
<para>
|
||||
To proceed, either wait until the backup has finished, or cancel it with the command
|
||||
<command>SELECT pg_stop_backup()</command>. For more details see the PostgreSQL
|
||||
documentation section
|
||||
<ulink url="https://www.postgresql.org/docs/current/continuous-archiving.html#BACKUP-LOWLEVEL-BASE-BACKUP-EXCLUSIVE">Making an exclusive low level backup</ulink>.
|
||||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
</chapter>
|
||||
|
||||
@@ -4,6 +4,6 @@ Upgrading from repmgr 3
|
||||
This document has been integrated into the main `repmgr` documentation
|
||||
and is now located here:
|
||||
|
||||
> [Upgrading from repmgr 3.x](https://repmgr.org/docs/4.0/upgrading-from-repmgr-3.html)
|
||||
> [Upgrading from repmgr 3.x](https://repmgr.org/docs/current/upgrading-from-repmgr-3.html)
|
||||
|
||||
|
||||
|
||||
@@ -7,9 +7,9 @@
|
||||
<title>Upgrading repmgr</title>
|
||||
|
||||
<para>
|
||||
&repmgr; is updated regularly with point releases (e.g. 4.0.1 to 4.0.2)
|
||||
&repmgr; is updated regularly with minor releases (e.g. 4.0.1 to 4.0.2)
|
||||
containing bugfixes and other minor improvements. Any substantial new
|
||||
functionality will be included in a feature release (e.g. 4.0.x to 4.1.x).
|
||||
functionality will be included in a major release (e.g. 4.0 to 4.1).
|
||||
</para>
|
||||
|
||||
<sect1 id="upgrading-repmgr-extension" xreflabel="Upgrading repmgr 4.x and later">
|
||||
@@ -19,43 +19,202 @@
|
||||
</indexterm>
|
||||
<title>Upgrading repmgr 4.x and later</title>
|
||||
<para>
|
||||
&repmgr; 4.x is implemented as a PostgreSQL extension; normally the upgrade consists
|
||||
of the two following steps:
|
||||
<orderedlist>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Install the updated package (or compile the updated source)
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<application>repmgrd</application> (if running) must be restarted.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
For major releases, e.g. from <literal>4.0.x</literal> to <literal>4.1</literal>,
|
||||
execute <command>ALTER EXTENSION repmgr UPDATE</command>
|
||||
on the primary node in the database where the &repmgr; extension is installed.
|
||||
</simpara>
|
||||
<simpara>
|
||||
This will update the extension metadata and, if necessary, apply
|
||||
changes to the &repmgr; extension objects.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</orderedlist>
|
||||
From version 4, &repmgr; consists of three elements:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
the <application>repmgr</application> and <application>repmgrd</application> executables
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
the objects for the &repmgr; PostgreSQL extension (SQL files for creating/updating
|
||||
repmgr metadata, and the extension control file)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
the shared library module used by <application>repmgrd</application> which
|
||||
is resident in the PostgreSQL backend
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<para>
|
||||
With <emphasis>minor releases</emphasis>, usually changes are only made to the <application>repmgr</application>
|
||||
and <application>repmgrd</application> executables. In this case, the upgrade is quite straightforward,
|
||||
and is simply a case of installing the new version, and restarting <application>repmgrd</application>
|
||||
(if running).
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Always check the <link linkend="appendix-release-notes">release notes</link> for every
|
||||
release as they may contain upgrade instructions particular to individual versions.
|
||||
For <emphasis>major releases</emphasis>, the &repmgr; PostgreSQL extension will need to be updated
|
||||
to the latest version. Additionally, if the shared library module has been updated (this is sometimes,
|
||||
but not always the case), PostgreSQL itself will need to be restarted on each node.
|
||||
</para>
|
||||
<important>
|
||||
<para>
|
||||
Always check the <link linkend="appendix-release-notes">release notes</link> for every
|
||||
release as they may contain upgrade instructions particular to individual versions.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
<para>
|
||||
Note that it may be necessary to restart the PostgreSQL server if the upgrade contains
|
||||
changes to the shared object file used by <application>repmgrd</application>; check the
|
||||
release notes for details.
|
||||
</para>
|
||||
<sect2 id="upgrading-minor-version" xreflabel="Upgrading a minor version release">
|
||||
<indexterm>
|
||||
<primary>upgrading</primary>
|
||||
<secondary>minor release</secondary>
|
||||
</indexterm>
|
||||
<title>Upgrading a minor version release</title>
|
||||
|
||||
<para>
|
||||
The process for installing minor version upgrades is quite straightforward:
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
install the new &repmgr; version
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
restart <application>repmgrd</application> on all nodes where it is running
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
Some packaging systems (e.g. <link linkend="packages-debian-ubuntu">Debian/Ubuntu</link>
|
||||
may restart <application>repmgrd</application> as part of the package upgrade process.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
Minor version upgrades can be performed in any order on the nodes in the replication
|
||||
cluster.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
A PostgreSQL restart is <emphasis>not</emphasis> required for minor version upgrades.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
The same &repmgr; "major version" (e.g. <literal>4.2</literal>) must be
|
||||
installed on all nodes in the replication cluster. While it's possible to have differing
|
||||
&repmgr; "minor versions" (e.g. <literal>4.2.1</literal>) on different nodes,
|
||||
we strongly recommend updating all nodes to the latest minor version.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="upgrading-major-version" xreflabel="Upgrading a major version release">
|
||||
<indexterm>
|
||||
<primary>upgrading</primary>
|
||||
<secondary>major release</secondary>
|
||||
</indexterm>
|
||||
<title>Upgrading a major version release</title>
|
||||
<para>
|
||||
"major version" upgrades need to be planned more carefully, as they may include
|
||||
changes to the &repmgr; metadata (which need to be propagated from the primary to all
|
||||
standbys) and/or changes to the shared object file used by <application>repmgrd</application>
|
||||
(which require a PostgreSQL restart).
|
||||
</para>
|
||||
<para>
|
||||
With this in mind,
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<orderedlist>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
Stop <application>repmgrd</application> (if in use) on all nodes where it is running.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
Disable the <application>repmgrd</application> service on all nodes where it is in use;
|
||||
this is to prevent packages from prematurely restarting <application>repmgrd</application>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
Install the updated package (or compile the updated source) on all nodes.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
If running a <literal>systemd</literal>-based Linux distribution, execute (as <literal>root</literal>,
|
||||
or with appropriate <literal>sudo</literal> permissions):
|
||||
<programlisting>
|
||||
systemctl daemon-reload</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
If the &repmgr; shared library module has been updated (check the <link linkend="appendix-release-notes">release notes</link>!),
|
||||
restart PostgreSQL, then <application>repmgrd</application> (if in use) on each node,
|
||||
The order in which this is applied to individual nodes is not critical,
|
||||
and it's also fine to restart PostgreSQL on all nodes first before starting <application>repmgrd</application>.
|
||||
</simpara>
|
||||
<simpara>
|
||||
Note that if the upgrade requires a PostgreSQL restart, <application>repmgrd</application>
|
||||
will only function correctly once all nodes have been restarted.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
On the primary node, execute
|
||||
<programlisting>
|
||||
ALTER EXTENSION repmgr UPDATE</programlisting>
|
||||
in the database where &repmgr; is installed.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
Reenable the <application>repmgrd</application> service on all nodes where it is in use, and
|
||||
ensure it is running.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</orderedlist>
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
If the &repmgr; upgrade requires a PostgreSQL restart, combine the &repmgr; upgrade
|
||||
with a PostgreSQL minor version upgrade, which will require a restart in any case.
|
||||
New PostgreSQL minor version are usually released every couple of months.
|
||||
</para>
|
||||
</tip>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="upgrading-check-repmgrd" xreflabel="Checking repmgrd status after an upgrade">
|
||||
<indexterm>
|
||||
<primary>upgrading</primary>
|
||||
<secondary>checking repmgrd status</secondary>
|
||||
</indexterm>
|
||||
<title>Checking repmgrd status after an upgrade</title>
|
||||
<para>
|
||||
From <link linkend="release-4.2">repmgr 4.2</link>, once the upgrade is complete, execute the <command><link linkend="repmgr-daemon-status">repmgr daemon status</link></command>
|
||||
command (on any node) to show an overview of the status of <application>repmgrd</application> on all nodes.
|
||||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="upgrading-and-pg-upgrade" xreflabel="pg_upgrade and repmgr">
|
||||
@@ -88,13 +247,20 @@
|
||||
</simpara>
|
||||
</note>
|
||||
<para>
|
||||
For further details please see the <ulink url="https://www.postgresql.org/docs/current/static/pgupgrade.html">pg_upgrade documentation</ulink>.
|
||||
For further details please see the <ulink url="https://www.postgresql.org/docs/current/pgupgrade.html">pg_upgrade documentation</ulink>.
|
||||
</para>
|
||||
<para>
|
||||
If replication slots are in use, bear in mind these will <emphasis>not</emphasis>
|
||||
be recreated by <application>pg_upgrade</application>. These will need to
|
||||
be recreated manually.
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
Use <command><link linkend="repmgr-node-check">repmgr node check</link></command>
|
||||
to determine which replacation slots need to be recreated.
|
||||
</para>
|
||||
</tip>
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
<!ENTITY repmgrversion "4.1dev">
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* errcode.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -47,5 +47,7 @@
|
||||
#define ERR_FOLLOW_FAIL 23
|
||||
#define ERR_REJOIN_FAIL 24
|
||||
#define ERR_NODE_STATUS 25
|
||||
#define ERR_REPMGRD_PAUSE 26
|
||||
#define ERR_REPMGRD_SERVICE 27
|
||||
|
||||
#endif /* _ERRCODE_H_ */
|
||||
|
||||
@@ -47,7 +47,7 @@ SELECT repmgr.am_bdr_failover_handler(NULL);
|
||||
SELECT repmgr.get_new_primary();
|
||||
get_new_primary
|
||||
-----------------
|
||||
|
||||
-1
|
||||
(1 row)
|
||||
|
||||
SELECT repmgr.notify_follow_primary(-1);
|
||||
|
||||
4
log.c
4
log.c
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* log.c - Logging methods
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -85,7 +85,7 @@ _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_li
|
||||
|
||||
time(&t);
|
||||
tm = localtime(&t);
|
||||
strftime(buf, 100, "[%Y-%m-%d %H:%M:%S]", tm);
|
||||
strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", tm);
|
||||
fprintf(stderr, "%s [%s] ", buf, level_name);
|
||||
}
|
||||
else
|
||||
|
||||
2
log.h
2
log.h
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* log.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
||||
32
repmgr--4.1--4.2.sql
Normal file
32
repmgr--4.1--4.2.sql
Normal file
@@ -0,0 +1,32 @@
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "CREATE EXTENSION repmgr" to load this file. \quit
|
||||
|
||||
CREATE FUNCTION get_repmgrd_pid()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_repmgrd_pid'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_repmgrd_pidfile()
|
||||
RETURNS TEXT
|
||||
AS 'MODULE_PATHNAME', 'get_repmgrd_pidfile'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION set_repmgrd_pid(INT, TEXT)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'set_repmgrd_pid'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION repmgrd_is_running()
|
||||
RETURNS BOOL
|
||||
AS 'MODULE_PATHNAME', 'repmgrd_is_running'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION repmgrd_pause(BOOL)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'repmgrd_pause'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION repmgrd_is_paused()
|
||||
RETURNS BOOL
|
||||
AS 'MODULE_PATHNAME', 'repmgrd_is_paused'
|
||||
LANGUAGE C STRICT;
|
||||
@@ -145,7 +145,6 @@ CREATE FUNCTION unset_bdr_failover_handler()
|
||||
AS 'MODULE_PATHNAME', 'unset_bdr_failover_handler'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
|
||||
CREATE VIEW repmgr.replication_status AS
|
||||
SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
|
||||
n.type AS node_type, n.active, last_monitor_time,
|
||||
|
||||
17
repmgr--4.2--4.3.sql
Normal file
17
repmgr--4.2--4.3.sql
Normal file
@@ -0,0 +1,17 @@
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "CREATE EXTENSION repmgr" to load this file. \quit
|
||||
|
||||
CREATE FUNCTION set_upstream_last_seen()
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'set_upstream_last_seen'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_upstream_last_seen()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_upstream_last_seen'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_wal_receiver_pid()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_wal_receiver_pid'
|
||||
LANGUAGE C STRICT;
|
||||
197
repmgr--4.2.sql
Normal file
197
repmgr--4.2.sql
Normal file
@@ -0,0 +1,197 @@
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "CREATE EXTENSION repmgr" to load this file. \quit
|
||||
|
||||
CREATE TABLE repmgr.nodes (
|
||||
node_id INTEGER PRIMARY KEY,
|
||||
upstream_node_id INTEGER NULL REFERENCES nodes (node_id) DEFERRABLE,
|
||||
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
node_name TEXT NOT NULL,
|
||||
type TEXT NOT NULL CHECK (type IN('primary','standby','witness','bdr')),
|
||||
location TEXT NOT NULL DEFAULT 'default',
|
||||
priority INT NOT NULL DEFAULT 100,
|
||||
conninfo TEXT NOT NULL,
|
||||
repluser VARCHAR(63) NOT NULL,
|
||||
slot_name TEXT NULL,
|
||||
config_file TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE repmgr.events (
|
||||
node_id INTEGER NOT NULL,
|
||||
event TEXT NOT NULL,
|
||||
successful BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
event_timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
details TEXT NULL
|
||||
);
|
||||
|
||||
DO $repmgr$
|
||||
DECLARE
|
||||
DECLARE server_version_num INT;
|
||||
BEGIN
|
||||
SELECT setting
|
||||
FROM pg_catalog.pg_settings
|
||||
WHERE name = 'server_version_num'
|
||||
INTO server_version_num;
|
||||
IF server_version_num >= 90400 THEN
|
||||
EXECUTE $repmgr_func$
|
||||
CREATE TABLE repmgr.monitoring_history (
|
||||
primary_node_id INTEGER NOT NULL,
|
||||
standby_node_id INTEGER NOT NULL,
|
||||
last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
last_apply_time TIMESTAMP WITH TIME ZONE,
|
||||
last_wal_primary_location PG_LSN NOT NULL,
|
||||
last_wal_standby_location PG_LSN,
|
||||
replication_lag BIGINT NOT NULL,
|
||||
apply_lag BIGINT NOT NULL
|
||||
)
|
||||
$repmgr_func$;
|
||||
ELSE
|
||||
EXECUTE $repmgr_func$
|
||||
CREATE TABLE repmgr.monitoring_history (
|
||||
primary_node_id INTEGER NOT NULL,
|
||||
standby_node_id INTEGER NOT NULL,
|
||||
last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
last_apply_time TIMESTAMP WITH TIME ZONE,
|
||||
last_wal_primary_location TEXT NOT NULL,
|
||||
last_wal_standby_location TEXT,
|
||||
replication_lag BIGINT NOT NULL,
|
||||
apply_lag BIGINT NOT NULL
|
||||
)
|
||||
$repmgr_func$;
|
||||
END IF;
|
||||
END$repmgr$;
|
||||
|
||||
|
||||
|
||||
CREATE INDEX idx_monitoring_history_time
|
||||
ON repmgr.monitoring_history (last_monitor_time, standby_node_id);
|
||||
|
||||
CREATE VIEW repmgr.show_nodes AS
|
||||
SELECT n.node_id,
|
||||
n.node_name,
|
||||
n.active,
|
||||
n.upstream_node_id,
|
||||
un.node_name AS upstream_node_name,
|
||||
n.type,
|
||||
n.priority,
|
||||
n.conninfo
|
||||
FROM repmgr.nodes n
|
||||
LEFT JOIN repmgr.nodes un
|
||||
ON un.node_id = n.upstream_node_id;
|
||||
|
||||
|
||||
/* XXX update upgrade scripts! */
|
||||
CREATE TABLE repmgr.voting_term (
|
||||
term INT NOT NULL
|
||||
);
|
||||
|
||||
CREATE UNIQUE INDEX voting_term_restrict
|
||||
ON repmgr.voting_term ((TRUE));
|
||||
|
||||
CREATE RULE voting_term_delete AS
|
||||
ON DELETE TO repmgr.voting_term
|
||||
DO INSTEAD NOTHING;
|
||||
|
||||
|
||||
/* ================= */
|
||||
/* repmgrd functions */
|
||||
/* ================= */
|
||||
|
||||
/* monitoring functions */
|
||||
|
||||
CREATE FUNCTION set_local_node_id(INT)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'set_local_node_id'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_local_node_id()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_local_node_id'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION standby_set_last_updated()
|
||||
RETURNS TIMESTAMP WITH TIME ZONE
|
||||
AS 'MODULE_PATHNAME', 'standby_set_last_updated'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION standby_get_last_updated()
|
||||
RETURNS TIMESTAMP WITH TIME ZONE
|
||||
AS 'MODULE_PATHNAME', 'standby_get_last_updated'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
/* failover functions */
|
||||
|
||||
CREATE FUNCTION notify_follow_primary(INT)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'notify_follow_primary'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_new_primary()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_new_primary'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION reset_voting_status()
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'reset_voting_status'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION am_bdr_failover_handler(INT)
|
||||
RETURNS BOOL
|
||||
AS 'MODULE_PATHNAME', 'am_bdr_failover_handler'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION unset_bdr_failover_handler()
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'unset_bdr_failover_handler'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_repmgrd_pid()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_repmgrd_pid'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_repmgrd_pidfile()
|
||||
RETURNS TEXT
|
||||
AS 'MODULE_PATHNAME', 'get_repmgrd_pidfile'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION set_repmgrd_pid(INT, TEXT)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'set_repmgrd_pid'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION repmgrd_is_running()
|
||||
RETURNS BOOL
|
||||
AS 'MODULE_PATHNAME', 'repmgrd_is_running'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION repmgrd_pause(BOOL)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'repmgrd_pause'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION repmgrd_is_paused()
|
||||
RETURNS BOOL
|
||||
AS 'MODULE_PATHNAME', 'repmgrd_is_paused'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
|
||||
CREATE VIEW repmgr.replication_status AS
|
||||
SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
|
||||
n.type AS node_type, n.active, last_monitor_time,
|
||||
CASE WHEN n.type='standby' THEN m.last_wal_primary_location ELSE NULL END AS last_wal_primary_location,
|
||||
m.last_wal_standby_location,
|
||||
CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.replication_lag) ELSE NULL END AS replication_lag,
|
||||
CASE WHEN n.type='standby' THEN
|
||||
CASE WHEN replication_lag > 0 THEN age(now(), m.last_apply_time) ELSE '0'::INTERVAL END
|
||||
ELSE NULL
|
||||
END AS replication_time_lag,
|
||||
CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.apply_lag) ELSE NULL END AS apply_lag,
|
||||
AGE(NOW(), CASE WHEN pg_catalog.pg_is_in_recovery() THEN repmgr.standby_get_last_updated() ELSE m.last_monitor_time END) AS communication_time_lag
|
||||
FROM repmgr.monitoring_history m
|
||||
JOIN repmgr.nodes n ON m.standby_node_id = n.node_id
|
||||
WHERE (m.standby_node_id, m.last_monitor_time) IN (
|
||||
SELECT m1.standby_node_id, MAX(m1.last_monitor_time)
|
||||
FROM repmgr.monitoring_history m1 GROUP BY 1
|
||||
);
|
||||
|
||||
217
repmgr--4.3.sql
Normal file
217
repmgr--4.3.sql
Normal file
@@ -0,0 +1,217 @@
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "CREATE EXTENSION repmgr" to load this file. \quit
|
||||
|
||||
CREATE TABLE repmgr.nodes (
|
||||
node_id INTEGER PRIMARY KEY,
|
||||
upstream_node_id INTEGER NULL REFERENCES nodes (node_id) DEFERRABLE,
|
||||
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
node_name TEXT NOT NULL,
|
||||
type TEXT NOT NULL CHECK (type IN('primary','standby','witness','bdr')),
|
||||
location TEXT NOT NULL DEFAULT 'default',
|
||||
priority INT NOT NULL DEFAULT 100,
|
||||
conninfo TEXT NOT NULL,
|
||||
repluser VARCHAR(63) NOT NULL,
|
||||
slot_name TEXT NULL,
|
||||
config_file TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE repmgr.events (
|
||||
node_id INTEGER NOT NULL,
|
||||
event TEXT NOT NULL,
|
||||
successful BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
event_timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
details TEXT NULL
|
||||
);
|
||||
|
||||
DO $repmgr$
|
||||
DECLARE
|
||||
DECLARE server_version_num INT;
|
||||
BEGIN
|
||||
SELECT setting
|
||||
FROM pg_catalog.pg_settings
|
||||
WHERE name = 'server_version_num'
|
||||
INTO server_version_num;
|
||||
IF server_version_num >= 90400 THEN
|
||||
EXECUTE $repmgr_func$
|
||||
CREATE TABLE repmgr.monitoring_history (
|
||||
primary_node_id INTEGER NOT NULL,
|
||||
standby_node_id INTEGER NOT NULL,
|
||||
last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
last_apply_time TIMESTAMP WITH TIME ZONE,
|
||||
last_wal_primary_location PG_LSN NOT NULL,
|
||||
last_wal_standby_location PG_LSN,
|
||||
replication_lag BIGINT NOT NULL,
|
||||
apply_lag BIGINT NOT NULL
|
||||
)
|
||||
$repmgr_func$;
|
||||
ELSE
|
||||
EXECUTE $repmgr_func$
|
||||
CREATE TABLE repmgr.monitoring_history (
|
||||
primary_node_id INTEGER NOT NULL,
|
||||
standby_node_id INTEGER NOT NULL,
|
||||
last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL,
|
||||
last_apply_time TIMESTAMP WITH TIME ZONE,
|
||||
last_wal_primary_location TEXT NOT NULL,
|
||||
last_wal_standby_location TEXT,
|
||||
replication_lag BIGINT NOT NULL,
|
||||
apply_lag BIGINT NOT NULL
|
||||
)
|
||||
$repmgr_func$;
|
||||
END IF;
|
||||
END$repmgr$;
|
||||
|
||||
|
||||
|
||||
CREATE INDEX idx_monitoring_history_time
|
||||
ON repmgr.monitoring_history (last_monitor_time, standby_node_id);
|
||||
|
||||
CREATE VIEW repmgr.show_nodes AS
|
||||
SELECT n.node_id,
|
||||
n.node_name,
|
||||
n.active,
|
||||
n.upstream_node_id,
|
||||
un.node_name AS upstream_node_name,
|
||||
n.type,
|
||||
n.priority,
|
||||
n.conninfo
|
||||
FROM repmgr.nodes n
|
||||
LEFT JOIN repmgr.nodes un
|
||||
ON un.node_id = n.upstream_node_id;
|
||||
|
||||
|
||||
/* XXX update upgrade scripts! */
|
||||
CREATE TABLE repmgr.voting_term (
|
||||
term INT NOT NULL
|
||||
);
|
||||
|
||||
CREATE UNIQUE INDEX voting_term_restrict
|
||||
ON repmgr.voting_term ((TRUE));
|
||||
|
||||
CREATE RULE voting_term_delete AS
|
||||
ON DELETE TO repmgr.voting_term
|
||||
DO INSTEAD NOTHING;
|
||||
|
||||
|
||||
/* ================= */
|
||||
/* repmgrd functions */
|
||||
/* ================= */
|
||||
|
||||
/* monitoring functions */
|
||||
|
||||
CREATE FUNCTION set_local_node_id(INT)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'set_local_node_id'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_local_node_id()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_local_node_id'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION standby_set_last_updated()
|
||||
RETURNS TIMESTAMP WITH TIME ZONE
|
||||
AS 'MODULE_PATHNAME', 'standby_set_last_updated'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION standby_get_last_updated()
|
||||
RETURNS TIMESTAMP WITH TIME ZONE
|
||||
AS 'MODULE_PATHNAME', 'standby_get_last_updated'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION set_upstream_last_seen()
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'set_upstream_last_seen'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_upstream_last_seen()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_upstream_last_seen'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
|
||||
/* failover functions */
|
||||
|
||||
CREATE FUNCTION notify_follow_primary(INT)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'notify_follow_primary'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_new_primary()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_new_primary'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION reset_voting_status()
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'reset_voting_status'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION am_bdr_failover_handler(INT)
|
||||
RETURNS BOOL
|
||||
AS 'MODULE_PATHNAME', 'am_bdr_failover_handler'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION unset_bdr_failover_handler()
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'unset_bdr_failover_handler'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_repmgrd_pid()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_repmgrd_pid'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_repmgrd_pidfile()
|
||||
RETURNS TEXT
|
||||
AS 'MODULE_PATHNAME', 'get_repmgrd_pidfile'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION set_repmgrd_pid(INT, TEXT)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'set_repmgrd_pid'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION repmgrd_is_running()
|
||||
RETURNS BOOL
|
||||
AS 'MODULE_PATHNAME', 'repmgrd_is_running'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION repmgrd_pause(BOOL)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'repmgrd_pause'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION repmgrd_is_paused()
|
||||
RETURNS BOOL
|
||||
AS 'MODULE_PATHNAME', 'repmgrd_is_paused'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
CREATE FUNCTION get_wal_receiver_pid()
|
||||
RETURNS INT
|
||||
AS 'MODULE_PATHNAME', 'get_wal_receiver_pid'
|
||||
LANGUAGE C STRICT;
|
||||
|
||||
|
||||
|
||||
|
||||
/* views */
|
||||
|
||||
CREATE VIEW repmgr.replication_status AS
|
||||
SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
|
||||
n.type AS node_type, n.active, last_monitor_time,
|
||||
CASE WHEN n.type='standby' THEN m.last_wal_primary_location ELSE NULL END AS last_wal_primary_location,
|
||||
m.last_wal_standby_location,
|
||||
CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.replication_lag) ELSE NULL END AS replication_lag,
|
||||
CASE WHEN n.type='standby' THEN
|
||||
CASE WHEN replication_lag > 0 THEN age(now(), m.last_apply_time) ELSE '0'::INTERVAL END
|
||||
ELSE NULL
|
||||
END AS replication_time_lag,
|
||||
CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.apply_lag) ELSE NULL END AS apply_lag,
|
||||
AGE(NOW(), CASE WHEN pg_catalog.pg_is_in_recovery() THEN repmgr.standby_get_last_updated() ELSE m.last_monitor_time END) AS communication_time_lag
|
||||
FROM repmgr.monitoring_history m
|
||||
JOIN repmgr.nodes n ON m.standby_node_id = n.node_id
|
||||
WHERE (m.standby_node_id, m.last_monitor_time) IN (
|
||||
SELECT m1.standby_node_id, MAX(m1.last_monitor_time)
|
||||
FROM repmgr.monitoring_history m1 GROUP BY 1
|
||||
);
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Implements BDR-related actions for the repmgr command line utility
|
||||
*
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -126,7 +126,7 @@ do_bdr_register(void)
|
||||
}
|
||||
|
||||
/* check whether repmgr extension exists, and there are no non-BDR nodes registered */
|
||||
extension_status = get_repmgr_extension_status(conn);
|
||||
extension_status = get_repmgr_extension_status(conn, NULL);
|
||||
|
||||
if (extension_status == REPMGR_UNKNOWN)
|
||||
{
|
||||
@@ -191,7 +191,7 @@ do_bdr_register(void)
|
||||
{
|
||||
NodeInfoList local_node_records = T_NODE_INFO_LIST_INITIALIZER;
|
||||
|
||||
get_all_node_records(conn, &local_node_records);
|
||||
(void) get_all_node_records(conn, &local_node_records);
|
||||
|
||||
if (local_node_records.node_count == 0)
|
||||
{
|
||||
@@ -216,7 +216,7 @@ do_bdr_register(void)
|
||||
ExtensionStatus other_node_extension_status = REPMGR_UNKNOWN;
|
||||
|
||||
/* skip the local node */
|
||||
if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, MAXLEN) == 0)
|
||||
if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, sizeof(node_info.node_name)) == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@@ -232,14 +232,14 @@ do_bdr_register(void)
|
||||
}
|
||||
|
||||
/* check repmgr schema exists, skip if not */
|
||||
other_node_extension_status = get_repmgr_extension_status(bdr_node_conn);
|
||||
other_node_extension_status = get_repmgr_extension_status(bdr_node_conn, NULL);
|
||||
|
||||
if (other_node_extension_status != REPMGR_INSTALLED)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
get_all_node_records(bdr_node_conn, &existing_nodes);
|
||||
(void) get_all_node_records(bdr_node_conn, &existing_nodes);
|
||||
|
||||
for (cell = existing_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
@@ -304,9 +304,9 @@ do_bdr_register(void)
|
||||
node_info.active = true;
|
||||
node_info.priority = config_file_options.priority;
|
||||
|
||||
strncpy(node_info.node_name, config_file_options.node_name, MAXLEN);
|
||||
strncpy(node_info.location, config_file_options.location, MAXLEN);
|
||||
strncpy(node_info.conninfo, config_file_options.conninfo, MAXLEN);
|
||||
strncpy(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name));
|
||||
strncpy(node_info.location, config_file_options.location, sizeof(node_info.location));
|
||||
strncpy(node_info.conninfo, config_file_options.conninfo, sizeof(node_info.conninfo));
|
||||
|
||||
if (record_status == RECORD_FOUND)
|
||||
{
|
||||
@@ -330,7 +330,7 @@ do_bdr_register(void)
|
||||
* name set when the node was registered.
|
||||
*/
|
||||
|
||||
if (strncmp(node_info.node_name, config_file_options.node_name, MAXLEN) != 0)
|
||||
if (strncmp(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name)) != 0)
|
||||
{
|
||||
log_error(_("a record for node %i is already registered with node_name \"%s\""),
|
||||
config_file_options.node_id, node_info.node_name);
|
||||
@@ -442,7 +442,7 @@ do_bdr_unregister(void)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
extension_status = get_repmgr_extension_status(conn);
|
||||
extension_status = get_repmgr_extension_status(conn, NULL);
|
||||
if (extension_status != REPMGR_INSTALLED)
|
||||
{
|
||||
log_error(_("repmgr is not installed on database \"%s\""), dbname);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgr-action-bdr.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Implements cluster information actions for the repmgr command line utility
|
||||
*
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -24,8 +24,7 @@
|
||||
#include "repmgr-client-global.h"
|
||||
#include "repmgr-action-cluster.h"
|
||||
|
||||
#define SHOW_HEADER_COUNT 7
|
||||
|
||||
#define SHOW_HEADER_COUNT 8
|
||||
|
||||
typedef enum
|
||||
{
|
||||
@@ -35,6 +34,7 @@ typedef enum
|
||||
SHOW_STATUS,
|
||||
SHOW_UPSTREAM_NAME,
|
||||
SHOW_LOCATION,
|
||||
SHOW_PRIORITY,
|
||||
SHOW_CONNINFO
|
||||
} ShowHeader;
|
||||
|
||||
@@ -51,21 +51,13 @@ typedef enum
|
||||
} EventHeader;
|
||||
|
||||
|
||||
|
||||
struct ColHeader
|
||||
{
|
||||
char title[MAXLEN];
|
||||
int max_length;
|
||||
int cur_length;
|
||||
};
|
||||
|
||||
struct ColHeader headers_show[SHOW_HEADER_COUNT];
|
||||
struct ColHeader headers_event[EVENT_HEADER_COUNT];
|
||||
|
||||
|
||||
|
||||
static int build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length);
|
||||
static int build_cluster_crosscheck(t_node_status_cube ***cube_dest, int *name_length);
|
||||
static int build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, ItemList *warnings, int *error_code);
|
||||
static int build_cluster_crosscheck(t_node_status_cube ***cube_dest, int *name_length, ItemList *warnings, int *error_code);
|
||||
static void cube_set_node_status(t_node_status_cube **cube, int n, int node_id, int matrix_node_id, int connection_node_id, int connection_status);
|
||||
|
||||
/*
|
||||
@@ -84,6 +76,7 @@ do_cluster_show(void)
|
||||
ItemList warnings = {NULL, NULL};
|
||||
bool success = false;
|
||||
bool error_found = false;
|
||||
bool connection_error_found = false;
|
||||
|
||||
/* Connect to local database to obtain cluster connection data */
|
||||
log_verbose(LOG_INFO, _("connecting to database"));
|
||||
@@ -110,12 +103,19 @@ do_cluster_show(void)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* Initialize column headers */
|
||||
strncpy(headers_show[SHOW_ID].title, _("ID"), MAXLEN);
|
||||
strncpy(headers_show[SHOW_NAME].title, _("Name"), MAXLEN);
|
||||
strncpy(headers_show[SHOW_ROLE].title, _("Role"), MAXLEN);
|
||||
strncpy(headers_show[SHOW_STATUS].title, _("Status"), MAXLEN);
|
||||
strncpy(headers_show[SHOW_UPSTREAM_NAME].title, _("Upstream"), MAXLEN);
|
||||
strncpy(headers_show[SHOW_LOCATION].title, _("Location"), MAXLEN);
|
||||
|
||||
if (runtime_options.compact == true)
|
||||
strncpy(headers_show[SHOW_PRIORITY].title, _("Prio."), MAXLEN);
|
||||
else
|
||||
strncpy(headers_show[SHOW_PRIORITY].title, _("Priority"), MAXLEN);
|
||||
|
||||
strncpy(headers_show[SHOW_CONNINFO].title, _("Connection string"), MAXLEN);
|
||||
|
||||
/*
|
||||
@@ -125,12 +125,26 @@ do_cluster_show(void)
|
||||
|
||||
for (i = 0; i < SHOW_HEADER_COUNT; i++)
|
||||
{
|
||||
headers_show[i].max_length = strlen(headers_show[i].title);
|
||||
headers_show[i].display = true;
|
||||
|
||||
if (runtime_options.compact == true)
|
||||
{
|
||||
if (i == SHOW_CONNINFO)
|
||||
{
|
||||
headers_show[i].display = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (headers_show[i].display == true)
|
||||
{
|
||||
headers_show[i].max_length = strlen(headers_show[i].title);
|
||||
}
|
||||
}
|
||||
|
||||
for (cell = nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
PQExpBufferData details;
|
||||
PQExpBufferData buf;
|
||||
|
||||
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
|
||||
|
||||
@@ -141,14 +155,31 @@ do_cluster_show(void)
|
||||
}
|
||||
else
|
||||
{
|
||||
char error[MAXLEN];
|
||||
/* check if node is reachable, but just not letting us in */
|
||||
if (is_server_available_quiet(cell->node_info->conninfo))
|
||||
cell->node_info->node_status = NODE_STATUS_REJECTED;
|
||||
else
|
||||
cell->node_info->node_status = NODE_STATUS_DOWN;
|
||||
|
||||
strncpy(error, PQerrorMessage(cell->node_info->conn), MAXLEN);
|
||||
cell->node_info->node_status = NODE_STATUS_DOWN;
|
||||
cell->node_info->recovery_type = RECTYPE_UNKNOWN;
|
||||
item_list_append_format(&warnings,
|
||||
"when attempting to connect to node \"%s\" (ID: %i), following error encountered :\n\"%s\"",
|
||||
cell->node_info->node_name, cell->node_info->node_id, trim(error));
|
||||
|
||||
connection_error_found = true;
|
||||
|
||||
if (runtime_options.verbose)
|
||||
{
|
||||
char error[MAXLEN];
|
||||
|
||||
strncpy(error, PQerrorMessage(cell->node_info->conn), MAXLEN);
|
||||
item_list_append_format(&warnings,
|
||||
"when attempting to connect to node \"%s\" (ID: %i), following error encountered :\n\"%s\"",
|
||||
cell->node_info->node_name, cell->node_info->node_id, trim(error));
|
||||
}
|
||||
else
|
||||
{
|
||||
item_list_append_format(&warnings,
|
||||
"unable to connect to node \"%s\" (ID: %i)",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
}
|
||||
|
||||
initPQExpBuffer(&details);
|
||||
@@ -170,16 +201,16 @@ do_cluster_show(void)
|
||||
switch (cell->node_info->recovery_type)
|
||||
{
|
||||
case RECTYPE_PRIMARY:
|
||||
appendPQExpBuffer(&details, "* running");
|
||||
appendPQExpBufferStr(&details, "* running");
|
||||
break;
|
||||
case RECTYPE_STANDBY:
|
||||
appendPQExpBuffer(&details, "! running as standby");
|
||||
appendPQExpBufferStr(&details, "! running as standby");
|
||||
item_list_append_format(&warnings,
|
||||
"node \"%s\" (ID: %i) is registered as primary but running as standby",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
break;
|
||||
case RECTYPE_UNKNOWN:
|
||||
appendPQExpBuffer(&details, "! unknown");
|
||||
appendPQExpBufferStr(&details, "! unknown");
|
||||
item_list_append_format(&warnings,
|
||||
"node \"%s\" (ID: %i) has unknown replication status",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
@@ -190,27 +221,40 @@ do_cluster_show(void)
|
||||
{
|
||||
if (cell->node_info->recovery_type == RECTYPE_PRIMARY)
|
||||
{
|
||||
appendPQExpBuffer(&details, "! running");
|
||||
appendPQExpBufferStr(&details, "! running");
|
||||
item_list_append_format(&warnings,
|
||||
"node \"%s\" (ID: %i) is running but the repmgr node record is inactive",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBuffer(&details, "! running as standby");
|
||||
appendPQExpBufferStr(&details, "! running as standby");
|
||||
item_list_append_format(&warnings,
|
||||
"node \"%s\" (ID: %i) is registered as an inactive primary but running as standby",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* node is up but cannot connect */
|
||||
else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
|
||||
{
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBufferStr(&details, "? running");
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBufferStr(&details, "! running");
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
/* node is unreachable */
|
||||
else
|
||||
{
|
||||
/* node is unreachable but marked active */
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBuffer(&details, "? unreachable");
|
||||
appendPQExpBufferStr(&details, "? unreachable");
|
||||
item_list_append_format(&warnings,
|
||||
"node \"%s\" (ID: %i) is registered as an active primary but is unreachable",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
@@ -218,7 +262,7 @@ do_cluster_show(void)
|
||||
/* node is unreachable and marked as inactive */
|
||||
else
|
||||
{
|
||||
appendPQExpBuffer(&details, "- failed");
|
||||
appendPQExpBufferStr(&details, "- failed");
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
@@ -234,16 +278,16 @@ do_cluster_show(void)
|
||||
switch (cell->node_info->recovery_type)
|
||||
{
|
||||
case RECTYPE_STANDBY:
|
||||
appendPQExpBuffer(&details, " running");
|
||||
appendPQExpBufferStr(&details, " running");
|
||||
break;
|
||||
case RECTYPE_PRIMARY:
|
||||
appendPQExpBuffer(&details, "! running as primary");
|
||||
appendPQExpBufferStr(&details, "! running as primary");
|
||||
item_list_append_format(&warnings,
|
||||
"node \"%s\" (ID: %i) is registered as standby but running as primary",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
break;
|
||||
case RECTYPE_UNKNOWN:
|
||||
appendPQExpBuffer(&details, "! unknown");
|
||||
appendPQExpBufferStr(&details, "! unknown");
|
||||
item_list_append_format(
|
||||
&warnings,
|
||||
"node \"%s\" (ID: %i) has unknown replication status",
|
||||
@@ -255,19 +299,40 @@ do_cluster_show(void)
|
||||
{
|
||||
if (cell->node_info->recovery_type == RECTYPE_STANDBY)
|
||||
{
|
||||
appendPQExpBuffer(&details, "! running");
|
||||
appendPQExpBufferStr(&details, "! running");
|
||||
item_list_append_format(&warnings,
|
||||
"node \"%s\" (ID: %i) is running but the repmgr node record is inactive",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBuffer(&details, "! running as primary");
|
||||
appendPQExpBufferStr(&details, "! running as primary");
|
||||
item_list_append_format(&warnings,
|
||||
"node \"%s\" (ID: %i) is running as primary but the repmgr node record is inactive",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
}
|
||||
|
||||
/* warn about issue with paused WAL replay */
|
||||
if (is_wal_replay_paused(cell->node_info->conn, true))
|
||||
{
|
||||
item_list_append_format(&warnings,
|
||||
_("WAL replay is paused on node \"%s\" (ID: %i) with WAL replay pending; this node cannot be manually promoted until WAL replay is resumed"),
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
}
|
||||
/* node is up but cannot connect */
|
||||
else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
|
||||
{
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBufferStr(&details, "? running");
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBufferStr(&details, "! running");
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
/* node is unreachable */
|
||||
else
|
||||
@@ -275,18 +340,19 @@ do_cluster_show(void)
|
||||
/* node is unreachable but marked active */
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBuffer(&details, "? unreachable");
|
||||
appendPQExpBufferStr(&details, "? unreachable");
|
||||
item_list_append_format(&warnings,
|
||||
"node \"%s\" (ID: %i) is registered as an active standby but is unreachable",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBuffer(&details, "- failed");
|
||||
error_found = true;
|
||||
appendPQExpBufferStr(&details, "- failed");
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
case WITNESS:
|
||||
case BDR:
|
||||
@@ -296,24 +362,38 @@ do_cluster_show(void)
|
||||
{
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBuffer(&details, "* running");
|
||||
appendPQExpBufferStr(&details, "* running");
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBuffer(&details, "! running");
|
||||
appendPQExpBufferStr(&details, "! running");
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
/* node is up but cannot connect */
|
||||
else if (cell->node_info->node_status == NODE_STATUS_REJECTED)
|
||||
{
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBufferStr(&details, "? rejected");
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBufferStr(&details, "! failed");
|
||||
error_found = true;
|
||||
}
|
||||
|
||||
}
|
||||
/* node is unreachable */
|
||||
else
|
||||
{
|
||||
if (cell->node_info->active == true)
|
||||
{
|
||||
appendPQExpBuffer(&details, "? unreachable");
|
||||
appendPQExpBufferStr(&details, "? unreachable");
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBuffer(&details, "- failed");
|
||||
appendPQExpBufferStr(&details, "- failed");
|
||||
error_found = true;
|
||||
}
|
||||
}
|
||||
@@ -322,7 +402,7 @@ do_cluster_show(void)
|
||||
case UNKNOWN:
|
||||
{
|
||||
/* this should never happen */
|
||||
appendPQExpBuffer(&details, "? unknown node type");
|
||||
appendPQExpBufferStr(&details, "? unknown node type");
|
||||
error_found = true;
|
||||
}
|
||||
break;
|
||||
@@ -334,15 +414,35 @@ do_cluster_show(void)
|
||||
PQfinish(cell->node_info->conn);
|
||||
cell->node_info->conn = NULL;
|
||||
|
||||
initPQExpBuffer(&buf);
|
||||
appendPQExpBuffer(&buf, "%i", cell->node_info->node_id);
|
||||
headers_show[SHOW_ID].cur_length = strlen(buf.data);
|
||||
termPQExpBuffer(&buf);
|
||||
|
||||
headers_show[SHOW_ROLE].cur_length = strlen(get_node_type_string(cell->node_info->type));
|
||||
headers_show[SHOW_NAME].cur_length = strlen(cell->node_info->node_name);
|
||||
headers_show[SHOW_STATUS].cur_length = strlen(cell->node_info->details);
|
||||
headers_show[SHOW_UPSTREAM_NAME].cur_length = strlen(cell->node_info->upstream_node_name);
|
||||
|
||||
initPQExpBuffer(&buf);
|
||||
appendPQExpBuffer(&buf, "%i", cell->node_info->priority);
|
||||
headers_show[SHOW_PRIORITY].cur_length = strlen(buf.data);
|
||||
termPQExpBuffer(&buf);
|
||||
|
||||
headers_show[SHOW_LOCATION].cur_length = strlen(cell->node_info->location);
|
||||
|
||||
|
||||
|
||||
headers_show[SHOW_CONNINFO].cur_length = strlen(cell->node_info->conninfo);
|
||||
|
||||
for (i = 0; i < SHOW_HEADER_COUNT; i++)
|
||||
{
|
||||
if (runtime_options.compact == true)
|
||||
{
|
||||
if (headers_show[i].display == false)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (headers_show[i].cur_length > headers_show[i].max_length)
|
||||
{
|
||||
headers_show[i].max_length = headers_show[i].cur_length;
|
||||
@@ -351,36 +451,10 @@ do_cluster_show(void)
|
||||
|
||||
}
|
||||
|
||||
/* Print column header row (text mode only) */
|
||||
if (runtime_options.output_mode == OM_TEXT)
|
||||
{
|
||||
for (i = 0; i < SHOW_HEADER_COUNT; i++)
|
||||
{
|
||||
if (i == 0)
|
||||
printf(" ");
|
||||
else
|
||||
printf(" | ");
|
||||
|
||||
printf("%-*s",
|
||||
headers_show[i].max_length,
|
||||
headers_show[i].title);
|
||||
}
|
||||
printf("\n");
|
||||
printf("-");
|
||||
|
||||
for (i = 0; i < SHOW_HEADER_COUNT; i++)
|
||||
{
|
||||
int j;
|
||||
|
||||
for (j = 0; j < headers_show[i].max_length; j++)
|
||||
printf("-");
|
||||
|
||||
if (i < (SHOW_HEADER_COUNT - 1))
|
||||
printf("-+-");
|
||||
else
|
||||
printf("-");
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
print_status_header(SHOW_HEADER_COUNT, headers_show);
|
||||
}
|
||||
|
||||
for (cell = nodes.head; cell; cell = cell->next)
|
||||
@@ -420,7 +494,14 @@ do_cluster_show(void)
|
||||
printf("| %-*s ", headers_show[SHOW_STATUS].max_length, cell->node_info->details);
|
||||
printf("| %-*s ", headers_show[SHOW_UPSTREAM_NAME].max_length, cell->node_info->upstream_node_name);
|
||||
printf("| %-*s ", headers_show[SHOW_LOCATION].max_length, cell->node_info->location);
|
||||
printf("| %-*s\n", headers_show[SHOW_CONNINFO].max_length, cell->node_info->conninfo);
|
||||
printf("| %-*i ", headers_show[SHOW_PRIORITY].max_length, cell->node_info->priority);
|
||||
|
||||
if (headers_show[SHOW_CONNINFO].display == true)
|
||||
{
|
||||
printf("| %-*s", headers_show[SHOW_CONNINFO].max_length, cell->node_info->conninfo);
|
||||
}
|
||||
|
||||
puts("");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -437,6 +518,11 @@ do_cluster_show(void)
|
||||
{
|
||||
printf(_(" - %s\n"), cell->string);
|
||||
}
|
||||
|
||||
if (runtime_options.verbose == false && connection_error_found == true)
|
||||
{
|
||||
log_hint(_("execute with --verbose option to see connection error messages"));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -619,9 +705,12 @@ do_cluster_crosscheck(void)
|
||||
|
||||
t_node_status_cube **cube;
|
||||
|
||||
bool error_found = false;
|
||||
bool connection_error_found = false;
|
||||
int error_code = SUCCESS;
|
||||
ItemList warnings = {NULL, NULL};
|
||||
|
||||
n = build_cluster_crosscheck(&cube, &name_length, &warnings, &error_code);
|
||||
|
||||
n = build_cluster_crosscheck(&cube, &name_length);
|
||||
if (runtime_options.output_mode == OM_CSV)
|
||||
{
|
||||
for (i = 0; i < n; i++)
|
||||
@@ -643,6 +732,11 @@ do_cluster_crosscheck(void)
|
||||
cube[i]->node_id,
|
||||
cube[j]->node_id,
|
||||
max_node_status);
|
||||
|
||||
if (max_node_status == -1)
|
||||
{
|
||||
connection_error_found = true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -700,16 +794,16 @@ do_cluster_crosscheck(void)
|
||||
{
|
||||
case -2:
|
||||
c = '?';
|
||||
error_found = true;
|
||||
break;
|
||||
case -1:
|
||||
c = 'x';
|
||||
error_found = true;
|
||||
connection_error_found = true;
|
||||
break;
|
||||
case 0:
|
||||
c = '*';
|
||||
break;
|
||||
default:
|
||||
log_error("unexpected node status value %i", max_node_status);
|
||||
exit(ERR_INTERNAL);
|
||||
}
|
||||
|
||||
@@ -718,6 +812,13 @@ do_cluster_crosscheck(void)
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
if (warnings.head != NULL && runtime_options.terse == false)
|
||||
{
|
||||
log_warning(_("following problems detected:"));
|
||||
print_item_list(&warnings);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* clean up allocated cube array */
|
||||
@@ -744,13 +845,23 @@ do_cluster_crosscheck(void)
|
||||
free(cube);
|
||||
}
|
||||
|
||||
if (error_found == true)
|
||||
/* errors detected by build_cluster_crosscheck() have priority */
|
||||
if (connection_error_found == true)
|
||||
{
|
||||
exit(ERR_NODE_STATUS);
|
||||
error_code = ERR_NODE_STATUS;
|
||||
}
|
||||
|
||||
exit(error_code);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* CLUSTER MATRIX
|
||||
*
|
||||
* Parameters:
|
||||
* --csv
|
||||
*/
|
||||
void
|
||||
do_cluster_matrix()
|
||||
{
|
||||
@@ -763,18 +874,30 @@ do_cluster_matrix()
|
||||
|
||||
t_node_matrix_rec **matrix_rec_list;
|
||||
|
||||
bool error_found = false;
|
||||
bool connection_error_found = false;
|
||||
int error_code = SUCCESS;
|
||||
ItemList warnings = {NULL, NULL};
|
||||
|
||||
n = build_cluster_matrix(&matrix_rec_list, &name_length);
|
||||
n = build_cluster_matrix(&matrix_rec_list, &name_length, &warnings, &error_code);
|
||||
|
||||
if (runtime_options.output_mode == OM_CSV)
|
||||
{
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
for (j = 0; j < n; j++)
|
||||
{
|
||||
printf("%d,%d,%d\n",
|
||||
matrix_rec_list[i]->node_id,
|
||||
matrix_rec_list[i]->node_status_list[j]->node_id,
|
||||
matrix_rec_list[i]->node_status_list[j]->node_status);
|
||||
|
||||
if (matrix_rec_list[i]->node_status_list[j]->node_status == -2
|
||||
|| matrix_rec_list[i]->node_status_list[j]->node_status == -1)
|
||||
{
|
||||
connection_error_found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -803,16 +926,16 @@ do_cluster_matrix()
|
||||
{
|
||||
case -2:
|
||||
c = '?';
|
||||
error_found = true;
|
||||
break;
|
||||
case -1:
|
||||
c = 'x';
|
||||
error_found = true;
|
||||
connection_error_found = true;
|
||||
break;
|
||||
case 0:
|
||||
c = '*';
|
||||
break;
|
||||
default:
|
||||
log_error("unexpected node status value %i", matrix_rec_list[i]->node_status_list[j]->node_status);
|
||||
exit(ERR_INTERNAL);
|
||||
}
|
||||
|
||||
@@ -820,6 +943,13 @@ do_cluster_matrix()
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
if (warnings.head != NULL && runtime_options.terse == false)
|
||||
{
|
||||
log_warning(_("following problems detected:"));
|
||||
print_item_list(&warnings);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
@@ -834,10 +964,13 @@ do_cluster_matrix()
|
||||
|
||||
free(matrix_rec_list);
|
||||
|
||||
if (error_found == true)
|
||||
/* actual database connection errors have priority */
|
||||
if (connection_error_found == true)
|
||||
{
|
||||
exit(ERR_NODE_STATUS);
|
||||
error_code = ERR_NODE_STATUS;
|
||||
}
|
||||
|
||||
exit(error_code);
|
||||
}
|
||||
|
||||
|
||||
@@ -866,7 +999,7 @@ matrix_set_node_status(t_node_matrix_rec **matrix_rec_list, int n, int node_id,
|
||||
|
||||
|
||||
static int
|
||||
build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
|
||||
build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, ItemList *warnings, int *error_code)
|
||||
{
|
||||
PGconn *conn = NULL;
|
||||
int i = 0,
|
||||
@@ -895,7 +1028,12 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
|
||||
local_node_id = runtime_options.node_id;
|
||||
}
|
||||
|
||||
get_all_node_records(conn, &nodes);
|
||||
if (get_all_node_records(conn, &nodes) == false)
|
||||
{
|
||||
/* get_all_node_records() will display the error */
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
PQfinish(conn);
|
||||
conn = NULL;
|
||||
@@ -909,7 +1047,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
|
||||
/*
|
||||
* Allocate an empty matrix record list
|
||||
*
|
||||
* -2 == NULL ? -1 == Error x 0 == OK *
|
||||
* -2 == NULL ? -1 == Error x 0 == OK
|
||||
*/
|
||||
|
||||
matrix_rec_list = (t_node_matrix_rec **) pg_malloc0(sizeof(t_node_matrix_rec) * nodes.node_count);
|
||||
@@ -925,7 +1063,9 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
|
||||
matrix_rec_list[i] = (t_node_matrix_rec *) pg_malloc0(sizeof(t_node_matrix_rec));
|
||||
|
||||
matrix_rec_list[i]->node_id = cell->node_info->node_id;
|
||||
strncpy(matrix_rec_list[i]->node_name, cell->node_info->node_name, MAXLEN);
|
||||
strncpy(matrix_rec_list[i]->node_name,
|
||||
cell->node_info->node_name,
|
||||
sizeof(cell->node_info->node_name));
|
||||
|
||||
/*
|
||||
* Find the maximum length of a node name
|
||||
@@ -972,7 +1112,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
|
||||
|
||||
host = param_get(&remote_conninfo, "host");
|
||||
|
||||
node_conn = establish_db_connection(cell->node_info->conninfo, false);
|
||||
node_conn = establish_db_connection_quiet(cell->node_info->conninfo);
|
||||
|
||||
connection_status =
|
||||
(PQstatus(node_conn) == CONNECTION_OK) ? 0 : -1;
|
||||
@@ -1009,24 +1149,12 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
|
||||
* remote repmgr - those are the only values it needs to work, and
|
||||
* saves us making assumptions about the location of repmgr.conf
|
||||
*/
|
||||
appendPQExpBuffer(&command,
|
||||
"\"%s -d '%s' ",
|
||||
make_pg_path(progname()),
|
||||
cell->node_info->conninfo);
|
||||
appendPQExpBufferChar(&command, '"');
|
||||
|
||||
make_remote_repmgr_path(&command, cell->node_info);
|
||||
|
||||
if (strlen(pg_bindir))
|
||||
{
|
||||
appendPQExpBuffer(&command,
|
||||
"--pg_bindir=");
|
||||
appendShellString(&command,
|
||||
pg_bindir);
|
||||
appendPQExpBuffer(&command,
|
||||
" ");
|
||||
}
|
||||
|
||||
appendPQExpBuffer(&command,
|
||||
" cluster show --csv\"");
|
||||
appendPQExpBufferStr(&command,
|
||||
" cluster show --csv -L NOTICE --terse\"");
|
||||
|
||||
log_verbose(LOG_DEBUG, "build_cluster_matrix(): executing:\n %s", command.data);
|
||||
|
||||
@@ -1035,38 +1163,57 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
|
||||
(void) remote_command(host,
|
||||
runtime_options.remote_user,
|
||||
command.data,
|
||||
config_file_options.ssh_options,
|
||||
&command_output);
|
||||
|
||||
p = command_output.data;
|
||||
|
||||
termPQExpBuffer(&command);
|
||||
|
||||
for (j = 0; j < nodes.node_count; j++)
|
||||
/* no output returned - probably SSH error */
|
||||
if (p[0] == '\0' || p[0] == '\n')
|
||||
{
|
||||
if (sscanf(p, "%d,%d", &x, &y) != 2)
|
||||
item_list_append_format(warnings,
|
||||
"node %i inaccessible via SSH",
|
||||
connection_node_id);
|
||||
*error_code = ERR_BAD_SSH;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (j = 0; j < nodes.node_count; j++)
|
||||
{
|
||||
fprintf(stderr, _("cannot parse --csv output: %s\n"), p);
|
||||
PQfinish(node_conn);
|
||||
exit(ERR_INTERNAL);
|
||||
if (sscanf(p, "%d,%d", &x, &y) != 2)
|
||||
{
|
||||
matrix_set_node_status(matrix_rec_list,
|
||||
nodes.node_count,
|
||||
connection_node_id,
|
||||
x,
|
||||
-2);
|
||||
|
||||
item_list_append_format(warnings,
|
||||
"unable to parse --csv output for node %i; output returned was:\n\"%s\"",
|
||||
connection_node_id, p);
|
||||
*error_code = ERR_INTERNAL;
|
||||
}
|
||||
else
|
||||
{
|
||||
matrix_set_node_status(matrix_rec_list,
|
||||
nodes.node_count,
|
||||
connection_node_id,
|
||||
x,
|
||||
(y == -1) ? -1 : 0);
|
||||
}
|
||||
|
||||
while (*p && (*p != '\n'))
|
||||
p++;
|
||||
if (*p == '\n')
|
||||
p++;
|
||||
}
|
||||
|
||||
matrix_set_node_status(matrix_rec_list,
|
||||
nodes.node_count,
|
||||
connection_node_id,
|
||||
x,
|
||||
(y == -1) ? -1 : 0);
|
||||
|
||||
while (*p && (*p != '\n'))
|
||||
p++;
|
||||
if (*p == '\n')
|
||||
p++;
|
||||
}
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
PQfinish(node_conn);
|
||||
free_conninfo_params(&remote_conninfo);
|
||||
|
||||
node_conn = NULL;
|
||||
}
|
||||
|
||||
*matrix_rec_dest = matrix_rec_list;
|
||||
@@ -1079,7 +1226,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
|
||||
|
||||
|
||||
static int
|
||||
build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, ItemList *warnings, int *error_code)
|
||||
{
|
||||
PGconn *conn = NULL;
|
||||
int h,
|
||||
@@ -1100,7 +1247,12 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
else
|
||||
conn = establish_db_connection_by_params(&source_conninfo, true);
|
||||
|
||||
get_all_node_records(conn, &nodes);
|
||||
if (get_all_node_records(conn, &nodes) == false)
|
||||
{
|
||||
/* get_all_node_records() will display the error */
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
PQfinish(conn);
|
||||
conn = NULL;
|
||||
@@ -1128,7 +1280,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
|
||||
cube[h] = (t_node_status_cube *) pg_malloc(sizeof(t_node_status_cube));
|
||||
cube[h]->node_id = cell->node_info->node_id;
|
||||
strncpy(cube[h]->node_name, cell->node_info->node_name, MAXLEN);
|
||||
strncpy(cube[h]->node_name, cell->node_info->node_name, sizeof(cell->node_info->node_name));
|
||||
|
||||
/*
|
||||
* Find the maximum length of a node name
|
||||
@@ -1150,7 +1302,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
/* we don't need the name here */
|
||||
cube[h]->matrix_list_rec[i]->node_name[0] = '\0';
|
||||
|
||||
cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec) * nodes.node_count);
|
||||
cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec *) * nodes.node_count);
|
||||
|
||||
j = 0;
|
||||
|
||||
@@ -1187,28 +1339,13 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
|
||||
initPQExpBuffer(&command);
|
||||
|
||||
appendPQExpBuffer(&command,
|
||||
"%s -d '%s' --node-id=%i ",
|
||||
make_pg_path(progname()),
|
||||
cell->node_info->conninfo,
|
||||
remote_node_id);
|
||||
make_remote_repmgr_path(&command, cell->node_info);
|
||||
|
||||
if (strlen(pg_bindir))
|
||||
{
|
||||
appendPQExpBuffer(&command,
|
||||
"--pg_bindir=");
|
||||
appendShellString(&command,
|
||||
pg_bindir);
|
||||
appendPQExpBuffer(&command,
|
||||
" ");
|
||||
}
|
||||
|
||||
appendPQExpBuffer(&command,
|
||||
"cluster matrix --csv 2>/dev/null");
|
||||
appendPQExpBufferStr(&command,
|
||||
" cluster matrix --csv -L NOTICE --terse");
|
||||
|
||||
initPQExpBuffer(&command_output);
|
||||
|
||||
/* fix to work with --node-id */
|
||||
if (cube[i]->node_id == config_file_options.node_id)
|
||||
{
|
||||
(void) local_command_simple(command.data,
|
||||
@@ -1239,6 +1376,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
(void) remote_command(host,
|
||||
runtime_options.remote_user,
|
||||
quoted_command.data,
|
||||
config_file_options.ssh_options,
|
||||
&command_output);
|
||||
|
||||
free_conninfo_params(&remote_conninfo);
|
||||
@@ -1249,9 +1387,13 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
|
||||
p = command_output.data;
|
||||
|
||||
if (!strlen(command_output.data))
|
||||
if (p[0] == '\0' || p[0] == '\n')
|
||||
{
|
||||
item_list_append_format(warnings,
|
||||
"node %i inaccessible via SSH",
|
||||
remote_node_id);
|
||||
termPQExpBuffer(&command_output);
|
||||
*error_code = ERR_BAD_SSH;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1263,16 +1405,23 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
|
||||
if (sscanf(p, "%d,%d,%d", &matrix_rec_node_id, &node_status_node_id, &node_status) != 3)
|
||||
{
|
||||
fprintf(stderr, _("cannot parse --csv output: %s\n"), p);
|
||||
exit(ERR_INTERNAL);
|
||||
cube_set_node_status(cube,
|
||||
nodes.node_count,
|
||||
remote_node_id,
|
||||
matrix_rec_node_id,
|
||||
node_status_node_id,
|
||||
-2);
|
||||
*error_code = ERR_INTERNAL;
|
||||
}
|
||||
else
|
||||
{
|
||||
cube_set_node_status(cube,
|
||||
nodes.node_count,
|
||||
remote_node_id,
|
||||
matrix_rec_node_id,
|
||||
node_status_node_id,
|
||||
node_status);
|
||||
}
|
||||
|
||||
cube_set_node_status(cube,
|
||||
nodes.node_count,
|
||||
remote_node_id,
|
||||
matrix_rec_node_id,
|
||||
node_status_node_id,
|
||||
node_status);
|
||||
|
||||
while (*p && (*p != '\n'))
|
||||
p++;
|
||||
@@ -1332,6 +1481,7 @@ do_cluster_cleanup(void)
|
||||
PGconn *conn = NULL;
|
||||
PGconn *primary_conn = NULL;
|
||||
int entries_to_delete = 0;
|
||||
PQExpBufferData event_details;
|
||||
|
||||
conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
@@ -1343,9 +1493,17 @@ do_cluster_cleanup(void)
|
||||
|
||||
log_debug(_("number of days of monitoring history to retain: %i"), runtime_options.keep_history);
|
||||
|
||||
entries_to_delete = get_number_of_monitoring_records_to_delete(primary_conn, runtime_options.keep_history);
|
||||
entries_to_delete = get_number_of_monitoring_records_to_delete(primary_conn,
|
||||
runtime_options.keep_history,
|
||||
runtime_options.node_id);
|
||||
|
||||
if (entries_to_delete == 0)
|
||||
if (entries_to_delete < 0)
|
||||
{
|
||||
log_error(_("unable to query number of monitoring records to clean up"));
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
else if (entries_to_delete == 0)
|
||||
{
|
||||
log_info(_("no monitoring records to delete"));
|
||||
PQfinish(primary_conn);
|
||||
@@ -1355,10 +1513,23 @@ do_cluster_cleanup(void)
|
||||
log_debug("at least %i monitoring records for deletion",
|
||||
entries_to_delete);
|
||||
|
||||
if (delete_monitoring_records(primary_conn, runtime_options.keep_history) == false)
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
if (delete_monitoring_records(primary_conn, runtime_options.keep_history, runtime_options.node_id) == false)
|
||||
{
|
||||
log_error(_("unable to delete monitoring records"));
|
||||
appendPQExpBufferStr(&event_details,
|
||||
_("unable to delete monitoring records"));
|
||||
|
||||
log_error("%s", event_details.data);
|
||||
log_detail("%s", PQerrorMessage(primary_conn));
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"cluster_cleanup",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
@@ -1370,19 +1541,40 @@ do_cluster_cleanup(void)
|
||||
log_detail("%s", PQerrorMessage(primary_conn));
|
||||
}
|
||||
|
||||
|
||||
PQfinish(primary_conn);
|
||||
|
||||
if (runtime_options.keep_history > 0)
|
||||
if (runtime_options.keep_history == 0)
|
||||
{
|
||||
log_notice(_("monitoring records older than %i day(s) deleted"),
|
||||
runtime_options.keep_history);
|
||||
appendPQExpBufferStr(&event_details,
|
||||
_("all monitoring records deleted"));
|
||||
}
|
||||
else
|
||||
{
|
||||
log_info(_("all monitoring records deleted"));
|
||||
appendPQExpBufferStr(&event_details,
|
||||
_("monitoring records deleted"));
|
||||
}
|
||||
|
||||
if (runtime_options.node_id != UNKNOWN_NODE_ID)
|
||||
appendPQExpBuffer(&event_details,
|
||||
_(" for node %i"),
|
||||
runtime_options.node_id);
|
||||
|
||||
if (runtime_options.keep_history > 0)
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("; records newer than %i day(s) retained"),
|
||||
runtime_options.keep_history);
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"cluster_cleanup",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
PQfinish(primary_conn);
|
||||
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1407,6 +1599,7 @@ do_cluster_help(void)
|
||||
printf(_(" Configuration file or database connection required.\n"));
|
||||
puts("");
|
||||
printf(_(" --csv emit output as CSV (with a subset of fields)\n"));
|
||||
printf(_(" --compact display only a subset of fields\n"));
|
||||
puts("");
|
||||
|
||||
printf(_("CLUSTER MATRIX\n"));
|
||||
@@ -1442,7 +1635,7 @@ do_cluster_help(void)
|
||||
|
||||
printf(_("CLUSTER CLEANUP\n"));
|
||||
puts("");
|
||||
printf(_(" \"cluster cleanup\" purges records from the \"repmgr.monitor\" table.\n"));
|
||||
printf(_(" \"cluster cleanup\" purges records from the \"repmgr.monitoring_history\" table.\n"));
|
||||
puts("");
|
||||
printf(_(" -k, --keep-history=VALUE retain indicated number of days of history (default: 0)\n"));
|
||||
puts("");
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgr-action-cluster.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -30,14 +30,14 @@ typedef struct
|
||||
typedef struct
|
||||
{
|
||||
int node_id;
|
||||
char node_name[MAXLEN];
|
||||
char node_name[NAMEDATALEN];
|
||||
t_node_status_rec **node_status_list;
|
||||
} t_node_matrix_rec;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int node_id;
|
||||
char node_name[MAXLEN];
|
||||
char node_name[NAMEDATALEN];
|
||||
t_node_matrix_rec **matrix_list_rec;
|
||||
} t_node_status_cube;
|
||||
|
||||
|
||||
795
repmgr-action-daemon.c
Normal file
795
repmgr-action-daemon.c
Normal file
@@ -0,0 +1,795 @@
|
||||
/*
|
||||
* repmgr-action-daemon.c
|
||||
*
|
||||
* Implements repmgrd actions for the repmgr command line utility
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <signal.h>
|
||||
#include <sys/stat.h> /* for stat() */
|
||||
|
||||
#include "repmgr.h"
|
||||
|
||||
#include "repmgr-client-global.h"
|
||||
#include "repmgr-action-daemon.h"
|
||||
|
||||
#define REPMGR_DAEMON_STOP_START_WAIT 15
|
||||
#define REPMGR_DAEMON_STATUS_START_HINT _("use \"repmgr daemon status\" to confirm that repmgrd was successfully started")
|
||||
#define REPMGR_DAEMON_STATUS_STOP_HINT _("use \"repmgr daemon status\" to confirm that repmgrd was successfully stopped")
|
||||
|
||||
/*
|
||||
* Possibly also show:
|
||||
* - repmgrd start time?
|
||||
* - repmgrd mode
|
||||
* - priority
|
||||
* - whether promotion candidate (due to zero priority/different location)
|
||||
*/
|
||||
|
||||
typedef enum
|
||||
{
|
||||
STATUS_ID = 0,
|
||||
STATUS_NAME,
|
||||
STATUS_ROLE,
|
||||
STATUS_PRIORITY,
|
||||
STATUS_PG,
|
||||
STATUS_RUNNING,
|
||||
STATUS_PID,
|
||||
STATUS_PAUSED,
|
||||
STATUS_UPSTREAM_LAST_SEEN
|
||||
} StatusHeader;
|
||||
|
||||
#define STATUS_HEADER_COUNT 9
|
||||
|
||||
struct ColHeader headers_status[STATUS_HEADER_COUNT];
|
||||
|
||||
static void fetch_node_records(PGconn *conn, NodeInfoList *node_list);
|
||||
static void _do_repmgr_pause(bool pause);
|
||||
|
||||
|
||||
void
|
||||
do_daemon_status(void)
|
||||
{
|
||||
PGconn *conn = NULL;
|
||||
NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||
NodeInfoListCell *cell = NULL;
|
||||
int i;
|
||||
RepmgrdInfo **repmgrd_info;
|
||||
ItemList warnings = {NULL, NULL};
|
||||
bool connection_error_found = false;
|
||||
|
||||
/* Connect to local database to obtain cluster connection data */
|
||||
log_verbose(LOG_INFO, _("connecting to database"));
|
||||
|
||||
if (strlen(config_file_options.conninfo))
|
||||
conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
else
|
||||
conn = establish_db_connection_by_params(&source_conninfo, true);
|
||||
|
||||
fetch_node_records(conn, &nodes);
|
||||
|
||||
repmgrd_info = (RepmgrdInfo **) pg_malloc0(sizeof(RepmgrdInfo *) * nodes.node_count);
|
||||
|
||||
if (repmgrd_info == NULL)
|
||||
{
|
||||
log_error(_("unable to allocate memory"));
|
||||
exit(ERR_OUT_OF_MEMORY);
|
||||
}
|
||||
|
||||
strncpy(headers_status[STATUS_ID].title, _("ID"), MAXLEN);
|
||||
strncpy(headers_status[STATUS_NAME].title, _("Name"), MAXLEN);
|
||||
strncpy(headers_status[STATUS_ROLE].title, _("Role"), MAXLEN);
|
||||
|
||||
if (runtime_options.compact == true)
|
||||
strncpy(headers_status[STATUS_PRIORITY].title, _("Prio."), MAXLEN);
|
||||
else
|
||||
strncpy(headers_status[STATUS_PRIORITY].title, _("Priority"), MAXLEN);
|
||||
|
||||
strncpy(headers_status[STATUS_PG].title, _("Status"), MAXLEN);
|
||||
strncpy(headers_status[STATUS_RUNNING].title, _("repmgrd"), MAXLEN);
|
||||
strncpy(headers_status[STATUS_PID].title, _("PID"), MAXLEN);
|
||||
strncpy(headers_status[STATUS_PAUSED].title, _("Paused?"), MAXLEN);
|
||||
|
||||
if (runtime_options.compact == true)
|
||||
strncpy(headers_status[STATUS_UPSTREAM_LAST_SEEN].title, _("Upstr. last"), MAXLEN);
|
||||
else
|
||||
strncpy(headers_status[STATUS_UPSTREAM_LAST_SEEN].title, _("Upstream last seen"), MAXLEN);
|
||||
|
||||
|
||||
for (i = 0; i < STATUS_HEADER_COUNT; i++)
|
||||
{
|
||||
headers_status[i].max_length = strlen(headers_status[i].title);
|
||||
headers_status[i].display = true;
|
||||
}
|
||||
|
||||
i = 0;
|
||||
|
||||
for (cell = nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
int j;
|
||||
PQExpBufferData buf;
|
||||
|
||||
repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
|
||||
repmgrd_info[i]->node_id = cell->node_info->node_id;
|
||||
repmgrd_info[i]->pid = UNKNOWN_PID;
|
||||
repmgrd_info[i]->recovery_type = RECTYPE_UNKNOWN;
|
||||
repmgrd_info[i]->paused = false;
|
||||
repmgrd_info[i]->running = false;
|
||||
repmgrd_info[i]->pg_running = true;
|
||||
repmgrd_info[i]->wal_paused_pending_wal = false;
|
||||
repmgrd_info[i]->upstream_last_seen = -1;
|
||||
|
||||
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
|
||||
|
||||
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
||||
{
|
||||
connection_error_found = true;
|
||||
|
||||
if (runtime_options.verbose)
|
||||
{
|
||||
char error[MAXLEN];
|
||||
|
||||
strncpy(error, PQerrorMessage(cell->node_info->conn), MAXLEN);
|
||||
|
||||
item_list_append_format(&warnings,
|
||||
"when attempting to connect to node \"%s\" (ID: %i), following error encountered :\n\"%s\"",
|
||||
cell->node_info->node_name, cell->node_info->node_id, trim(error));
|
||||
}
|
||||
else
|
||||
{
|
||||
item_list_append_format(&warnings,
|
||||
"unable to connect to node \"%s\" (ID: %i)",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
|
||||
repmgrd_info[i]->pg_running = false;
|
||||
maxlen_snprintf(repmgrd_info[i]->pg_running_text, "%s", _("not running"));
|
||||
maxlen_snprintf(repmgrd_info[i]->repmgrd_running, "%s", _("n/a"));
|
||||
maxlen_snprintf(repmgrd_info[i]->pid_text, "%s", _("n/a"));
|
||||
}
|
||||
else
|
||||
{
|
||||
maxlen_snprintf(repmgrd_info[i]->pg_running_text, "%s", _("running"));
|
||||
|
||||
repmgrd_info[i]->pid = repmgrd_get_pid(cell->node_info->conn);
|
||||
|
||||
repmgrd_info[i]->running = repmgrd_is_running(cell->node_info->conn);
|
||||
|
||||
if (repmgrd_info[i]->running == true)
|
||||
{
|
||||
maxlen_snprintf(repmgrd_info[i]->repmgrd_running, "%s", _("running"));
|
||||
}
|
||||
else
|
||||
{
|
||||
maxlen_snprintf(repmgrd_info[i]->repmgrd_running, "%s", _("not running"));
|
||||
}
|
||||
|
||||
if (repmgrd_info[i]->pid == UNKNOWN_PID)
|
||||
{
|
||||
maxlen_snprintf(repmgrd_info[i]->pid_text, "%s", _("n/a"));
|
||||
}
|
||||
else
|
||||
{
|
||||
maxlen_snprintf(repmgrd_info[i]->pid_text, "%i", repmgrd_info[i]->pid);
|
||||
}
|
||||
|
||||
repmgrd_info[i]->paused = repmgrd_is_paused(cell->node_info->conn);
|
||||
|
||||
repmgrd_info[i]->recovery_type = get_recovery_type(cell->node_info->conn);
|
||||
|
||||
if (repmgrd_info[i]->recovery_type == RECTYPE_STANDBY)
|
||||
{
|
||||
repmgrd_info[i]->wal_paused_pending_wal = is_wal_replay_paused(cell->node_info->conn, true);
|
||||
|
||||
if (repmgrd_info[i]->wal_paused_pending_wal == true)
|
||||
{
|
||||
item_list_append_format(&warnings,
|
||||
_("WAL replay is paused on node \"%s\" (ID: %i) with WAL replay pending; this node cannot be manually promoted until WAL replay is resumed"),
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
}
|
||||
|
||||
repmgrd_info[i]->upstream_last_seen = get_upstream_last_seen(cell->node_info->conn, cell->node_info->type);
|
||||
if (repmgrd_info[i]->upstream_last_seen < 0)
|
||||
{
|
||||
maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, "%s", _("n/a"));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (runtime_options.compact == true)
|
||||
{
|
||||
maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, _("%i sec(s) ago"), repmgrd_info[i]->upstream_last_seen);
|
||||
}
|
||||
else
|
||||
{
|
||||
maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, _("%i second(s) ago"), repmgrd_info[i]->upstream_last_seen);
|
||||
}
|
||||
}
|
||||
|
||||
PQfinish(cell->node_info->conn);
|
||||
}
|
||||
|
||||
|
||||
headers_status[STATUS_NAME].cur_length = strlen(cell->node_info->node_name);
|
||||
headers_status[STATUS_ROLE].cur_length = strlen(get_node_type_string(cell->node_info->type));
|
||||
|
||||
initPQExpBuffer(&buf);
|
||||
appendPQExpBuffer(&buf, "%i", cell->node_info->priority);
|
||||
headers_status[STATUS_PRIORITY].cur_length = strlen(buf.data);
|
||||
termPQExpBuffer(&buf);
|
||||
|
||||
headers_status[STATUS_PID].cur_length = strlen(repmgrd_info[i]->pid_text);
|
||||
headers_status[STATUS_RUNNING].cur_length = strlen(repmgrd_info[i]->repmgrd_running);
|
||||
headers_status[STATUS_PG].cur_length = strlen(repmgrd_info[i]->pg_running_text);
|
||||
|
||||
headers_status[STATUS_UPSTREAM_LAST_SEEN].cur_length = strlen(repmgrd_info[i]->upstream_last_seen_text);
|
||||
|
||||
for (j = 0; j < STATUS_HEADER_COUNT; j++)
|
||||
{
|
||||
if (headers_status[j].cur_length > headers_status[j].max_length)
|
||||
{
|
||||
headers_status[j].max_length = headers_status[j].cur_length;
|
||||
}
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
/* Print column header row (text mode only) */
|
||||
if (runtime_options.output_mode == OM_TEXT)
|
||||
{
|
||||
print_status_header(STATUS_HEADER_COUNT, headers_status);
|
||||
}
|
||||
|
||||
i = 0;
|
||||
|
||||
for (cell = nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
if (runtime_options.output_mode == OM_CSV)
|
||||
{
|
||||
int running = repmgrd_info[i]->running ? 1 : 0;
|
||||
int paused = repmgrd_info[i]->paused ? 1 : 0;
|
||||
|
||||
/* If PostgreSQL is not running, repmgrd status is unknown */
|
||||
if (repmgrd_info[i]->pg_running == false)
|
||||
{
|
||||
running = -1;
|
||||
paused = -1;
|
||||
}
|
||||
|
||||
printf("%i,%s,%s,%i,%i,%i,%i,%i,%i\n",
|
||||
cell->node_info->node_id,
|
||||
cell->node_info->node_name,
|
||||
get_node_type_string(cell->node_info->type),
|
||||
repmgrd_info[i]->pg_running ? 1 : 0,
|
||||
running,
|
||||
repmgrd_info[i]->pid,
|
||||
paused,
|
||||
cell->node_info->priority,
|
||||
repmgrd_info[i]->pid == UNKNOWN_PID
|
||||
? -1
|
||||
: repmgrd_info[i]->upstream_last_seen);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf(" %-*i ", headers_status[STATUS_ID].max_length, cell->node_info->node_id);
|
||||
printf("| %-*s ", headers_status[STATUS_NAME].max_length, cell->node_info->node_name);
|
||||
printf("| %-*s ", headers_status[STATUS_ROLE].max_length, get_node_type_string(cell->node_info->type));
|
||||
printf("| %-*i ", headers_status[STATUS_PRIORITY].max_length, cell->node_info->priority);
|
||||
|
||||
printf("| %-*s ", headers_status[STATUS_PG].max_length, repmgrd_info[i]->pg_running_text);
|
||||
printf("| %-*s ", headers_status[STATUS_RUNNING].max_length, repmgrd_info[i]->repmgrd_running);
|
||||
printf("| %-*s ", headers_status[STATUS_PID].max_length, repmgrd_info[i]->pid_text);
|
||||
|
||||
if (repmgrd_info[i]->pid == UNKNOWN_PID)
|
||||
{
|
||||
printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, _("n/a"));
|
||||
printf("| %-*s ", headers_status[STATUS_UPSTREAM_LAST_SEEN].max_length, _("n/a"));
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, repmgrd_info[i]->paused ? _("yes") : _("no"));
|
||||
|
||||
printf("| %-*s ", headers_status[STATUS_UPSTREAM_LAST_SEEN].max_length, repmgrd_info[i]->upstream_last_seen_text);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
pfree(repmgrd_info[i]);
|
||||
i++;
|
||||
}
|
||||
|
||||
pfree(repmgrd_info);
|
||||
|
||||
/* emit any warnings */
|
||||
|
||||
if (warnings.head != NULL && runtime_options.terse == false && runtime_options.output_mode != OM_CSV)
|
||||
{
|
||||
ItemListCell *cell = NULL;
|
||||
|
||||
printf(_("\nWARNING: following issues were detected\n"));
|
||||
for (cell = warnings.head; cell; cell = cell->next)
|
||||
{
|
||||
printf(_(" - %s\n"), cell->string);
|
||||
}
|
||||
|
||||
if (runtime_options.verbose == false && connection_error_found == true)
|
||||
{
|
||||
log_hint(_("execute with --verbose option to see connection error messages"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
do_daemon_pause(void)
|
||||
{
|
||||
_do_repmgr_pause(true);
|
||||
}
|
||||
|
||||
void
|
||||
do_daemon_unpause(void)
|
||||
{
|
||||
_do_repmgr_pause(false);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
_do_repmgr_pause(bool pause)
|
||||
{
|
||||
PGconn *conn = NULL;
|
||||
NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||
NodeInfoListCell *cell = NULL;
|
||||
int i;
|
||||
int error_nodes = 0;
|
||||
|
||||
/* Connect to local database to obtain cluster connection data */
|
||||
log_verbose(LOG_INFO, _("connecting to database"));
|
||||
|
||||
if (strlen(config_file_options.conninfo))
|
||||
conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
else
|
||||
conn = establish_db_connection_by_params(&source_conninfo, true);
|
||||
|
||||
fetch_node_records(conn, &nodes);
|
||||
|
||||
i = 0;
|
||||
|
||||
for (cell = nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
log_verbose(LOG_DEBUG, "pausing node %i (%s)",
|
||||
cell->node_info->node_id,
|
||||
cell->node_info->node_name);
|
||||
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
|
||||
|
||||
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
||||
{
|
||||
log_warning(_("unable to connect to node %i"),
|
||||
cell->node_info->node_id);
|
||||
error_nodes++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
if (pause == true)
|
||||
{
|
||||
log_info(_("would pause node %i (%s) "),
|
||||
cell->node_info->node_id,
|
||||
cell->node_info->node_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_info(_("would unpause node %i (%s) "),
|
||||
cell->node_info->node_id,
|
||||
cell->node_info->node_name);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
bool success = repmgrd_pause(cell->node_info->conn, pause);
|
||||
|
||||
if (success == false)
|
||||
error_nodes++;
|
||||
|
||||
log_notice(_("node %i (%s) %s"),
|
||||
cell->node_info->node_id,
|
||||
cell->node_info->node_name,
|
||||
success == true
|
||||
? pause == true ? "paused" : "unpaused"
|
||||
: pause == true ? "not paused" : "not unpaused");
|
||||
}
|
||||
PQfinish(cell->node_info->conn);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
if (error_nodes > 0)
|
||||
{
|
||||
if (pause == true)
|
||||
{
|
||||
log_error(_("unable to pause %i node(s)"), error_nodes);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("unable to unpause %i node(s)"), error_nodes);
|
||||
}
|
||||
|
||||
log_hint(_("execute \"repmgr daemon status\" to view current status"));
|
||||
|
||||
exit(ERR_REPMGRD_PAUSE);
|
||||
}
|
||||
|
||||
exit(SUCCESS);
|
||||
}
|
||||
|
||||
|
||||
|
||||
void
|
||||
fetch_node_records(PGconn *conn, NodeInfoList *node_list)
|
||||
{
|
||||
bool success = get_all_node_records(conn, node_list);
|
||||
|
||||
if (success == false)
|
||||
{
|
||||
/* get_all_node_records() will display any error message */
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (node_list->node_count == 0)
|
||||
{
|
||||
log_error(_("no node records were found"));
|
||||
log_hint(_("ensure at least one node is registered"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
do_daemon_start(void)
|
||||
{
|
||||
PGconn *conn = NULL;
|
||||
PQExpBufferData repmgrd_command;
|
||||
PQExpBufferData output_buf;
|
||||
bool success;
|
||||
|
||||
if (config_file_options.repmgrd_service_start_command[0] == '\0')
|
||||
{
|
||||
log_error(_("\"repmgrd_service_start_command\" is not set"));
|
||||
log_hint(_("set \"repmgrd_service_start_command\" in \"repmgr.conf\""));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
log_verbose(LOG_INFO, _("connecting to local node"));
|
||||
|
||||
conn = establish_db_connection(config_file_options.conninfo, false);
|
||||
|
||||
if (PQstatus(conn) != CONNECTION_OK)
|
||||
{
|
||||
/* TODO: if PostgreSQL is not available, have repmgrd loop and retry connection */
|
||||
log_error(_("unable to connect to local node"));
|
||||
log_detail(_("PostgreSQL must be running before \"repmgrd\" can be started"));
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
/*
|
||||
* if local connection available, check if repmgr.so is installed, and
|
||||
* whether repmgrd is running
|
||||
*/
|
||||
check_shared_library(conn);
|
||||
|
||||
if (is_repmgrd_running(conn) == true)
|
||||
{
|
||||
pid_t pid = UNKNOWN_PID;
|
||||
|
||||
log_error(_("repmgrd appears to be running already"));
|
||||
|
||||
pid = repmgrd_get_pid(conn);
|
||||
|
||||
if (pid != UNKNOWN_PID)
|
||||
log_detail(_("repmgrd PID is %i"), pid);
|
||||
else
|
||||
log_warning(_("unable to determine repmgrd PID"));
|
||||
|
||||
PQfinish(conn);
|
||||
exit(ERR_REPMGRD_SERVICE);
|
||||
}
|
||||
|
||||
PQfinish(conn);
|
||||
|
||||
|
||||
initPQExpBuffer(&repmgrd_command);
|
||||
appendPQExpBufferStr(&repmgrd_command,
|
||||
config_file_options.repmgrd_service_start_command);
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info(_("prerequisites for starting repmgrd met"));
|
||||
log_detail("following command would be executed:\n %s", repmgrd_command.data);
|
||||
exit(SUCCESS);
|
||||
}
|
||||
|
||||
log_notice(_("executing: \"%s\""), repmgrd_command.data);
|
||||
|
||||
initPQExpBuffer(&output_buf);
|
||||
|
||||
success = local_command(repmgrd_command.data, &output_buf);
|
||||
termPQExpBuffer(&repmgrd_command);
|
||||
|
||||
if (success == false)
|
||||
{
|
||||
log_error(_("unable to start repmgrd"));
|
||||
if (output_buf.data[0] != '\0')
|
||||
log_detail("%s", output_buf.data);
|
||||
termPQExpBuffer(&output_buf);
|
||||
exit(ERR_REPMGRD_SERVICE);
|
||||
}
|
||||
|
||||
termPQExpBuffer(&output_buf);
|
||||
|
||||
if (runtime_options.no_wait == true || runtime_options.wait == 0)
|
||||
{
|
||||
log_hint(REPMGR_DAEMON_STATUS_START_HINT);
|
||||
}
|
||||
else
|
||||
{
|
||||
int i = 0;
|
||||
int timeout = REPMGR_DAEMON_STOP_START_WAIT;
|
||||
|
||||
if (runtime_options.wait_provided)
|
||||
timeout = runtime_options.wait;
|
||||
|
||||
conn = establish_db_connection(config_file_options.conninfo, false);
|
||||
|
||||
if (PQstatus(conn) != CONNECTION_OK)
|
||||
{
|
||||
log_notice(_("unable to connect to local node"));
|
||||
log_hint(REPMGR_DAEMON_STATUS_START_HINT);
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (is_repmgrd_running(conn) == true)
|
||||
{
|
||||
log_notice(_("repmgrd was successfully started"));
|
||||
PQfinish(conn);
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == timeout)
|
||||
{
|
||||
PQfinish(conn);
|
||||
log_error(_("repmgrd does not appear to have started after %i seconds"),
|
||||
timeout);
|
||||
log_hint(REPMGR_DAEMON_STATUS_START_HINT);
|
||||
exit(ERR_REPMGRD_SERVICE);
|
||||
}
|
||||
|
||||
log_debug("sleeping 1 second; %i of %i attempts to determine if repmgrd is running",
|
||||
i, runtime_options.wait);
|
||||
sleep(1);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void do_daemon_stop(void)
|
||||
{
|
||||
PGconn *conn = NULL;
|
||||
PQExpBufferData repmgrd_command;
|
||||
PQExpBufferData output_buf;
|
||||
bool success;
|
||||
bool have_db_connection = true;
|
||||
pid_t pid = UNKNOWN_PID;
|
||||
|
||||
if (config_file_options.repmgrd_service_stop_command[0] == '\0')
|
||||
{
|
||||
log_error(_("\"repmgrd_service_stop_command\" is not set"));
|
||||
log_hint(_("set \"repmgrd_service_stop_command\" in \"repmgr.conf\""));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/*
|
||||
* if local connection available, check if repmgr.so is installed, and
|
||||
* whether repmgrd is running
|
||||
*/
|
||||
log_verbose(LOG_INFO, _("connecting to local node"));
|
||||
|
||||
conn = establish_db_connection(config_file_options.conninfo, false);
|
||||
|
||||
if (PQstatus(conn) != CONNECTION_OK)
|
||||
{
|
||||
/*
|
||||
* a PostgreSQL connection is not required to stop repmgrd,
|
||||
*/
|
||||
log_warning(_("unable to connect to local node"));
|
||||
have_db_connection = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
check_shared_library(conn);
|
||||
|
||||
if (is_repmgrd_running(conn) == false)
|
||||
{
|
||||
log_error(_("repmgrd appears to be stopped already"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_REPMGRD_SERVICE);
|
||||
}
|
||||
|
||||
/* Attempt to fetch the PID, in case we need it later */
|
||||
pid = repmgrd_get_pid(conn);
|
||||
log_debug("retrieved pid is %i", pid);
|
||||
}
|
||||
|
||||
PQfinish(conn);
|
||||
|
||||
initPQExpBuffer(&repmgrd_command);
|
||||
|
||||
appendPQExpBufferStr(&repmgrd_command,
|
||||
config_file_options.repmgrd_service_stop_command);
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info(_("prerequisites for stopping repmgrd met"));
|
||||
log_detail("following command would be executed:\n %s", repmgrd_command.data);
|
||||
exit(SUCCESS);
|
||||
}
|
||||
|
||||
log_notice(_("executing: \"%s\""), repmgrd_command.data);
|
||||
|
||||
initPQExpBuffer(&output_buf);
|
||||
|
||||
success = local_command(repmgrd_command.data, &output_buf);
|
||||
termPQExpBuffer(&repmgrd_command);
|
||||
|
||||
if (success == false)
|
||||
{
|
||||
log_error(_("unable to stop repmgrd"));
|
||||
if (output_buf.data[0] != '\0')
|
||||
log_detail("%s", output_buf.data);
|
||||
termPQExpBuffer(&output_buf);
|
||||
exit(ERR_REPMGRD_SERVICE);
|
||||
}
|
||||
|
||||
termPQExpBuffer(&output_buf);
|
||||
|
||||
if (runtime_options.no_wait == true || runtime_options.wait == 0)
|
||||
{
|
||||
if (have_db_connection == true)
|
||||
log_hint(REPMGR_DAEMON_STATUS_STOP_HINT);
|
||||
}
|
||||
else
|
||||
{
|
||||
int i = 0;
|
||||
int timeout = REPMGR_DAEMON_STOP_START_WAIT;
|
||||
/*
|
||||
*
|
||||
*/
|
||||
if (pid == UNKNOWN_PID)
|
||||
{
|
||||
/*
|
||||
* XXX attempt to get pidfile from config
|
||||
* and get contents
|
||||
* ( see check_and_create_pid_file() )
|
||||
* if PID still unknown, exit here
|
||||
*/
|
||||
log_warning(_("unable to determine repmgrd PID"));
|
||||
|
||||
if (have_db_connection == true)
|
||||
log_hint(REPMGR_DAEMON_STATUS_STOP_HINT);
|
||||
|
||||
exit(ERR_REPMGRD_SERVICE);
|
||||
}
|
||||
|
||||
if (runtime_options.wait_provided)
|
||||
timeout = runtime_options.wait;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (kill(pid, 0) == -1)
|
||||
{
|
||||
if (errno == ESRCH)
|
||||
{
|
||||
log_notice(_("repmgrd was successfully stopped"));
|
||||
exit(SUCCESS);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("unable to determine status of process with PID %i"), pid);
|
||||
log_detail("%s", strerror(errno));
|
||||
exit(ERR_REPMGRD_SERVICE);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (i == timeout)
|
||||
{
|
||||
log_error(_("repmgrd does not appear to have stopped after %i seconds"),
|
||||
timeout);
|
||||
|
||||
if (have_db_connection == true)
|
||||
log_hint(REPMGR_DAEMON_STATUS_START_HINT);
|
||||
|
||||
exit(ERR_REPMGRD_SERVICE);
|
||||
}
|
||||
|
||||
log_debug("sleeping 1 second; %i of %i attempts to determine if repmgrd with PID %i is running",
|
||||
i, timeout, pid);
|
||||
sleep(1);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void do_daemon_help(void)
|
||||
{
|
||||
print_help_header();
|
||||
|
||||
printf(_("Usage:\n"));
|
||||
printf(_(" %s [OPTIONS] daemon status\n"), progname());
|
||||
printf(_(" %s [OPTIONS] daemon pause\n"), progname());
|
||||
printf(_(" %s [OPTIONS] daemon unpause\n"), progname());
|
||||
printf(_(" %s [OPTIONS] daemon start\n"), progname());
|
||||
printf(_(" %s [OPTIONS] daemon stop\n"), progname());
|
||||
puts("");
|
||||
|
||||
printf(_("DAEMON STATUS\n"));
|
||||
puts("");
|
||||
printf(_(" \"daemon status\" shows the status of repmgrd on each node in the cluster\n"));
|
||||
puts("");
|
||||
printf(_(" --csv emit output as CSV\n"));
|
||||
printf(_(" --verbose show text of database connection error messages\n"));
|
||||
puts("");
|
||||
|
||||
printf(_("DAEMON START\n"));
|
||||
puts("");
|
||||
printf(_(" \"daemon start\" attempts to start repmgrd\n"));
|
||||
puts("");
|
||||
printf(_(" --dry-run check prerequisites but don't start repmgrd\n"));
|
||||
printf(_(" -w/--wait wait for repmgrd to start (default: %i seconds)\n"), REPMGR_DAEMON_STOP_START_WAIT);
|
||||
printf(_(" --no-wait don't wait for repmgrd to start\n"));
|
||||
puts("");
|
||||
|
||||
printf(_("DAEMON STOP\n"));
|
||||
puts("");
|
||||
printf(_(" \"daemon stop\" attempts to stop repmgrd\n"));
|
||||
puts("");
|
||||
printf(_(" --dry-run check prerequisites but don't stop repmgrd\n"));
|
||||
printf(_(" -w/--wait wait for repmgrd to stop (default: %i seconds)\n"), REPMGR_DAEMON_STOP_START_WAIT);
|
||||
printf(_(" --no-wait don't wait for repmgrd to stop\n"));
|
||||
puts("");
|
||||
|
||||
printf(_("DAEMON PAUSE\n"));
|
||||
puts("");
|
||||
printf(_(" \"daemon pause\" instructs repmgrd on each node to pause failover detection\n"));
|
||||
puts("");
|
||||
printf(_(" --dry-run check if nodes are reachable but don't pause repmgrd\n"));
|
||||
puts("");
|
||||
|
||||
printf(_("DAEMON UNPAUSE\n"));
|
||||
puts("");
|
||||
printf(_(" \"daemon unpause\" instructs repmgrd on each node to resume failover detection\n"));
|
||||
puts("");
|
||||
printf(_(" --dry-run check if nodes are reachable but don't unpause repmgrd\n"));
|
||||
puts("");
|
||||
|
||||
puts("");
|
||||
}
|
||||
30
repmgr-action-daemon.h
Normal file
30
repmgr-action-daemon.h
Normal file
@@ -0,0 +1,30 @@
|
||||
/*
|
||||
* repmgr-action-daemon.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef _REPMGR_ACTION_DAEMON_H_
|
||||
#define _REPMGR_ACTION_DAEMON_H_
|
||||
|
||||
|
||||
extern void do_daemon_status(void);
|
||||
extern void do_daemon_pause(void);
|
||||
extern void do_daemon_unpause(void);
|
||||
extern void do_daemon_start(void);
|
||||
extern void do_daemon_stop(void);
|
||||
|
||||
extern void do_daemon_help(void);
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgr-action-node.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -24,6 +24,7 @@ extern void do_node_check(void);
|
||||
|
||||
extern void do_node_rejoin(void);
|
||||
extern void do_node_service(void);
|
||||
extern void do_node_control(void);
|
||||
|
||||
extern void do_node_help(void);
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Implements primary actions for the repmgr command line utility
|
||||
*
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
@@ -64,12 +64,10 @@ do_primary_register(void)
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("connection to node lost"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
log_error(_("unable to determine server's recovery type"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
log_verbose(LOG_INFO, _("server is not in recovery"));
|
||||
@@ -172,8 +170,8 @@ do_primary_register(void)
|
||||
&node_info);
|
||||
if (record_created == true)
|
||||
{
|
||||
appendPQExpBuffer(&event_description,
|
||||
"existing primary record updated");
|
||||
appendPQExpBufferStr(&event_description,
|
||||
"existing primary record updated");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -253,6 +251,7 @@ do_primary_unregister(void)
|
||||
PGconn *primary_conn = NULL;
|
||||
PGconn *local_conn = NULL;
|
||||
t_node_info local_node_info = T_NODE_INFO_INITIALIZER;
|
||||
t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
|
||||
|
||||
t_node_info *target_node_info_ptr = NULL;
|
||||
PGconn *target_node_conn = NULL;
|
||||
@@ -273,8 +272,6 @@ do_primary_unregister(void)
|
||||
|
||||
if (PQstatus(primary_conn) != CONNECTION_OK)
|
||||
{
|
||||
t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
|
||||
|
||||
log_error(_("unable to connect to primary server"));
|
||||
|
||||
if (get_primary_node_record(local_conn, &primary_node_info) == true)
|
||||
@@ -293,10 +290,19 @@ do_primary_unregister(void)
|
||||
/* Local connection no longer required */
|
||||
PQfinish(local_conn);
|
||||
|
||||
if (get_primary_node_record(primary_conn, &primary_node_info) == false)
|
||||
{
|
||||
log_error(_("unable to retrieve record for primary node"));
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* Target node is local node? */
|
||||
if (target_node_info.node_id == UNKNOWN_NODE_ID
|
||||
|| target_node_info.node_id == config_file_options.node_id)
|
||||
if (target_node_info.node_id == UNKNOWN_NODE_ID)
|
||||
{
|
||||
target_node_info_ptr = &primary_node_info;
|
||||
}
|
||||
else if (target_node_info.node_id == config_file_options.node_id)
|
||||
{
|
||||
target_node_info_ptr = &local_node_info;
|
||||
}
|
||||
@@ -306,6 +312,24 @@ do_primary_unregister(void)
|
||||
target_node_info_ptr = &target_node_info;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sanity-check the target node is not a witness
|
||||
*/
|
||||
|
||||
if (target_node_info_ptr->type == WITNESS)
|
||||
{
|
||||
log_error(_("node %s (id: %i) is a witness server, unable to unregister"),
|
||||
target_node_info_ptr->node_name,
|
||||
target_node_info_ptr->node_id);
|
||||
if (target_node_info_ptr->type == STANDBY)
|
||||
{
|
||||
log_hint(_("the node can be unregistered with \"repmgr witness unregister\""));
|
||||
}
|
||||
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for downstream nodes - if any still defined, we won't be able to
|
||||
* delete the node record due to foreign key constraints.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*
|
||||
* repmgr-action-primary.h
|
||||
* Copyright (c) 2ndQuadrant, 2010-2018
|
||||
* Copyright (c) 2ndQuadrant, 2010-2019
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user