mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 07:06:30 +00:00
Compare commits
92 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
afdaf9be66 | ||
|
|
8067924c3e | ||
|
|
e94a6eefde | ||
|
|
69d7b6f7eb | ||
|
|
8ec3b2a536 | ||
|
|
68a9745e7e | ||
|
|
20ce53e2d2 | ||
|
|
638a119c85 | ||
|
|
053863cdd0 | ||
|
|
009cc0480c | ||
|
|
63bdc19132 | ||
|
|
fbd389d0b3 | ||
|
|
4aef4ea11e | ||
|
|
0ffaff75df | ||
|
|
c54bb73fb2 | ||
|
|
28ea2e48de | ||
|
|
41274f5525 | ||
|
|
edceb32ccb | ||
|
|
3dba8336e9 | ||
|
|
97d0cee259 | ||
|
|
2dfe1d18e9 | ||
|
|
55bb93bd3f | ||
|
|
4c49954cd4 | ||
|
|
a880b6ce16 | ||
|
|
c51a2283dd | ||
|
|
717828e73e | ||
|
|
c7477d7a9c | ||
|
|
1db8d3904f | ||
|
|
362f478d55 | ||
|
|
cb1bf892e6 | ||
|
|
b1b5fe1193 | ||
|
|
af0e141859 | ||
|
|
580c1a9170 | ||
|
|
b624fc7efa | ||
|
|
67ccd4dcb3 | ||
|
|
6de3a5a997 | ||
|
|
f86e89ba45 | ||
|
|
a6d0ba07ed | ||
|
|
b553a70ad5 | ||
|
|
3364f8bdf0 | ||
|
|
242fa287b4 | ||
|
|
fa908432c8 | ||
|
|
afa942fef6 | ||
|
|
94cfc66b04 | ||
|
|
87eae9a50f | ||
|
|
82a37f4865 | ||
|
|
a38f727b7d | ||
|
|
e6df936c1b | ||
|
|
91ca997d40 | ||
|
|
65c90a2a64 | ||
|
|
90cba78f52 | ||
|
|
f8908d7e31 | ||
|
|
478bbcccbf | ||
|
|
a03d41de28 | ||
|
|
f1e527adcb | ||
|
|
09e597dcdd | ||
|
|
94a7f0c719 | ||
|
|
6ac42f1593 | ||
|
|
94b72382e5 | ||
|
|
18c12f58a4 | ||
|
|
cf3fa18085 | ||
|
|
a5281d93dc | ||
|
|
0d73d3c2b5 | ||
|
|
23c99304a6 | ||
|
|
1ab16bc6c2 | ||
|
|
7f1f04636d | ||
|
|
6a1797cadd | ||
|
|
94d26dbe9f | ||
|
|
ae655eb4fd | ||
|
|
65371489c6 | ||
|
|
28c7737dc0 | ||
|
|
505d72d19c | ||
|
|
b292ac61f8 | ||
|
|
293d66bf71 | ||
|
|
3e1f0ec168 | ||
|
|
6f9a1f975e | ||
|
|
deea4f69f7 | ||
|
|
37e53108a2 | ||
|
|
96cf06204c | ||
|
|
381e22c2c7 | ||
|
|
7e2af17783 | ||
|
|
b4272853e7 | ||
|
|
562b6ddfc2 | ||
|
|
a15e5c9d52 | ||
|
|
d9cc09cee4 | ||
|
|
c4f6abe951 | ||
|
|
e454fb77d3 | ||
|
|
b76e5852d3 | ||
|
|
0674364ffd | ||
|
|
b2eb9b8525 | ||
|
|
71c5d10a8c | ||
|
|
1476b21cd4 |
4
FAQ.md
4
FAQ.md
@@ -1,9 +1,7 @@
|
||||
FAQ - Frequently Asked Questions about repmgr
|
||||
=============================================
|
||||
|
||||
The repmgr 4 FAQ is located here:
|
||||
|
||||
https://repmgr.org/docs/appendix-faq.html
|
||||
The repmgr 4 FAQ is located here: [repmgr FAQ (Frequently Asked Questions)](https://repmgr.org/docs/4.0/appendix-faq.html "repmgr FAQ")
|
||||
|
||||
The repmgr 3.x FAQ can be found here:
|
||||
|
||||
|
||||
47
HISTORY
47
HISTORY
@@ -1,4 +1,49 @@
|
||||
4.0.4 2018-03-08
|
||||
4.0.6 2018-06-14
|
||||
repmgr: (witness register) prevent registration of a witness server with the
|
||||
same name as an existing node (Ian)
|
||||
repmgr: (standby follow) check node has actually connected to new primary
|
||||
before reporting success; GitHub #444 (Ian)
|
||||
repmgr: (standby clone) improve handling of external configuration file copying,
|
||||
including consideration in --dry-run check; GitHub #443 (Ian)
|
||||
repmgr: (standby clone) don't require presence of "user" parameter in
|
||||
conninfo string; GitHub #437 (Ian)
|
||||
repmgr: (standby clone) improve documentation of --recovery-conf-only
|
||||
mode; GitHub #438 (Ian)
|
||||
repmgr: (node rejoin) fix bug when parsing --config-files parameter;
|
||||
GitHub #442 (Ian)
|
||||
repmgr: when using --dry-run, force log level to INFO to ensure output
|
||||
will always be displayed; GitHub #441 (Ian)
|
||||
repmgr: (cluster matrix/crosscheck) return non-zero exit code if node
|
||||
connection issues detected; GitHub #447 (Ian)
|
||||
repmgrd: ensure local node is counted as quorum member; GitHub #439 (Ian)
|
||||
|
||||
4.0.5 2018-05-02
|
||||
repmgr: poll demoted primary after restart as a standby during a
|
||||
switchover operation; GitHub #408 (Ian)
|
||||
repmgr: add configuration parameter "config_directory"; GitHub #424 (Ian)
|
||||
repmgr: add "dbname=replication" to all replication connection strings;
|
||||
GitHub #421 (Ian)
|
||||
repmgr: add sanity check if --upstream-node-id not supplied when executing
|
||||
"standby register"; GitHub #395 (Ian)
|
||||
repmgr: enable provision of "archive_cleanup_command" in recovery.conf;
|
||||
GitHub #416 (Ian)
|
||||
repmgr: actively check for node to rejoin cluster; GitHub #415 (Ian)
|
||||
repmgr: enable pg_rewind to be used with PostgreSQL 9.3/9.4; GitHub #413 (Ian)
|
||||
repmgr: fix minimum accepted value for "degraded_monitoring_timeout";
|
||||
GitHub #411 (Ian)
|
||||
repmgr: fix superuser password handling; GitHub #400 (Ian)
|
||||
repmgr: fix parsing of "archive_ready_critical" configuration file
|
||||
parameter; GitHub #426 (Ian)
|
||||
repmgr: fix display of conninfo parsing error messages (Ian)
|
||||
repmgr: fix "repmgr cluster crosscheck" output; GitHub #389 (Ian)
|
||||
repmgrd: prevent standby connection handle from going stale (Ian)
|
||||
repmgrd: fix memory leaks in witness code; GitHub #402 (AndrzejNowicki, Martín)
|
||||
repmgrd: handle "pg_ctl promote" timeout; GitHub #425 (Ian)
|
||||
repmgrd: handle failover situation with only two nodes in the primary
|
||||
location, and at least one node in another location; GitHub #407 (Ian)
|
||||
repmgrd: set "connect_timeout=2" when pinging a server (Ian)
|
||||
|
||||
4.0.4 2018-03-09
|
||||
repmgr: add "standby clone --recovery-conf-only" option; GitHub #382 (Ian)
|
||||
repmgr: make "standby promote" timeout values configurable; GitHub #387 (Ian)
|
||||
repmgr: improve replication slot warnings generated by "node status";
|
||||
|
||||
20
TODO.md
Normal file
20
TODO.md
Normal file
@@ -0,0 +1,20 @@
|
||||
TODO
|
||||
====
|
||||
|
||||
This file contains a list of improvements which are desireable and/or have
|
||||
been requested, and which we aim to address/implement when time and resources
|
||||
permit.
|
||||
|
||||
It is *not* a roadmap and there's no guarantee of any item being implemented
|
||||
within any given timeframe.
|
||||
|
||||
|
||||
Enable suspension of repmgrd failover
|
||||
-------------------------------------
|
||||
|
||||
When performing maintenance, e.g. a switchover, it's necessary to stop all
|
||||
repmgrd nodes to prevent unintended failover; this is obviously inconvenient.
|
||||
We'll need to implement some way of notifying each repmgrd to suspend automatic
|
||||
failover until further notice.
|
||||
|
||||
Requested in GitHub #410 ( https://github.com/2ndQuadrant/repmgr/issues/410 )
|
||||
31
configfile.c
31
configfile.c
@@ -288,6 +288,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
memset(options->node_name, 0, sizeof(options->node_name));
|
||||
memset(options->conninfo, 0, sizeof(options->conninfo));
|
||||
memset(options->data_directory, 0, sizeof(options->data_directory));
|
||||
memset(options->config_directory, 0, sizeof(options->data_directory));
|
||||
memset(options->pg_bindir, 0, sizeof(options->pg_bindir));
|
||||
options->replication_type = REPLICATION_TYPE_PHYSICAL;
|
||||
|
||||
@@ -314,16 +315,24 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->tablespace_mapping.tail = NULL;
|
||||
memset(options->recovery_min_apply_delay, 0, sizeof(options->recovery_min_apply_delay));
|
||||
options->recovery_min_apply_delay_provided = false;
|
||||
memset(options->archive_cleanup_command, 0, sizeof(options->archive_cleanup_command));
|
||||
options->use_primary_conninfo_password = false;
|
||||
memset(options->passfile, 0, sizeof(options->passfile));
|
||||
|
||||
/*-----------------------
|
||||
/*-------------------------
|
||||
* standby promote settings
|
||||
*------------------------
|
||||
*-------------------------
|
||||
*/
|
||||
options->promote_check_timeout = DEFAULT_PROMOTE_CHECK_TIMEOUT;
|
||||
options->promote_check_interval = DEFAULT_PROMOTE_CHECK_INTERVAL;
|
||||
|
||||
/*------------------------
|
||||
* standby follow settings
|
||||
*------------------------
|
||||
*/
|
||||
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
|
||||
options->standby_follow_timeout = DEFAULT_STANDBY_FOLLOW_TIMEOUT;
|
||||
|
||||
/*-----------------
|
||||
* repmgrd settings
|
||||
*-----------------
|
||||
@@ -343,7 +352,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->degraded_monitoring_timeout = -1;
|
||||
options->async_query_timeout = DEFAULT_ASYNC_QUERY_TIMEOUT;
|
||||
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
|
||||
options->primary_follow_timeout = DEFAULT_PRIMARY_FOLLOW_TIMEOUT;
|
||||
options->standby_reconnect_timeout = DEFAULT_STANDBY_RECONNECT_TIMEOUT;
|
||||
|
||||
/*-------------
|
||||
@@ -463,6 +471,9 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
strncpy(options->conninfo, value, MAXLEN);
|
||||
else if (strcmp(name, "data_directory") == 0)
|
||||
strncpy(options->data_directory, value, MAXPGPATH);
|
||||
else if (strcmp(name, "config_directory") == 0)
|
||||
strncpy(options->config_directory, value, MAXPGPATH);
|
||||
|
||||
else if (strcmp(name, "replication_user") == 0)
|
||||
{
|
||||
if (strlen(value) < NAMEDATALEN)
|
||||
@@ -508,6 +519,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
parse_time_unit_parameter(name, value, options->recovery_min_apply_delay, error_list);
|
||||
options->recovery_min_apply_delay_provided = true;
|
||||
}
|
||||
else if (strcmp(name, "archive_cleanup_command") == 0)
|
||||
strncpy(options->archive_cleanup_command, value, MAXLEN);
|
||||
else if (strcmp(name, "use_primary_conninfo_password") == 0)
|
||||
options->use_primary_conninfo_password = parse_bool(value, name, error_list);
|
||||
else if (strcmp(name, "passfile") == 0)
|
||||
@@ -520,10 +533,16 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
else if (strcmp(name, "promote_check_interval") == 0)
|
||||
options->promote_check_interval = repmgr_atoi(value, name, error_list, 1);
|
||||
|
||||
/* standby follow settings */
|
||||
else if (strcmp(name, "primary_follow_timeout") == 0)
|
||||
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "standby_follow_timeout") == 0)
|
||||
options->standby_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
|
||||
/* node check settings */
|
||||
else if (strcmp(name, "archive_ready_warning") == 0)
|
||||
options->archive_ready_warning = repmgr_atoi(value, name, error_list, 1);
|
||||
else if (strcmp(name, "archive_ready_critcial") == 0)
|
||||
else if (strcmp(name, "archive_ready_critical") == 0)
|
||||
options->archive_ready_critical = repmgr_atoi(value, name, error_list, 1);
|
||||
else if (strcmp(name, "replication_lag_warning") == 0)
|
||||
options->replication_lag_warning = repmgr_atoi(value, name, error_list, 1);
|
||||
@@ -564,13 +583,11 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
else if (strcmp(name, "monitoring_history") == 0)
|
||||
options->monitoring_history = parse_bool(value, name, error_list);
|
||||
else if (strcmp(name, "degraded_monitoring_timeout") == 0)
|
||||
options->degraded_monitoring_timeout = repmgr_atoi(value, name, error_list, 1);
|
||||
options->degraded_monitoring_timeout = repmgr_atoi(value, name, error_list, -1);
|
||||
else if (strcmp(name, "async_query_timeout") == 0)
|
||||
options->async_query_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "primary_notification_timeout") == 0)
|
||||
options->primary_notification_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "primary_follow_timeout") == 0)
|
||||
options->primary_follow_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "standby_reconnect_timeout") == 0)
|
||||
options->standby_reconnect_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
|
||||
|
||||
17
configfile.h
17
configfile.h
@@ -73,6 +73,7 @@ typedef struct
|
||||
char conninfo[MAXLEN];
|
||||
char replication_user[NAMEDATALEN];
|
||||
char data_directory[MAXPGPATH];
|
||||
char config_directory[MAXPGPATH];
|
||||
char pg_bindir[MAXPGPATH];
|
||||
int replication_type;
|
||||
|
||||
@@ -89,6 +90,7 @@ typedef struct
|
||||
TablespaceList tablespace_mapping;
|
||||
char recovery_min_apply_delay[MAXLEN];
|
||||
bool recovery_min_apply_delay_provided;
|
||||
char archive_cleanup_command[MAXLEN];
|
||||
bool use_primary_conninfo_password;
|
||||
char passfile[MAXPGPATH];
|
||||
|
||||
@@ -96,6 +98,10 @@ typedef struct
|
||||
int promote_check_timeout;
|
||||
int promote_check_interval;
|
||||
|
||||
/* standby follow settings */
|
||||
int primary_follow_timeout;
|
||||
int standby_follow_timeout;
|
||||
|
||||
/* node check settings */
|
||||
int archive_ready_warning;
|
||||
int archive_ready_critical;
|
||||
@@ -118,7 +124,6 @@ typedef struct
|
||||
int degraded_monitoring_timeout;
|
||||
int async_query_timeout;
|
||||
int primary_notification_timeout;
|
||||
int primary_follow_timeout;
|
||||
int standby_reconnect_timeout;
|
||||
|
||||
/* BDR settings */
|
||||
@@ -158,13 +163,16 @@ typedef struct
|
||||
|
||||
#define T_CONFIGURATION_OPTIONS_INITIALIZER { \
|
||||
/* node information */ \
|
||||
UNKNOWN_NODE_ID, "", "", "", "", "", REPLICATION_TYPE_PHYSICAL, \
|
||||
UNKNOWN_NODE_ID, "", "", "", "", "", "", REPLICATION_TYPE_PHYSICAL, \
|
||||
/* log settings */ \
|
||||
"", "", "", DEFAULT_LOG_STATUS_INTERVAL, \
|
||||
/* standby action settings */ \
|
||||
false, "", "", { NULL, NULL }, "", false, false, "", \
|
||||
/* standby clone settings */ \
|
||||
false, "", "", { NULL, NULL }, "", false, "", false, "", \
|
||||
/* standby promote settings */ \
|
||||
DEFAULT_PROMOTE_CHECK_TIMEOUT, DEFAULT_PROMOTE_CHECK_INTERVAL, \
|
||||
/* standby follow settings */ \
|
||||
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
||||
DEFAULT_STANDBY_FOLLOW_TIMEOUT, \
|
||||
/* node check settings */ \
|
||||
DEFAULT_ARCHIVE_READY_WARNING, DEFAULT_ARCHIVE_READY_CRITICAL, \
|
||||
DEFAULT_REPLICATION_LAG_WARNING, DEFAULT_REPLICATION_LAG_CRITICAL, \
|
||||
@@ -178,7 +186,6 @@ typedef struct
|
||||
false, -1, \
|
||||
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
||||
DEFAULT_STANDBY_RECONNECT_TIMEOUT, \
|
||||
/* BDR settings */ \
|
||||
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||
|
||||
18
configure
vendored
18
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for repmgr 4.0.4.
|
||||
# Generated by GNU Autoconf 2.69 for repmgr 4.0.5.
|
||||
#
|
||||
# Report bugs to <pgsql-bugs@postgresql.org>.
|
||||
#
|
||||
@@ -582,8 +582,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='repmgr'
|
||||
PACKAGE_TARNAME='repmgr'
|
||||
PACKAGE_VERSION='4.0.4'
|
||||
PACKAGE_STRING='repmgr 4.0.4'
|
||||
PACKAGE_VERSION='4.0.5'
|
||||
PACKAGE_STRING='repmgr 4.0.5'
|
||||
PACKAGE_BUGREPORT='pgsql-bugs@postgresql.org'
|
||||
PACKAGE_URL='https://2ndquadrant.com/en/resources/repmgr/'
|
||||
|
||||
@@ -1178,7 +1178,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures repmgr 4.0.4 to adapt to many kinds of systems.
|
||||
\`configure' configures repmgr 4.0.5 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1239,7 +1239,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of repmgr 4.0.4:";;
|
||||
short | recursive ) echo "Configuration of repmgr 4.0.5:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1313,7 +1313,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
repmgr configure 4.0.4
|
||||
repmgr configure 4.0.5
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -1332,7 +1332,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by repmgr $as_me 4.0.4, which was
|
||||
It was created by repmgr $as_me 4.0.5, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2359,7 +2359,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by repmgr $as_me 4.0.4, which was
|
||||
This file was extended by repmgr $as_me 4.0.5, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -2422,7 +2422,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
repmgr config.status 4.0.4
|
||||
repmgr config.status 4.0.5
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
AC_INIT([repmgr], [4.0.4], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])
|
||||
AC_INIT([repmgr], [4.0.6], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])
|
||||
|
||||
AC_COPYRIGHT([Copyright (c) 2010-2018, 2ndQuadrant Ltd.])
|
||||
|
||||
|
||||
149
controldata.c
149
controldata.c
@@ -37,13 +37,8 @@ get_system_identifier(const char *data_directory)
|
||||
uint64 system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
|
||||
|
||||
control_file_info = get_controlfile(data_directory);
|
||||
system_identifier = control_file_info->system_identifier;
|
||||
|
||||
if (control_file_info->control_file_processed == true)
|
||||
system_identifier = control_file_info->control_file->system_identifier;
|
||||
else
|
||||
system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
|
||||
|
||||
pfree(control_file_info->control_file);
|
||||
pfree(control_file_info);
|
||||
|
||||
return system_identifier;
|
||||
@@ -57,13 +52,8 @@ get_db_state(const char *data_directory)
|
||||
|
||||
control_file_info = get_controlfile(data_directory);
|
||||
|
||||
if (control_file_info->control_file_processed == true)
|
||||
state = control_file_info->control_file->state;
|
||||
else
|
||||
/* if we were unable to parse the control file, assume DB is shut down */
|
||||
state = DB_SHUTDOWNED;
|
||||
state = control_file_info->state;
|
||||
|
||||
pfree(control_file_info->control_file);
|
||||
pfree(control_file_info);
|
||||
|
||||
return state;
|
||||
@@ -78,12 +68,8 @@ get_latest_checkpoint_location(const char *data_directory)
|
||||
|
||||
control_file_info = get_controlfile(data_directory);
|
||||
|
||||
if (control_file_info->control_file_processed == false)
|
||||
return InvalidXLogRecPtr;
|
||||
checkPoint = control_file_info->checkPoint;
|
||||
|
||||
checkPoint = control_file_info->control_file->checkPoint;
|
||||
|
||||
pfree(control_file_info->control_file);
|
||||
pfree(control_file_info);
|
||||
|
||||
return checkPoint;
|
||||
@@ -98,16 +84,8 @@ get_data_checksum_version(const char *data_directory)
|
||||
|
||||
control_file_info = get_controlfile(data_directory);
|
||||
|
||||
if (control_file_info->control_file_processed == false)
|
||||
{
|
||||
data_checksum_version = -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
data_checksum_version = (int) control_file_info->control_file->data_checksum_version;
|
||||
}
|
||||
data_checksum_version = (int) control_file_info->data_checksum_version;
|
||||
|
||||
pfree(control_file_info->control_file);
|
||||
pfree(control_file_info);
|
||||
|
||||
return data_checksum_version;
|
||||
@@ -139,33 +117,109 @@ describe_db_state(DBState state)
|
||||
|
||||
|
||||
/*
|
||||
* we maintain our own version of get_controlfile() as we need cross-version
|
||||
* We maintain our own version of get_controlfile() as we need cross-version
|
||||
* compatibility, and also don't care if the file isn't readable.
|
||||
*/
|
||||
static ControlFileInfo *
|
||||
get_controlfile(const char *DataDir)
|
||||
{
|
||||
ControlFileInfo *control_file_info;
|
||||
int fd;
|
||||
FILE *fp = NULL;
|
||||
int fd, ret, version_num;
|
||||
char PgVersionPath[MAXPGPATH] = "";
|
||||
char ControlFilePath[MAXPGPATH] = "";
|
||||
char file_version_string[64] = "";
|
||||
long file_major, file_minor;
|
||||
char *endptr = NULL;
|
||||
void *ControlFileDataPtr = NULL;
|
||||
int expected_size = 0;
|
||||
|
||||
control_file_info = palloc0(sizeof(ControlFileInfo));
|
||||
|
||||
/* set default values */
|
||||
control_file_info->control_file_processed = false;
|
||||
control_file_info->control_file = palloc0(sizeof(ControlFileData));
|
||||
control_file_info->system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
|
||||
control_file_info->state = DB_SHUTDOWNED;
|
||||
control_file_info->checkPoint = InvalidXLogRecPtr;
|
||||
control_file_info->data_checksum_version = -1;
|
||||
|
||||
/*
|
||||
* Read PG_VERSION, as we'll need to determine which struct to read
|
||||
* the control file contents into
|
||||
*/
|
||||
snprintf(PgVersionPath, MAXPGPATH, "%s/PG_VERSION", DataDir);
|
||||
|
||||
fp = fopen(PgVersionPath, "r");
|
||||
|
||||
if (fp == NULL)
|
||||
{
|
||||
log_warning(_("could not open file \"%s\" for reading"),
|
||||
PgVersionPath);
|
||||
log_detail("%s", strerror(errno));
|
||||
return control_file_info;
|
||||
}
|
||||
|
||||
file_version_string[0] = '\0';
|
||||
|
||||
ret = fscanf(fp, "%63s", file_version_string);
|
||||
fclose(fp);
|
||||
|
||||
if (ret != 1 || endptr == file_version_string)
|
||||
{
|
||||
log_warning(_("unable to determine major version number from PG_VERSION"));
|
||||
|
||||
return control_file_info;
|
||||
}
|
||||
|
||||
file_major = strtol(file_version_string, &endptr, 10);
|
||||
file_minor = 0;
|
||||
|
||||
if (*endptr == '.')
|
||||
file_minor = strtol(endptr + 1, NULL, 10);
|
||||
|
||||
version_num = ((int) file_major * 10000) + ((int) file_minor * 100);
|
||||
|
||||
if (version_num < 90300)
|
||||
{
|
||||
log_warning(_("Data directory appears to be initialised for %s"), file_version_string);
|
||||
return control_file_info;
|
||||
}
|
||||
|
||||
|
||||
snprintf(ControlFilePath, MAXPGPATH, "%s/global/pg_control", DataDir);
|
||||
|
||||
if ((fd = open(ControlFilePath, O_RDONLY | PG_BINARY, 0)) == -1)
|
||||
{
|
||||
log_debug("could not open file \"%s\" for reading: %s",
|
||||
ControlFilePath, strerror(errno));
|
||||
log_warning(_("could not open file \"%s\" for reading"),
|
||||
ControlFilePath);
|
||||
log_detail("%s", strerror(errno));
|
||||
return control_file_info;
|
||||
}
|
||||
|
||||
if (read(fd, control_file_info->control_file, sizeof(ControlFileData)) != sizeof(ControlFileData))
|
||||
|
||||
if (version_num >= 90500)
|
||||
{
|
||||
log_debug("could not read file \"%s\": %s",
|
||||
ControlFilePath, strerror(errno));
|
||||
expected_size = sizeof(ControlFileData95);
|
||||
ControlFileDataPtr = palloc0(expected_size);
|
||||
}
|
||||
else if (version_num >= 90400)
|
||||
{
|
||||
expected_size = sizeof(ControlFileData94);
|
||||
ControlFileDataPtr = palloc0(expected_size);
|
||||
}
|
||||
else if (version_num >= 90300)
|
||||
{
|
||||
expected_size = sizeof(ControlFileData93);
|
||||
ControlFileDataPtr = palloc0(expected_size);
|
||||
}
|
||||
|
||||
|
||||
if (read(fd, ControlFileDataPtr, expected_size) != expected_size)
|
||||
{
|
||||
log_warning(_("could not read file \"%s\""),
|
||||
ControlFilePath);
|
||||
log_detail("%s", strerror(errno));
|
||||
|
||||
return control_file_info;
|
||||
}
|
||||
|
||||
@@ -173,6 +227,33 @@ get_controlfile(const char *DataDir)
|
||||
|
||||
control_file_info->control_file_processed = true;
|
||||
|
||||
if (version_num >= 90500)
|
||||
{
|
||||
ControlFileData95 *ptr = (struct ControlFileData95 *)ControlFileDataPtr;
|
||||
control_file_info->system_identifier = ptr->system_identifier;
|
||||
control_file_info->state = ptr->state;
|
||||
control_file_info->checkPoint = ptr->checkPoint;
|
||||
control_file_info->data_checksum_version = ptr->data_checksum_version;
|
||||
}
|
||||
else if (version_num >= 90400)
|
||||
{
|
||||
ControlFileData94 *ptr = (struct ControlFileData94 *)ControlFileDataPtr;
|
||||
control_file_info->system_identifier = ptr->system_identifier;
|
||||
control_file_info->state = ptr->state;
|
||||
control_file_info->checkPoint = ptr->checkPoint;
|
||||
control_file_info->data_checksum_version = ptr->data_checksum_version;
|
||||
}
|
||||
else if (version_num >= 90300)
|
||||
{
|
||||
ControlFileData93 *ptr = (struct ControlFileData93 *)ControlFileDataPtr;
|
||||
control_file_info->system_identifier = ptr->system_identifier;
|
||||
control_file_info->state = ptr->state;
|
||||
control_file_info->checkPoint = ptr->checkPoint;
|
||||
control_file_info->data_checksum_version = ptr->data_checksum_version;
|
||||
}
|
||||
|
||||
pfree(ControlFileDataPtr);
|
||||
|
||||
/*
|
||||
* We don't check the CRC here as we're potentially checking a pg_control
|
||||
* file from a different PostgreSQL version to the one repmgr was compiled
|
||||
|
||||
251
controldata.h
251
controldata.h
@@ -12,12 +12,261 @@
|
||||
#include "postgres_fe.h"
|
||||
#include "catalog/pg_control.h"
|
||||
|
||||
/*
|
||||
* A simplified representation of pg_control containing only those fields
|
||||
* required by repmgr.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
bool control_file_processed;
|
||||
ControlFileData *control_file;
|
||||
uint64 system_identifier;
|
||||
DBState state;
|
||||
XLogRecPtr checkPoint;
|
||||
uint32 data_checksum_version;
|
||||
} ControlFileInfo;
|
||||
|
||||
|
||||
|
||||
/* Same for 9.3, 9.4 */
|
||||
typedef struct CheckPoint93
|
||||
{
|
||||
XLogRecPtr redo; /* next RecPtr available when we began to
|
||||
* create CheckPoint (i.e. REDO start point) */
|
||||
TimeLineID ThisTimeLineID; /* current TLI */
|
||||
TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new
|
||||
* timeline (equals ThisTimeLineID otherwise) */
|
||||
bool fullPageWrites; /* current full_page_writes */
|
||||
uint32 nextXidEpoch; /* higher-order bits of nextXid */
|
||||
TransactionId nextXid; /* next free XID */
|
||||
Oid nextOid; /* next free OID */
|
||||
MultiXactId nextMulti; /* next free MultiXactId */
|
||||
MultiXactOffset nextMultiOffset; /* next free MultiXact offset */
|
||||
TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */
|
||||
Oid oldestXidDB; /* database with minimum datfrozenxid */
|
||||
MultiXactId oldestMulti; /* cluster-wide minimum datminmxid */
|
||||
Oid oldestMultiDB; /* database with minimum datminmxid */
|
||||
pg_time_t time; /* time stamp of checkpoint */
|
||||
|
||||
TransactionId oldestActiveXid;
|
||||
} CheckPoint93;
|
||||
|
||||
|
||||
/* Same for 9.5, 9.6, 10, HEAD */
|
||||
typedef struct CheckPoint95
|
||||
{
|
||||
XLogRecPtr redo; /* next RecPtr available when we began to
|
||||
* create CheckPoint (i.e. REDO start point) */
|
||||
TimeLineID ThisTimeLineID; /* current TLI */
|
||||
TimeLineID PrevTimeLineID; /* previous TLI, if this record begins a new
|
||||
* timeline (equals ThisTimeLineID otherwise) */
|
||||
bool fullPageWrites; /* current full_page_writes */
|
||||
uint32 nextXidEpoch; /* higher-order bits of nextXid */
|
||||
TransactionId nextXid; /* next free XID */
|
||||
Oid nextOid; /* next free OID */
|
||||
MultiXactId nextMulti; /* next free MultiXactId */
|
||||
MultiXactOffset nextMultiOffset; /* next free MultiXact offset */
|
||||
TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */
|
||||
Oid oldestXidDB; /* database with minimum datfrozenxid */
|
||||
MultiXactId oldestMulti; /* cluster-wide minimum datminmxid */
|
||||
Oid oldestMultiDB; /* database with minimum datminmxid */
|
||||
pg_time_t time; /* time stamp of checkpoint */
|
||||
TransactionId oldestCommitTsXid; /* oldest Xid with valid commit
|
||||
* timestamp */
|
||||
TransactionId newestCommitTsXid; /* newest Xid with valid commit
|
||||
* timestamp */
|
||||
|
||||
TransactionId oldestActiveXid;
|
||||
} CheckPoint95;
|
||||
|
||||
|
||||
typedef struct ControlFileData93
|
||||
{
|
||||
uint64 system_identifier;
|
||||
|
||||
uint32 pg_control_version; /* PG_CONTROL_VERSION */
|
||||
uint32 catalog_version_no; /* see catversion.h */
|
||||
|
||||
DBState state; /* see enum above */
|
||||
pg_time_t time; /* time stamp of last pg_control update */
|
||||
XLogRecPtr checkPoint; /* last check point record ptr */
|
||||
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
|
||||
|
||||
CheckPoint93 checkPointCopy; /* copy of last check point record */
|
||||
|
||||
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
|
||||
|
||||
XLogRecPtr minRecoveryPoint;
|
||||
TimeLineID minRecoveryPointTLI;
|
||||
XLogRecPtr backupStartPoint;
|
||||
XLogRecPtr backupEndPoint;
|
||||
bool backupEndRequired;
|
||||
|
||||
int wal_level;
|
||||
int MaxConnections;
|
||||
int max_prepared_xacts;
|
||||
int max_locks_per_xact;
|
||||
|
||||
uint32 maxAlign; /* alignment requirement for tuples */
|
||||
double floatFormat; /* constant 1234567.0 */
|
||||
|
||||
uint32 blcksz; /* data block size for this DB */
|
||||
uint32 relseg_size; /* blocks per segment of large relation */
|
||||
|
||||
uint32 xlog_blcksz; /* block size within WAL files */
|
||||
uint32 xlog_seg_size; /* size of each WAL segment */
|
||||
|
||||
uint32 nameDataLen; /* catalog name field width */
|
||||
uint32 indexMaxKeys; /* max number of columns in an index */
|
||||
|
||||
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
|
||||
|
||||
/* flag indicating internal format of timestamp, interval, time */
|
||||
bool enableIntTimes; /* int64 storage enabled? */
|
||||
|
||||
/* flags indicating pass-by-value status of various types */
|
||||
bool float4ByVal; /* float4 pass-by-value? */
|
||||
bool float8ByVal; /* float8, int8, etc pass-by-value? */
|
||||
|
||||
/* Are data pages protected by checksums? Zero if no checksum version */
|
||||
uint32 data_checksum_version;
|
||||
|
||||
} ControlFileData93;
|
||||
|
||||
|
||||
/*
|
||||
* Following fields added since 9.3:
|
||||
*
|
||||
* int max_worker_processes;
|
||||
* int max_prepared_xacts;
|
||||
* int max_locks_per_xact;
|
||||
*
|
||||
*/
|
||||
typedef struct ControlFileData94
|
||||
{
|
||||
uint64 system_identifier;
|
||||
|
||||
uint32 pg_control_version; /* PG_CONTROL_VERSION */
|
||||
uint32 catalog_version_no; /* see catversion.h */
|
||||
|
||||
DBState state; /* see enum above */
|
||||
pg_time_t time; /* time stamp of last pg_control update */
|
||||
XLogRecPtr checkPoint; /* last check point record ptr */
|
||||
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
|
||||
|
||||
CheckPoint93 checkPointCopy; /* copy of last check point record */
|
||||
|
||||
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
|
||||
|
||||
XLogRecPtr minRecoveryPoint;
|
||||
TimeLineID minRecoveryPointTLI;
|
||||
XLogRecPtr backupStartPoint;
|
||||
XLogRecPtr backupEndPoint;
|
||||
bool backupEndRequired;
|
||||
|
||||
int wal_level;
|
||||
bool wal_log_hints;
|
||||
int MaxConnections;
|
||||
int max_worker_processes;
|
||||
int max_prepared_xacts;
|
||||
int max_locks_per_xact;
|
||||
|
||||
uint32 maxAlign; /* alignment requirement for tuples */
|
||||
double floatFormat; /* constant 1234567.0 */
|
||||
|
||||
uint32 blcksz; /* data block size for this DB */
|
||||
uint32 relseg_size; /* blocks per segment of large relation */
|
||||
|
||||
uint32 xlog_blcksz; /* block size within WAL files */
|
||||
uint32 xlog_seg_size; /* size of each WAL segment */
|
||||
|
||||
uint32 nameDataLen; /* catalog name field width */
|
||||
uint32 indexMaxKeys; /* max number of columns in an index */
|
||||
|
||||
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
|
||||
uint32 loblksize; /* chunk size in pg_largeobject */
|
||||
|
||||
bool enableIntTimes; /* int64 storage enabled? */
|
||||
|
||||
bool float4ByVal; /* float4 pass-by-value? */
|
||||
bool float8ByVal; /* float8, int8, etc pass-by-value? */
|
||||
|
||||
/* Are data pages protected by checksums? Zero if no checksum version */
|
||||
uint32 data_checksum_version;
|
||||
|
||||
} ControlFileData94;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Following field added since 9.4:
|
||||
*
|
||||
* bool track_commit_timestamp;
|
||||
*
|
||||
* Unchanged in 9.6
|
||||
*
|
||||
* In 10, following field appended *after* "data_checksum_version":
|
||||
*
|
||||
* char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN];
|
||||
*
|
||||
* (but we don't care about that)
|
||||
*/
|
||||
|
||||
typedef struct ControlFileData95
|
||||
{
|
||||
uint64 system_identifier;
|
||||
|
||||
uint32 pg_control_version; /* PG_CONTROL_VERSION */
|
||||
uint32 catalog_version_no; /* see catversion.h */
|
||||
|
||||
DBState state; /* see enum above */
|
||||
pg_time_t time; /* time stamp of last pg_control update */
|
||||
XLogRecPtr checkPoint; /* last check point record ptr */
|
||||
XLogRecPtr prevCheckPoint; /* previous check point record ptr */
|
||||
|
||||
CheckPoint95 checkPointCopy; /* copy of last check point record */
|
||||
|
||||
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
|
||||
|
||||
XLogRecPtr minRecoveryPoint;
|
||||
TimeLineID minRecoveryPointTLI;
|
||||
XLogRecPtr backupStartPoint;
|
||||
XLogRecPtr backupEndPoint;
|
||||
bool backupEndRequired;
|
||||
|
||||
int wal_level;
|
||||
bool wal_log_hints;
|
||||
int MaxConnections;
|
||||
int max_worker_processes;
|
||||
int max_prepared_xacts;
|
||||
int max_locks_per_xact;
|
||||
bool track_commit_timestamp;
|
||||
|
||||
uint32 maxAlign; /* alignment requirement for tuples */
|
||||
double floatFormat; /* constant 1234567.0 */
|
||||
|
||||
uint32 blcksz; /* data block size for this DB */
|
||||
uint32 relseg_size; /* blocks per segment of large relation */
|
||||
|
||||
uint32 xlog_blcksz; /* block size within WAL files */
|
||||
uint32 xlog_seg_size; /* size of each WAL segment */
|
||||
|
||||
uint32 nameDataLen; /* catalog name field width */
|
||||
uint32 indexMaxKeys; /* max number of columns in an index */
|
||||
|
||||
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
|
||||
uint32 loblksize; /* chunk size in pg_largeobject */
|
||||
|
||||
bool enableIntTimes; /* int64 storage enabled? */
|
||||
|
||||
bool float4ByVal; /* float4 pass-by-value? */
|
||||
bool float8ByVal; /* float8, int8, etc pass-by-value? */
|
||||
|
||||
uint32 data_checksum_version;
|
||||
|
||||
} ControlFileData95;
|
||||
|
||||
|
||||
|
||||
extern DBState get_db_state(const char *data_directory);
|
||||
extern const char *describe_db_state(DBState state);
|
||||
extern int get_data_checksum_version(const char *data_directory);
|
||||
|
||||
251
dbutils.c
251
dbutils.c
@@ -23,6 +23,7 @@
|
||||
#include <sys/time.h>
|
||||
#include <sys/stat.h>
|
||||
#include <dirent.h>
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include "repmgr.h"
|
||||
#include "dbutils.h"
|
||||
@@ -124,7 +125,7 @@ _establish_db_connection(const char *conninfo, const bool exit_on_error, const b
|
||||
|
||||
initialize_conninfo_params(&conninfo_params, false);
|
||||
|
||||
parse_success = parse_conninfo_string(conninfo, &conninfo_params, errmsg, false);
|
||||
parse_success = parse_conninfo_string(conninfo, &conninfo_params, &errmsg, false);
|
||||
|
||||
if (parse_success == false)
|
||||
{
|
||||
@@ -311,6 +312,18 @@ is_superuser_connection(PGconn *conn, t_connection_user *userinfo)
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
close_connection(PGconn **conn)
|
||||
{
|
||||
if (*conn == NULL)
|
||||
return;
|
||||
|
||||
PQfinish(*conn);
|
||||
|
||||
*conn = NULL;
|
||||
}
|
||||
|
||||
|
||||
/* =============================== */
|
||||
/* conninfo manipulation functions */
|
||||
/* =============================== */
|
||||
@@ -358,6 +371,37 @@ get_conninfo_value(const char *conninfo, const char *keyword, char *output)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Get a default conninfo value for the provided parameter, and copy
|
||||
* it to the 'output' buffer.
|
||||
*
|
||||
* Returns true on success, or false on failure (provided keyword not found).
|
||||
*
|
||||
*/
|
||||
bool
|
||||
get_conninfo_default_value(const char *param, char *output, int maxlen)
|
||||
{
|
||||
PQconninfoOption *defs = NULL;
|
||||
PQconninfoOption *def = NULL;
|
||||
bool found = false;
|
||||
|
||||
defs = PQconndefaults();
|
||||
|
||||
for (def = defs; def->keyword; def++)
|
||||
{
|
||||
if (strncmp(def->keyword, param, maxlen) == 0)
|
||||
{
|
||||
strncpy(output, def->val, maxlen);
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
|
||||
PQconninfoFree(defs);
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
initialize_conninfo_params(t_conninfo_param_list *param_list, bool set_defaults)
|
||||
{
|
||||
@@ -541,7 +585,7 @@ param_get(t_conninfo_param_list *param_list, const char *param)
|
||||
/*
|
||||
* Parse a conninfo string into a t_conninfo_param_list
|
||||
*
|
||||
* See conn_to_param_list() to do the same for a PQconn
|
||||
* See conn_to_param_list() to do the same for a PGconn
|
||||
*
|
||||
* "ignore_local_params": ignores those parameters specific
|
||||
* to a local installation, i.e. when parsing an upstream
|
||||
@@ -549,12 +593,12 @@ param_get(t_conninfo_param_list *param_list, const char *param)
|
||||
* don't copy that node's values
|
||||
*/
|
||||
bool
|
||||
parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char *errmsg, bool ignore_local_params)
|
||||
parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char **errmsg, bool ignore_local_params)
|
||||
{
|
||||
PQconninfoOption *connOptions = NULL;
|
||||
PQconninfoOption *option = NULL;
|
||||
|
||||
connOptions = PQconninfoParse(conninfo_str, &errmsg);
|
||||
connOptions = PQconninfoParse(conninfo_str, errmsg);
|
||||
|
||||
if (connOptions == NULL)
|
||||
return false;
|
||||
@@ -585,10 +629,19 @@ parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_lis
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Parse a PQconn into a t_conninfo_param_list
|
||||
* Parse a PGconn into a t_conninfo_param_list
|
||||
*
|
||||
* See parse_conninfo_string() to do the same for a conninfo string
|
||||
*
|
||||
* NOTE: the current use case for this is to take an active connection,
|
||||
* replace the existing username (typically replacing it with the superuser
|
||||
* or replication user name), and make a new connection as that user.
|
||||
* If the "password" field is set, it will cause any connection made with
|
||||
* these parameters to fail (unless of course the password happens to be the
|
||||
* same). Therefore we remove the password altogether, and rely on it being
|
||||
* available via .pgpass.
|
||||
*/
|
||||
void
|
||||
conn_to_param_list(PGconn *conn, t_conninfo_param_list *param_list)
|
||||
@@ -604,6 +657,10 @@ conn_to_param_list(PGconn *conn, t_conninfo_param_list *param_list)
|
||||
(option->val != NULL && option->val[0] == '\0'))
|
||||
continue;
|
||||
|
||||
/* Ignore "password" */
|
||||
if (strcmp(option->keyword, "password") == 0)
|
||||
continue;
|
||||
|
||||
param_set(param_list, option->keyword, option->val);
|
||||
}
|
||||
|
||||
@@ -984,7 +1041,7 @@ get_cluster_size(PGconn *conn, char *size)
|
||||
|
||||
initPQExpBuffer(&query);
|
||||
appendPQExpBuffer(&query,
|
||||
"SELECT pg_catalog.pg_size_pretty(SUM(pg_catalog.pg_database_size(oid))::bigint) "
|
||||
"SELECT pg_catalog.pg_size_pretty(pg_catalog.sum(pg_catalog.pg_database_size(oid))::bigint) "
|
||||
" FROM pg_catalog.pg_database ");
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_cluster_size():\n%s", query.data);
|
||||
@@ -1327,67 +1384,6 @@ get_replication_info(PGconn *conn, ReplInfo *replication_info)
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason)
|
||||
{
|
||||
bool can_use = true;
|
||||
|
||||
if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
|
||||
server_version_num = get_server_version(conn, NULL);
|
||||
|
||||
if (server_version_num < 90500)
|
||||
{
|
||||
appendPQExpBuffer(reason,
|
||||
_("pg_rewind available from PostgreSQL 9.5"));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (guc_set(conn, "full_page_writes", "=", "off"))
|
||||
{
|
||||
if (can_use == false)
|
||||
appendPQExpBuffer(reason, "; ");
|
||||
|
||||
appendPQExpBuffer(reason,
|
||||
_("\"full_page_writes\" must be set to \"on\""));
|
||||
|
||||
can_use = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* "wal_log_hints" off - are data checksums available? Note: we're
|
||||
* checking the local pg_control file here as the value will be the same
|
||||
* throughout the cluster and saves a round-trip to the demotion
|
||||
* candidate.
|
||||
*/
|
||||
if (guc_set(conn, "wal_log_hints", "=", "on") == false)
|
||||
{
|
||||
int data_checksum_version = get_data_checksum_version(data_directory);
|
||||
|
||||
if (data_checksum_version < 0)
|
||||
{
|
||||
if (can_use == false)
|
||||
appendPQExpBuffer(reason, "; ");
|
||||
|
||||
appendPQExpBuffer(reason,
|
||||
_("\"wal_log_hints\" is set to \"off\" but unable to determine checksum version"));
|
||||
can_use = false;
|
||||
}
|
||||
else if (data_checksum_version == 0)
|
||||
{
|
||||
if (can_use == false)
|
||||
appendPQExpBuffer(reason, "; ");
|
||||
|
||||
appendPQExpBuffer(reason,
|
||||
_("\"wal_log_hints\" is set to \"off\" and checksums are disabled"));
|
||||
|
||||
can_use = false;
|
||||
}
|
||||
}
|
||||
|
||||
return can_use;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
get_ready_archive_files(PGconn *conn, const char *data_directory)
|
||||
{
|
||||
@@ -1769,7 +1765,7 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row)
|
||||
strncpy(node_info->location, PQgetvalue(res, row, 7), MAXLEN);
|
||||
node_info->priority = atoi(PQgetvalue(res, row, 8));
|
||||
node_info->active = atobool(PQgetvalue(res, row, 9));
|
||||
strncpy(node_info->config_file, PQgetvalue(res, row, 10), MAXLEN);
|
||||
strncpy(node_info->config_file, PQgetvalue(res, row, 10), MAXPGPATH);
|
||||
|
||||
/* This won't normally be set */
|
||||
strncpy(node_info->upstream_node_name, PQgetvalue(res, row, 11), MAXLEN);
|
||||
@@ -2423,7 +2419,9 @@ update_node_record_set_primary(PGconn *conn, int this_node_id)
|
||||
" UPDATE repmgr.nodes "
|
||||
" SET active = FALSE "
|
||||
" WHERE type = 'primary' "
|
||||
" AND active IS TRUE ");
|
||||
" AND active IS TRUE "
|
||||
" AND node_id != %i ",
|
||||
this_node_id);
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
termPQExpBuffer(&query);
|
||||
@@ -2445,7 +2443,8 @@ update_node_record_set_primary(PGconn *conn, int this_node_id)
|
||||
appendPQExpBuffer(&query,
|
||||
" UPDATE repmgr.nodes"
|
||||
" SET type = 'primary', "
|
||||
" upstream_node_id = NULL "
|
||||
" upstream_node_id = NULL, "
|
||||
" active = TRUE "
|
||||
" WHERE node_id = %i ",
|
||||
this_node_id);
|
||||
|
||||
@@ -2608,9 +2607,11 @@ witness_copy_node_records(PGconn *primary_conn, PGconn *witness_conn)
|
||||
log_error(_("unable to defer constraints:\n %s"),
|
||||
PQerrorMessage(witness_conn));
|
||||
rollback_transaction(witness_conn);
|
||||
PQclear(res);
|
||||
|
||||
return false;
|
||||
}
|
||||
PQclear(res);
|
||||
|
||||
/* truncate existing records */
|
||||
|
||||
@@ -2631,6 +2632,8 @@ witness_copy_node_records(PGconn *primary_conn, PGconn *witness_conn)
|
||||
/* and done */
|
||||
commit_transaction(witness_conn);
|
||||
|
||||
clear_node_info_list(&nodes);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -2645,7 +2648,7 @@ delete_node_record(PGconn *conn, int node)
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
"DELETE FROM repmgr.nodes "
|
||||
" WHERE node_id = %d",
|
||||
" WHERE node_id = %i",
|
||||
node);
|
||||
|
||||
log_verbose(LOG_DEBUG, "delete_node_record():\n %s", query.data);
|
||||
@@ -2715,6 +2718,7 @@ update_node_record_slot_name(PGconn *primary_conn, int node_id, char *slot_name)
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *node_info)
|
||||
{
|
||||
@@ -2729,14 +2733,14 @@ get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *no
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
" SELECT current_setting('max_wal_senders')::INT AS max_wal_senders, "
|
||||
" (SELECT COUNT(*) FROM pg_catalog.pg_stat_replication) AS attached_wal_receivers, ");
|
||||
" SELECT pg_catalog.current_setting('max_wal_senders')::INT AS max_wal_senders, "
|
||||
" (SELECT pg_catalog.count(*) FROM pg_catalog.pg_stat_replication) AS attached_wal_receivers, ");
|
||||
|
||||
/* no replication slots in PostgreSQL 9.3 */
|
||||
if (server_version_num < 90400)
|
||||
{
|
||||
appendPQExpBuffer(&query,
|
||||
" 0 AS max_replication_slots, "
|
||||
" 0 AS max_replication_slots, "
|
||||
" 0 AS total_replication_slots, "
|
||||
" 0 AS active_replication_slots, "
|
||||
" 0 AS inactive_replication_slots, ");
|
||||
@@ -2745,16 +2749,16 @@ get_node_replication_stats(PGconn *conn, int server_version_num, t_node_info *no
|
||||
{
|
||||
appendPQExpBuffer(&query,
|
||||
" current_setting('max_replication_slots')::INT AS max_replication_slots, "
|
||||
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots) AS total_replication_slots, "
|
||||
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active IS TRUE) AS active_replication_slots, "
|
||||
" (SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE active IS FALSE) AS inactive_replication_slots, ");
|
||||
" (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots) AS total_replication_slots, "
|
||||
" (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots WHERE active IS TRUE) AS active_replication_slots, "
|
||||
" (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots WHERE active IS FALSE) AS inactive_replication_slots, ");
|
||||
}
|
||||
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
" pg_catalog.pg_is_in_recovery() AS in_recovery");
|
||||
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_node_replication_stats():\n%s", query.data);
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
termPQExpBuffer(&query);
|
||||
@@ -2791,7 +2795,7 @@ is_downstream_node_attached(PGconn *conn, char *node_name)
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
" SELECT COUNT(*) FROM pg_catalog.pg_stat_replication "
|
||||
" SELECT pg_catalog.count(*) FROM pg_catalog.pg_stat_replication "
|
||||
" WHERE application_name = '%s'",
|
||||
node_name);
|
||||
res = PQexec(conn, query.data);
|
||||
@@ -2881,21 +2885,21 @@ get_datadir_configuration_files(PGconn *conn, KeyValueList *list)
|
||||
appendPQExpBuffer(&query,
|
||||
"WITH files AS ( "
|
||||
" WITH dd AS ( "
|
||||
" SELECT setting "
|
||||
" SELECT setting "
|
||||
" FROM pg_catalog.pg_settings "
|
||||
" WHERE name = 'data_directory') "
|
||||
" SELECT distinct(sourcefile) AS config_file"
|
||||
" FROM dd, pg_catalog.pg_settings ps "
|
||||
" WHERE ps.sourcefile IS NOT NULL "
|
||||
" AND ps.sourcefile ~ ('^' || dd.setting) "
|
||||
" UNION "
|
||||
" SELECT ps.setting AS config_file"
|
||||
" FROM dd, pg_catalog.pg_settings ps "
|
||||
" WHERE ps.name IN ( 'config_file', 'hba_file', 'ident_file') "
|
||||
" AND ps.setting ~ ('^' || dd.setting) "
|
||||
" SELECT distinct(sourcefile) AS config_file"
|
||||
" FROM dd, pg_catalog.pg_settings ps "
|
||||
" WHERE ps.sourcefile IS NOT NULL "
|
||||
" AND ps.sourcefile ~ ('^' || dd.setting) "
|
||||
" UNION "
|
||||
" SELECT ps.setting AS config_file"
|
||||
" FROM dd, pg_catalog.pg_settings ps "
|
||||
" WHERE ps.name IN ('config_file', 'hba_file', 'ident_file') "
|
||||
" AND ps.setting ~ ('^' || dd.setting) "
|
||||
") "
|
||||
" SELECT config_file, "
|
||||
" regexp_replace(config_file, '^.*\\/','') AS filename "
|
||||
" pg_catalog.regexp_replace(config_file, '^.*\\/','') AS filename "
|
||||
" FROM files "
|
||||
"ORDER BY config_file");
|
||||
|
||||
@@ -2988,7 +2992,7 @@ get_configuration_file_locations(PGconn *conn, t_configfile_list *list)
|
||||
" WHERE name = 'data_directory' "
|
||||
" ) "
|
||||
" SELECT ps.setting, "
|
||||
" regexp_replace(setting, '^.*\\/', '') AS filename, "
|
||||
" pg_catalog.regexp_replace(setting, '^.*\\/', '') AS filename, "
|
||||
" ps.setting ~ ('^' || dd.data_directory) AS in_data_dir "
|
||||
" FROM dd, pg_catalog.pg_settings ps "
|
||||
" WHERE ps.name IN ('hba_file', 'ident_file') "
|
||||
@@ -3138,6 +3142,8 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
|
||||
char event_timestamp[MAXLEN] = "";
|
||||
bool success = true;
|
||||
|
||||
log_verbose(LOG_DEBUG, "_create_event(): event is \"%s\" for node %i", event, node_id);
|
||||
|
||||
/*
|
||||
* Only attempt to write a record if a connection handle was provided.
|
||||
* Also check that the repmgr schema has been properly initialised - if
|
||||
@@ -3405,7 +3411,7 @@ get_event_records(PGconn *conn, int node_id, const char *node_name, const char *
|
||||
/* LEFT JOIN used here as a node record may have been removed */
|
||||
appendPQExpBuffer(&query,
|
||||
" SELECT e.node_id, n.node_name, e.event, e.successful, "
|
||||
" TO_CHAR(e.event_timestamp, 'YYYY-MM-DD HH24:MI:SS') AS timestamp, "
|
||||
" pg_catalog.to_char(e.event_timestamp, 'YYYY-MM-DD HH24:MI:SS') AS timestamp, "
|
||||
" e.details "
|
||||
" FROM repmgr.events e "
|
||||
"LEFT JOIN repmgr.nodes n ON e.node_id = n.node_id ");
|
||||
@@ -3658,7 +3664,7 @@ get_free_replication_slots(PGconn *conn)
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
" SELECT pg_catalog.current_setting('max_replication_slots')::INT - "
|
||||
" COUNT(*) AS free_slots"
|
||||
" pg_catalog.count(*) AS free_slots"
|
||||
" FROM pg_catalog.pg_replication_slots");
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
@@ -3853,6 +3859,45 @@ is_server_available(const char *conninfo)
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
is_server_available_params(t_conninfo_param_list *param_list)
|
||||
{
|
||||
PGPing status = PQpingParams((const char **) param_list->keywords,
|
||||
(const char **) param_list->values,
|
||||
false);
|
||||
|
||||
/* deparsing the param_list adds overhead, so only do it if needed */
|
||||
if (log_level == LOG_DEBUG)
|
||||
{
|
||||
char *conninfo_str = param_list_to_string(param_list);
|
||||
log_verbose(LOG_DEBUG, "ping status for %s is %i", conninfo_str, (int)status);
|
||||
pfree(conninfo_str);
|
||||
}
|
||||
|
||||
if (status == PQPING_OK)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Simple throw-away query to stop a connection handle going stale
|
||||
*/
|
||||
void
|
||||
connection_ping(PGconn *conn)
|
||||
{
|
||||
PGresult *res = PQexec(conn, "SELECT TRUE");
|
||||
|
||||
log_verbose(LOG_DEBUG, "connection_ping(): result is %s", PQresStatus(PQresultStatus(res)));
|
||||
|
||||
PQclear(res);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* ==================== */
|
||||
/* monitoring functions */
|
||||
/* ==================== */
|
||||
@@ -3937,9 +3982,9 @@ get_number_of_monitoring_records_to_delete(PGconn *primary_conn, int keep_histor
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
"SELECT COUNT(*) "
|
||||
"SELECT pg_catalog.count(*) "
|
||||
" FROM repmgr.monitoring_history "
|
||||
" WHERE age(now(), last_monitor_time) >= '%d days'::interval",
|
||||
" WHERE pg_catalog.age(pg_catalog.now(), last_monitor_time) >= '%d days'::interval",
|
||||
keep_history);
|
||||
|
||||
res = PQexec(primary_conn, query.data);
|
||||
@@ -3978,7 +4023,7 @@ delete_monitoring_records(PGconn *primary_conn, int keep_history)
|
||||
{
|
||||
appendPQExpBuffer(&query,
|
||||
"DELETE FROM repmgr.monitoring_history "
|
||||
" WHERE age(now(), last_monitor_time) >= '%d days'::interval ",
|
||||
" WHERE pg_catalog.age(pg_catalog.now(), last_monitor_time) >= '%d days'::interval ",
|
||||
keep_history);
|
||||
}
|
||||
else
|
||||
@@ -4276,7 +4321,7 @@ _is_bdr_db(PGconn *conn, PQExpBufferData *output, bool quiet)
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
"SELECT COUNT(*) FROM pg_catalog.pg_extension WHERE extname='bdr'");
|
||||
"SELECT pg_catalog.count(*) FROM pg_catalog.pg_extension WHERE extname='bdr'");
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
termPQExpBuffer(&query);
|
||||
@@ -4389,7 +4434,7 @@ is_bdr_repmgr(PGconn *conn)
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
"SELECT COUNT(*)"
|
||||
"SELECT pg_catalog.count(*)"
|
||||
" FROM repmgr.nodes n"
|
||||
" WHERE n.type != 'bdr' ");
|
||||
|
||||
@@ -4420,8 +4465,8 @@ is_table_in_bdr_replication_set(PGconn *conn, const char *tablename, const char
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
"SELECT COUNT(*) "
|
||||
" FROM UNNEST(bdr.table_get_replication_sets('repmgr.%s')) AS repset "
|
||||
"SELECT pg_catalog.count(*) "
|
||||
" FROM pg_catalog.unnest(bdr.table_get_replication_sets('repmgr.%s')) AS repset "
|
||||
" WHERE repset='%s' ",
|
||||
tablename,
|
||||
set);
|
||||
@@ -4799,8 +4844,8 @@ bdr_node_has_repmgr_set(PGconn *conn, const char *node_name)
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
" SELECT COUNT(*) "
|
||||
" FROM UNNEST(bdr.connection_get_replication_sets('%s') AS repset "
|
||||
" SELECT pg_catalog.count(*) "
|
||||
" FROM pg_catalog.unnest(bdr.connection_get_replication_sets('%s') AS repset "
|
||||
" WHERE repset = 'repmgr'",
|
||||
node_name);
|
||||
|
||||
@@ -4835,7 +4880,7 @@ bdr_node_set_repmgr_set(PGconn *conn, const char *node_name)
|
||||
" SELECT bdr.connection_set_replication_sets( "
|
||||
" ARRAY( "
|
||||
" SELECT repset::TEXT "
|
||||
" FROM UNNEST(bdr.connection_get_replication_sets('%s')) AS repset "
|
||||
" FROM pg_catalog.unnest(bdr.connection_get_replication_sets('%s')) AS repset "
|
||||
" UNION "
|
||||
" SELECT 'repmgr'::TEXT "
|
||||
" ), "
|
||||
|
||||
@@ -353,10 +353,11 @@ PGconn *get_primary_connection(PGconn *standby_conn, int *primary_id, char *p
|
||||
PGconn *get_primary_connection_quiet(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
|
||||
|
||||
bool is_superuser_connection(PGconn *conn, t_connection_user *userinfo);
|
||||
void close_connection(PGconn **conn);
|
||||
|
||||
/* conninfo manipulation functions */
|
||||
bool get_conninfo_value(const char *conninfo, const char *keyword, char *output);
|
||||
|
||||
bool get_conninfo_default_value(const char *param, char *output, int maxlen);
|
||||
void initialize_conninfo_params(t_conninfo_param_list *param_list, bool set_defaults);
|
||||
void free_conninfo_params(t_conninfo_param_list *param_list);
|
||||
void copy_conninfo_params(t_conninfo_param_list *dest_list, t_conninfo_param_list *source_list);
|
||||
@@ -364,10 +365,11 @@ void conn_to_param_list(PGconn *conn, t_conninfo_param_list *param_list);
|
||||
void param_set(t_conninfo_param_list *param_list, const char *param, const char *value);
|
||||
void param_set_ine(t_conninfo_param_list *param_list, const char *param, const char *value);
|
||||
char *param_get(t_conninfo_param_list *param_list, const char *param);
|
||||
bool parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char *errmsg, bool ignore_local_params);
|
||||
bool parse_conninfo_string(const char *conninfo_str, t_conninfo_param_list *param_list, char **errmsg, bool ignore_local_params);
|
||||
char *param_list_to_string(t_conninfo_param_list *param_list);
|
||||
bool has_passfile(void);
|
||||
|
||||
|
||||
/* transaction functions */
|
||||
bool begin_transaction(PGconn *conn);
|
||||
bool commit_transaction(PGconn *conn);
|
||||
@@ -386,7 +388,6 @@ bool get_cluster_size(PGconn *conn, char *size);
|
||||
int get_server_version(PGconn *conn, char *server_version);
|
||||
RecoveryType get_recovery_type(PGconn *conn);
|
||||
int get_primary_node_id(PGconn *conn);
|
||||
bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
|
||||
int get_ready_archive_files(PGconn *conn, const char *data_directory);
|
||||
bool identify_system(PGconn *repl_conn, t_system_identification *identification);
|
||||
bool repmgrd_set_local_node_id(PGconn *conn, int local_node_id);
|
||||
@@ -465,6 +466,8 @@ int wait_connection_availability(PGconn *conn, long long timeout);
|
||||
|
||||
/* node availability functions */
|
||||
bool is_server_available(const char *conninfo);
|
||||
bool is_server_available_params(t_conninfo_param_list *param_list);
|
||||
void connection_ping(PGconn *conn);
|
||||
|
||||
/* monitoring functions */
|
||||
void
|
||||
|
||||
@@ -24,8 +24,9 @@
|
||||
series will no longer be actively maintained.
|
||||
</para>
|
||||
<para>
|
||||
repmgr 2.x supports PostgreSQL 9.0 ~ 9.3. While it is compatible
|
||||
with PostgreSQL 9.3, we recommend using repmgr 4.x.
|
||||
&repmgr; 2.x supports PostgreSQL 9.0 ~ 9.3. While it is compatible
|
||||
with PostgreSQL 9.3, we recommend using repmgr 4.x. &repmgr; 2.x is
|
||||
no longer maintained.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
@@ -35,7 +36,7 @@
|
||||
Replication slots, introduced in PostgreSQL 9.4, ensure that the
|
||||
primary server will retain WAL files until they have been consumed
|
||||
by all standby servers. This makes WAL file management much easier,
|
||||
and if used `repmgr` will no longer insist on a fixed minimum number
|
||||
and if used &repmgr; will no longer insist on a fixed minimum number
|
||||
(default: 5000) of WAL files being retained.
|
||||
</para>
|
||||
<para>
|
||||
@@ -86,12 +87,27 @@
|
||||
</para>
|
||||
<para>
|
||||
To minimize downtime during major upgrades, for more recent PostgreSQL
|
||||
versions <ulink url="https://www.2ndquadrant.com/en/resources/pglogical/">pglogical</ulink>
|
||||
versions (PostgreSQL 9.4 and later),
|
||||
<ulink url="https://www.2ndquadrant.com/en/resources/pglogical/">pglogical</ulink>
|
||||
can be used to set up a parallel cluster using the newer PostgreSQL version,
|
||||
which can be kept in sync with the existing production cluster until the
|
||||
new cluster is ready to be put into production.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-libdir-repmgr-error">
|
||||
<title>What does this error mean: <literal>ERROR: could not access file "$libdir/repmgr"</literal>?</title>
|
||||
<para>
|
||||
It means the &repmgr; extension code is not installed in the
|
||||
PostgreSQL application directory. This typically happens when using PostgreSQL
|
||||
packages provided by a third-party vendor, which often have different
|
||||
filesystem layouts.
|
||||
</para>
|
||||
<para>
|
||||
Either use PostgreSQL packages provided by the community or 2ndQuadrant; if this
|
||||
is not possible, contact your vendor for assistance.
|
||||
</para>
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="faq-repmgr" xreflabel="repmgr">
|
||||
@@ -105,6 +121,7 @@
|
||||
standby to have been cloned using &repmgr;.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-repmgr-clone-other-source" >
|
||||
<title>Can I use a standby not cloned by &repmgr; as a &repmgr; node?</title>
|
||||
|
||||
@@ -118,6 +135,13 @@
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-repmgr-recovery-conf" >
|
||||
<title>What does &repmgr; write in <filename>recovery.conf</filename>, and what options can be set there?</title>
|
||||
<para>
|
||||
See section <link linkend="repmgr-standby-clone-recovery-conf">Customising recovery.conf</link>.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-repmgr-failed-primary-standby" xreflabel="Reintegrate a failed primary as a standby">
|
||||
<title>How can a failed primary be re-added as a standby?</title>
|
||||
<para>
|
||||
@@ -126,19 +150,23 @@
|
||||
needs to be re-registered as a standby.
|
||||
</para>
|
||||
<para>
|
||||
In PostgreSQL 9.5 and later, it's possible to use <command>pg_rewind</command>
|
||||
to re-synchronise the existing data directory, which will usually be much
|
||||
It's possible to use <command>pg_rewind</command> to re-synchronise the existing data
|
||||
directory, which will usually be much
|
||||
faster than re-cloning the server. However <command>pg_rewind</command> can only
|
||||
be used if PostgreSQL either has <varname>wal_log_hints</varname> enabled, or
|
||||
data checksums were enabled when the cluster was initialized.
|
||||
</para>
|
||||
<para>
|
||||
&repmgr; provides the command <command>repmgr node rejoin</command> which can
|
||||
optionally execute <command>pg_rewind</command>; see the <xref linkend="repmgr-node-rejoin">
|
||||
documentation for details.
|
||||
Note that <command>pg_rewind</command> is available as part of the core PostgreSQL
|
||||
distribution from PostgreSQL 9.5, and as a third-party utility for PostgreSQL 9.3 and 9.4.
|
||||
</para>
|
||||
<para>
|
||||
If <command>pg_rewind</command> cannot be used, then the data directory will have
|
||||
&repmgr; provides the command <command>repmgr node rejoin</command> which can
|
||||
optionally execute <command>pg_rewind</command>; see the <xref linkend="repmgr-node-rejoin">
|
||||
documentation for details, in particular the section <xref linkend="repmgr-node-rejoin-pg-rewind">.
|
||||
</para>
|
||||
<para>
|
||||
If <command>pg_rewind</command> cannot be used, then the data directory will need
|
||||
to be re-cloned from scratch.
|
||||
</para>
|
||||
|
||||
|
||||
@@ -1,48 +1,119 @@
|
||||
<appendix id="appendix-packages" xreflabel="Package details">
|
||||
<indexterm>
|
||||
<primary>packages</primary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>packages</primary>
|
||||
</indexterm>
|
||||
|
||||
<title>&repmgr; package details</title>
|
||||
<para>
|
||||
This section provides technical details about various &repmgr; binary
|
||||
packages, such as location of the installed binaries and
|
||||
configuration files.
|
||||
</para>
|
||||
|
||||
<sect1 id="packages-centos" xreflabel="CentOS packages">
|
||||
<title>CentOS, RHEL, Scientific Linux etc.</title>
|
||||
<title>&repmgr; package details</title>
|
||||
<para>
|
||||
Currently packages are provided for versions 6.x and 7.x of CentOS et al.
|
||||
This section provides technical details about various &repmgr; binary
|
||||
packages, such as location of the installed binaries and
|
||||
configuration files.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<sect1 id="packages-centos" xreflabel="CentOS packages">
|
||||
<title>CentOS Packages</title>
|
||||
<indexterm>
|
||||
<primary>packages</primary>
|
||||
<secondary>CentOS packages</secondary>
|
||||
</indexterm>
|
||||
<para>
|
||||
For PostgreSQL 9.6 and lower, the CentOS packages use a mixture of <literal>9.6</literal>
|
||||
and <literal>96</literal> in various places to designate the major version;
|
||||
from PostgreSQL 10, the first part of the version number (e.g. <literal>10</literal>) is
|
||||
the major version, so there is more consistency in file/path/package naming.
|
||||
Currently, &repmgr; RPM packages are provided for versions 6.x and 7.x of CentOS. These should also
|
||||
work on matching versions of Red Hat Enterprise Linux, Scientific Linux and Oracle Enterprise Linux;
|
||||
together with CentOS, these are the same RedHat-based distributions for which the main community project
|
||||
(PGDG) provides packages (see the <ulink url="https://yum.postgresql.org/">PostgreSQL RPM Building Project</ulink>
|
||||
page for details).
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
Note these &repmgr; RPM packages are not designed to work with SuSE/OpenSuSE.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; packages are designed to be compatible with community-provided PostgreSQL packages.
|
||||
They may not work with vendor-specific packages such as those provided by RedHat for RHEL
|
||||
customers, as the filesystem layout may be different to the community RPMs.
|
||||
Please contact your support vendor for assistance.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<sect2 id="packages-centos-repositories">
|
||||
<title>CentOS repositories</title>
|
||||
|
||||
<para>
|
||||
&repmgr; packages are available from the public 2ndQuadrant repository, and also the
|
||||
PostgreSQL community repository. The 2ndQuadrant repository is updated immediately
|
||||
after each
|
||||
&repmgr; release.
|
||||
</para>
|
||||
|
||||
<table id="centos-2ndquadrant-repository">
|
||||
<title>2ndQuadrant public repository</title>
|
||||
<tgroup cols="2">
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>Repository URL:</entry>
|
||||
<entry><ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ">https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ</ulink></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
|
||||
<table id="centos-pgdg-repository">
|
||||
<title>PostgreSQL community repository (PGDG)</title>
|
||||
<tgroup cols="2">
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>Repository URL:</entry>
|
||||
<entry><ulink url="https://yum.postgresql.org/repopackages.php">https://yum.postgresql.org/repopackages.php</ulink></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://yum.postgresql.org/">https://yum.postgresql.org/</ulink></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="packages-centos-details">
|
||||
<title>CentOS package details</title>
|
||||
|
||||
<para>
|
||||
The two tables below list relevant information, paths, commands etc. for the &repmgr; packages on
|
||||
CentOS 7 (with systemd) and CentOS 6 (no systemd). Substitute the appropriate PostgreSQL major
|
||||
version number for your installation.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
For PostgreSQL 9.6 and lower, the CentOS packages use a mixture of <literal>9.6</literal>
|
||||
and <literal>96</literal> in various places to designate the major version; e.g. the
|
||||
package name is <literal>repmgr96</literal>, but the binary directory is
|
||||
<filename>/var/lib/pgsql/9.6/data</filename>.
|
||||
</para>
|
||||
<para>
|
||||
From PostgreSQL 10, the first part of the version number (e.g. <literal>10</literal>) is
|
||||
the major version, so there is more consistency in file/path/package naming
|
||||
(package <literal>repmgr10</literal>, binary directory <filename>/var/lib/pgsql/10/data</filename>).
|
||||
</para>
|
||||
</note>
|
||||
|
||||
|
||||
<table id="centos-7-packages">
|
||||
<title>CentOS 7 packages</title>
|
||||
|
||||
<tgroup cols="2">
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>Repository URL:</entry>
|
||||
<entry><ulink url="https://yum.postgresql.org/repopackages.php">https://yum.postgresql.org/repopackages.php</ulink></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://yum.postgresql.org/">https://yum.postgresql.org/</ulink></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Package name example:</entry>
|
||||
<entry><filename>repmgr10-4.0.0-1.rhel7.x86_64</filename></entry>
|
||||
<entry><filename>repmgr10-4.0.4-1.rhel7.x86_64</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
@@ -52,7 +123,7 @@
|
||||
|
||||
<row>
|
||||
<entry>Installation command:</entry>
|
||||
<entry><literal>yum install -y repmgr10</literal></entry>
|
||||
<entry><literal>yum install repmgr10</literal></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
@@ -61,7 +132,7 @@
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>In default path:</entry>
|
||||
<entry>repmgr in default path:</entry>
|
||||
<entry>NO</entry>
|
||||
</row>
|
||||
|
||||
@@ -70,9 +141,14 @@
|
||||
<entry><filename>/etc/repmgr/10/repmgr.conf</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Data directory:</entry>
|
||||
<entry><filename>/var/lib/pgsql/10/data</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>repmgrd service command:</entry>
|
||||
<entry><literal>service repmgr10</literal></entry>
|
||||
<entry><command>systemctl [start|stop|restart|reload] repmgr10</command></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
@@ -82,7 +158,7 @@
|
||||
|
||||
<row>
|
||||
<entry>repmgrd log file location:</entry>
|
||||
<entry>(not specified)</entry>
|
||||
<entry>(not specified by package; set in <filename>repmgr.conf</filename>)</entry>
|
||||
</row>
|
||||
|
||||
</tbody>
|
||||
@@ -94,29 +170,20 @@
|
||||
|
||||
<tgroup cols="2">
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>Repository URL:</entry>
|
||||
<entry><ulink url="https://yum.postgresql.org/repopackages.php">https://yum.postgresql.org/repopackages.php</ulink></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://yum.postgresql.org/">https://yum.postgresql.org/</ulink></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Package name example:</entry>
|
||||
<entry><filename>repmgr96-4.0.0-1.rhel6.x86_64</filename></entry>
|
||||
<entry><filename>repmgr96-4.0.4-1.rhel6.x86_64</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Metapackage:</entry>
|
||||
<entry>NO</entry>
|
||||
<entry>(none)</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Installation command:</entry>
|
||||
<entry><literal>yum install -y repmgr96</literal></entry>
|
||||
<entry><literal>yum install repmgr96</literal></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
@@ -125,7 +192,7 @@
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>In default path:</entry>
|
||||
<entry>repmgr in default path:</entry>
|
||||
<entry>NO</entry>
|
||||
</row>
|
||||
|
||||
@@ -134,9 +201,14 @@
|
||||
<entry><filename>/etc/repmgr/9.6/repmgr.conf</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Data directory:</entry>
|
||||
<entry><filename>/var/lib/pgsql/9.6/data</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>repmgrd service command:</entry>
|
||||
<entry>service repmgr-9.6</entry>
|
||||
<entry><literal>service [start|stop|restart|reload] repmgr-9.6</literal></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
@@ -153,6 +225,143 @@
|
||||
</tgroup>
|
||||
</table>
|
||||
|
||||
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
|
||||
|
||||
|
||||
<sect1 id="packages-debian-ubuntu" xreflabel="Debian/Ubuntu packages">
|
||||
<title>Debian/Ubuntu Packages</title>
|
||||
<indexterm>
|
||||
<primary>packages</primary>
|
||||
<secondary>Debian/Ubuntu packages</secondary>
|
||||
</indexterm>
|
||||
<para>
|
||||
&repmgr; <literal>.deb</literal> packages are provided via the
|
||||
PostgreSQL Community APT repository, and are available for each community-supported
|
||||
PostgreSQL version, currently supported Debian releases, and currently supported
|
||||
Ubuntu LTS releases.
|
||||
</para>
|
||||
|
||||
<sect2 id="packages-apt-repository">
|
||||
<title>APT repository</title>
|
||||
|
||||
<para>
|
||||
&repmgr; packages are available from the PostgreSQL Community APT repository,
|
||||
which is updated immediately after each &repmgr; release.
|
||||
</para>
|
||||
|
||||
|
||||
<table id="apt-repository">
|
||||
<title>PostgreSQL Community APT repository (PGDG)</title>
|
||||
<tgroup cols="2">
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>Repository URL:</entry>
|
||||
<entry><ulink url="http://apt.postgresql.org/">http://apt.postgresql.org/</ulink></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://wiki.postgresql.org/wiki/Apt)">https://wiki.postgresql.org/wiki/Apt)</ulink></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="packages-debian-details">
|
||||
<title>Debian/Ubuntu package details</title>
|
||||
|
||||
<para>
|
||||
The table below lists relevant information, paths, commands etc. for the &repmgr; packages on
|
||||
Debian 9.x ("Stretch"). Substitute the appropriate PostgreSQL major
|
||||
version number for your installation.
|
||||
</para>
|
||||
<para>
|
||||
See also <xref linkend="repmgrd-configuration-debian-ubuntu"> for some specifics related
|
||||
to configuring the <application>repmgrd</application> daemon.
|
||||
</para>
|
||||
|
||||
<table id="debian-9-packages">
|
||||
<title>Debian 9.x packages</title>
|
||||
|
||||
<tgroup cols="2">
|
||||
<tbody>
|
||||
|
||||
<row>
|
||||
<entry>Package name example:</entry>
|
||||
<entry><filename>postgresql-10-repmgr</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Metapackage:</entry>
|
||||
<entry><filename>repmgr-common</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Installation command:</entry>
|
||||
<entry><literal>apt-get install postgresql-10-repmgr</literal></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Binary location:</entry>
|
||||
<entry><filename>/usr/lib/postgresql/10/bin</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>repmgr in default path:</entry>
|
||||
<entry>Yes (via wrapper script <filename>/usr/bin/repmgr</filename>)</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Configuration file location:</entry>
|
||||
<entry>(not set by package)</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>Data directory:</entry>
|
||||
<entry><filename>/var/lib/postgresql/10/main</filename></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>PostgreSQL service command:</entry>
|
||||
<entry><command>systemctl [start|stop|restart|reload] postgresql@10-main</command></entry>
|
||||
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>repmgrd service command:</entry>
|
||||
<entry><command>systemctl [start|stop|restart|reload] repmgrd</command></entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>repmgrd service file location:</entry>
|
||||
<entry><filename>/etc/init.d/repmgrd</filename> (defaults in: <filename>/etc/defaults/repmgrd</filename>)</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry>repmgrd log file location:</entry>
|
||||
<entry>(not specified by package; set in <filename>repmgr.conf</filename>)</entry>
|
||||
</row>
|
||||
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
<note>
|
||||
<para>
|
||||
Instead of using the <application>systemd</application> service command directly,
|
||||
it's recommended to execute <command>pg_ctlcluster</command> (as <literal>root</literal>,
|
||||
either directly or via <command>sudo</command>), e.g.:
|
||||
<programlisting>
|
||||
<command>pg_ctlcluster 10 main [start|stop|restart|reload]</command></programlisting>
|
||||
</para>
|
||||
<para>
|
||||
For pre-<application>systemd</application> systems, <command>pg_ctlcluster</command>
|
||||
can be executed directly by the <literal>postgres</literal> user.
|
||||
</para>
|
||||
</note>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
</appendix>
|
||||
|
||||
@@ -15,9 +15,269 @@
|
||||
See also: <xref linkend="upgrading-repmgr">
|
||||
</para>
|
||||
|
||||
<sect1 id="release-4.0.6">
|
||||
<title>Release 4.0.6</title>
|
||||
<para><emphasis>June ??, 2018</emphasis></para>
|
||||
<para>
|
||||
&repmgr; 4.0.6 contains a number of bug fixes and usability enhancements.
|
||||
</para>
|
||||
<para>
|
||||
We recommend upgrading to this version as soon as possible.
|
||||
This release can be installed as a simple package upgrade from repmgr 4.0 ~ 4.0.5;
|
||||
<application>repmgrd</application> (if running) should be restarted. See <xref linkend="upgrading-repmgr">
|
||||
for more details.
|
||||
</para>
|
||||
|
||||
<sect2>
|
||||
<title>Usability enhancements</title>
|
||||
|
||||
<para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command> and
|
||||
<command><link linkend="repmgr-cluster-matrix">repmgr cluster matrix</link></command>:
|
||||
return non-zero exit code if node connection issues detected (GitHub #447)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>:
|
||||
Improve handling of external configuration file copying, including consideration in
|
||||
<option>--dry-run</option> check
|
||||
(GitHub #443)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
When using <option>--dry-run</option>, force log level to <literal>INFO</literal>
|
||||
to ensure output will always be displayed
|
||||
(GitHub #441)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Bug fixes</title>
|
||||
<para>
|
||||
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-witness-register">repmgr witness register</link></command>:
|
||||
prevent registration of a witness server with the same name as an existing node.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-standby-follow">repmgr standby follow</link></command>:
|
||||
check node has actually connected to new primary before reporting success
|
||||
(GitHub #444)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>:
|
||||
Don't require presence of <varname>user</varname> parameter in conninfo string
|
||||
(GitHub #437)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-standby-clone">repmgr standby clone</link></command>:
|
||||
Improve documentation of <option>--recovery-conf-only</option> mode
|
||||
(GitHub #438)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>:
|
||||
Fix bug when parsing <option>--config-files</option> parameter
|
||||
(GitHub #442)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application>: ensure local node is counted as quorum member
|
||||
(GitHub #439)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="release-4.0.5">
|
||||
<title>Release 4.0.5</title>
|
||||
<para><emphasis>Wed May 2, 2018</emphasis></para>
|
||||
<para>
|
||||
&repmgr; 4.0.5 contains a number of usability enhancements related to
|
||||
<application>pg_rewind</application> usage, <filename>recovery.conf</filename>
|
||||
generation and (in <application>repmgrd</application>) handling of various
|
||||
corner-case situations, as well as a number of bug fixes.
|
||||
</para>
|
||||
|
||||
<sect2>
|
||||
<title>Usability enhancements</title>
|
||||
|
||||
<para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Various documentation improvements, with particular emphasis on
|
||||
the importance of setting appropriate <link linkend="configuration-service-commands">service commands</link>
|
||||
instead of relying on <application>pg_ctl</application>.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Poll demoted primary after restart as a standby during a switchover operation (GitHub #408).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Add configuration parameter <option>config_directory</option> (GitHub #424).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Add sanity check if <option>--upstream-node-id</option> not supplied when executing
|
||||
<xref linkend="repmgr-standby-register"> (GitHub #395).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Enable <link linkend="repmgr-node-rejoin-pg-rewind">pg_rewind</link> to be used with
|
||||
PostgreSQL 9.3/9.4 (GitHub #413).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
When generating replication connection strings, set <literal>dbname=replication</literal>
|
||||
if appropriate (GitHub #421).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Enable provision of <option>archive_cleanup_command</option> in <filename>recovery.conf</filename>
|
||||
(GitHub #416).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Actively check for node to <link linkend="repmgr-node-rejoin">rejoin</link> cluster (GitHub #415).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application>: set <literal>connect_timeout=2</literal> (if not explicitly set)
|
||||
when pinging a server.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Bug fixes</title>
|
||||
<para>
|
||||
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Fix display of conninfo parsing error messages.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Fix minimum accepted value for <varname>degraded_monitoring_timeout</varname> (GitHub #411).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Fix superuser password handling (GitHub #400)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Fix parsing of <varname>archive_ready_critical</varname> configuration file parameter (GitHub #426).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Fix <command><link linkend="repmgr-cluster-crosscheck">repmgr cluster crosscheck</link></command>
|
||||
output (GitHub #389)
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Fix memory leaks in witness code (GitHub #402).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application>: handle <command>pg_ctl promote</command> timeout (GitHub #425).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application>: handle failover situation with only two nodes in the primary
|
||||
location, and at least one node in another location (GitHub #407).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application>: prevent standby connection handle from going stale.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="release-4.0.4">
|
||||
<title>Release 4.0.4</title>
|
||||
<para><emphasis>Thu Mar 8, 2018</emphasis></para>
|
||||
<para><emphasis>Fri Mar 9, 2018</emphasis></para>
|
||||
|
||||
<para>
|
||||
&repmgr; 4.0.4 contains some bug fixes and and a number of
|
||||
@@ -30,6 +290,15 @@
|
||||
for more details.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
It is not possible to perform a switchover where the demotion candidate is
|
||||
running &repmgr; 4.0.2 or lower; all nodes should be upgraded to the latest version (4.0.4).
|
||||
This is due to additional checks introduced in 4.0.3 which require the presence of
|
||||
4.0.3 or later versions on all nodes.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<sect2>
|
||||
<title>Usability enhancements</title>
|
||||
|
||||
@@ -144,6 +413,14 @@
|
||||
This release can be installed as a simple package upgrade from repmgr 4.0 ~ 4.0.2;
|
||||
repmgrd (if running) should be restarted.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
It is not possible to perform a switchover where the demotion candidate is
|
||||
running &repmgr; 4.0.2 or lower; all nodes should be upgraded to 4.0.3. This is due
|
||||
to additional checks introduced in 4.0.3 which require the presence of
|
||||
4.0.3 or later versions on all nodes.
|
||||
</para>
|
||||
</note>
|
||||
<sect2>
|
||||
<title>Usability enhancements</title>
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<sect2 id="cloning-from-barman-prerequisites" xreflabel="Prerequisites for cloning from Barman">
|
||||
<sect2 id="cloning-from-barman-prerequisites">
|
||||
<title>Prerequisites for cloning from Barman</title>
|
||||
<para>
|
||||
In order to enable Barman support for <command>repmgr standby clone</command>, following
|
||||
@@ -356,7 +356,7 @@
|
||||
By default, <command>pg_basebackup</command> performs a checkpoint before beginning the backup
|
||||
process. However, a normal checkpoint may take some time to complete;
|
||||
a fast checkpoint can be forced with the <literal>-c/--fast-checkpoint</literal> option.
|
||||
However this may impact performance of the server being cloned from (typically the primary)
|
||||
Note that this may impact performance of the server being cloned from (typically the primary)
|
||||
so should be used with care.
|
||||
</para>
|
||||
<tip>
|
||||
@@ -384,11 +384,16 @@
|
||||
|
||||
<sect2 id="cloning-advanced-managing-passwords" xreflabel="Managing passwords">
|
||||
<title>Managing passwords</title>
|
||||
<indexterm>
|
||||
<primary>cloning</primary>
|
||||
<secondary>using passwords</secondary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
If replication connections to a standby's upstream server are password-protected,
|
||||
the standby must be able to provide the password so it can begin streaming
|
||||
replication.
|
||||
the standby must be able to provide the password so it can begin streaming replication.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The recommended way to do this is to store the password in the <literal>postgres</literal> system
|
||||
user's <filename>~/.pgpass</filename> file. It's also possible to store the password in the
|
||||
@@ -396,6 +401,17 @@
|
||||
security reasons. For more details see the
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/libpq-pgpass.html">PostgreSQL password file documentation</ulink>.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If using a <filename>pgpass</filename> file, an entry for the replication user (by default the
|
||||
user who connects to the <literal>repmgr</literal> database) <emphasis>must</emphasis>
|
||||
be provided, with database name set to <literal>replication</literal>, e.g.:
|
||||
<programlisting>
|
||||
node1:5432:replication:repmgr:12345</programlisting>
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
If, for whatever reason, you wish to include the password in <filename>recovery.conf</filename>,
|
||||
set <varname>use_primary_conninfo_password</varname> to <literal>true</literal> in
|
||||
@@ -407,8 +423,7 @@
|
||||
</para>
|
||||
<para>
|
||||
It is of course also possible to include the password value in the <varname>conninfo</varname>
|
||||
string for each node, but this is obviously a security risk and should be
|
||||
avoided.
|
||||
string for each node, but this is obviously a security risk and should be avoided.
|
||||
</para>
|
||||
<para>
|
||||
From PostgreSQL 9.6, <application>libpq</application> supports the <varname>passfile</varname>
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
<sect1 id="configuration-file-settings" xreflabel="configuration file settings">
|
||||
<indexterm>
|
||||
<primary>repmgr.conf</primary>
|
||||
<secondary>settings</secondary>
|
||||
<secondary>basic settings</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Configuration file settings</title>
|
||||
<title>Basic configuration file settings</title>
|
||||
<para>
|
||||
Each <filename>repmgr.conf</filename> file must contain the following parameters:
|
||||
</para>
|
||||
@@ -92,7 +92,10 @@
|
||||
|
||||
<para>
|
||||
For a full list of annotated configuration items, see the file
|
||||
<ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</>.
|
||||
<ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink>.
|
||||
</para>
|
||||
<para>
|
||||
For <application>repmgrd</application>-specific settings, see <xref linkend="repmgrd-configuration">.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
|
||||
115
doc/configuration-service-commands.sgml
Normal file
115
doc/configuration-service-commands.sgml
Normal file
@@ -0,0 +1,115 @@
|
||||
<sect1 id="configuration-service-commands" xreflabel="service command settings">
|
||||
<indexterm>
|
||||
<primary>repmgr.conf</primary>
|
||||
<secondary>service command settings</secondary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>service command settings</primary>
|
||||
<secondary>configuration in repmgr.conf</secondary>
|
||||
</indexterm>
|
||||
<title>Service command settings</title>
|
||||
|
||||
<para>
|
||||
In some circumstances, &repmgr; (and <application>repmgrd</application>) need to
|
||||
be able to stop, start or restart PostgreSQL. &repmgr; commands which need to do this
|
||||
include <link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>,
|
||||
<link linkend="repmgr-standby-switchover"><command>repmgr standby switchover</command></link> and
|
||||
<link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>.
|
||||
</para>
|
||||
<para>
|
||||
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> to control the PostgreSQL
|
||||
server. However this can lead to various problems, particularly when PostgreSQL has been
|
||||
installed from packages, and expecially so if <application>systemd</application> is in use.
|
||||
</para>
|
||||
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If using <application>systemd</application>, ensure you have <varname>RemoteIPC</varname> set to <literal>off</literal>.
|
||||
See the <ulink url="https://wiki.postgresql.org/wiki/Systemd">systemd</ulink>
|
||||
entry in the <ulink url="https://wiki.postgresql.org/wiki/Main_Page">PostgreSQL wiki</ulink> for details.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
|
||||
<para>
|
||||
With this in mind, we recommend to <emphasis>always</emphasis> configure &repmgr; to use the
|
||||
available system service commands.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To do this, specify the appropriate command for each action
|
||||
in <filename>repmgr.conf</filename> using the following configuration
|
||||
parameters:
|
||||
<programlisting>
|
||||
service_start_command
|
||||
service_stop_command
|
||||
service_restart_command
|
||||
service_reload_command</programlisting>
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
It's also possible to specify a <varname>service_promote_command</varname>;
|
||||
this overrides any value contained in the setting <varname>promote_command</varname>.
|
||||
This is intended for systems which provide a package-level promote command,
|
||||
such as Debian's <application>pg_ctlcluster</application>.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
To confirm which command &repmgr; will execute for each action, use
|
||||
<command>repmgr node service --list --action=...</command>, e.g.:
|
||||
<programlisting>
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=stop
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=start
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=restart
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=reload</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
These commands will be executed by the system user which &repmgr; runs as (usually <literal>postgres</literal>)
|
||||
and will probably require passwordless sudo access to be able to execute the command.
|
||||
</para>
|
||||
<para>
|
||||
For example, using <application>systemd</application> on CentOS 7, the service commands can be
|
||||
set as follows:
|
||||
<programlisting>
|
||||
service_start_command = 'sudo systemctl start postgresql-9.6'
|
||||
service_stop_command = 'sudo systemctl stop postgresql-9.6'
|
||||
service_restart_command = 'sudo systemctl restart postgresql-9.6'
|
||||
service_reload_command = 'sudo systemctl reload postgresql-9.6'</programlisting>
|
||||
and <filename>/etc/sudoers</filename> should be set as follows:
|
||||
<programlisting>
|
||||
Defaults:postgres !requiretty
|
||||
postgres ALL = NOPASSWD: /usr/bin/systemctl stop postgresql-9.6, \
|
||||
/usr/bin/systemctl start postgresql-9.6, \
|
||||
/usr/bin/systemctl restart postgresql-9.6 \
|
||||
/usr/bin/systemctl reload postgresql-9.6</programlisting>
|
||||
</para>
|
||||
|
||||
<important>
|
||||
<indexterm>
|
||||
<primary>pg_ctlcluster</primary>
|
||||
<secondary>service command settings</secondary>
|
||||
</indexterm>
|
||||
<para>
|
||||
Debian/Ubuntu users: instead of calling <command>sudo systemctl</command> directly, use
|
||||
<command>sudo pg_ctlcluster</command>, e.g.:
|
||||
<programlisting>
|
||||
service_start_command = 'sudo pg_ctlcluster 9.6 main start'
|
||||
service_stop_command = 'sudo pg_ctlcluster 9.6 main stop'
|
||||
service_restart_command = 'sudo pg_ctlcluster 9.6 main restart'
|
||||
service_reload_command = 'sudo pg_ctlcluster 9.6 main reload'</programlisting>
|
||||
and set <filename>/etc/sudoers</filename> accordingly.
|
||||
</para>
|
||||
<para>
|
||||
While <command>pg_ctlcluster</command> will work when executed as user <literal>postgres</literal>,
|
||||
it's strongly recommended to use <command>sudo pg_ctlcluster</command> on <application>systemd</application>
|
||||
systems, to ensure <application>systemd</application> has a correct picture of
|
||||
the PostgreSQL application state.
|
||||
</para>
|
||||
|
||||
</important>
|
||||
|
||||
</sect1>
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
&configuration-file;
|
||||
&configuration-file-settings;
|
||||
&configuration-service-commands;
|
||||
|
||||
<sect1 id="configuration-permissions" xreflabel="User permissions">
|
||||
<indexterm>
|
||||
|
||||
@@ -205,6 +205,9 @@
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_failover_follow</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_failover_aborted</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_upstream_disconnect</literal></simpara>
|
||||
</listitem>
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
<!ENTITY configuration SYSTEM "configuration.sgml">
|
||||
<!ENTITY configuration-file SYSTEM "configuration-file.sgml">
|
||||
<!ENTITY configuration-file-settings SYSTEM "configuration-file-settings.sgml">
|
||||
<!ENTITY configuration-service-commands SYSTEM "configuration-service-commands.sgml">
|
||||
<!ENTITY cloning-standbys SYSTEM "cloning-standbys.sgml">
|
||||
<!ENTITY promoting-standby SYSTEM "promoting-standby.sgml">
|
||||
<!ENTITY follow-new-primary SYSTEM "follow-new-primary.sgml">
|
||||
|
||||
@@ -5,83 +5,107 @@
|
||||
system.
|
||||
</para>
|
||||
|
||||
<sect2 id="installation-packages-redhat" xreflabel="Installing from packages on RHEL, Fedora and CentOS">
|
||||
<sect2 id="installation-packages-redhat" xreflabel="Installing from packages on RHEL, CentOS and Fedora">
|
||||
|
||||
<indexterm>
|
||||
<primary>installation</primary>
|
||||
<secondary>on Redhat/CentOS/Fedora etc.</secondary>
|
||||
<secondary>on Red Hat/CentOS/Fedora etc.</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>RedHat/Fedora/CentOS</title>
|
||||
<title>RedHat/CentOS/Fedora</title>
|
||||
<para>
|
||||
RPM packages for &repmgr; are available via Yum through
|
||||
&repmgr; RPM packages for RedHat/CentOS variants and Fedora are available from the
|
||||
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
|
||||
<ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>; see following
|
||||
section for details.
|
||||
</para>
|
||||
<para>
|
||||
RPM packages for &repmgr; are also available via Yum through
|
||||
the PostgreSQL Global Development Group RPM repository
|
||||
(<ulink url="https://yum.postgresql.org/">http://yum.postgresql.org/</ulink>).
|
||||
Follow the instructions for your distribution (RedHat, CentOS,
|
||||
Fedora, etc.) and architecture as detailed there.
|
||||
Fedora, etc.) and architecture as detailed there. Note that it can take some days
|
||||
for new &repmgr; packages to become available via the this repository.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; packages are designed to be compatible with the community-provided PostgreSQL packages.
|
||||
They may not work with vendor-specific packages such as those provided by RedHat for RHEL
|
||||
customers, as the filesystem layout may be different to the community RPMs.
|
||||
Please contact your support vendor for assistance.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink> also provides its
|
||||
own RPM packages which are made available
|
||||
at the same time as each &repmgr; release, as it can take some days for
|
||||
them to become available via the main PGDG repository. See following section for details:
|
||||
For more information on the package contents, including details of installation
|
||||
paths and relevant <link linkend="configuration-service-commands">service commands</link>,
|
||||
see the appendix section <xref linkend="packages-centos">.
|
||||
</para>
|
||||
|
||||
|
||||
<sect3 id="installation-packages-redhat-2ndq">
|
||||
<title>2ndQuadrant repmgr yum repository</title>
|
||||
<title>2ndQuadrant public RPM yum repository</title>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink> previously provided a dedicated
|
||||
&repmgr; repository at
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/">http://packages.2ndquadrant.com/repmgr/</ulink>.
|
||||
This repository will be deprecated in a future release as it is now replaced by
|
||||
the <ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>
|
||||
documented below.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
Beginning with <ulink url="http://repmgr.org/release-notes-3.1.3.html">repmgr 3.1.3</ulink>,
|
||||
Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
|
||||
<ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a dedicated <literal>yum</literal>
|
||||
repository for &repmgr; releases. This repository complements the main
|
||||
<ulink url="https://yum.postgresql.org/repopackages.php">PGDG community repository</ulink>,
|
||||
but enables repmgr users to access the latest &repmgr; packages before they are
|
||||
available via the PGDG repository, which can take several days to be updated following
|
||||
a fresh &repmgr; release.
|
||||
</para>
|
||||
<ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink> for 2ndQuadrant software,
|
||||
including &repmgr;. We recommend using this for all future &repmgr; releases.
|
||||
</para>
|
||||
<para>
|
||||
General instructions for using this repository can be found on its
|
||||
<ulink url="https://rpm.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
for installing &repmgr; follow below.
|
||||
</para>
|
||||
<para>
|
||||
<emphasis>Installation</emphasis>
|
||||
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Import the repository public key (optional but recommended):
|
||||
<programlisting>
|
||||
rpm --import http://packages.2ndquadrant.com/repmgr/RPM-GPG-KEY-repmgr</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Locate the repository RPM for your PostgreSQL version from the list at:
|
||||
<ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Install the repository RPM for your distribution (this enables the 2ndQuadrant
|
||||
repository as a source of repmgr packages):
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<emphasis>Fedora:</emphasis>
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm</ulink>
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<emphasis>RHEL, CentOS etc:</emphasis>
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</ulink>
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<para>
|
||||
e.g.:
|
||||
<programlisting>
|
||||
$ yum install http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
Install the repository RPM for your distribution and PostgreSQL version
|
||||
(this enables the 2ndQuadrant repository as a source of &repmgr; packages).
|
||||
</para>
|
||||
<para>
|
||||
For example, for PostgreSQL 10 on CentOS, execute:
|
||||
<programlisting>
|
||||
sudo yum install https://rpm.2ndquadrant.com/site/content/2ndquadrant-repo-10-1-1.el7.noarch.rpm
|
||||
</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Verify that the repository is installed with:
|
||||
<programlisting>
|
||||
sudo yum repolist</programlisting>
|
||||
The output should contain two entries like this:
|
||||
<programlisting>
|
||||
2ndquadrant-repo-10/7/x86_64 2ndQuadrant packages for PG10 for rhel 7 - x86_64 1
|
||||
2ndquadrant-repo-10-debug/7/x86_64 2ndQuadrant packages for PG10 for rhel 7 - x86_64 - Debug 1</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Install the repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr96</literal>), e.g.:
|
||||
Install the &repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr10</literal>):
|
||||
<programlisting>
|
||||
$ yum install repmgr96</programlisting>
|
||||
$ yum install repmgr10</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
@@ -91,13 +115,13 @@
|
||||
<emphasis>Compatibility with PGDG Repositories</emphasis>
|
||||
</para>
|
||||
<para>
|
||||
The 2ndQuadrant &repmgr; yum repository uses exactly the same package definitions as the
|
||||
main PGDG repository and is effectively a selective mirror for &repmgr; packages only.
|
||||
The 2ndQuadrant &repmgr; yum repository packages use the same definitions and file system layout as the
|
||||
main PGDG repository.
|
||||
</para>
|
||||
<para>
|
||||
Normally yum should prioritize the repository with the most recent &repmgr; version.
|
||||
Once the PGDG repository has been updated, it doesn't matter which repository
|
||||
the packages are installed from.
|
||||
Normally <application>yum</application> will prioritize the repository with the most recent &repmgr; version.
|
||||
Once the PGDG repository has been updated, it doesn't matter which repository
|
||||
the packages are installed from.
|
||||
</para>
|
||||
<para>
|
||||
To ensure the 2ndQuadrant repository is always prioritised, install <literal>yum-plugin-priorities</literal>
|
||||
@@ -111,30 +135,23 @@
|
||||
To install a specific package version, execute <command>yum --showduplicates list</command>
|
||||
for the package in question:
|
||||
<programlisting>
|
||||
[root@localhost ~]# yum --showduplicates list repmgr96
|
||||
[root@localhost ~]# yum --showduplicates list repmgr10
|
||||
Loaded plugins: fastestmirror
|
||||
Loading mirror speeds from cached hostfile
|
||||
* base: ftp.iij.ad.jp
|
||||
* extras: ftp.iij.ad.jp
|
||||
* updates: ftp.iij.ad.jp
|
||||
Available Packages
|
||||
repmgr96.x86_64 3.2-1.el6 2ndquadrant-repmgr
|
||||
repmgr96.x86_64 3.2.1-1.el6 2ndquadrant-repmgr
|
||||
repmgr96.x86_64 3.3-1.el6 2ndquadrant-repmgr
|
||||
repmgr96.x86_64 3.3.1-1.el6 2ndquadrant-repmgr
|
||||
repmgr96.x86_64 3.3.2-1.el6 2ndquadrant-repmgr
|
||||
repmgr96.x86_64 3.3.2-1.rhel6 pgdg96
|
||||
repmgr96.x86_64 4.0.0-1.el6 2ndquadrant-repmgr
|
||||
repmgr96.x86_64 4.0.0-1.rhel6 pgdg96</programlisting>
|
||||
repmgr10.x86_64 4.0.3-1.rhel7 pgdg10
|
||||
repmgr10.x86_64 4.0.4-1.rhel7 pgdg10
|
||||
repmgr10.x86_64 4.0.5-1.el7 2ndquadrant-repo-10</programlisting>
|
||||
then append the appropriate version number to the package name with a hyphen, e.g.:
|
||||
<programlisting>
|
||||
[root@localhost ~]# yum install repmgr96-3.3.2-1.el6</programlisting>
|
||||
[root@localhost ~]# yum install repmgr10-4.0.3-1.rhel7</programlisting>
|
||||
</para>
|
||||
</sect3>
|
||||
</sect2>
|
||||
|
||||
|
||||
|
||||
<sect2 id="installation-packages-debian" xreflabel="Installing from packages on Debian or Ubuntu">
|
||||
|
||||
<indexterm>
|
||||
@@ -148,6 +165,85 @@
|
||||
Instructions can be found in the APT section of the PostgreSQL Wiki
|
||||
(<ulink url="https://wiki.postgresql.org/wiki/Apt">https://wiki.postgresql.org/wiki/Apt</ulink>).
|
||||
</para>
|
||||
<para>
|
||||
For more information on the package contents, including details of installation
|
||||
paths and relevant <link linkend="configuration-service-commands">service commands</link>,
|
||||
see the appendix section <xref linkend="packages-debian-ubuntu">.
|
||||
</para>
|
||||
|
||||
<sect3 id="installation-packages-debian-ubuntu-2ndq">
|
||||
<title>2ndQuadrant public apt repository for Debian/Ubuntu</title>
|
||||
|
||||
<para>
|
||||
Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
|
||||
<ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a
|
||||
<ulink url="https://apt.2ndquadrant.com/">public apt repository</ulink> for 2ndQuadrant software,
|
||||
including &repmgr;.
|
||||
</para>
|
||||
<para>
|
||||
General instructions for using this repository can be found on its
|
||||
<ulink url="https://apt.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
for installing &repmgr; follow below.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<emphasis>Installation</emphasis>
|
||||
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
If not already present, install the <application>apt-transport-https</application> package:
|
||||
<programlisting>
|
||||
sudo apt-get install apt-transport-https</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Create <filename>/etc/apt/sources.list.d/2ndquadrant.list</filename> as follows:
|
||||
<programlisting>
|
||||
sudo sh -c 'echo "deb https://apt.2ndquadrant.com/ $(lsb_release -cs)-2ndquadrant main" > /etc/apt/sources.list.d/2ndquadrant.list'</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Install the 2ndQuadrant <ulink url="https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc">repository key</ulink>:
|
||||
<programlisting>
|
||||
sudo apt-get install curl ca-certificates
|
||||
curl https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc | sudo apt-key add -</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Update the package list
|
||||
<programlisting>
|
||||
sudo apt-get update</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Install the &repmgr version appropriate for your PostgreSQL version (e.g. <literal>repmgr10</literal>):
|
||||
<programlisting>
|
||||
$ apt-get install postgresql-10-repmgr</programlisting>
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
For packages for PostgreSQL 9.6 and earlier, the package name includes
|
||||
a period between major and minor version numbers, e.g.
|
||||
<literal>postgresql-9.6-repmgr</literal>.
|
||||
</para>
|
||||
</note>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
</para>
|
||||
|
||||
</sect3>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
@@ -80,7 +80,7 @@
|
||||
</para>
|
||||
|
||||
<para>
|
||||
There are also tags for each &repmgr; release, e.g. <filename>REL4_0_STABLE</filename>.
|
||||
There are also tags for each &repmgr; release, e.g. <filename>4.0.5</filename>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
<title>repmgr overview</title>
|
||||
|
||||
<para>
|
||||
This chapter provides a high-level overview of repmgr's components and functionality.
|
||||
This chapter provides a high-level overview of &repmgr;'s components and
|
||||
functionality.
|
||||
</para>
|
||||
<sect1 id="repmgr-concepts" xreflabel="Concepts">
|
||||
|
||||
|
||||
@@ -38,5 +38,34 @@
|
||||
and therefore determine the state of outbound connections from that node.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
Following exit codes can be emitted by <command>repmgr cluster crosscheck</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The check completed successfully and all nodes are reachable.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_CLUSTER_CHECK (25)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
One or more nodes could not be reached.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
</refentry>
|
||||
|
||||
|
||||
@@ -97,5 +97,35 @@
|
||||
useful result.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
Following exit codes can be emitted by <command>repmgr cluster matrix</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>SUCCESS (0)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The check completed successfully and all nodes are reachable.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>ERR_CLUSTER_CHECK (25)</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
One or more nodes could not be reached.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
</refentry>
|
||||
|
||||
|
||||
@@ -45,6 +45,77 @@
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
|
||||
<title>Options</title>
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--dry-run</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Check prerequisites but don't actually execute the rejoin.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--force-rewind[=/path/to/pg_rewind]</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Execute <application>pg_rewind</application> if necessary.
|
||||
</para>
|
||||
<para>
|
||||
It is only necessary to provide the <application>pg_rewind</application>
|
||||
if using PostgreSQL 9.3 or 9.4, and <application>pg_rewind</application>
|
||||
is not installed in the PostgreSQL <filename>bin</filename> directory.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--config-files</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
comma-separated list of configuration files to retain after
|
||||
executing <application>pg_rewind</application>.
|
||||
</para>
|
||||
<para>
|
||||
Currently <application>pg_rewind</application> will overwrite
|
||||
the local node's configuration files with the files from the source node,
|
||||
so it's advisable to use this option to ensure they are kept.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--config-archive-dir</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Directory to temporarily store configuration files specified with
|
||||
<option>--config-files</option>; default: <filename>/tmp</filename>.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-W/--no-wait</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Don't wait for the node to rejoin cluster.
|
||||
</para>
|
||||
<para>
|
||||
If this option is supplied, &repmgr; will restart the node but
|
||||
not wait for it to connect to the primary.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
@@ -77,11 +148,18 @@
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-node-rejoin-pg-rewind" xreflabel="Using pg_rewind">
|
||||
|
||||
<indexterm>
|
||||
<primary>pg_rewind</primary>
|
||||
<secondary>using with "repmgr node rejoin"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Using <command>pg_rewind</command></title>
|
||||
<para>
|
||||
<command>repmgr node rejoin</command> can optionally use <command>pg_rewind</command> to re-integrate a
|
||||
node which has diverged from the rest of the cluster, typically a failed primary.
|
||||
<command>pg_rewind</command> is available in PostgreSQL 9.5 and later.
|
||||
<command>pg_rewind</command> is available in PostgreSQL 9.5 and later as part of the core distribution,
|
||||
and can be installed from external sources for PostgreSQL 9.3 and 9.4.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-F</option><option>--force</option></term>
|
||||
<term><option>-F</option>, <option>--force</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Overwrite an existing node record
|
||||
|
||||
@@ -25,9 +25,11 @@
|
||||
<note>
|
||||
<simpara>
|
||||
<command>repmgr standby clone</command> does not start the standby, and after cloning
|
||||
<command>repmgr standby register</command> must be executed to notify &repmgr; of its presence.
|
||||
a standby, the command <command>repmgr standby register</command> must be executed to
|
||||
notify &repmgr; of its existence.
|
||||
</simpara>
|
||||
</note>
|
||||
|
||||
</refsect1>
|
||||
|
||||
|
||||
@@ -65,7 +67,71 @@
|
||||
</tip>
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-standby-clone-wal-management" xreflabel="Managing WAL during the cloning process">
|
||||
<refsect1 id="repmgr-standby-clone-recovery-conf">
|
||||
<indexterm>
|
||||
<primary>recovery.conf</primary>
|
||||
<secondary>customising with "repmgr standby clone"</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Customising recovery.conf</title>
|
||||
<para>
|
||||
By default, &repmgr; will create a minimal <filename>recovery.conf</filename>
|
||||
containing following parameters:
|
||||
</para>
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara><varname>standby_mode</varname> (always <literal>'on'</literal>)</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><varname>recovery_target_timeline</varname> (always <literal>'latest'</literal>)</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><varname>primary_conninfo</varname></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><varname>primary_slot_name</varname> (if replication slots in use)</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
<para>
|
||||
The following additional parameters can be specified in <filename>repmgr.conf</filename>
|
||||
for inclusion in <filename>recovery.conf</filename>:
|
||||
</para>
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara><varname>restore_command</varname></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><varname>archive_cleanup_command</varname></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><varname>recovery_min_apply_delay</varname></simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
We recommend using <ulink url="https://www.pgbarman.org/">Barman</ulink> to manage
|
||||
WAL file archiving. For more details on combining &repmgr; and <application>Barman</application>,
|
||||
in particular using <varname>restore_command</varname> to configure Barman as a backup source of
|
||||
WAL files, see <xref linkend="cloning-from-barman">.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-standby-clone-wal-management">
|
||||
<title>Managing WAL during the cloning process</title>
|
||||
<para>
|
||||
When initially cloning a standby, you will need to ensure
|
||||
@@ -102,15 +168,22 @@
|
||||
|
||||
|
||||
<refsect1 id="repmgr-standby-create-recovery-conf">
|
||||
|
||||
<indexterm>
|
||||
<primary>recovery.conf</primary>
|
||||
<secondary>generating for a standby cloned by another method</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Using a standby cloned by another method</title>
|
||||
<para>
|
||||
&repmgr; supports standbys cloned by another method (e.g. using <application>barman</application>'s
|
||||
<command>barman recover</command> command).
|
||||
<command><ulink url="http://docs.pgbarman.org/release/2.4/#recover">barman recover</ulink></command> command).
|
||||
</para>
|
||||
<para>
|
||||
To integrate the standby as a &repmgr; node, ensure the <filename>repmgr.conf</filename>
|
||||
file is created for the node, then execute the command
|
||||
<command>repmgr standby clone --recovery-conf-only</command>.
|
||||
file is created for the node, and that it has been registered using
|
||||
<command><link linkend="repmgr-standby-register">repmgr standby register</link></command>.
|
||||
Then execute the command <command>repmgr standby clone --recovery-conf-only</command>.
|
||||
This will create the <filename>recovery.conf</filename> file needed to attach
|
||||
the node to its upstream, and will also create a replication slot on the
|
||||
upstream node if required.
|
||||
@@ -125,6 +198,13 @@
|
||||
to check the prerequisites for creating the <filename>recovery.conf</filename> file,
|
||||
and display the contents of the file without actually creating it.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
<option>--recovery-conf-only</option> was introduced in &repmgr; <link linkend="release-4.0.4">4.0.4</link>.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
@@ -151,7 +231,7 @@
|
||||
<term><option>-c, --fast-checkpoint</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
force fast checkpoint (not effective when cloning from Barman
|
||||
Force fast checkpoint (not effective when cloning from Barman).
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -160,7 +240,7 @@
|
||||
<term><option>--copy-external-config-files[={samepath|pgdata}]</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
copy configuration files located outside the data directory on the source
|
||||
Copy configuration files located outside the data directory on the source
|
||||
node to the same path on the standby (default) or to the
|
||||
PostgreSQL data directory.
|
||||
</para>
|
||||
@@ -171,7 +251,7 @@
|
||||
<term><option>--no-upstream-connection</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
when using Barman, do not connect to upstream node
|
||||
When using Barman, do not connect to upstream node.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -180,7 +260,7 @@
|
||||
<term><option>-R, --remote-user=USERNAME</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
remote system username for SSH operations (default: current local system username)
|
||||
Remote system username for SSH operations (default: current local system username).
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -189,7 +269,7 @@
|
||||
<term><option> --recovery-conf-only</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
create <filename>recovery.conf</filename> file for a previously cloned instance
|
||||
Create <filename>recovery.conf</filename> file for a previously cloned instance. &repmgr 4.0.4 and later.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -198,7 +278,7 @@
|
||||
<term><option>--replication-user</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
user to make replication connections with (optional, not usually required)
|
||||
User to make replication connections with (optional, not usually required).
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -207,8 +287,8 @@
|
||||
<term><option>--superuser</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
if the &repmgr; user is not a superuser, the name of a valid superuser must
|
||||
be provided with this option
|
||||
If the &repmgr; user is not a superuser, the name of a valid superuser must
|
||||
be provided with this option.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -219,7 +299,7 @@
|
||||
<listitem>
|
||||
<para>
|
||||
<literal>primary_conninfo</literal> value to write in recovery.conf
|
||||
when the intended upstream server does not yet exist
|
||||
when the intended upstream server does not yet exist.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -236,7 +316,7 @@
|
||||
<term><option>--without-barman </option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
do not use Barman even if configured
|
||||
Do not use Barman even if configured.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -251,5 +331,11 @@
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
See <xref linkend="cloning-standbys"> for details about various aspects of cloning.
|
||||
</para>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
||||
|
||||
@@ -26,10 +26,18 @@
|
||||
running. It can only be used to attach an active standby to the current primary node
|
||||
(and not to another standby).
|
||||
</para>
|
||||
<para>
|
||||
To re-add an inactive node to the replication cluster, see
|
||||
<xref linkend="repmgr-node-rejoin">
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
To re-add an inactive node to the replication cluster, use
|
||||
<xref linkend="repmgr-node-rejoin">.
|
||||
</para>
|
||||
</tip>
|
||||
|
||||
<para>
|
||||
<command>repmgr standby follow</command> will wait up to
|
||||
<varname>standby_follow_timeout</varname> seconds (default: <literal>30</literal>)
|
||||
to verify the standby has actually connected to the new primary.
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
|
||||
@@ -71,7 +79,7 @@
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>-W</option></term>
|
||||
<term><option>-w</option></term>
|
||||
<term><option>--wait</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
@@ -92,7 +100,7 @@
|
||||
A <literal>standby_follow</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
</para>
|
||||
<para>
|
||||
If provided, &repmgr; will subsitute the placeholders <literal>%p</literal> with the node ID of the primary
|
||||
If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the primary
|
||||
being followed, <literal>%c</literal> with its <literal>conninfo</literal> string, and
|
||||
<literal>%a</literal> with its node name.
|
||||
</para>
|
||||
|
||||
@@ -173,7 +173,7 @@
|
||||
</para>
|
||||
|
||||
<para>
|
||||
If provided, &repmgr; will subsitute the placeholders <literal>%p</literal> with the node ID of the
|
||||
If provided, &repmgr; will substitute the placeholders <literal>%p</literal> with the node ID of the
|
||||
primary node, <literal>%c</literal> with its <literal>conninfo</literal> string, and
|
||||
<literal>%a</literal> with its node name.
|
||||
</para>
|
||||
|
||||
@@ -35,6 +35,10 @@
|
||||
a successful switchover.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
For more details on performing a switchover, including preparation and configuration,
|
||||
see section <xref linkend="performing-switchover">.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
@@ -84,11 +88,14 @@
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--force-rewind</option></term>
|
||||
<term><option>--force-rewind[=/path/to/pg_rewind]</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Use <application>pg_rewind</application> to reintegrate the old primary if necessary
|
||||
(PostgreSQL 9.5 and later).
|
||||
(and the prerequisites for using <application>pg_rewind</application> are met).
|
||||
If using PostgreSQL 9.3 or 9.4, and the <application>pg_rewind</application>
|
||||
binary is not installed in the PostgreSQL <filename>bin</filename> directory,
|
||||
provide its full path. For more details see also <xref linkend="switchover-pg-rewind">.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@@ -115,6 +122,48 @@
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Configuration file settings</title>
|
||||
|
||||
<para>
|
||||
Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
|
||||
switchover operation:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_attempts</literal>: number of times to check the original primary
|
||||
for a clean shutdown after executing the shutdown command, before aborting
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_interval</literal>: interval (in seconds) to check the original
|
||||
primary for a clean shutdown after executing the shutdown command (up to a maximum
|
||||
of <literal>reconnect_attempts</literal> tries)
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>replication_lag_critical</literal>:
|
||||
if replication lag (in seconds) on the standby exceeds this value, the
|
||||
switchover will be aborted (unless the <literal>-F/--force</literal> option
|
||||
is provided)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>standby_reconnect_timeout</literal>:
|
||||
Number of seconds to attempt to reconnect to the demoted primary
|
||||
once it has been restarted.
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>Execution</title>
|
||||
|
||||
@@ -150,7 +199,7 @@
|
||||
<refsect1>
|
||||
<title>Exit codes</title>
|
||||
<para>
|
||||
Following exit codes can be emitted by <literal>repmgr standby switchover</literal>:
|
||||
Following exit codes can be emitted by <command>repmgr standby switchover</command>:
|
||||
</para>
|
||||
<variablelist>
|
||||
|
||||
@@ -178,7 +227,7 @@
|
||||
<para>
|
||||
The switchover was executed but a problem was encountered.
|
||||
Typically this means the former primary could not be reattached
|
||||
as a standby.
|
||||
as a standby. Check preceding log messages for more information.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
@@ -1,60 +1,233 @@
|
||||
<chapter id="repmgrd-configuration">
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>configuration</secondary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd configuration</title>
|
||||
<para>
|
||||
To use <application>repmgrd</application>, its associated function library must be
|
||||
included in <filename>postgresql.conf</filename> with:
|
||||
<title>repmgrd configuration</title>
|
||||
|
||||
<programlisting>
|
||||
shared_preload_libraries = 'repmgr'</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Changing this setting requires a restart of PostgreSQL; for more details see
|
||||
the <ulink url="https://www.postgresql.org/docs/current/static/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
|
||||
</para>
|
||||
<para>
|
||||
Additionally the following <application>repmgrd</application> options *must* be set in
|
||||
<filename>repmgr.conf</filename> (adjust configuration file locations as appropriate):
|
||||
<programlisting>
|
||||
failover=automatic
|
||||
promote_command='repmgr standby promote -f /etc/repmgr.conf --log-to-file'
|
||||
follow_command='repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Note that the <literal>--log-to-file</literal> option will cause
|
||||
output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
|
||||
to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
|
||||
See <filename>repmgr.conf.sample</filename> for further <application>repmgrd</application>-specific settings.
|
||||
</para>
|
||||
<para>
|
||||
When <varname>failover</varname> is set to <literal>automatic</literal>, upon detecting failure
|
||||
of the current primary, <application>repmgrd</application> will execute one of
|
||||
<varname>promote_command</varname> or <varname>follow_command</varname>,
|
||||
depending on whether the current server is to become the new primary, or
|
||||
needs to follow another server which has become the new primary. Note that
|
||||
these commands can be any valid shell script which results in one of these
|
||||
two actions happening, but if &repmgr;'s <command>standby follow</command> or
|
||||
<command>standby promote</command>
|
||||
commands are not executed (either directly as shown here, or from a script which
|
||||
performs other actions), the &repmgr; metadata will not be updated and
|
||||
&repmgr; will no longer function reliably.
|
||||
</para>
|
||||
<para>
|
||||
The <varname>follow_command</varname> should provide the <literal>--upstream-node-id=%n</literal>
|
||||
option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
|
||||
<application>repmgrd</application> with the ID of the new primary node. If this is not provided, &repmgr;
|
||||
will attempt to determine the new primary by itself, but if the
|
||||
original primary comes back online after the new primary is promoted, there is a risk that
|
||||
<command>repmgr standby follow</command> will result in the node continuing to follow
|
||||
the original primary.
|
||||
</para>
|
||||
<sect1 id="repmgrd-connection-settings">
|
||||
<title>repmgrd connection settings</title>
|
||||
<para>
|
||||
<application>repmgrd</application> is a daemon which runs on each PostgreSQL node,
|
||||
monitoring the local node, and (unless it's the primary node) the upstream server
|
||||
(the primary server or with cascading replication, another standby) which it's
|
||||
connected to.
|
||||
</para>
|
||||
<para>
|
||||
<application>repmgrd</application> can be configured to provide failover
|
||||
capability in case the primary upstream node becomes unreachable, and/or
|
||||
provide monitoring data to the &repmgr; metadatabase.
|
||||
</para>
|
||||
|
||||
<sect1 id="repmgrd-basic-configuration">
|
||||
<title>repmgrd basic configuration</title>
|
||||
|
||||
<para>
|
||||
To use <application>repmgrd</application>, its associated function library <emphasis>must</emphasis> be
|
||||
included in <filename>postgresql.conf</filename> with:
|
||||
|
||||
<programlisting>
|
||||
shared_preload_libraries = 'repmgr'</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Changing this setting requires a restart of PostgreSQL; for more details see
|
||||
the <ulink url="https://www.postgresql.org/docs/current/static/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
|
||||
</para>
|
||||
|
||||
<sect2 id="repmgrd-automatic-failover-configuration">
|
||||
<title>automatic failover configuration</title>
|
||||
<para>
|
||||
If using automatic failover, the following <application>repmgrd</application> options *must* be set in
|
||||
<filename>repmgr.conf</filename> :
|
||||
<programlisting>
|
||||
failover=automatic
|
||||
promote_command='/usr/bin/repmgr standby promote -f /etc/repmgr.conf --log-to-file'
|
||||
follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Adjust file paths as appropriate; we recomment specifying the full path to the &repmgr; binary.
|
||||
</para>
|
||||
<para>
|
||||
Note that the <literal>--log-to-file</literal> option will cause
|
||||
output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
|
||||
to be logged to the same destination configured to receive log output for <application>repmgrd</application>.
|
||||
See <filename><ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</ulink></filename>
|
||||
for further <application>repmgrd</application>-specific settings.
|
||||
</para>
|
||||
<para>
|
||||
When <varname>failover</varname> is set to <literal>automatic</literal>, upon detecting failure
|
||||
of the current primary, <application>repmgrd</application> will execute one of:
|
||||
</para>
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>promote_command</varname> (if the current server is to become the new primary)
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>follow_command</varname> (if the current server needs to follow another server which has
|
||||
become the new primary)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<note>
|
||||
<para>
|
||||
These commands can be any valid shell script which results in one of these
|
||||
two actions happening, but if &repmgr;'s <command>standby follow</command> or
|
||||
<command>standby promote</command>
|
||||
commands are not executed (either directly as shown here, or from a script which
|
||||
performs other actions), the &repmgr; metadata will not be updated and
|
||||
&repmgr; will no longer function reliably.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
The <varname>follow_command</varname> should provide the <literal>--upstream-node-id=%n</literal>
|
||||
option to <command>repmgr standby follow</command>; the <literal>%n</literal> will be replaced by
|
||||
<application>repmgrd</application> with the ID of the new primary node. If this is not provided, &repmgr;
|
||||
will attempt to determine the new primary by itself, but if the
|
||||
original primary comes back online after the new primary is promoted, there is a risk that
|
||||
<command>repmgr standby follow</command> will result in the node continuing to follow
|
||||
the original primary.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="repmgrd-service-configuration">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>PostgreSQL service configuration</secondary>
|
||||
</indexterm>
|
||||
<title>PostgreSQL service configuration</title>
|
||||
<para>
|
||||
If using automatic failover, currently <application>repmgrd</application> will need to execute
|
||||
<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>
|
||||
to restart PostgreSQL on standbys to have them follow a new primary.
|
||||
</para>
|
||||
<para>
|
||||
To ensure this happens smoothly, it's essential to provide the appropriate system/service restart
|
||||
command appropriate to your operating system via <varname>service_restart_command</varname>
|
||||
in <filename>repmgr.conf</filename>. If you don't do this, <application>repmgrd</application>
|
||||
will default to using <command>pg_ctl</command>, which can result in unexpected problems,
|
||||
particularly on <application>systemd</application>-based systems.
|
||||
</para>
|
||||
<para>
|
||||
For more details, see <xref linkend="configuration-service-commands">.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="repmgrd-monitoring-configuration">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>monitoring configuration</secondary>
|
||||
</indexterm>
|
||||
<title>Monitoring configuration</title>
|
||||
<para>
|
||||
To enable monitoring, set:
|
||||
<programlisting>
|
||||
monitoring_history=yes</programlisting>
|
||||
in <filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
<para>
|
||||
The default monitoring interval is 2 seconds; this value can be explicitly set using:
|
||||
<programlisting>
|
||||
monitor_interval_secs=<seconds></programlisting>
|
||||
in <filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
<para>
|
||||
For more details on monitoring, see <xref linkend="repmgrd-monitoring">.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="repmgrd-daemon">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>starting and stopping</secondary>
|
||||
</indexterm>
|
||||
<title>repmgrd daemon</title>
|
||||
<para>
|
||||
If installed from a package, the <application>repmgrd</application> can be started
|
||||
via the operating system's service command, e.g. in <application>systemd</application>
|
||||
using <command>systemctl</command>.
|
||||
</para>
|
||||
<para>
|
||||
See appendix <xref linkend="appendix-packages"> for details of service commands
|
||||
for different distributions.
|
||||
</para>
|
||||
<para>
|
||||
<application>repmgrd</application> can be started manually like this:
|
||||
<programlisting>
|
||||
repmgrd -f /etc/repmgr.conf --pid-file /tmp/repmgrd.pid --daemonize</programlisting>
|
||||
and stopped with <command>kill `cat /tmp/repmgrd.pid`</command>. Adjust paths as appropriate.
|
||||
</para>
|
||||
<para>
|
||||
To apply configuration file changes to a running <application>repmgrd</application>
|
||||
daemon, execute the operating system's service reload command (for manually started
|
||||
instances, execute <command>kill -HUP `cat /tmp/repmgrd.pid`</command>).
|
||||
Note that only a subset of configuration file parameters can be changed on a
|
||||
running <application>repmgrd</application> daemon.
|
||||
</para>
|
||||
|
||||
<sect2 id="repmgrd-configuration-debian-ubuntu">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>Debian/Ubuntu and daemon configuration</secondary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>Debian/Ubuntu</primary>
|
||||
<secondary>repmgrd daemon configuration</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd daemon configuration on Debian/Ubuntu</title>
|
||||
|
||||
<para>
|
||||
If &repmgr; was installed from Debian/Ubuntu packages, additional configuration
|
||||
is required before <application>repmgrd</application> is started as a daemon.
|
||||
</para>
|
||||
<para>
|
||||
This is done via the file <filename>/etc/default/repmgrd</filename>, which by default
|
||||
looks like this:
|
||||
<programlisting>
|
||||
# default settings for repmgrd. This file is source by /bin/sh from
|
||||
# /etc/init.d/repmgrd
|
||||
|
||||
# disable repmgrd by default so it won't get started upon installation
|
||||
# valid values: yes/no
|
||||
REPMGRD_ENABLED=no
|
||||
|
||||
# configuration file (required)
|
||||
#REPMGRD_CONF="/path/to/repmgr.conf"
|
||||
|
||||
# additional options
|
||||
#REPMGRD_OPTS=""
|
||||
|
||||
# user to run repmgrd as
|
||||
#REPMGRD_USER=postgres
|
||||
|
||||
# repmgrd binary
|
||||
#REPMGRD_BIN=/usr/bin/repmgrd
|
||||
|
||||
# pid file
|
||||
#REPMGRD_PIDFILE=/var/run/repmgrd.pid</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Set <varname>REPMGRD_ENABLED</varname> to <literal>yes</literal>, and <varname>REPMGRD_CONF</varname>
|
||||
to the <filename>repmgr.conf</filename> file you are using.
|
||||
</para>
|
||||
<para>
|
||||
If using <application>systemd</application>, you may need to execute <command>systemctl daemon-reload</command>.
|
||||
Also, if you attempted to start <application>repmgrd</application> using <command>systemctl start repmgrd</command>,
|
||||
you'll need to execute <command>systemctl stop repmgrd</command>. Because that's how <application>systemd</application>
|
||||
rolls.
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="repmgrd-connection-settings">
|
||||
<title>repmgrd connection settings</title>
|
||||
<para>
|
||||
In addition to the &repmgr; configuration settings, parameters in the
|
||||
<varname>conninfo</varname> string influence how &repmgr; makes a network connection to
|
||||
@@ -76,12 +249,21 @@
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-PARAMKEYWORDS">PostgreSQL documentation</ulink>.
|
||||
</para>
|
||||
</sect1>
|
||||
|
||||
|
||||
|
||||
<sect1 id="repmgrd-log-rotation">
|
||||
<indexterm>
|
||||
<primary>log rotation</primary>
|
||||
<secondary>repmgrd</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd log rotation</title>
|
||||
<para>
|
||||
To ensure the current <application>repmgrd</application> logfile does not grow
|
||||
indefinitely, configure your system's <command>logrotate</command> to
|
||||
regularly rotate it.
|
||||
To ensure the current <application>repmgrd</application> logfile
|
||||
(specified in <filename>repmgr.conf</filename> with the parameter
|
||||
<option>log_file</option> does not grow indefinitely, configure your
|
||||
system's <command>logrotate</command> to regularly rotate it.
|
||||
</para>
|
||||
<para>
|
||||
Sample configuration to rotate logfiles weekly with retention for
|
||||
|
||||
@@ -3,6 +3,10 @@
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>monitoring</secondary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>monitoring</primary>
|
||||
<secondary>with repmgrd</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>Monitoring with repmgrd</title>
|
||||
<para>
|
||||
|
||||
@@ -67,13 +67,21 @@
|
||||
promotion candidate to all standbys attached to the demotion candidate.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<simpara>
|
||||
&repmgr; expects to find the &repmgr; binary in the same path on the remote
|
||||
server as on the local server.
|
||||
</simpara>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
Double-check which commands will be used to stop/start/restart the current
|
||||
primary; on the primary execute:
|
||||
primary; on the current primary execute:
|
||||
<programlisting>
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=stop
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=start
|
||||
repmgr -f /etc/repmgr.conf node service --list --action=restart</programlisting>
|
||||
|
||||
</para>
|
||||
|
||||
<para>
|
||||
@@ -92,7 +100,11 @@
|
||||
<para>
|
||||
If the <option>service_*_command</option> options aren't defined, &repmgr; will
|
||||
fall back to using <application>pg_ctl</application> to stop/start/restart
|
||||
PostgreSQL, which may not work properly.
|
||||
PostgreSQL, which may not work properly, particularly when executed on a remote
|
||||
server.
|
||||
</para>
|
||||
<para>
|
||||
For more details, see <xref linkend="configuration-service-commands">.
|
||||
</para>
|
||||
</important>
|
||||
|
||||
@@ -109,6 +121,7 @@
|
||||
</simpara>
|
||||
</note>
|
||||
|
||||
|
||||
<para>
|
||||
Check that access from applications is minimalized or preferably blocked
|
||||
completely, so applications are not unexpectedly interrupted.
|
||||
@@ -163,34 +176,60 @@
|
||||
</para>
|
||||
</important>
|
||||
|
||||
<para>
|
||||
Note that following parameters in <filename>repmgr.conf</filename> are relevant to the
|
||||
switchover operation:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_attempts</literal>: number of times to check the original primary
|
||||
for a clean shutdown after executing the shutdown command, before aborting
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>reconnect_interval</literal>: interval (in seconds) to check the original
|
||||
primary for a clean shutdown after executing the shutdown command (up to a maximum
|
||||
of <literal>reconnect_attempts</literal> tries)
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>replication_lag_critical</literal>:
|
||||
if replication lag (in seconds) on the standby exceeds this value, the
|
||||
switchover will be aborted (unless the <literal>-F/--force</literal> option
|
||||
is provided)
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
See <xref linkend="repmgr-standby-switchover"> for a full list of available
|
||||
command line options and <filename>repmgr.conf</filename> settings relevant
|
||||
to performing a switchover.
|
||||
</simpara>
|
||||
</note>
|
||||
|
||||
<sect2 id="switchover-pg-rewind" xreflabel="Switchover and pg_rewind">
|
||||
<indexterm>
|
||||
<primary>pg_rewind</primary>
|
||||
<secondary>using with "repmgr standby switchover"</secondary>
|
||||
</indexterm>
|
||||
<title>Switchover and pg_rewind</title>
|
||||
<para>
|
||||
If the demotion candidate does not shut down smoothly or cleanly, there's a risk it
|
||||
will have a slightly divergent timeline and will not be able to attach to the new
|
||||
primary. To fix this situation without needing to reclone the old primary, it's
|
||||
possible to use the <application>pg_rewind</application> utility, which will usually be
|
||||
able to resync the two servers.
|
||||
</para>
|
||||
<para>
|
||||
To have &repmgr; execute <application>pg_rewind</application> if it detects this
|
||||
situation after promoting the new primary, add the <option>--force-rewind</option>
|
||||
option.
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
If &repmgr; detects a situation where it needs to execute <application>pg_rewind</application>,
|
||||
it will execute a <literal>CHECKPOINT</literal> on the new primary before executing
|
||||
<application>pg_rewind</application>.
|
||||
</simpara>
|
||||
</note>
|
||||
<para>
|
||||
For more details on <application>pg_rewind</application>, see:
|
||||
<ulink url="https://www.postgresql.org/docs/current/static/app-pgrewind.html">https://www.postgresql.org/docs/current/static/app-pgrewind.html</ulink>.
|
||||
</para>
|
||||
<para>
|
||||
<application>pg_rewind</application> has been part of the core PostgreSQL distribution since
|
||||
version 9.5. Users of versions 9.3 and 9.4 will need to manually install it; the source code is available here:
|
||||
<ulink url="https://github.com/vmware/pg_rewind">https://github.com/vmware/pg_rewind</ulink>.
|
||||
If the <application>pg_rewind</application>
|
||||
binary is not installed in the PostgreSQL <filename>bin</filename> directory, provide
|
||||
its full path on the demotion candidate with <option>--force-rewind</option>.
|
||||
</para>
|
||||
<para>
|
||||
Note that building the 9.3/9.4 version of <application>pg_rewind</application> requires the PostgreSQL
|
||||
source code. Also, PostgreSQL 9.3 does not provide <varname>wal_log_hints</varname>,
|
||||
meaning data checksums must have been enabled when the database was initialized.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="switchover-execution" xreflabel="Executing the switchover command">
|
||||
|
||||
@@ -1 +1 @@
|
||||
<!ENTITY repmgrversion "4.0.4">
|
||||
<!ENTITY repmgrversion "4.0.6">
|
||||
|
||||
@@ -44,5 +44,8 @@
|
||||
#define ERR_REGISTRATION_SYNC 20
|
||||
#define ERR_OUT_OF_MEMORY 21
|
||||
#define ERR_SWITCHOVER_INCOMPLETE 22
|
||||
#define ERR_FOLLOW_FAIL 23
|
||||
#define ERR_REJOIN_FAIL 24
|
||||
#define ERR_CLUSTER_CHECK 25
|
||||
|
||||
#endif /* _ERRCODE_H_ */
|
||||
|
||||
7
log.c
7
log.c
@@ -329,6 +329,13 @@ logger_set_terse(void)
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
logger_set_min_level(int min_log_level)
|
||||
{
|
||||
if (min_log_level > log_level)
|
||||
log_level = min_log_level;
|
||||
}
|
||||
|
||||
int
|
||||
detect_log_level(const char *level)
|
||||
{
|
||||
|
||||
1
log.h
1
log.h
@@ -128,6 +128,7 @@ bool logger_shutdown(void);
|
||||
|
||||
void logger_set_verbose(void);
|
||||
void logger_set_terse(void);
|
||||
void logger_set_min_level(int min_log_level);
|
||||
|
||||
void
|
||||
log_detail(const char *fmt,...)
|
||||
|
||||
@@ -569,6 +569,8 @@ do_cluster_crosscheck(void)
|
||||
|
||||
t_node_status_cube **cube;
|
||||
|
||||
bool error_found = false;
|
||||
|
||||
n = build_cluster_crosscheck(&cube, &name_length);
|
||||
if (runtime_options.output_mode == OM_CSV)
|
||||
{
|
||||
@@ -648,9 +650,11 @@ do_cluster_crosscheck(void)
|
||||
{
|
||||
case -2:
|
||||
c = '?';
|
||||
error_found = true;
|
||||
break;
|
||||
case -1:
|
||||
c = 'x';
|
||||
error_found = true;
|
||||
break;
|
||||
case 0:
|
||||
c = '*';
|
||||
@@ -689,6 +693,11 @@ do_cluster_crosscheck(void)
|
||||
|
||||
free(cube);
|
||||
}
|
||||
|
||||
if (error_found == true)
|
||||
{
|
||||
exit(ERR_CLUSTER_CHECK);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -704,6 +713,8 @@ do_cluster_matrix()
|
||||
|
||||
t_node_matrix_rec **matrix_rec_list;
|
||||
|
||||
bool error_found = false;
|
||||
|
||||
n = build_cluster_matrix(&matrix_rec_list, &name_length);
|
||||
|
||||
if (runtime_options.output_mode == OM_CSV)
|
||||
@@ -742,9 +753,11 @@ do_cluster_matrix()
|
||||
{
|
||||
case -2:
|
||||
c = '?';
|
||||
error_found = true;
|
||||
break;
|
||||
case -1:
|
||||
c = 'x';
|
||||
error_found = true;
|
||||
break;
|
||||
case 0:
|
||||
c = '*';
|
||||
@@ -770,6 +783,11 @@ do_cluster_matrix()
|
||||
}
|
||||
|
||||
free(matrix_rec_list);
|
||||
|
||||
if (error_found == true)
|
||||
{
|
||||
exit(ERR_CLUSTER_CHECK);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -964,8 +982,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length)
|
||||
|
||||
initPQExpBuffer(&command_output);
|
||||
|
||||
(void) remote_command(
|
||||
host,
|
||||
(void) remote_command(host,
|
||||
runtime_options.remote_user,
|
||||
command.data,
|
||||
&command_output);
|
||||
@@ -1144,9 +1161,8 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
/* fix to work with --node-id */
|
||||
if (cube[i]->node_id == config_file_options.node_id)
|
||||
{
|
||||
(void) local_command(
|
||||
command.data,
|
||||
&command_output);
|
||||
(void) local_command_simple(command.data,
|
||||
&command_output);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1170,8 +1186,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length)
|
||||
|
||||
log_verbose(LOG_DEBUG, "build_cluster_crosscheck(): executing\n %s", quoted_command.data);
|
||||
|
||||
(void) remote_command(
|
||||
host,
|
||||
(void) remote_command(host,
|
||||
runtime_options.remote_user,
|
||||
quoted_command.data,
|
||||
&command_output);
|
||||
|
||||
@@ -940,6 +940,7 @@ do_node_check_replication_connection(void)
|
||||
return;
|
||||
}
|
||||
|
||||
/* retrieve remote node record from local database */
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
record_status = get_node_record(local_conn, runtime_options.remote_node_id, &node_record);
|
||||
@@ -956,8 +957,12 @@ do_node_check_replication_connection(void)
|
||||
initialize_conninfo_params(&remote_conninfo, false);
|
||||
parse_conninfo_string(node_record.conninfo, &remote_conninfo, NULL, false);
|
||||
|
||||
if (strcmp(param_get(&remote_conninfo, "user"), node_record.repluser) != 0)
|
||||
{
|
||||
param_set(&remote_conninfo, "user", node_record.repluser);
|
||||
param_set(&remote_conninfo, "dbname", "replication");
|
||||
}
|
||||
param_set(&remote_conninfo, "replication", "1");
|
||||
param_set(&remote_conninfo, "user", node_record.repluser);
|
||||
|
||||
repl_conn = establish_db_connection_by_params(&remote_conninfo, false);
|
||||
|
||||
@@ -1513,7 +1518,7 @@ do_node_service(void)
|
||||
|
||||
if (data_dir_required_for_action(action))
|
||||
{
|
||||
get_node_data_directory(data_dir);
|
||||
get_node_config_directory(data_dir);
|
||||
|
||||
if (data_dir[0] == '\0')
|
||||
{
|
||||
@@ -1601,7 +1606,7 @@ _do_node_service_list_actions(t_server_action action)
|
||||
|
||||
if (data_dir_required == true)
|
||||
{
|
||||
get_node_data_directory(data_dir);
|
||||
get_node_config_directory(data_dir);
|
||||
}
|
||||
|
||||
/* show command for specific action only */
|
||||
@@ -1667,6 +1672,13 @@ parse_server_action(const char *action_name)
|
||||
*
|
||||
* Note that "repmgr node rejoin" is also executed by
|
||||
* "repmgr standby switchover" after promoting the new primary.
|
||||
*
|
||||
* Parameters:
|
||||
* --dry-run
|
||||
* --force-rewind[=VALUE]
|
||||
* --config-files
|
||||
* --config-archive-dir
|
||||
* -W/--no-wait
|
||||
*/
|
||||
void
|
||||
do_node_rejoin(void)
|
||||
@@ -1728,7 +1740,7 @@ do_node_rejoin(void)
|
||||
{
|
||||
log_error(_("database is not shut down cleanly"));
|
||||
|
||||
if (runtime_options.force_rewind == true)
|
||||
if (runtime_options.force_rewind_used == true)
|
||||
{
|
||||
log_detail(_("pg_rewind will not be able to run"));
|
||||
}
|
||||
@@ -1758,7 +1770,17 @@ do_node_rejoin(void)
|
||||
PQfinish(upstream_conn);
|
||||
|
||||
/* connect to registered primary and check it's not in recovery */
|
||||
upstream_conn = establish_db_connection(primary_node_record.conninfo, true);
|
||||
upstream_conn = establish_db_connection(primary_node_record.conninfo, false);
|
||||
|
||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_error(_("unable to connect to current primary \"%s\" (node ID: %i)"),
|
||||
primary_node_record.node_name,
|
||||
primary_node_record.node_id);
|
||||
log_detail(_("primay node conninfo is: \"%s\""),
|
||||
primary_node_record.conninfo);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
upstream_recovery_type = get_recovery_type(upstream_conn);
|
||||
|
||||
@@ -1774,30 +1796,33 @@ do_node_rejoin(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* If --force-rewind specified, check pg_rewind can be used, and
|
||||
* pre-emptively fetch the list of configuration files which should be
|
||||
* archived
|
||||
* --force-rewind specified - check prerequisites, and attempt to execute
|
||||
* (if --dry-run provided, just output the command which would be executed)
|
||||
*/
|
||||
|
||||
if (runtime_options.force_rewind == true)
|
||||
|
||||
if (runtime_options.force_rewind_used == true)
|
||||
{
|
||||
PQExpBufferData reason;
|
||||
PQExpBufferData msg;
|
||||
PQExpBufferData filebuf;
|
||||
int ret;
|
||||
|
||||
initPQExpBuffer(&reason);
|
||||
/*
|
||||
* Check that pg_rewind can be used
|
||||
*/
|
||||
|
||||
if (can_use_pg_rewind(upstream_conn, config_file_options.data_directory, &reason) == false)
|
||||
initPQExpBuffer(&msg);
|
||||
|
||||
if (can_use_pg_rewind(upstream_conn, config_file_options.data_directory, &msg) == false)
|
||||
{
|
||||
log_error(_("--force-rewind specified but pg_rewind cannot be used"));
|
||||
log_detail("%s", reason.data);
|
||||
termPQExpBuffer(&reason);
|
||||
log_detail("%s", msg.data);
|
||||
termPQExpBuffer(&msg);
|
||||
PQfinish(upstream_conn);
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
termPQExpBuffer(&reason);
|
||||
|
||||
initPQExpBuffer(&msg);
|
||||
appendPQExpBuffer(&msg,
|
||||
_("prerequisites for using pg_rewind are met"));
|
||||
|
||||
@@ -1810,25 +1835,31 @@ do_node_rejoin(void)
|
||||
log_verbose(LOG_INFO, "%s", msg.data);
|
||||
}
|
||||
termPQExpBuffer(&msg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Forcibly rewind node if requested (this is mainly for use when this
|
||||
* action is being executed by "repmgr standby switchover")
|
||||
*/
|
||||
if (runtime_options.force_rewind == true && runtime_options.dry_run == false)
|
||||
{
|
||||
int ret;
|
||||
PQExpBufferData filebuf;
|
||||
|
||||
/*
|
||||
* Archive requested configuration files.
|
||||
*
|
||||
* In --dry-run mode this acts as a check that the files can be archived, though
|
||||
* errors will only be logged; any copied files will be deleted and --dry-run
|
||||
* execution will continue.
|
||||
*/
|
||||
_do_node_archive_config();
|
||||
|
||||
/* execute pg_rewind */
|
||||
initPQExpBuffer(&command);
|
||||
|
||||
appendPQExpBuffer(&command,
|
||||
"%s -D ",
|
||||
make_pg_path("pg_rewind"));
|
||||
if (runtime_options.force_rewind_path[0] != '\0')
|
||||
{
|
||||
appendPQExpBuffer(&command,
|
||||
"%s -D ",
|
||||
runtime_options.force_rewind_path);
|
||||
}
|
||||
else
|
||||
{
|
||||
appendPQExpBuffer(&command,
|
||||
"%s -D ",
|
||||
make_pg_path("pg_rewind"));
|
||||
}
|
||||
|
||||
appendShellString(&command,
|
||||
config_file_options.data_directory);
|
||||
@@ -1842,112 +1873,119 @@ do_node_rejoin(void)
|
||||
log_info(_("pg_rewind would now be executed"));
|
||||
log_detail(_("pg_rewind command is:\n %s"),
|
||||
command.data);
|
||||
|
||||
PQfinish(upstream_conn);
|
||||
exit(SUCCESS);
|
||||
}
|
||||
|
||||
log_notice(_("executing pg_rewind"));
|
||||
log_debug("pg_rewind command is:\n %s",
|
||||
command.data);
|
||||
|
||||
initPQExpBuffer(&command_output);
|
||||
|
||||
ret = local_command(
|
||||
command.data,
|
||||
&command_output);
|
||||
|
||||
termPQExpBuffer(&command);
|
||||
|
||||
if (ret == false)
|
||||
else
|
||||
{
|
||||
log_error(_("unable to execute pg_rewind"));
|
||||
log_detail("%s", command_output.data);
|
||||
log_notice(_("executing pg_rewind"));
|
||||
log_debug("pg_rewind command is:\n %s",
|
||||
command.data);
|
||||
|
||||
initPQExpBuffer(&command_output);
|
||||
|
||||
ret = local_command(command.data,
|
||||
&command_output);
|
||||
|
||||
termPQExpBuffer(&command);
|
||||
|
||||
if (ret == false)
|
||||
{
|
||||
log_error(_("unable to execute pg_rewind"));
|
||||
log_detail("%s", command_output.data);
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
/* Restore any previously archived config files */
|
||||
_do_node_restore_config();
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
initPQExpBuffer(&filebuf);
|
||||
|
||||
/* Restore any previously archived config files */
|
||||
_do_node_restore_config();
|
||||
|
||||
initPQExpBuffer(&filebuf);
|
||||
|
||||
/* remove any recovery.done file copied in by pg_rewind */
|
||||
appendPQExpBuffer(&filebuf,
|
||||
"%s/recovery.done",
|
||||
config_file_options.data_directory);
|
||||
|
||||
if (stat(filebuf.data, &statbuf) == 0)
|
||||
{
|
||||
log_verbose(LOG_INFO, _("deleting \"recovery.done\""));
|
||||
|
||||
if (unlink(filebuf.data) == -1)
|
||||
{
|
||||
log_warning(_("unable to delete \"%s\""),
|
||||
filebuf.data);
|
||||
log_detail("%s", strerror(errno));
|
||||
}
|
||||
}
|
||||
termPQExpBuffer(&filebuf);
|
||||
|
||||
/* delete any replication slots copied in by pg_rewind */
|
||||
{
|
||||
PQExpBufferData slotdir_path;
|
||||
DIR *slotdir;
|
||||
struct dirent *slotdir_ent;
|
||||
|
||||
initPQExpBuffer(&slotdir_path);
|
||||
|
||||
appendPQExpBuffer(&slotdir_path,
|
||||
"%s/pg_replslot",
|
||||
/* remove any recovery.done file copied in by pg_rewind */
|
||||
appendPQExpBuffer(&filebuf,
|
||||
"%s/recovery.done",
|
||||
config_file_options.data_directory);
|
||||
|
||||
slotdir = opendir(slotdir_path.data);
|
||||
|
||||
if (slotdir == NULL)
|
||||
if (stat(filebuf.data, &statbuf) == 0)
|
||||
{
|
||||
log_warning(_("unable to open replication slot directory \"%s\""),
|
||||
slotdir_path.data);
|
||||
log_detail("%s", strerror(errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
while ((slotdir_ent = readdir(slotdir)) != NULL) {
|
||||
struct stat statbuf;
|
||||
PQExpBufferData slotdir_ent_path;
|
||||
log_verbose(LOG_INFO, _("deleting \"recovery.done\""));
|
||||
|
||||
if(strcmp(slotdir_ent->d_name, ".") == 0 || strcmp(slotdir_ent->d_name, "..") == 0)
|
||||
continue;
|
||||
|
||||
initPQExpBuffer(&slotdir_ent_path);
|
||||
|
||||
appendPQExpBuffer(&slotdir_ent_path,
|
||||
"%s/%s",
|
||||
slotdir_path.data,
|
||||
slotdir_ent->d_name);
|
||||
|
||||
if (stat(slotdir_ent_path.data, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
|
||||
{
|
||||
termPQExpBuffer(&slotdir_ent_path);
|
||||
continue;
|
||||
}
|
||||
|
||||
log_debug("deleting slot directory \"%s\"", slotdir_ent_path.data);
|
||||
if (rmdir_recursive(slotdir_ent_path.data) != 0 && errno != EEXIST)
|
||||
{
|
||||
log_warning(_("unable to delete replication slot directory \"%s\""), slotdir_ent_path.data);
|
||||
log_detail("%s", strerror(errno));
|
||||
log_hint(_("directory may need to be manually removed"));
|
||||
}
|
||||
|
||||
termPQExpBuffer(&slotdir_ent_path);
|
||||
if (unlink(filebuf.data) == -1)
|
||||
{
|
||||
log_warning(_("unable to delete \"%s\""),
|
||||
filebuf.data);
|
||||
log_detail("%s", strerror(errno));
|
||||
}
|
||||
}
|
||||
termPQExpBuffer(&slotdir_path);
|
||||
termPQExpBuffer(&filebuf);
|
||||
|
||||
/*
|
||||
* Delete any replication slots copied in by pg_rewind.
|
||||
*
|
||||
* TODO:
|
||||
* - from PostgreSQL 11, this will be handled by pg_rewind, so
|
||||
* we can skip this step from that version; see commit
|
||||
* 266b6acb312fc440c1c1a2036aa9da94916beac6
|
||||
* - possibly delete contents of various other directories
|
||||
* as per the above commit for pre-PostgreSQL 11
|
||||
*/
|
||||
{
|
||||
PQExpBufferData slotdir_path;
|
||||
DIR *slotdir;
|
||||
struct dirent *slotdir_ent;
|
||||
|
||||
initPQExpBuffer(&slotdir_path);
|
||||
|
||||
appendPQExpBuffer(&slotdir_path,
|
||||
"%s/pg_replslot",
|
||||
config_file_options.data_directory);
|
||||
|
||||
slotdir = opendir(slotdir_path.data);
|
||||
|
||||
if (slotdir == NULL)
|
||||
{
|
||||
log_warning(_("unable to open replication slot directory \"%s\""),
|
||||
slotdir_path.data);
|
||||
log_detail("%s", strerror(errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
while ((slotdir_ent = readdir(slotdir)) != NULL) {
|
||||
struct stat statbuf;
|
||||
PQExpBufferData slotdir_ent_path;
|
||||
|
||||
if(strcmp(slotdir_ent->d_name, ".") == 0 || strcmp(slotdir_ent->d_name, "..") == 0)
|
||||
continue;
|
||||
|
||||
initPQExpBuffer(&slotdir_ent_path);
|
||||
|
||||
appendPQExpBuffer(&slotdir_ent_path,
|
||||
"%s/%s",
|
||||
slotdir_path.data,
|
||||
slotdir_ent->d_name);
|
||||
|
||||
if (stat(slotdir_ent_path.data, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
|
||||
{
|
||||
termPQExpBuffer(&slotdir_ent_path);
|
||||
continue;
|
||||
}
|
||||
|
||||
log_debug("deleting slot directory \"%s\"", slotdir_ent_path.data);
|
||||
if (rmdir_recursive(slotdir_ent_path.data) != 0 && errno != EEXIST)
|
||||
{
|
||||
log_warning(_("unable to delete replication slot directory \"%s\""), slotdir_ent_path.data);
|
||||
log_detail("%s", strerror(errno));
|
||||
log_hint(_("directory may need to be manually removed"));
|
||||
}
|
||||
|
||||
termPQExpBuffer(&slotdir_ent_path);
|
||||
}
|
||||
}
|
||||
termPQExpBuffer(&slotdir_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1967,7 +2005,9 @@ do_node_rejoin(void)
|
||||
if (success == false)
|
||||
{
|
||||
log_notice(_("NODE REJOIN failed"));
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
if (strlen(follow_output.data))
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
create_event_notification(upstream_conn,
|
||||
&config_file_options,
|
||||
@@ -1983,22 +2023,99 @@ do_node_rejoin(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX add checks that node actually started and connected to primary,
|
||||
* if not exit with ERR_REJOIN_FAIL
|
||||
* Actively check that node actually started and connected to primary,
|
||||
* if not exit with ERR_REJOIN_FAIL.
|
||||
*
|
||||
* This check can be overridden with -W/--no-wait, in which case a one-time
|
||||
* check will be carried out.
|
||||
*/
|
||||
if (runtime_options.no_wait == false)
|
||||
{
|
||||
int i;
|
||||
|
||||
create_event_notification(upstream_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"node_rejoin",
|
||||
success,
|
||||
follow_output.data);
|
||||
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
||||
{
|
||||
if (is_server_available(config_file_options.conninfo))
|
||||
{
|
||||
log_verbose(LOG_INFO, _("demoted primary is pingable"));
|
||||
break;
|
||||
}
|
||||
|
||||
PQfinish(upstream_conn);
|
||||
if (i % 5 == 0)
|
||||
{
|
||||
log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"),
|
||||
config_file_options.node_id,
|
||||
i + 1, config_file_options.standby_reconnect_timeout);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts",
|
||||
config_file_options.node_id,
|
||||
i + 1, config_file_options.standby_reconnect_timeout);
|
||||
}
|
||||
|
||||
log_notice(_("NODE REJOIN successful"));
|
||||
log_detail("%s", follow_output.data);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
for (; i < config_file_options.standby_reconnect_timeout; i++)
|
||||
{
|
||||
success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
|
||||
|
||||
if (success == true)
|
||||
{
|
||||
log_verbose(LOG_INFO, _("node %i has attached to its upstream node"),
|
||||
config_file_options.node_id);
|
||||
break;
|
||||
}
|
||||
|
||||
if (i % 5 == 0)
|
||||
{
|
||||
log_info(_("waiting for node %i to connect to new primary; %i of max %i attempts"),
|
||||
config_file_options.node_id,
|
||||
i + 1, config_file_options.standby_reconnect_timeout);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts",
|
||||
config_file_options.node_id,
|
||||
i + 1, config_file_options.standby_reconnect_timeout);
|
||||
}
|
||||
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
create_event_notification(upstream_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"node_rejoin",
|
||||
success,
|
||||
follow_output.data);
|
||||
|
||||
if (success == false)
|
||||
{
|
||||
termPQExpBuffer(&follow_output);
|
||||
log_notice(_("NODE REJOIN failed"));
|
||||
exit(ERR_REJOIN_FAIL);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
|
||||
}
|
||||
|
||||
|
||||
if (success == true)
|
||||
{
|
||||
log_notice(_("NODE REJOIN successful"));
|
||||
log_detail("%s", follow_output.data);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* if we reach here, no record found in upstream node's pg_stat_replication */
|
||||
log_notice(_("NODE REJOIN has completed but node is not yet reattached to upstream"));
|
||||
log_hint(_("you will need to manually check the node's replication status"));
|
||||
}
|
||||
termPQExpBuffer(&follow_output);
|
||||
|
||||
return;
|
||||
@@ -2050,6 +2167,11 @@ _do_node_archive_config(void)
|
||||
termPQExpBuffer(&archive_dir);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_verbose(LOG_INFO, "temporary archive directory \"%s\" created", archive_dir.data);
|
||||
}
|
||||
}
|
||||
else if (!S_ISDIR(statbuf.st_mode))
|
||||
{
|
||||
@@ -2074,8 +2196,8 @@ _do_node_archive_config(void)
|
||||
{
|
||||
|
||||
/*
|
||||
* attempt to remove any existing files in the directory TODO: collate
|
||||
* problem files into list
|
||||
* attempt to remove any existing files in the directory
|
||||
* TODO: collate problem files into list
|
||||
*/
|
||||
while ((arcdir_ent = readdir(arcdir)) != NULL)
|
||||
{
|
||||
@@ -2151,7 +2273,11 @@ _do_node_archive_config(void)
|
||||
|
||||
if (i < config_file_len)
|
||||
{
|
||||
strncpy(filenamebuf, runtime_options.config_files + i, config_file_len - i);
|
||||
int filename_len = config_file_len - i;
|
||||
|
||||
strncpy(filenamebuf, runtime_options.config_files + i, filename_len);
|
||||
|
||||
filenamebuf[filename_len] = '\0';
|
||||
|
||||
initPQExpBuffer(&pathbuf);
|
||||
appendPQExpBuffer(&pathbuf,
|
||||
@@ -2229,7 +2355,7 @@ _do_node_archive_config(void)
|
||||
}
|
||||
else
|
||||
{
|
||||
log_verbose(LOG_INFO, "directory \"%s\" deleted", archive_dir.data);
|
||||
log_verbose(LOG_INFO, "temporary archive directory \"%s\" deleted", archive_dir.data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2438,15 +2564,15 @@ do_node_help(void)
|
||||
puts("");
|
||||
printf(_(" Configuration file required, runs on local node only.\n"));
|
||||
puts("");
|
||||
printf(_(" --csv emit output as CSV\n"));
|
||||
printf(_(" --nagios emit output in Nagios format (individual status output only)\n"));
|
||||
printf(_(" --csv emit output as CSV\n"));
|
||||
printf(_(" --nagios emit output in Nagios format (individual status output only)\n"));
|
||||
puts("");
|
||||
printf(_(" Following options check an individual status:\n"));
|
||||
printf(_(" --archive-ready number of WAL files ready for archiving\n"));
|
||||
printf(_(" --downstream whether all downstream nodes are connected\n"));
|
||||
printf(_(" --replication-lag replication lag in seconds (standbys only)\n"));
|
||||
printf(_(" --role check node has expected role\n"));
|
||||
printf(_(" --slots check for inactive replication slots\n"));
|
||||
printf(_(" --archive-ready number of WAL files ready for archiving\n"));
|
||||
printf(_(" --downstream whether all downstream nodes are connected\n"));
|
||||
printf(_(" --replication-lag replication lag in seconds (standbys only)\n"));
|
||||
printf(_(" --role check node has expected role\n"));
|
||||
printf(_(" --slots check for inactive replication slots\n"));
|
||||
|
||||
puts("");
|
||||
|
||||
@@ -2456,13 +2582,16 @@ do_node_help(void)
|
||||
puts("");
|
||||
printf(_(" Configuration file required, runs on local node only.\n"));
|
||||
puts("");
|
||||
printf(_(" --dry-run check that the prerequisites are met for rejoining the node\n" \
|
||||
" (including usability of \"pg_rewind\" if requested)\n"));
|
||||
printf(_(" --force-rewind execute \"pg_rewind\" if necessary\n"));
|
||||
printf(_(" --config-files comma-separated list of configuration files to retain\n" \
|
||||
" after executing \"pg_rewind\"\n"));
|
||||
printf(_(" --config-archive-dir directory to temporarily store retained configuration files\n" \
|
||||
" (default: /tmp)\n"));
|
||||
printf(_(" --dry-run check that the prerequisites are met for rejoining the node\n" \
|
||||
" (including usability of \"pg_rewind\" if requested)\n"));
|
||||
printf(_(" --force-rewind[=VALUE] execute \"pg_rewind\" if necessary\n"));
|
||||
printf(_(" (9.3 and 9.4 - provide full \"pg_rewind\" path)\n"));
|
||||
|
||||
printf(_(" --config-files comma-separated list of configuration files to retain\n" \
|
||||
" after executing \"pg_rewind\"\n"));
|
||||
printf(_(" --config-archive-dir directory to temporarily store retained configuration files\n" \
|
||||
" (default: /tmp)\n"));
|
||||
printf(_(" -W/--no-wait don't wait for the node to rejoin cluster\n"));
|
||||
puts("");
|
||||
|
||||
printf(_("NODE SERVICE\n"));
|
||||
|
||||
@@ -60,6 +60,7 @@ static char upstream_data_directory[MAXPGPATH];
|
||||
static t_conninfo_param_list recovery_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
||||
static char recovery_conninfo_str[MAXLEN] = "";
|
||||
static char upstream_repluser[NAMEDATALEN] = "";
|
||||
static char upstream_user[NAMEDATALEN] = "";
|
||||
|
||||
static int source_server_version_num = UNKNOWN_SERVER_VERSION_NUM;
|
||||
|
||||
@@ -72,7 +73,7 @@ static char local_repmgr_tmp_directory[MAXPGPATH];
|
||||
static char datadir_list_filename[MAXLEN];
|
||||
static char barman_command_buf[MAXLEN] = "";
|
||||
|
||||
static void _do_standby_promote_internal(PGconn *conn, const char *data_dir);
|
||||
static void _do_standby_promote_internal(PGconn *conn);
|
||||
static void _do_create_recovery_conf(void);
|
||||
|
||||
static void check_barman_config(void);
|
||||
@@ -86,7 +87,7 @@ static void initialise_direct_clone(t_node_info *node_record);
|
||||
static int run_basebackup(t_node_info *node_record);
|
||||
static int run_file_backup(t_node_info *node_record);
|
||||
|
||||
static void copy_configuration_files(void);
|
||||
static void copy_configuration_files(bool delete_after_copy);
|
||||
|
||||
static void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);
|
||||
|
||||
@@ -282,7 +283,7 @@ do_standby_clone(void)
|
||||
{
|
||||
/*
|
||||
* This connects to the source node and performs sanity checks, also
|
||||
* sets "recovery_conninfo_str", "upstream_repluser" and
|
||||
* sets "recovery_conninfo_str", "upstream_repluser", "upstream_user" and
|
||||
* "upstream_node_id".
|
||||
*
|
||||
* Will error out if source connection not possible and not in
|
||||
@@ -349,7 +350,7 @@ do_standby_clone(void)
|
||||
* `application_name`, if set
|
||||
*/
|
||||
|
||||
parse_success = parse_conninfo_string(recovery_conninfo_str, &recovery_conninfo, errmsg, true);
|
||||
parse_success = parse_conninfo_string(recovery_conninfo_str, &recovery_conninfo, &errmsg, true);
|
||||
|
||||
if (parse_success == false)
|
||||
{
|
||||
@@ -497,7 +498,33 @@ do_standby_clone(void)
|
||||
|
||||
termPQExpBuffer(&msg);
|
||||
|
||||
/* TODO: check all files are readable */
|
||||
/*
|
||||
* Here we'll attempt an initial test copy of the detected external
|
||||
* files, to detect any issues before we run the base backup.
|
||||
*
|
||||
* Note this will exit with an error, unless -F/--force supplied.
|
||||
*
|
||||
* TODO: put the files in a temporary directory and move to their final
|
||||
* destination once the database has been cloned.
|
||||
*/
|
||||
|
||||
if (runtime_options.copy_external_config_files_destination == CONFIG_FILE_SAMEPATH)
|
||||
{
|
||||
/*
|
||||
* Files will be placed in the same path as on the source server;
|
||||
* don't delete after copying.
|
||||
*/
|
||||
copy_configuration_files(false);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Files will be placed in the data directory - delete after copying.
|
||||
* They'll be copied again later; see TODO above.
|
||||
*/
|
||||
copy_configuration_files(true);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -596,7 +623,12 @@ do_standby_clone(void)
|
||||
*/
|
||||
if (runtime_options.copy_external_config_files == true && config_files.entries > 0)
|
||||
{
|
||||
copy_configuration_files();
|
||||
/*
|
||||
* If "--copy-external-config-files=samepath" was used, the files will already
|
||||
* have been copied.
|
||||
*/
|
||||
if (runtime_options.copy_external_config_files_destination == CONFIG_FILE_PGDATA)
|
||||
copy_configuration_files(false);
|
||||
}
|
||||
|
||||
/* Write the recovery.conf file */
|
||||
@@ -937,7 +969,6 @@ _do_create_recovery_conf(void)
|
||||
log_detail("%s", PQerrorMessage(source_conn));
|
||||
}
|
||||
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
@@ -954,7 +985,10 @@ _do_create_recovery_conf(void)
|
||||
{
|
||||
log_detail("%s", PQerrorMessage(source_conn));
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
log_hint(_("standby must be registered before a new recovery.conf file can be created"));
|
||||
}
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
@@ -1197,6 +1231,8 @@ do_standby_register(void)
|
||||
t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;
|
||||
int primary_node_id = UNKNOWN_NODE_ID;
|
||||
|
||||
bool dry_run_ok = true;
|
||||
|
||||
log_info(_("connecting to local node \"%s\" (ID: %i)"),
|
||||
config_file_options.node_name,
|
||||
config_file_options.node_id);
|
||||
@@ -1204,8 +1240,11 @@ do_standby_register(void)
|
||||
conn = establish_db_connection_quiet(config_file_options.conninfo);
|
||||
|
||||
/*
|
||||
* if --force provided, don't wait for the node to start, as the
|
||||
* normal use case will be re-registering an existing node, or
|
||||
* If unable to connect, and --force not provided, wait up to --wait-start
|
||||
* seconds (default: 0) for the node to become reachable.
|
||||
*
|
||||
* Not that if --force provided, we don't wait for the node to start, as
|
||||
* the normal use case will be re-registering an existing node, or
|
||||
* registering an inactive/not-yet-extant one; we'll do the
|
||||
* error handling for those cases in the next code block
|
||||
*/
|
||||
@@ -1243,9 +1282,12 @@ do_standby_register(void)
|
||||
config_file_options.node_id,
|
||||
timer);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* If still unable to connect, continue only if -F/--force provided,
|
||||
* and primary connection parameters provided.
|
||||
*/
|
||||
if (PQstatus(conn) != CONNECTION_OK)
|
||||
{
|
||||
if (runtime_options.force == false)
|
||||
@@ -1265,12 +1307,12 @@ do_standby_register(void)
|
||||
log_error(_("unable to connect to local node \"%s\" (ID: %i)"),
|
||||
config_file_options.node_name,
|
||||
config_file_options.node_id);
|
||||
log_hint(_("to register an inactive standby, additionally provide the primary connection parameters"));
|
||||
log_hint(_("to register a standby which is not running, additionally provide the primary connection parameters"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
|
||||
if (PQstatus(conn) == CONNECTION_OK)
|
||||
/* connection OK - check this is actually a standby */
|
||||
else
|
||||
{
|
||||
check_recovery_type(conn);
|
||||
}
|
||||
@@ -1293,7 +1335,6 @@ do_standby_register(void)
|
||||
primary_conn = establish_db_connection_by_params(&source_conninfo, false);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* no amount of --force will make it possible to register the standby
|
||||
* without a primary server to connect to
|
||||
@@ -1368,8 +1409,12 @@ do_standby_register(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* If an upstream node is defined, check if that node exists and is active
|
||||
* If it doesn't exist, and --force set, create a minimal inactive record
|
||||
* If an upstream node is defined, check if that node exists and is active.
|
||||
*
|
||||
* If it doesn't exist, and --force set, create a minimal inactive record,
|
||||
* in the assumption that the user knows what they are doing (usually some kind
|
||||
* of provisioning where multiple servers are created in parallel) and will
|
||||
* create the active record later.
|
||||
*/
|
||||
if (runtime_options.upstream_node_id != NO_UPSTREAM_NODE)
|
||||
{
|
||||
@@ -1493,15 +1538,15 @@ do_standby_register(void)
|
||||
/* check our standby is connected */
|
||||
if (is_downstream_node_attached(upstream_conn, config_file_options.node_name) == true)
|
||||
{
|
||||
log_verbose(LOG_INFO, _("local node is attached to upstream"));
|
||||
log_verbose(LOG_INFO, _("local node is attached to specified upstream node %i"), runtime_options.upstream_node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!runtime_options.force)
|
||||
{
|
||||
log_error(_("this node does not appear to be attached to upstream node \"%s\" (ID: %i)"),
|
||||
config_file_options.node_name,
|
||||
config_file_options.node_id);
|
||||
upstream_node_record.node_name,
|
||||
upstream_node_record.node_id);
|
||||
|
||||
log_detail(_("no record for application name \"%s\" found in \"pg_stat_replication\""),
|
||||
config_file_options.node_name);
|
||||
@@ -1520,24 +1565,82 @@ do_standby_register(void)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info(_("all prerequisites for \"standby register\" are met"));
|
||||
|
||||
PQfinish(primary_conn);
|
||||
if (PQstatus(conn) == CONNECTION_OK)
|
||||
PQfinish(conn);
|
||||
exit(SUCCESS);
|
||||
}
|
||||
|
||||
/*
|
||||
* populate node record structure with current values (this will overwrite
|
||||
* any existing values, which is what we want when updating the record
|
||||
* populate node record structure with current values set in repmgr.conf
|
||||
* and/or the command line (this will overwrite any existing values, which
|
||||
* is what we want when updating the record)
|
||||
*/
|
||||
init_node_record(&node_record);
|
||||
node_record.type = STANDBY;
|
||||
|
||||
/* if --upstream-node-id not provided, set to primary node id */
|
||||
if (node_record.upstream_node_id == UNKNOWN_NODE_ID)
|
||||
{
|
||||
node_record.upstream_node_id = primary_node_id;
|
||||
}
|
||||
|
||||
/*
|
||||
* If --upstream-node-id not provided, we're defaulting to the primary as
|
||||
* upstream node. If local node is available, double-check that it's attached
|
||||
* to the primary, in case --upstream-node-id was an accidental ommission.
|
||||
*
|
||||
* Currently we'll only do this for newly registered nodes.
|
||||
*/
|
||||
if (runtime_options.upstream_node_id == NO_UPSTREAM_NODE && PQstatus(conn) == CONNECTION_OK)
|
||||
{
|
||||
/* only do this if record does not exist */
|
||||
if (record_status != RECORD_FOUND)
|
||||
{
|
||||
log_warning(_("--upstream-node-id not supplied, assuming upstream node is primary (node ID %i)"),
|
||||
primary_node_id);
|
||||
|
||||
/* check our standby is connected */
|
||||
if (is_downstream_node_attached(primary_conn, config_file_options.node_name) == true)
|
||||
{
|
||||
log_verbose(LOG_INFO, _("local node is attached to primary"));
|
||||
}
|
||||
else if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("local node not attached to primary node %i"), primary_node_id);
|
||||
/* TODO: 9.6 and later, display detail from pg_stat_wal_receiver */
|
||||
log_hint(_("specify the actual upstream node id with --upstream-node-id, or use -F/--force to continue anyway"));
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
dry_run_ok = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
PQfinish(primary_conn);
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log_warning(_("local node not attached to primary node %i"), primary_node_id);
|
||||
log_notice(_("-F/--force supplied, continuing anyway"));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
PQfinish(primary_conn);
|
||||
if (PQstatus(conn) == CONNECTION_OK)
|
||||
PQfinish(conn);
|
||||
|
||||
if (dry_run_ok == false)
|
||||
{
|
||||
log_warning(_("issue(s) encountered; see preceding log messages"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
log_info(_("all prerequisites for \"standby register\" are met"));
|
||||
|
||||
exit(SUCCESS);
|
||||
}
|
||||
|
||||
/*
|
||||
* node record exists - update it (at this point we have already
|
||||
@@ -1560,13 +1663,11 @@ do_standby_register(void)
|
||||
|
||||
if (record_created == false)
|
||||
{
|
||||
appendPQExpBuffer(
|
||||
&details,
|
||||
appendPQExpBuffer(&details,
|
||||
"standby registration failed");
|
||||
|
||||
if (runtime_options.force == true)
|
||||
appendPQExpBuffer(
|
||||
&details,
|
||||
appendPQExpBuffer(&details,
|
||||
" (-F/--force option was used)");
|
||||
|
||||
create_event_notification_extended(
|
||||
@@ -1889,13 +1990,12 @@ do_standby_promote(void)
|
||||
|
||||
PQfinish(current_primary_conn);
|
||||
|
||||
|
||||
_do_standby_promote_internal(conn, config_file_options.data_directory);
|
||||
_do_standby_promote_internal(conn);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
_do_standby_promote_internal(PGconn *conn, const char *data_dir)
|
||||
_do_standby_promote_internal(PGconn *conn)
|
||||
{
|
||||
char script[MAXLEN];
|
||||
int r,
|
||||
@@ -1907,7 +2007,9 @@ _do_standby_promote_internal(PGconn *conn, const char *data_dir)
|
||||
|
||||
t_node_info local_node_record = T_NODE_INFO_INITIALIZER;
|
||||
RecordStatus record_status = RECORD_NOT_FOUND;
|
||||
char data_dir[MAXPGPATH];
|
||||
|
||||
get_node_config_directory(data_dir);
|
||||
|
||||
/* fetch local node record so we can add detail in log messages */
|
||||
record_status = get_node_record(conn,
|
||||
@@ -1976,13 +2078,6 @@ _do_standby_promote_internal(PGconn *conn, const char *data_dir)
|
||||
|
||||
log_verbose(LOG_INFO, _("standby promoted to primary after %i second(s)"), i);
|
||||
|
||||
/*
|
||||
* Execute a CHECKPOINT as soon as possible after promotion. The primary
|
||||
* reason for this is to ensure that "pg_control" has the latest timeline
|
||||
* before it's read by "pg_rewind", typically during a switchover operation.
|
||||
*/
|
||||
checkpoint(conn);
|
||||
|
||||
/* update node information to reflect new status */
|
||||
if (update_node_record_set_primary(conn, config_file_options.node_id) == false)
|
||||
{
|
||||
@@ -2064,7 +2159,13 @@ do_standby_follow(void)
|
||||
|
||||
log_verbose(LOG_DEBUG, "do_standby_follow()");
|
||||
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_hint(_("use \"repmgr node rejoin\" to re-add an inactive node to the replication cluster"));
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
log_verbose(LOG_INFO, _("connected to local node"));
|
||||
|
||||
@@ -2109,7 +2210,7 @@ do_standby_follow(void)
|
||||
log_hint(_("alter \"primary_follow_timeout\" in \"repmgr.conf\" to change this value"));
|
||||
}
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
exit(ERR_FOLLOW_FAIL);
|
||||
}
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
@@ -2128,7 +2229,7 @@ do_standby_follow(void)
|
||||
log_error(_("unable to find record for new upstream node %i"),
|
||||
runtime_options.upstream_node_id);
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
exit(ERR_FOLLOW_FAIL);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2161,7 +2262,7 @@ do_standby_follow(void)
|
||||
{
|
||||
log_error(_("unable to determine number of free replication slots on the primary"));
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
exit(ERR_FOLLOW_FAIL);
|
||||
}
|
||||
|
||||
if (free_slots == 0)
|
||||
@@ -2169,7 +2270,7 @@ do_standby_follow(void)
|
||||
log_error(_("no free replication slots available on the primary"));
|
||||
log_hint(_("consider increasing \"max_replication_slots\""));
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
exit(ERR_FOLLOW_FAIL);
|
||||
}
|
||||
else if (runtime_options.dry_run == true)
|
||||
{
|
||||
@@ -2185,15 +2286,19 @@ do_standby_follow(void)
|
||||
|
||||
conn_to_param_list(primary_conn, &repl_conninfo);
|
||||
|
||||
if (strcmp(param_get(&repl_conninfo, "user"), primary_node_record.repluser) != 0)
|
||||
{
|
||||
param_set(&repl_conninfo, "user", primary_node_record.repluser);
|
||||
param_set(&repl_conninfo, "dbname", "replication");
|
||||
}
|
||||
param_set(&repl_conninfo, "replication", "1");
|
||||
param_set(&repl_conninfo, "user", primary_node_record.repluser);
|
||||
|
||||
repl_conn = establish_db_connection_by_params(&repl_conninfo, false);
|
||||
if (PQstatus(repl_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_error(_("unable to establish a replication connection to the primary node"));
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
exit(ERR_FOLLOW_FAIL);
|
||||
}
|
||||
else if (runtime_options.dry_run == true)
|
||||
{
|
||||
@@ -2210,7 +2315,7 @@ do_standby_follow(void)
|
||||
log_error(_("unable to query the primary node's system identification"));
|
||||
PQfinish(primary_conn);
|
||||
PQfinish(repl_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
exit(ERR_FOLLOW_FAIL);
|
||||
}
|
||||
|
||||
if (primary_identification.system_identifier != local_system_identifier)
|
||||
@@ -2221,7 +2326,7 @@ do_standby_follow(void)
|
||||
primary_identification.system_identifier);
|
||||
PQfinish(primary_conn);
|
||||
PQfinish(repl_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
exit(ERR_FOLLOW_FAIL);
|
||||
}
|
||||
else if (runtime_options.dry_run == true)
|
||||
{
|
||||
@@ -2247,6 +2352,74 @@ do_standby_follow(void)
|
||||
&follow_output,
|
||||
&follow_error_code);
|
||||
|
||||
/* unable to restart the standby */
|
||||
if (success == false)
|
||||
{
|
||||
create_event_notification_extended(
|
||||
primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"standby_follow",
|
||||
success,
|
||||
follow_output.data,
|
||||
&event_info);
|
||||
|
||||
PQfinish(primary_conn);
|
||||
|
||||
log_notice(_("STANDBY FOLLOW failed"));
|
||||
if (strlen( follow_output.data ))
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
termPQExpBuffer(&follow_output);
|
||||
exit(follow_error_code);
|
||||
}
|
||||
|
||||
termPQExpBuffer(&follow_output);
|
||||
|
||||
initPQExpBuffer(&follow_output);
|
||||
|
||||
/*
|
||||
* Wait up to "standby_follow_timeout" seconds for standby to connect to
|
||||
* upstream.
|
||||
* For 9.6 and later, we could check pg_stat_wal_receiver on the local node.
|
||||
*/
|
||||
|
||||
/* assume success, necessary if standby_follow_timeout is zero */
|
||||
success = true;
|
||||
|
||||
for (timer = 0; timer < config_file_options.standby_follow_timeout; timer++)
|
||||
{
|
||||
success = is_downstream_node_attached(primary_conn, config_file_options.node_name);
|
||||
if (success == true)
|
||||
break;
|
||||
|
||||
log_verbose(LOG_DEBUG, "sleeping %i of max %i seconds waiting for standby to attach to primary",
|
||||
timer + 1,
|
||||
config_file_options.standby_follow_timeout);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
if (success == true)
|
||||
{
|
||||
log_notice(_("STANDBY FOLLOW successful"));
|
||||
appendPQExpBuffer(&follow_output,
|
||||
"standby attached to upstream node \"%s\" (node ID: %i)",
|
||||
primary_node_record.node_name,
|
||||
primary_node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("STANDBY FOLLOW failed"));
|
||||
appendPQExpBuffer(&follow_output,
|
||||
"standby did not attach to upstream node \"%s\" (node ID: %i) after %i seconds",
|
||||
primary_node_record.node_name,
|
||||
primary_node_id,
|
||||
config_file_options.standby_follow_timeout);
|
||||
|
||||
}
|
||||
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
create_event_notification_extended(
|
||||
primary_conn,
|
||||
&config_file_options,
|
||||
@@ -2258,20 +2431,11 @@ do_standby_follow(void)
|
||||
|
||||
PQfinish(primary_conn);
|
||||
|
||||
if (success == false)
|
||||
{
|
||||
log_notice(_("STANDBY FOLLOW failed"));
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
termPQExpBuffer(&follow_output);
|
||||
exit(follow_error_code);
|
||||
}
|
||||
|
||||
log_notice(_("STANDBY FOLLOW successful"));
|
||||
log_detail("%s", follow_output.data);
|
||||
|
||||
termPQExpBuffer(&follow_output);
|
||||
|
||||
if (success == false)
|
||||
exit(ERR_FOLLOW_FAIL);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -2308,7 +2472,7 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor
|
||||
log_error(_("unable to retrieve record for node %i"),
|
||||
config_file_options.node_id);
|
||||
|
||||
*error_code = ERR_BAD_CONFIG;
|
||||
*error_code = ERR_FOLLOW_FAIL;
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2356,7 +2520,7 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor
|
||||
initialize_conninfo_params(&recovery_conninfo, false);
|
||||
|
||||
/* We ignore any application_name set in the primary's conninfo */
|
||||
parse_conninfo_string(primary_node_record->conninfo, &recovery_conninfo, errmsg, true);
|
||||
parse_conninfo_string(primary_node_record->conninfo, &recovery_conninfo, &errmsg, true);
|
||||
|
||||
{
|
||||
t_conninfo_param_list local_node_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
||||
@@ -2364,7 +2528,7 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor
|
||||
|
||||
initialize_conninfo_params(&local_node_conninfo, false);
|
||||
|
||||
parse_success = parse_conninfo_string(local_node_record.conninfo, &local_node_conninfo, errmsg, false);
|
||||
parse_success = parse_conninfo_string(local_node_record.conninfo, &local_node_conninfo, &errmsg, false);
|
||||
|
||||
if (parse_success == false)
|
||||
{
|
||||
@@ -2438,8 +2602,7 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor
|
||||
|
||||
if (!create_recovery_file(&local_node_record, &recovery_conninfo, config_file_options.data_directory, true))
|
||||
{
|
||||
/* XXX ERR_RECOVERY_FILE ??? */
|
||||
*error_code = ERR_BAD_CONFIG;
|
||||
*error_code = ERR_FOLLOW_FAIL;
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -2772,7 +2935,7 @@ do_standby_switchover(void)
|
||||
* archived
|
||||
*/
|
||||
|
||||
if (runtime_options.force_rewind == true)
|
||||
if (runtime_options.force_rewind_used == true)
|
||||
{
|
||||
PQExpBufferData reason;
|
||||
PQExpBufferData msg;
|
||||
@@ -2938,7 +3101,7 @@ do_standby_switchover(void)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
break;
|
||||
default:
|
||||
log_error(_("unable to deterimine whether candidate is able to make replication connection to promotion candidate"));
|
||||
log_error(_("unable to determine whether demotion candidate is able to make replication connection to promotion candidate"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
break;
|
||||
}
|
||||
@@ -3381,6 +3544,8 @@ do_standby_switchover(void)
|
||||
remote_node_record.node_name,
|
||||
remote_node_record.node_id);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Stop the remote primary
|
||||
*
|
||||
@@ -3570,7 +3735,20 @@ do_standby_switchover(void)
|
||||
}
|
||||
|
||||
/* promote standby (local node) */
|
||||
_do_standby_promote_internal(local_conn, config_file_options.data_directory);
|
||||
_do_standby_promote_internal(local_conn);
|
||||
|
||||
|
||||
/*
|
||||
* if pg_rewind is requested, issue a checkpoint immediately after promoting
|
||||
* the local node, as pg_rewind compares timelines on the basis of the value
|
||||
* in pg_control, which is written at the first checkpoint, which might not
|
||||
* occur immediately.
|
||||
*/
|
||||
if (runtime_options.force_rewind_used == true)
|
||||
{
|
||||
log_notice(_("issuing CHECKPOINT"));
|
||||
checkpoint(local_conn);
|
||||
}
|
||||
|
||||
/*
|
||||
* Execute `repmgr node rejoin` to create recovery.conf and start the
|
||||
@@ -3583,18 +3761,26 @@ do_standby_switchover(void)
|
||||
KeyValueListCell *cell = NULL;
|
||||
bool first_entry = true;
|
||||
|
||||
if (runtime_options.force_rewind == false)
|
||||
if (runtime_options.force_rewind_used == false)
|
||||
{
|
||||
log_error(_("new primary diverges from former primary and --force-rewind not provided"));
|
||||
/* TODO: "repmgr node rejoin" example, when available */
|
||||
log_hint(_("the former primary will need to be restored manually"));
|
||||
log_hint(_("the former primary will need to be restored manually, or use \"repmgr node rejoin\""));
|
||||
termPQExpBuffer(&node_rejoin_options);
|
||||
PQfinish(local_conn);
|
||||
exit(ERR_SWITCHOVER_FAIL);
|
||||
}
|
||||
|
||||
appendPQExpBuffer(&node_rejoin_options,
|
||||
" --force-rewind --config-files=");
|
||||
" --force-rewind");
|
||||
|
||||
if (runtime_options.force_rewind_path[0] != '\0')
|
||||
{
|
||||
appendPQExpBuffer(&node_rejoin_options,
|
||||
"=%s",
|
||||
runtime_options.force_rewind_path);
|
||||
}
|
||||
appendPQExpBuffer(&node_rejoin_options,
|
||||
" --config-files=");
|
||||
|
||||
for (cell = remote_config_files.head; cell; cell = cell->next)
|
||||
{
|
||||
@@ -3668,41 +3854,6 @@ do_standby_switchover(void)
|
||||
|
||||
termPQExpBuffer(&command_output);
|
||||
|
||||
/* clean up remote node */
|
||||
remote_conn = establish_db_connection(remote_node_record.conninfo, false);
|
||||
|
||||
/* check new standby (old primary) is reachable */
|
||||
if (PQstatus(remote_conn) != CONNECTION_OK)
|
||||
{
|
||||
switchover_success = false;
|
||||
|
||||
/* TODO: double-check whether new standby has attached */
|
||||
|
||||
log_warning(_("switchover did not fully complete"));
|
||||
log_detail(_("node \"%s\" is now primary but node \"%s\" is not reachable"),
|
||||
local_node_record.node_name,
|
||||
remote_node_record.node_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (config_file_options.use_replication_slots == true)
|
||||
{
|
||||
drop_replication_slot_if_exists(remote_conn,
|
||||
remote_node_record.node_id,
|
||||
local_node_record.slot_name);
|
||||
}
|
||||
/* TODO warn about any inactive replication slots */
|
||||
|
||||
log_notice(_("switchover was successful"));
|
||||
log_detail(_("node \"%s\" is now primary and node \"%s\" is attached as standby"),
|
||||
local_node_record.node_name,
|
||||
remote_node_record.node_name);
|
||||
|
||||
}
|
||||
|
||||
PQfinish(remote_conn);
|
||||
|
||||
|
||||
/*
|
||||
* If --siblings-follow specified, attempt to make them follow the new
|
||||
* primary
|
||||
@@ -3778,6 +3929,61 @@ do_standby_switchover(void)
|
||||
|
||||
PQfinish(local_conn);
|
||||
|
||||
/*
|
||||
* Clean up remote node. It's possible that the standby is still starting up,
|
||||
* so poll for a while until we get a connection.
|
||||
*/
|
||||
|
||||
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
||||
{
|
||||
remote_conn = establish_db_connection(remote_node_record.conninfo, false);
|
||||
|
||||
if (PQstatus(remote_conn) == CONNECTION_OK)
|
||||
break;
|
||||
|
||||
log_info(_("sleeping 1 second; %i of %i attempts (\"standby_reconnect_timeout\") to reconnect to demoted primary"),
|
||||
i + 1,
|
||||
config_file_options.standby_reconnect_timeout);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
/* check new standby (old primary) is reachable */
|
||||
if (PQstatus(remote_conn) != CONNECTION_OK)
|
||||
{
|
||||
switchover_success = false;
|
||||
|
||||
/* TODO: double-check whether new standby has attached */
|
||||
|
||||
log_warning(_("switchover did not fully complete"));
|
||||
log_detail(_("node \"%s\" is now primary but node \"%s\" is not reachable"),
|
||||
local_node_record.node_name,
|
||||
remote_node_record.node_name);
|
||||
|
||||
if (config_file_options.use_replication_slots == true)
|
||||
{
|
||||
log_hint(_("any inactive replication slots on the old primary will need to be dropped manually"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (config_file_options.use_replication_slots == true)
|
||||
{
|
||||
drop_replication_slot_if_exists(remote_conn,
|
||||
remote_node_record.node_id,
|
||||
local_node_record.slot_name);
|
||||
}
|
||||
/* TODO warn about any inactive replication slots */
|
||||
|
||||
log_notice(_("switchover was successful"));
|
||||
log_detail(_("node \"%s\" is now primary and node \"%s\" is attached as standby"),
|
||||
local_node_record.node_name,
|
||||
remote_node_record.node_name);
|
||||
|
||||
}
|
||||
|
||||
PQfinish(remote_conn);
|
||||
|
||||
|
||||
if (switchover_success == true)
|
||||
{
|
||||
log_notice(_("STANDBY SWITCHOVER has completed successfully"));
|
||||
@@ -3801,6 +4007,8 @@ check_source_server()
|
||||
PGconn *privileged_conn = NULL;
|
||||
|
||||
char cluster_size[MAXLEN];
|
||||
char *connstr = NULL;
|
||||
|
||||
t_node_info node_record = T_NODE_INFO_INITIALIZER;
|
||||
RecordStatus record_status = RECORD_NOT_FOUND;
|
||||
ExtensionStatus extension_status = REPMGR_UNKNOWN;
|
||||
@@ -3809,8 +4017,11 @@ check_source_server()
|
||||
log_verbose(LOG_DEBUG, "check_source_server()");
|
||||
log_info(_("connecting to source node"));
|
||||
|
||||
source_conn = establish_db_connection_by_params(&source_conninfo, false);
|
||||
connstr = param_list_to_string(&source_conninfo);
|
||||
log_detail(_("connection string is: %s"), connstr);
|
||||
pfree(connstr);
|
||||
|
||||
source_conn = establish_db_connection_by_params(&source_conninfo, false);
|
||||
/*
|
||||
* Unless in barman mode, exit with an error;
|
||||
* establish_db_connection_by_params() will have already logged an error
|
||||
@@ -3964,9 +4175,28 @@ check_source_server()
|
||||
record_status = get_node_record(source_conn, upstream_node_id, &node_record);
|
||||
if (record_status == RECORD_FOUND)
|
||||
{
|
||||
upstream_conninfo_found = true;
|
||||
t_conninfo_param_list upstream_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
||||
char *upstream_conninfo_user;
|
||||
|
||||
initialize_conninfo_params(&upstream_conninfo, false);
|
||||
parse_conninfo_string(node_record.conninfo, &upstream_conninfo, NULL, false);
|
||||
|
||||
strncpy(recovery_conninfo_str, node_record.conninfo, MAXLEN);
|
||||
strncpy(upstream_repluser, node_record.repluser, NAMEDATALEN);
|
||||
|
||||
upstream_conninfo_user = param_get(&upstream_conninfo, "user");
|
||||
if (upstream_conninfo_user != NULL)
|
||||
{
|
||||
strncpy(upstream_user, upstream_conninfo_user, NAMEDATALEN);
|
||||
}
|
||||
else
|
||||
{
|
||||
get_conninfo_default_value("user", upstream_user, NAMEDATALEN);
|
||||
}
|
||||
|
||||
log_verbose(LOG_DEBUG, "upstream_user is \"%s\"", upstream_user);
|
||||
|
||||
upstream_conninfo_found = true;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4016,7 +4246,7 @@ check_source_server_via_barman()
|
||||
* parse_conninfo_string() here will remove the upstream's
|
||||
* `application_name`, if set
|
||||
*/
|
||||
parse_success = parse_conninfo_string(barman_conninfo_str, &barman_conninfo, errmsg, true);
|
||||
parse_success = parse_conninfo_string(barman_conninfo_str, &barman_conninfo, &errmsg, true);
|
||||
|
||||
if (parse_success == false)
|
||||
{
|
||||
@@ -4398,6 +4628,11 @@ check_upstream_config(PGconn *conn, int server_version_num, t_node_info *node_in
|
||||
param_set(&repl_conninfo, "user", node_info->repluser);
|
||||
}
|
||||
|
||||
if (strcmp(param_get(&repl_conninfo, "user"), upstream_user) != 0)
|
||||
{
|
||||
param_set(&repl_conninfo, "dbname", "replication");
|
||||
}
|
||||
|
||||
/*
|
||||
* work out how many replication connections are required (1 or 2)
|
||||
*/
|
||||
@@ -4512,7 +4747,7 @@ initialise_direct_clone(t_node_info *node_record)
|
||||
}
|
||||
else
|
||||
{
|
||||
TablespaceListCell *cell = false;
|
||||
TablespaceListCell *cell;
|
||||
KeyValueList not_found = {NULL, NULL};
|
||||
int total = 0,
|
||||
matched = 0;
|
||||
@@ -5570,7 +5805,7 @@ get_barman_property(char *dst, char *name, char *local_repmgr_directory)
|
||||
|
||||
|
||||
static void
|
||||
copy_configuration_files(void)
|
||||
copy_configuration_files(bool delete_after_copy)
|
||||
{
|
||||
int i,
|
||||
r;
|
||||
@@ -5615,13 +5850,35 @@ copy_configuration_files(void)
|
||||
r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
|
||||
file->filepath, dest_path.data, false, source_server_version_num);
|
||||
|
||||
termPQExpBuffer(&dest_path);
|
||||
/*
|
||||
* TODO: collate errors into list
|
||||
*/
|
||||
|
||||
if (WEXITSTATUS(r))
|
||||
{
|
||||
log_error(_("standby clone: unable to copy config file \"%s\""),
|
||||
file->filename);
|
||||
log_hint(_("see preceding messages for details"));
|
||||
|
||||
if (runtime_options.force == false)
|
||||
exit(ERR_BAD_RSYNC);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is to check we can actually copy the files before running the
|
||||
* main clone operation
|
||||
*/
|
||||
if (delete_after_copy == true)
|
||||
{
|
||||
/* this is very unlikely to happen, but log in case it does */
|
||||
if (unlink(dest_path.data) < 0 && errno != ENOENT)
|
||||
{
|
||||
log_warning(_("unable to delete %s"), dest_path.data);
|
||||
log_detail("%s", strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
termPQExpBuffer(&dest_path);
|
||||
}
|
||||
|
||||
return;
|
||||
@@ -5822,7 +6079,6 @@ create_recovery_file(t_node_info *node_record, t_conninfo_param_list *recovery_c
|
||||
appendPQExpBuffer(&recovery_file_buf,
|
||||
"recovery_min_apply_delay = %s\n",
|
||||
config_file_options.recovery_min_apply_delay);
|
||||
|
||||
}
|
||||
|
||||
/* primary_slot_name = '...' (optional, for 9.4 and later) */
|
||||
@@ -5847,6 +6103,16 @@ create_recovery_file(t_node_info *node_record, t_conninfo_param_list *recovery_c
|
||||
free(escaped);
|
||||
}
|
||||
|
||||
/* archive_cleanup_command (optional) */
|
||||
if (config_file_options.archive_cleanup_command[0] != '\0')
|
||||
{
|
||||
char *escaped = escape_recovery_conf_value(config_file_options.archive_cleanup_command);
|
||||
appendPQExpBuffer(&recovery_file_buf,
|
||||
"archive_cleanup_command = '%s'\n",
|
||||
escaped);
|
||||
free(escaped);
|
||||
}
|
||||
|
||||
if (as_file == true)
|
||||
{
|
||||
maxpath_snprintf(recovery_file_path, "%s/%s", dest, RECOVERY_COMMAND_FILE);
|
||||
@@ -6284,7 +6550,9 @@ do_standby_help(void)
|
||||
printf(_(" --always-promote promote standby even if behind original primary\n"));
|
||||
printf(_(" --dry-run perform checks etc. but don't actually execute switchover\n"));
|
||||
printf(_(" -F, --force ignore warnings and continue anyway\n"));
|
||||
printf(_(" --force-rewind 9.5 and later - use pg_rewind to reintegrate the old primary if necessary\n"));
|
||||
printf(_(" --force-rewind[=VALUE] use \"pg_rewind\" to reintegrate the old primary if necessary\n"));
|
||||
printf(_(" (9.3 and 9.4 - provide \"pg_rewind\" path)\n"));
|
||||
|
||||
printf(_(" -R, --remote-user=USERNAME database server username for SSH operations (default: \"%s\")\n"), runtime_options.username);
|
||||
printf(_(" --siblings-follow have other standbys follow new primary\n"));
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ do_witness_register(void)
|
||||
if (recovery_type == RECTYPE_STANDBY)
|
||||
{
|
||||
log_error(_("provided node is a standby"));
|
||||
log_error(_("a witness node must run on an independent primary server"));
|
||||
log_hint(_("a witness node must run on an independent primary server"));
|
||||
|
||||
PQfinish(witness_conn);
|
||||
|
||||
@@ -86,6 +86,7 @@ do_witness_register(void)
|
||||
|
||||
/* connect to primary with provided parameters */
|
||||
log_info(_("connecting to primary node"));
|
||||
|
||||
/*
|
||||
* Extract the repmgr user and database names from the conninfo string
|
||||
* provided in repmgr.conf
|
||||
@@ -135,8 +136,11 @@ do_witness_register(void)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* XXX sanity check witness node is not part of main cluster */
|
||||
|
||||
/*
|
||||
* TODO: sanity check witness node is not part of main cluster; we could
|
||||
* add a random application_name to the respective connections,
|
||||
* and do a simple check of pg_stat_activity
|
||||
*/
|
||||
|
||||
/* create repmgr extension, if does not exist */
|
||||
if (runtime_options.dry_run == false && !create_repmgr_extension(witness_conn))
|
||||
@@ -182,7 +186,6 @@ do_witness_register(void)
|
||||
log_error(_("witness node is already registered"));
|
||||
log_hint(_("use option -F/--force to reregister the node"));
|
||||
|
||||
|
||||
PQfinish(witness_conn);
|
||||
PQfinish(primary_conn);
|
||||
|
||||
@@ -190,8 +193,26 @@ do_witness_register(void)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that an active node with the same node_name doesn't exist already
|
||||
*/
|
||||
|
||||
// XXX check other node with same name does not exist
|
||||
record_status = get_node_record_by_name(primary_conn,
|
||||
config_file_options.node_name,
|
||||
&node_record);
|
||||
|
||||
|
||||
if (record_status == RECORD_FOUND)
|
||||
{
|
||||
if (node_record.active == true && node_record.node_id != config_file_options.node_id)
|
||||
{
|
||||
log_error(_("node %i exists already with node_name \"%s\""),
|
||||
node_record.node_id,
|
||||
config_file_options.node_name);
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* if repmgr.nodes contains entries, delete if -F/--force provided,
|
||||
@@ -222,6 +243,7 @@ do_witness_register(void)
|
||||
PQfinish(witness_conn);
|
||||
exit(SUCCESS);
|
||||
}
|
||||
|
||||
/* create record on primary */
|
||||
|
||||
/*
|
||||
|
||||
@@ -42,6 +42,7 @@ typedef struct
|
||||
bool force;
|
||||
char pg_bindir[MAXLEN]; /* overrides setting in repmgr.conf */
|
||||
bool wait;
|
||||
bool no_wait;
|
||||
|
||||
/* logging options */
|
||||
char log_level[MAXLEN]; /* overrides setting in repmgr.conf */
|
||||
@@ -92,7 +93,8 @@ typedef struct
|
||||
|
||||
/* "standby switchover" options */
|
||||
bool always_promote;
|
||||
bool force_rewind;
|
||||
bool force_rewind_used;
|
||||
char force_rewind_path[MAXPGPATH];
|
||||
bool siblings_follow;
|
||||
|
||||
/* "node status" options */
|
||||
@@ -133,7 +135,7 @@ typedef struct
|
||||
/* configuration metadata */ \
|
||||
false, false, false, false, \
|
||||
/* general configuration options */ \
|
||||
"", false, false, "", false, \
|
||||
"", false, false, "", false, false, \
|
||||
/* logging options */ \
|
||||
"", false, false, false, \
|
||||
/* output options */ \
|
||||
@@ -152,7 +154,7 @@ typedef struct
|
||||
/* "standby register" options */ \
|
||||
false, 0, DEFAULT_WAIT_START, \
|
||||
/* "standby switchover" options */ \
|
||||
false, false, false, \
|
||||
false, false, "", false, \
|
||||
/* "node status" options */ \
|
||||
false, \
|
||||
/* "node check" options */ \
|
||||
@@ -208,6 +210,7 @@ extern void check_93_config(void);
|
||||
extern bool create_repmgr_extension(PGconn *conn);
|
||||
extern int test_ssh_connection(char *host, char *remote_user);
|
||||
extern bool local_command(const char *command, PQExpBufferData *outputbuf);
|
||||
extern bool local_command_simple(const char *command, PQExpBufferData *outputbuf);
|
||||
|
||||
extern standy_clone_mode get_standby_clone_mode(void);
|
||||
|
||||
@@ -228,7 +231,9 @@ extern void print_help_header(void);
|
||||
/* server control functions */
|
||||
extern void get_server_action(t_server_action action, char *script, char *data_dir);
|
||||
extern bool data_dir_required_for_action(t_server_action action);
|
||||
extern void get_node_config_directory(char *config_dir_buf);
|
||||
extern void get_node_data_directory(char *data_dir_buf);
|
||||
extern void init_node_record(t_node_info *node_record);
|
||||
extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
|
||||
|
||||
#endif /* _REPMGR_CLIENT_GLOBAL_H_ */
|
||||
|
||||
209
repmgr-client.c
209
repmgr-client.c
@@ -53,6 +53,7 @@
|
||||
|
||||
#include "repmgr.h"
|
||||
#include "compat.h"
|
||||
#include "controldata.h"
|
||||
#include "repmgr-client.h"
|
||||
#include "repmgr-client-global.h"
|
||||
#include "repmgr-action-primary.h"
|
||||
@@ -90,6 +91,7 @@ t_node_info target_node_info = T_NODE_INFO_INITIALIZER;
|
||||
static ItemList cli_errors = {NULL, NULL};
|
||||
static ItemList cli_warnings = {NULL, NULL};
|
||||
|
||||
static bool _local_command(const char *command, PQExpBufferData *outputbuf, bool simple);
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
@@ -176,7 +178,7 @@ main(int argc, char **argv)
|
||||
strncpy(runtime_options.username, pw->pw_name, MAXLEN);
|
||||
}
|
||||
|
||||
while ((c = getopt_long(argc, argv, "?Vb:f:FWd:h:p:U:R:S:D:ck:L:tvC:", long_options,
|
||||
while ((c = getopt_long(argc, argv, "?Vb:f:FwWd:h:p:U:R:S:D:ck:L:tvC:", long_options,
|
||||
&optindex)) != -1)
|
||||
{
|
||||
/*
|
||||
@@ -241,11 +243,16 @@ main(int argc, char **argv)
|
||||
strncpy(runtime_options.replication_user, optarg, MAXLEN);
|
||||
break;
|
||||
|
||||
/* -W/--wait */
|
||||
case 'W':
|
||||
/* -w/--wait */
|
||||
case 'w':
|
||||
runtime_options.wait = true;
|
||||
break;
|
||||
|
||||
/* -W/--no-wait */
|
||||
case 'W':
|
||||
runtime_options.no_wait = true;
|
||||
break;
|
||||
|
||||
/*----------------------------
|
||||
* database connection options
|
||||
*----------------------------
|
||||
@@ -420,7 +427,13 @@ main(int argc, char **argv)
|
||||
break;
|
||||
|
||||
case OPT_FORCE_REWIND:
|
||||
runtime_options.force_rewind = true;
|
||||
runtime_options.force_rewind_used = true;
|
||||
|
||||
if (optarg != NULL)
|
||||
{
|
||||
strncpy(runtime_options.force_rewind_path, optarg, MAXPGPATH);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case OPT_SIBLINGS_FOLLOW:
|
||||
@@ -621,7 +634,7 @@ main(int argc, char **argv)
|
||||
* If -d/--dbname appears to be a conninfo string, validate by attempting
|
||||
* to parse it (and if successful, store the parsed parameters)
|
||||
*/
|
||||
if (runtime_options.dbname)
|
||||
if (runtime_options.dbname[0])
|
||||
{
|
||||
if (strncmp(runtime_options.dbname, "postgresql://", 13) == 0 ||
|
||||
strncmp(runtime_options.dbname, "postgres://", 11) == 0 ||
|
||||
@@ -997,7 +1010,6 @@ main(int argc, char **argv)
|
||||
runtime_options.output_mode = OM_OPTFORMAT;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Check for configuration file items which can be overriden by runtime
|
||||
* options
|
||||
@@ -1055,6 +1067,17 @@ main(int argc, char **argv)
|
||||
if (runtime_options.terse)
|
||||
logger_set_terse();
|
||||
|
||||
/*
|
||||
* If --dry-run specified, ensure log_level is at least LOG_INFO, regardless
|
||||
* of what's in the configuration file or -L/--log-level paremeter, otherwise
|
||||
* some or output might not be displayed.
|
||||
*/
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
logger_set_min_level(LOG_INFO);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Node configuration information is not needed for all actions, with
|
||||
* STANDBY CLONE being the main exception.
|
||||
@@ -1315,6 +1338,15 @@ check_cli_parameters(const int action)
|
||||
_("--no-upstream-connection only effective in Barman mode"));
|
||||
}
|
||||
}
|
||||
|
||||
if (strlen(config_file_options.config_directory))
|
||||
{
|
||||
if (runtime_options.copy_external_config_files == false)
|
||||
{
|
||||
item_list_append(&cli_warnings,
|
||||
_("\"config_directory\" set in repmgr.conf, but --copy-external-config-files not provided"));
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -1563,6 +1595,41 @@ check_cli_parameters(const int action)
|
||||
}
|
||||
}
|
||||
|
||||
/* --wait/--no-wait */
|
||||
|
||||
if (runtime_options.wait == true && runtime_options.no_wait == true)
|
||||
{
|
||||
item_list_append_format(&cli_errors,
|
||||
_("both --wait and --no-wait options provided"));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (runtime_options.wait)
|
||||
{
|
||||
switch (action)
|
||||
{
|
||||
case STANDBY_FOLLOW:
|
||||
break;
|
||||
default:
|
||||
item_list_append_format(&cli_warnings,
|
||||
_("--wait will be ignored when executing %s"),
|
||||
action_name(action));
|
||||
}
|
||||
}
|
||||
else if (runtime_options.wait)
|
||||
{
|
||||
switch (action)
|
||||
{
|
||||
case NODE_REJOIN:
|
||||
break;
|
||||
default:
|
||||
item_list_append_format(&cli_warnings,
|
||||
_("--no-wait will be ignored when executing %s"),
|
||||
action_name(action));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* repmgr node service --action */
|
||||
if (runtime_options.action[0] != '\0')
|
||||
{
|
||||
@@ -1604,7 +1671,7 @@ check_cli_parameters(const int action)
|
||||
}
|
||||
}
|
||||
|
||||
if (runtime_options.force_rewind == true)
|
||||
if (runtime_options.force_rewind_used == true)
|
||||
{
|
||||
switch (action)
|
||||
{
|
||||
@@ -2096,12 +2163,28 @@ test_ssh_connection(char *host, char *remote_user)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Execute a command locally. "outputbuf" should either be an
|
||||
* initialised PQexpbuffer, or NULL
|
||||
*/
|
||||
bool
|
||||
local_command(const char *command, PQExpBufferData *outputbuf)
|
||||
{
|
||||
return _local_command(command, outputbuf, false);
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
local_command_simple(const char *command, PQExpBufferData *outputbuf)
|
||||
{
|
||||
return _local_command(command, outputbuf, true);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
_local_command(const char *command, PQExpBufferData *outputbuf, bool simple)
|
||||
{
|
||||
FILE *fp = NULL;
|
||||
char output[MAXLEN];
|
||||
@@ -2128,7 +2211,8 @@ local_command(const char *command, PQExpBufferData *outputbuf)
|
||||
while (fgets(output, MAXLEN, fp) != NULL)
|
||||
{
|
||||
appendPQExpBuffer(outputbuf, "%s", output);
|
||||
if (!feof(fp))
|
||||
|
||||
if (!feof(fp) && simple == false)
|
||||
{
|
||||
break;
|
||||
}
|
||||
@@ -2171,6 +2255,7 @@ get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privil
|
||||
log_error(_("no database connection available"));
|
||||
exit(ERR_INTERNAL);
|
||||
}
|
||||
|
||||
is_superuser = is_superuser_connection(*conn, &userinfo);
|
||||
|
||||
if (is_superuser == true)
|
||||
@@ -2212,6 +2297,8 @@ get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privil
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
log_debug("established superuser connection as \"%s\"", runtime_options.superuser);
|
||||
|
||||
*privileged_conn = *superuser_conn;
|
||||
return;
|
||||
}
|
||||
@@ -2353,9 +2440,6 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Execute a command via ssh on the remote host.
|
||||
*
|
||||
@@ -2421,7 +2505,7 @@ remote_command(const char *host, const char *user, const char *command, PQExpBuf
|
||||
if (outputbuf != NULL)
|
||||
{
|
||||
if (strlen(outputbuf->data))
|
||||
log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n %s", outputbuf->data);
|
||||
log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n%s", outputbuf->data);
|
||||
else
|
||||
log_verbose(LOG_DEBUG, "remote_command(): no output returned");
|
||||
}
|
||||
@@ -2670,6 +2754,33 @@ data_dir_required_for_action(t_server_action action)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Copy the location of the configuration file directory into the
|
||||
* provided buffer; if "config_directory" provided, use that, otherwise
|
||||
* default to the data directory.
|
||||
*
|
||||
* This is primarily intended for use with "pg_ctl" (which itself shouldn't
|
||||
* be used outside of development environments).
|
||||
*/
|
||||
void
|
||||
get_node_config_directory(char *config_dir_buf)
|
||||
{
|
||||
if (config_file_options.config_directory[0] != '\0')
|
||||
{
|
||||
strncpy(config_dir_buf, config_file_options.config_directory, MAXPGPATH);
|
||||
return;
|
||||
}
|
||||
|
||||
if (config_file_options.data_directory[0] != '\0')
|
||||
{
|
||||
strncpy(config_dir_buf, config_file_options.data_directory, MAXPGPATH);
|
||||
return;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
get_node_data_directory(char *data_dir_buf)
|
||||
{
|
||||
@@ -2734,3 +2845,77 @@ init_node_record(t_node_info *node_record)
|
||||
create_slot_name(node_record->slot_name, config_file_options.node_id);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason)
|
||||
{
|
||||
bool can_use = true;
|
||||
int server_version_num = get_server_version(conn, NULL);
|
||||
|
||||
/* wal_log_hints not available in 9.3, so just determine if data checksums enabled */
|
||||
if (server_version_num < 90400)
|
||||
{
|
||||
int data_checksum_version = get_data_checksum_version(data_directory);
|
||||
|
||||
if (data_checksum_version < 0)
|
||||
{
|
||||
appendPQExpBuffer(reason,
|
||||
_("unable to determine data checksum version"));
|
||||
can_use = false;
|
||||
}
|
||||
else if (data_checksum_version == 0)
|
||||
{
|
||||
appendPQExpBuffer(reason,
|
||||
_("this cluster was initialised without data checksums"));
|
||||
can_use = false;
|
||||
}
|
||||
|
||||
return can_use;
|
||||
}
|
||||
|
||||
/* "full_page_writes" must be on in any case */
|
||||
if (guc_set(conn, "full_page_writes", "=", "off"))
|
||||
{
|
||||
if (can_use == false)
|
||||
appendPQExpBuffer(reason, "; ");
|
||||
|
||||
appendPQExpBuffer(reason,
|
||||
_("\"full_page_writes\" must be set to \"on\""));
|
||||
|
||||
can_use = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* "wal_log_hints" off - are data checksums available? Note: we're
|
||||
* checking the local pg_control file here as the value will be the same
|
||||
* throughout the cluster and saves a round-trip to the demotion
|
||||
* candidate.
|
||||
*/
|
||||
if (guc_set(conn, "wal_log_hints", "=", "on") == false)
|
||||
{
|
||||
int data_checksum_version = get_data_checksum_version(data_directory);
|
||||
|
||||
if (data_checksum_version < 0)
|
||||
{
|
||||
if (can_use == false)
|
||||
appendPQExpBuffer(reason, "; ");
|
||||
|
||||
appendPQExpBuffer(reason,
|
||||
_("\"wal_log_hints\" is set to \"off\" but unable to determine data checksum version"));
|
||||
can_use = false;
|
||||
}
|
||||
else if (data_checksum_version == 0)
|
||||
{
|
||||
if (can_use == false)
|
||||
appendPQExpBuffer(reason, "; ");
|
||||
|
||||
appendPQExpBuffer(reason,
|
||||
_("\"wal_log_hints\" is set to \"off\" and data checksums are disabled"));
|
||||
|
||||
can_use = false;
|
||||
}
|
||||
}
|
||||
|
||||
return can_use;
|
||||
}
|
||||
|
||||
@@ -86,6 +86,7 @@
|
||||
#define OPT_REPL_CONN 1037
|
||||
#define OPT_REMOTE_NODE_ID 1038
|
||||
#define OPT_RECOVERY_CONF_ONLY 1039
|
||||
#define OPT_NO_WAIT 1040
|
||||
|
||||
/* deprecated since 3.3 */
|
||||
#define OPT_DATA_DIR 999
|
||||
@@ -104,7 +105,8 @@ static struct option long_options[] =
|
||||
{"dry-run", no_argument, NULL, OPT_DRY_RUN},
|
||||
{"force", no_argument, NULL, 'F'},
|
||||
{"pg_bindir", required_argument, NULL, 'b'},
|
||||
{"wait", no_argument, NULL, 'W'},
|
||||
{"wait", no_argument, NULL, 'w'},
|
||||
{"no-wait", no_argument, NULL, 'W'},
|
||||
|
||||
/* connection options */
|
||||
{"dbname", required_argument, NULL, 'd'},
|
||||
@@ -168,7 +170,7 @@ static struct option long_options[] =
|
||||
/* "node rejoin" options */
|
||||
{"config-files", required_argument, NULL, OPT_CONFIG_FILES},
|
||||
{"config-archive-dir", required_argument, NULL, OPT_CONFIG_ARCHIVE_DIR},
|
||||
{"force-rewind", no_argument, NULL, OPT_FORCE_REWIND},
|
||||
{"force-rewind", optional_argument, NULL, OPT_FORCE_REWIND},
|
||||
|
||||
/* "node service" options */
|
||||
{"action", required_argument, NULL, OPT_ACTION},
|
||||
|
||||
@@ -40,18 +40,28 @@
|
||||
# is not running and there's no other way of determining
|
||||
# the data directory.
|
||||
|
||||
#replication_user='repmgr' # User to make replication connections with, if not set defaults
|
||||
# to the user defined in "conninfo".
|
||||
|
||||
# =============================================================================
|
||||
|
||||
# Optional configuration items
|
||||
# =============================================================================
|
||||
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Server settings
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
#config_directory='' # If configuration files are located outside the data
|
||||
# directory, specify the directory where the main
|
||||
# postgresql.conf file is located.
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Replication settings
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
#replication_user='repmgr' # User to make replication connections with, if not set defaults
|
||||
# to the user defined in "conninfo".
|
||||
|
||||
#replication_type=physical # Must be one of 'physical' or 'bdr'.
|
||||
|
||||
#location=default # arbitrary string defining the location of the node; this
|
||||
@@ -65,9 +75,6 @@
|
||||
# at least the number of standbys which will connect
|
||||
# to the primary.
|
||||
|
||||
#recovery_min_apply_delay= # If provided, "recovery_min_apply_delay" in recovery.conf
|
||||
# will be set to this value.
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Witness server settings
|
||||
#------------------------------------------------------------------------------
|
||||
@@ -91,7 +98,7 @@
|
||||
#log_facility=STDERR # Logging facility: possible values are STDERR, or for
|
||||
# syslog integration, one of LOCAL0, LOCAL1, ..., LOCAL7, USER
|
||||
|
||||
#log_file='' # stderr can be redirected to an arbitrary file:
|
||||
#log_file='' # stderr can be redirected to an arbitrary file
|
||||
#log_status_interval=300 # interval (in seconds) for repmgrd to log a status message
|
||||
|
||||
|
||||
@@ -175,8 +182,15 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
# file system location to another. This
|
||||
# parameter can be provided multiple times.
|
||||
|
||||
#restore_command='' # This will be placed in the recovery.conf
|
||||
# file generated by repmgr
|
||||
#restore_command='' # This will be placed in the recovery.conf file generated
|
||||
# by repmgr.
|
||||
|
||||
#archive_cleanup_command='' # This will be placed in the recovery.conf file generated
|
||||
# by repmgr. Note we recommend using Barman for managing
|
||||
# WAL archives (see: https://www.pgbarman.org )
|
||||
|
||||
#recovery_min_apply_delay= # If provided, "recovery_min_apply_delay" in recovery.conf
|
||||
# will be set to this value (PostgreSQL 9.4 and later).
|
||||
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
@@ -199,8 +213,10 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
# These settings apply when instructing a standby to follow the new primary
|
||||
# ("repmgr standby follow").
|
||||
|
||||
#primary_follow_timeout=60 # The length of time (in seconds) to wait
|
||||
#primary_follow_timeout=60 # The max length of time (in seconds) to wait
|
||||
# for the new primary to become available
|
||||
#standby_follow_timeout=15 # The max length of time (in seconds) to wait
|
||||
# for the standby to connect to the primary
|
||||
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
@@ -286,6 +302,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
# /usr/bin/systemctl start postgresql-9.6, \
|
||||
# /usr/bin/systemctl restart postgresql-9.6
|
||||
#
|
||||
# Debian/Ubuntu users: use "sudo pg_ctlcluster" to execute service control commands.
|
||||
#
|
||||
# For more details, see: https://repmgr.org/docs/4.0/configuration-service-commands.html
|
||||
|
||||
#service_start_command = ''
|
||||
#service_stop_command = ''
|
||||
|
||||
1
repmgr.h
1
repmgr.h
@@ -70,6 +70,7 @@
|
||||
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */
|
||||
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */
|
||||
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 /* seconds */
|
||||
#define DEFAULT_STANDBY_FOLLOW_TIMEOUT 30 /* seconds */
|
||||
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30 /* seconds */
|
||||
#define DEFAULT_ARCHIVE_READY_WARNING 16 /* WAL files */
|
||||
#define DEFAULT_ARCHIVE_READY_CRITICAL 128 /* WAL files */
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
#define REPMGR_VERSION_DATE ""
|
||||
#define REPMGR_VERSION "4.0.4"
|
||||
#define REPMGR_VERSION "4.0.6"
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ static FailoverState failover_state = FAILOVER_STATE_UNKNOWN;
|
||||
|
||||
static int primary_node_id = UNKNOWN_NODE_ID;
|
||||
static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
|
||||
static NodeInfoList standby_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||
static NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||
|
||||
|
||||
static ElectionResult do_election(void);
|
||||
@@ -74,7 +74,6 @@ static FailoverState follow_new_primary(int new_primary_id);
|
||||
static FailoverState witness_follow_new_primary(int new_primary_id);
|
||||
|
||||
static void reset_node_voting_status(void);
|
||||
void close_connections_physical();
|
||||
|
||||
static bool do_primary_failover(void);
|
||||
static bool do_upstream_standby_failover(void);
|
||||
@@ -142,7 +141,7 @@ do_physical_node_check(void)
|
||||
case FAILOVER_AUTOMATIC:
|
||||
log_error(_("this node is marked as inactive and cannot be used as a failover target"));
|
||||
log_hint(_("%s"), hint);
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
|
||||
create_event_notification(NULL,
|
||||
&config_file_options,
|
||||
@@ -193,7 +192,7 @@ do_physical_node_check(void)
|
||||
if (required_param_missing == true)
|
||||
{
|
||||
log_hint(_("add the missing configuration parameter(s) and start repmgrd again"));
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
@@ -275,7 +274,7 @@ monitor_streaming_primary(void)
|
||||
|
||||
local_node_info.node_status = NODE_STATUS_UNKNOWN;
|
||||
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
|
||||
/*
|
||||
* as we're monitoring the primary, no point in trying to
|
||||
@@ -359,7 +358,7 @@ monitor_streaming_primary(void)
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_warning(_("node appears to be up but no connection could be made"));
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -384,7 +383,7 @@ monitor_streaming_primary(void)
|
||||
|
||||
if (PQstatus(new_primary_conn) != CONNECTION_OK)
|
||||
{
|
||||
PQfinish(new_primary_conn);
|
||||
close_connection(&new_primary_conn);
|
||||
log_warning(_("unable to connect to new primary node %i"), primary_node_id);
|
||||
}
|
||||
else
|
||||
@@ -450,7 +449,7 @@ monitor_streaming_primary(void)
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
PQfinish(new_primary_conn);
|
||||
close_connection(&new_primary_conn);
|
||||
|
||||
/* restart monitoring as standby */
|
||||
return;
|
||||
@@ -468,7 +467,7 @@ monitor_streaming_primary(void)
|
||||
log_error("%s", event_details.data);
|
||||
log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
|
||||
|
||||
PQfinish(new_primary_conn);
|
||||
close_connection(&new_primary_conn);
|
||||
|
||||
create_event_notification(NULL,
|
||||
&config_file_options,
|
||||
@@ -540,7 +539,7 @@ loop:
|
||||
|
||||
if (reload_config(&config_file_options))
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
if (*config_file_options.log_file)
|
||||
@@ -595,7 +594,7 @@ monitor_streaming_standby(void)
|
||||
if (local_node_info.upstream_node_id == NODE_NOT_FOUND)
|
||||
{
|
||||
log_error(_("unable to determine an active primary for this cluster, terminating"));
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
@@ -612,14 +611,14 @@ monitor_streaming_standby(void)
|
||||
log_error(_("no record found for upstream node (ID: %i), terminating"),
|
||||
local_node_info.upstream_node_id);
|
||||
log_hint(_("ensure the upstream node is registered correctly"));
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
else if (record_status == RECORD_ERROR)
|
||||
{
|
||||
log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
|
||||
local_node_info.upstream_node_id);
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
@@ -639,7 +638,7 @@ monitor_streaming_standby(void)
|
||||
local_node_info.upstream_node_id);
|
||||
log_hint(_("upstream node must be running before repmgrd can start"));
|
||||
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
@@ -749,7 +748,18 @@ monitor_streaming_standby(void)
|
||||
log_warning("%s", event_details.data);
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
PQfinish(upstream_conn);
|
||||
close_connection(&upstream_conn);
|
||||
|
||||
/*
|
||||
* if local node is unreachable, make a last-minute attempt to reconnect
|
||||
* before continuing with the failover process
|
||||
*/
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
check_connection(&local_node_info, &local_conn);
|
||||
}
|
||||
|
||||
upstream_conn = try_reconnect(&upstream_node_info);
|
||||
|
||||
/* Node has recovered - log and continue */
|
||||
@@ -806,6 +816,29 @@ monitor_streaming_standby(void)
|
||||
{
|
||||
int degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
|
||||
|
||||
if (config_file_options.degraded_monitoring_timeout > 0
|
||||
&& degraded_monitoring_elapsed > config_file_options.degraded_monitoring_timeout)
|
||||
{
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("degraded monitoring timeout (%i seconds) exceeded, terminating"),
|
||||
degraded_monitoring_elapsed);
|
||||
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
create_event_notification(NULL,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"repmgrd_shutdown",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
terminate(ERR_MONITORING_TIMEOUT);
|
||||
}
|
||||
|
||||
|
||||
log_debug("monitoring node %i in degraded state for %i seconds",
|
||||
upstream_node_info.node_id,
|
||||
degraded_monitoring_elapsed);
|
||||
@@ -872,7 +905,49 @@ monitor_streaming_standby(void)
|
||||
/* local node has been promoted */
|
||||
if (get_recovery_type(local_conn) == RECTYPE_PRIMARY)
|
||||
{
|
||||
log_notice(_("local node is primary, checking local node record"));
|
||||
log_notice(_("local node is primary, checking local node state"));
|
||||
|
||||
/*
|
||||
* It's possible the promote command timed out, but the promotion itself
|
||||
* succeeded. In this case failover state will be FAILOVER_STATE_PROMOTION_FAILED;
|
||||
* we can update the node record ourselves and resume primary monitoring.
|
||||
*/
|
||||
if (failover_state == FAILOVER_STATE_PROMOTION_FAILED)
|
||||
{
|
||||
int degraded_monitoring_elapsed;
|
||||
int former_upstream_node_id = local_node_info.upstream_node_id;
|
||||
|
||||
update_node_record_set_primary(local_conn, local_node_info.node_id);
|
||||
record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
|
||||
|
||||
degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
|
||||
|
||||
log_notice(_("resuming monitoring as primary node after %i seconds"),
|
||||
degraded_monitoring_elapsed);
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
appendPQExpBuffer(&event_details,
|
||||
"promotion command failed but promotion completed successfully");
|
||||
create_event_notification(local_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_promote",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
/* notify former siblings that they should now follow this node */
|
||||
get_active_sibling_node_records(local_conn,
|
||||
local_node_info.node_id,
|
||||
former_upstream_node_id,
|
||||
&sibling_nodes);
|
||||
notify_followers(&sibling_nodes, local_node_info.node_id);
|
||||
|
||||
/* this will restart monitoring in primary mode */
|
||||
monitoring_state = MS_NORMAL;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* There may be a delay between the node being promoted
|
||||
@@ -906,12 +981,12 @@ monitor_streaming_standby(void)
|
||||
get_active_sibling_node_records(local_conn,
|
||||
local_node_info.node_id,
|
||||
local_node_info.upstream_node_id,
|
||||
&standby_nodes);
|
||||
&sibling_nodes);
|
||||
|
||||
if (standby_nodes.node_count > 0)
|
||||
if (sibling_nodes.node_count > 0)
|
||||
{
|
||||
log_debug("scanning %i node records to detect new primary...", standby_nodes.node_count);
|
||||
for (cell = standby_nodes.head; cell; cell = cell->next)
|
||||
log_debug("scanning %i node records to detect new primary...", sibling_nodes.node_count);
|
||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
/* skip local node check, we did that above */
|
||||
if (cell->node_info->node_id == local_node_info.node_id)
|
||||
@@ -930,12 +1005,10 @@ monitor_streaming_standby(void)
|
||||
if (get_recovery_type(cell->node_info->conn) == RECTYPE_PRIMARY)
|
||||
{
|
||||
follow_node_id = cell->node_info->node_id;
|
||||
PQfinish(cell->node_info->conn);
|
||||
cell->node_info->conn = NULL;
|
||||
close_connection(&cell->node_info->conn);
|
||||
break;
|
||||
}
|
||||
PQfinish(cell->node_info->conn);
|
||||
cell->node_info->conn = NULL;
|
||||
close_connection(&cell->node_info->conn);
|
||||
}
|
||||
|
||||
if (follow_node_id != UNKNOWN_NODE_ID)
|
||||
@@ -943,7 +1016,7 @@ monitor_streaming_standby(void)
|
||||
follow_new_primary(follow_node_id);
|
||||
}
|
||||
}
|
||||
clear_node_info_list(&standby_nodes);
|
||||
clear_node_info_list(&sibling_nodes);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -987,6 +1060,15 @@ loop:
|
||||
}
|
||||
}
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
|
||||
{
|
||||
update_monitoring_history();
|
||||
}
|
||||
else
|
||||
{
|
||||
connection_ping(local_conn);
|
||||
}
|
||||
|
||||
/*
|
||||
* handle local node failure
|
||||
*
|
||||
@@ -1001,33 +1083,38 @@ loop:
|
||||
{
|
||||
if (local_node_info.active == true)
|
||||
{
|
||||
bool success = true;
|
||||
PQExpBufferData event_details;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
local_node_info.active = false;
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to connect to local node \"%s\" (ID: %i), marking inactive"),
|
||||
local_node_info.node_name,
|
||||
local_node_info.node_id);
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||
{
|
||||
if (update_node_record_set_active(primary_conn, local_node_info.node_id, false) == true)
|
||||
if (update_node_record_set_active(primary_conn, local_node_info.node_id, false) == false)
|
||||
{
|
||||
PQExpBufferData event_details;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
local_node_info.active = false;
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to connect to local node \"%s\" (ID: %i), marking inactive"),
|
||||
local_node_info.node_name,
|
||||
local_node_info.node_id);
|
||||
|
||||
log_warning("%s", event_details.data);
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"standby_failure",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
success = false;
|
||||
log_warning(_("unable to mark node \"%s\" (ID: %i) as inactive"),
|
||||
local_node_info.node_name,
|
||||
local_node_info.node_id);
|
||||
}
|
||||
}
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"standby_failure",
|
||||
success,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -1066,16 +1153,13 @@ loop:
|
||||
}
|
||||
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK && config_file_options.monitoring_history == true)
|
||||
update_monitoring_history();
|
||||
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
log_debug("SIGHUP received");
|
||||
|
||||
if (reload_config(&config_file_options))
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
if (*config_file_options.log_file)
|
||||
@@ -1125,7 +1209,7 @@ monitor_streaming_witness(void)
|
||||
|
||||
log_error("%s", event_details.data);
|
||||
log_hint(_("execute \"repmgr witness register --force\" to update the witness node "));
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
|
||||
create_event_notification(NULL,
|
||||
&config_file_options,
|
||||
@@ -1153,7 +1237,7 @@ monitor_streaming_witness(void)
|
||||
upstream_node_info.node_id);
|
||||
log_hint(_("primary node must be running before repmgrd can start"));
|
||||
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
@@ -1236,7 +1320,7 @@ monitor_streaming_witness(void)
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
PQfinish(primary_conn);
|
||||
close_connection(&primary_conn);
|
||||
primary_conn = try_reconnect(&upstream_node_info);
|
||||
|
||||
/* Node has recovered - log and continue */
|
||||
@@ -1334,12 +1418,12 @@ monitor_streaming_witness(void)
|
||||
get_active_sibling_node_records(local_conn,
|
||||
local_node_info.node_id,
|
||||
local_node_info.upstream_node_id,
|
||||
&standby_nodes);
|
||||
&sibling_nodes);
|
||||
|
||||
if (standby_nodes.node_count > 0)
|
||||
if (sibling_nodes.node_count > 0)
|
||||
{
|
||||
log_debug("scanning %i node records to detect new primary...", standby_nodes.node_count);
|
||||
for (cell = standby_nodes.head; cell; cell = cell->next)
|
||||
log_debug("scanning %i node records to detect new primary...", sibling_nodes.node_count);
|
||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
/* skip local node check, we did that above */
|
||||
if (cell->node_info->node_id == local_node_info.node_id)
|
||||
@@ -1358,12 +1442,10 @@ monitor_streaming_witness(void)
|
||||
if (get_recovery_type(cell->node_info->conn) == RECTYPE_PRIMARY)
|
||||
{
|
||||
follow_node_id = cell->node_info->node_id;
|
||||
PQfinish(cell->node_info->conn);
|
||||
cell->node_info->conn = NULL;
|
||||
close_connection(&cell->node_info->conn);
|
||||
break;
|
||||
}
|
||||
PQfinish(cell->node_info->conn);
|
||||
cell->node_info->conn = NULL;
|
||||
close_connection(&cell->node_info->conn);
|
||||
}
|
||||
|
||||
if (follow_node_id != UNKNOWN_NODE_ID)
|
||||
@@ -1371,7 +1453,7 @@ monitor_streaming_witness(void)
|
||||
witness_follow_new_primary(follow_node_id);
|
||||
}
|
||||
}
|
||||
clear_node_info_list(&standby_nodes);
|
||||
clear_node_info_list(&sibling_nodes);
|
||||
}
|
||||
}
|
||||
loop:
|
||||
@@ -1425,7 +1507,7 @@ loop:
|
||||
|
||||
if (reload_config(&config_file_options))
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
if (*config_file_options.log_file)
|
||||
@@ -1472,7 +1554,7 @@ do_primary_failover(void)
|
||||
}
|
||||
else if (election_result == ELECTION_WON)
|
||||
{
|
||||
if (standby_nodes.node_count > 0)
|
||||
if (sibling_nodes.node_count > 0)
|
||||
{
|
||||
log_notice("this node is the winner, will now promote itself and inform other nodes");
|
||||
}
|
||||
@@ -1517,7 +1599,7 @@ do_primary_failover(void)
|
||||
get_active_sibling_node_records(local_conn,
|
||||
local_node_info.node_id,
|
||||
upstream_node_info.node_id,
|
||||
&standby_nodes);
|
||||
&sibling_nodes);
|
||||
|
||||
}
|
||||
else if (config_file_options.failover == FAILOVER_MANUAL)
|
||||
@@ -1555,7 +1637,7 @@ do_primary_failover(void)
|
||||
*/
|
||||
true,
|
||||
event_details.data);
|
||||
PQfinish(new_primary_conn);
|
||||
close_connection(&new_primary_conn);
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
}
|
||||
@@ -1579,10 +1661,10 @@ do_primary_failover(void)
|
||||
{
|
||||
case FAILOVER_STATE_PROMOTED:
|
||||
/* notify former siblings that they should now follow this node */
|
||||
notify_followers(&standby_nodes, local_node_info.node_id);
|
||||
notify_followers(&sibling_nodes, local_node_info.node_id);
|
||||
|
||||
/* we no longer care about our former siblings */
|
||||
clear_node_info_list(&standby_nodes);
|
||||
clear_node_info_list(&sibling_nodes);
|
||||
|
||||
/* pass control back down to start_monitoring() */
|
||||
log_info(_("switching to primary monitoring mode"));
|
||||
@@ -1596,10 +1678,10 @@ do_primary_failover(void)
|
||||
* notify siblings that they should resume following the original
|
||||
* primary
|
||||
*/
|
||||
notify_followers(&standby_nodes, upstream_node_info.node_id);
|
||||
notify_followers(&sibling_nodes, upstream_node_info.node_id);
|
||||
|
||||
/* we no longer care about our former siblings */
|
||||
clear_node_info_list(&standby_nodes);
|
||||
clear_node_info_list(&sibling_nodes);
|
||||
|
||||
/* pass control back down to start_monitoring() */
|
||||
log_info(_("resuming standby monitoring mode"));
|
||||
@@ -1774,8 +1856,7 @@ do_upstream_standby_failover(void)
|
||||
int i, r;
|
||||
char parsed_follow_command[MAXPGPATH] = "";
|
||||
|
||||
PQfinish(upstream_conn);
|
||||
upstream_conn = NULL;
|
||||
close_connection(&upstream_conn);
|
||||
|
||||
if (get_primary_node_record(local_conn, &primary_node_info) == false)
|
||||
{
|
||||
@@ -1796,7 +1877,7 @@ do_upstream_standby_failover(void)
|
||||
primary_node_info.node_name,
|
||||
primary_node_info.node_id);
|
||||
|
||||
PQfinish(primary_conn);
|
||||
close_connection(&primary_conn);
|
||||
monitoring_state = MS_DEGRADED;
|
||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||
return false;
|
||||
@@ -1810,15 +1891,14 @@ do_upstream_standby_failover(void)
|
||||
primary_node_info.node_name,
|
||||
primary_node_info.node_id);
|
||||
|
||||
PQfinish(primary_conn);
|
||||
close_connection(&primary_conn);
|
||||
monitoring_state = MS_DEGRADED;
|
||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Close the connection to this server */
|
||||
PQfinish(local_conn);
|
||||
local_conn = NULL;
|
||||
close_connection(&local_conn);
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
@@ -2137,7 +2217,7 @@ wait_primary_notification(int *new_primary_id)
|
||||
return true;
|
||||
}
|
||||
|
||||
log_verbose(LOG_DEBUG, "waiting for new primary notification, %i of max %i seconds",
|
||||
log_verbose(LOG_DEBUG, "waiting for new primary notification, %i of max %i seconds (\"primary_notification_timeout\")",
|
||||
i, config_file_options.primary_notification_timeout);
|
||||
|
||||
sleep(1);
|
||||
@@ -2206,7 +2286,7 @@ follow_new_primary(int new_primary_id)
|
||||
{
|
||||
new_primary_ok = false;
|
||||
log_warning(_("new primary is not in recovery"));
|
||||
PQfinish(upstream_conn);
|
||||
close_connection(&upstream_conn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2220,8 +2300,7 @@ follow_new_primary(int new_primary_id)
|
||||
* restart
|
||||
*/
|
||||
|
||||
PQfinish(local_conn);
|
||||
local_conn = NULL;
|
||||
close_connection(&local_conn);
|
||||
|
||||
/*
|
||||
* replace %n in "config_file_options.follow_command" with ID of primary
|
||||
@@ -2240,32 +2319,46 @@ follow_new_primary(int new_primary_id)
|
||||
PGconn *old_primary_conn;
|
||||
|
||||
/*
|
||||
* The follow action could still fail due to the original primary
|
||||
* The "standby follow" command could still fail due to the original primary
|
||||
* reappearing before the candidate could promote itself ("repmgr
|
||||
* standby follow" will refuse to promote another node if the primary
|
||||
* is available). However the new primary will only instruct use to
|
||||
* follow it after it's successfully promoted itself, so that very
|
||||
* likely won't be the reason for the failure.
|
||||
*
|
||||
*
|
||||
* TODO: check the new primary too - we could have a split-brain
|
||||
* situation where the old primary reappeared just after the new one
|
||||
* promoted itself.
|
||||
* is available). However the new primary will only instruct the other
|
||||
* nodes to follow it after it's successfully promoted itself, so this
|
||||
* case is highly unlikely. A slightly more likely scenario would
|
||||
* be the new primary becoming unavailable just after it's sent notifications
|
||||
* to its follower nodes, and the old primary becoming available again.
|
||||
*/
|
||||
old_primary_conn = establish_db_connection(failed_primary.conninfo, false);
|
||||
|
||||
if (PQstatus(old_primary_conn) == CONNECTION_OK)
|
||||
{
|
||||
/* XXX add event notifications */
|
||||
RecoveryType upstream_recovery_type = get_recovery_type(old_primary_conn);
|
||||
|
||||
PQfinish(old_primary_conn);
|
||||
|
||||
if (upstream_recovery_type == RECTYPE_PRIMARY)
|
||||
{
|
||||
log_notice(_("original primary reappeared - no action taken"));
|
||||
initPQExpBuffer(&event_details);
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("original primary reappeared - no action taken"));
|
||||
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
create_event_notification(old_primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_aborted",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
close_connection(&old_primary_conn);
|
||||
|
||||
return FAILOVER_STATE_PRIMARY_REAPPEARED;
|
||||
}
|
||||
|
||||
log_notice(_("original primary reappeared as standby"));
|
||||
|
||||
close_connection(&old_primary_conn);
|
||||
}
|
||||
|
||||
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||
@@ -2381,7 +2474,7 @@ witness_follow_new_primary(int new_primary_id)
|
||||
{
|
||||
new_primary_ok = false;
|
||||
log_warning(_("new primary is not in recovery"));
|
||||
PQfinish(upstream_conn);
|
||||
close_connection(&upstream_conn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2473,6 +2566,7 @@ do_election(void)
|
||||
|
||||
/* we're visible */
|
||||
int visible_nodes = 1;
|
||||
int total_nodes = 0;
|
||||
|
||||
NodeInfoListCell *cell = NULL;
|
||||
|
||||
@@ -2523,14 +2617,16 @@ do_election(void)
|
||||
get_active_sibling_node_records(local_conn,
|
||||
local_node_info.node_id,
|
||||
upstream_node_info.node_id,
|
||||
&standby_nodes);
|
||||
&sibling_nodes);
|
||||
|
||||
total_nodes = sibling_nodes.node_count + 1;
|
||||
|
||||
log_debug("do_election(): primary location is %s", upstream_node_info.location);
|
||||
|
||||
local_node_info.last_wal_receive_lsn = InvalidXLogRecPtr;
|
||||
|
||||
/* fast path if no other standbys (or witness) exists - normally win by default */
|
||||
if (standby_nodes.node_count == 0)
|
||||
if (sibling_nodes.node_count == 0)
|
||||
{
|
||||
if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
|
||||
{
|
||||
@@ -2556,6 +2652,14 @@ do_election(void)
|
||||
return ELECTION_NOT_CANDIDATE;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* standby nodes found - check if we're in the primary location before checking theirs */
|
||||
if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
|
||||
{
|
||||
primary_location_seen = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* get our lsn */
|
||||
local_node_info.last_wal_receive_lsn = get_last_wal_receive_location(local_conn);
|
||||
@@ -2565,7 +2669,7 @@ do_election(void)
|
||||
/* pointer to "winning" node, initially self */
|
||||
candidate_node = &local_node_info;
|
||||
|
||||
for (cell = standby_nodes.head; cell; cell = cell->next)
|
||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
/* assume the worst case */
|
||||
cell->node_info->node_status = NODE_STATUS_UNKNOWN;
|
||||
@@ -2620,7 +2724,7 @@ do_election(void)
|
||||
candidate_node = cell->node_info;
|
||||
}
|
||||
/* LSN is same - tiebreak on priority, then node_id */
|
||||
else if(cell->node_info->last_wal_receive_lsn == candidate_node->last_wal_receive_lsn)
|
||||
else if (cell->node_info->last_wal_receive_lsn == candidate_node->last_wal_receive_lsn)
|
||||
{
|
||||
log_verbose(LOG_DEBUG, "node %i has same LSN as current candidate %i",
|
||||
cell->node_info->node_id,
|
||||
@@ -2672,9 +2776,9 @@ do_election(void)
|
||||
|
||||
log_debug("visible nodes: %i; total nodes: %i",
|
||||
visible_nodes,
|
||||
standby_nodes.node_count);
|
||||
total_nodes);
|
||||
|
||||
if (visible_nodes <= (standby_nodes.node_count / 2.0))
|
||||
if (visible_nodes <= (total_nodes / 2.0))
|
||||
{
|
||||
log_notice(_("unable to reach a qualified majority of nodes"));
|
||||
log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
|
||||
@@ -2853,7 +2957,7 @@ format_failover_state(FailoverState failover_state)
|
||||
case FAILOVER_STATE_FOLLOW_FAIL:
|
||||
return "FOLLOW_FAIL";
|
||||
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
|
||||
return "ODE_NOTIFICATION_ERROR";
|
||||
return "NODE_NOTIFICATION_ERROR";
|
||||
}
|
||||
|
||||
/* should never reach here */
|
||||
@@ -2861,22 +2965,3 @@ format_failover_state(FailoverState failover_state)
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
close_connections_physical()
|
||||
{
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||
{
|
||||
/* cancel any pending queries to the primary */
|
||||
if (PQisBusy(primary_conn) == 1)
|
||||
cancel_query(primary_conn, config_file_options.async_query_timeout);
|
||||
PQfinish(primary_conn);
|
||||
primary_conn = NULL;
|
||||
}
|
||||
|
||||
if (upstream_conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
|
||||
{
|
||||
PQfinish(upstream_conn);
|
||||
upstream_conn = NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -24,7 +24,6 @@ void do_physical_node_check(void);
|
||||
void monitor_streaming_primary(void);
|
||||
void monitor_streaming_standby(void);
|
||||
void monitor_streaming_witness(void);
|
||||
void close_connections_physical(void);
|
||||
|
||||
void handle_sigint_physical(SIGNAL_ARGS);
|
||||
|
||||
|
||||
52
repmgrd.c
52
repmgrd.c
@@ -53,9 +53,6 @@ bool startup_event_logged = false;
|
||||
MonitoringState monitoring_state = MS_NORMAL;
|
||||
instr_time degraded_monitoring_start;
|
||||
|
||||
static void close_connections(void);
|
||||
void (*_close_connections) (void) = NULL;
|
||||
|
||||
/*
|
||||
* Record receipt of SIGHUP; will cause configuration file to be reread
|
||||
* at the appropriate point in the main loop.
|
||||
@@ -330,7 +327,7 @@ main(int argc, char **argv)
|
||||
{
|
||||
log_error(_("unable to determine status of \"repmgr\" extension"));
|
||||
log_detail("%s", PQerrorMessage(local_conn));
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
|
||||
@@ -347,7 +344,7 @@ main(int argc, char **argv)
|
||||
}
|
||||
|
||||
log_hint(_("check that this node is part of a repmgr cluster"));
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
@@ -373,7 +370,7 @@ main(int argc, char **argv)
|
||||
break;
|
||||
}
|
||||
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
@@ -392,7 +389,7 @@ main(int argc, char **argv)
|
||||
{
|
||||
log_error(_("unable to write to shared memory"));
|
||||
log_hint(_("ensure \"shared_preload_libraries\" includes \"repmgr\""));
|
||||
PQfinish(local_conn);
|
||||
close_connection(&local_conn);
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
@@ -404,7 +401,6 @@ main(int argc, char **argv)
|
||||
}
|
||||
else
|
||||
{
|
||||
_close_connections = close_connections_physical;
|
||||
log_debug("node id is %i, upstream node id is %i",
|
||||
local_node_info.node_id,
|
||||
local_node_info.upstream_node_id);
|
||||
@@ -705,17 +701,29 @@ PGconn *
|
||||
try_reconnect(t_node_info *node_info)
|
||||
{
|
||||
PGconn *conn;
|
||||
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
||||
|
||||
int i;
|
||||
|
||||
int max_attempts = config_file_options.reconnect_attempts;
|
||||
|
||||
initialize_conninfo_params(&conninfo_params, false);
|
||||
|
||||
|
||||
/* we assume by now the conninfo string is parseable */
|
||||
(void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false);
|
||||
|
||||
/* set some default values if not explicitly provided */
|
||||
param_set_ine(&conninfo_params, "connect_timeout", "2");
|
||||
param_set_ine(&conninfo_params, "fallback_application_name", "repmgr");
|
||||
|
||||
for (i = 0; i < max_attempts; i++)
|
||||
{
|
||||
log_info(_("checking state of node %i, %i of %i attempts"),
|
||||
node_info->node_id, i + 1, max_attempts);
|
||||
if (is_server_available(node_info->conninfo) == true)
|
||||
if (is_server_available_params(&conninfo_params) == true)
|
||||
{
|
||||
|
||||
log_notice(_("node has recovered, reconnecting"));
|
||||
|
||||
/*
|
||||
@@ -723,14 +731,18 @@ try_reconnect(t_node_info *node_info)
|
||||
* connection denied due to connection exhaustion - fall back to
|
||||
* degraded monitoring? - make that configurable
|
||||
*/
|
||||
conn = establish_db_connection(node_info->conninfo, false);
|
||||
|
||||
conn = establish_db_connection_by_params(&conninfo_params, false);
|
||||
|
||||
if (PQstatus(conn) == CONNECTION_OK)
|
||||
{
|
||||
free_conninfo_params(&conninfo_params);
|
||||
|
||||
node_info->node_status = NODE_STATUS_UP;
|
||||
return conn;
|
||||
}
|
||||
|
||||
PQfinish(conn);
|
||||
close_connection(&conn);
|
||||
log_notice(_("unable to reconnect to node"));
|
||||
}
|
||||
|
||||
@@ -742,13 +754,14 @@ try_reconnect(t_node_info *node_info)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
log_warning(_("unable to reconnect to node %i after %i attempts"),
|
||||
node_info->node_id,
|
||||
max_attempts);
|
||||
|
||||
node_info->node_status = NODE_STATUS_DOWN;
|
||||
|
||||
free_conninfo_params(&conninfo_params);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -784,24 +797,9 @@ print_monitoring_state(MonitoringState monitoring_state)
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
close_connections()
|
||||
{
|
||||
if (_close_connections != NULL)
|
||||
_close_connections();
|
||||
|
||||
if (local_conn != NULL && PQstatus(local_conn) == CONNECTION_OK)
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
local_conn = NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
terminate(int retval)
|
||||
{
|
||||
close_connections();
|
||||
logger_shutdown();
|
||||
|
||||
if (pid_file)
|
||||
|
||||
Reference in New Issue
Block a user