mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 15:16:29 +00:00
Compare commits
73 Commits
v4.4.0beta
...
REL4_1_STA
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
222f7e6080 | ||
|
|
446695e328 | ||
|
|
ec3da13e22 | ||
|
|
1488c014ff | ||
|
|
f471316504 | ||
|
|
726299f7ef | ||
|
|
7fda2a1bcf | ||
|
|
d26141b8ab | ||
|
|
4a6b5fe913 | ||
|
|
a71e644255 | ||
|
|
8646fd6004 | ||
|
|
3e1bb1a523 | ||
|
|
f5e58fc062 | ||
|
|
6b95a96f3a | ||
|
|
bd146ae9ac | ||
|
|
c7f8e48d12 | ||
|
|
322190516c | ||
|
|
31a49ff781 | ||
|
|
a6f99b58dd | ||
|
|
09b041433e | ||
|
|
058c8168e1 | ||
|
|
0468e47ef3 | ||
|
|
216326f316 | ||
|
|
3fb20ce774 | ||
|
|
e468ca859e | ||
|
|
623c84c022 | ||
|
|
c2dded1d7b | ||
|
|
457dbbd267 | ||
|
|
5485c06bc1 | ||
|
|
00ae42eb07 | ||
|
|
33525491ae | ||
|
|
8c84f7a214 | ||
|
|
efe4bed88e | ||
|
|
9ba8dcbac3 | ||
|
|
a8996a5bfa | ||
|
|
4cbba98193 | ||
|
|
23e6b85de3 | ||
|
|
d5ecb09f22 | ||
|
|
719dd93676 | ||
|
|
5747f1d446 | ||
|
|
9313b43cb1 | ||
|
|
5aeb1b0589 | ||
|
|
6c93388848 | ||
|
|
d4ad8ce20c | ||
|
|
bacab8d31c | ||
|
|
14856e3a4d | ||
|
|
ca9242badb | ||
|
|
ff0929e882 | ||
|
|
8cd1811edb | ||
|
|
bf15c0d40f | ||
|
|
9ae9d31165 | ||
|
|
d5064bdc02 | ||
|
|
9d0524a008 | ||
|
|
5398fd2d22 | ||
|
|
4c44c01380 | ||
|
|
5113ab0274 | ||
|
|
25f68bb283 | ||
|
|
730f67258c | ||
|
|
ca0e4de1ee | ||
|
|
2fb0f056fe | ||
|
|
3a789d53e0 | ||
|
|
fb67b2cd4f | ||
|
|
9f07804b6a | ||
|
|
d5b2fa2309 | ||
|
|
d696c4019e | ||
|
|
e6ffbcc67a | ||
|
|
e1410831e0 | ||
|
|
cb4f6f6e3f | ||
|
|
75e5d79654 | ||
|
|
55fbe12971 | ||
|
|
db4199e08f | ||
|
|
0d9ed02729 | ||
|
|
8e9f0b802b |
20
HISTORY
20
HISTORY
@@ -1,4 +1,22 @@
|
||||
4.1.0 2018-??-??
|
||||
4.1.1 2018-09-05
|
||||
logging: explicitly log the text of failed queries as ERRORs to
|
||||
assist logfile analysis; GitHub #498
|
||||
repmgr: truncate version string, if necessary; GitHub #490 (Ian)
|
||||
repmgr: improve messages emitted during "standby promote" (Ian)
|
||||
repmgr: "standby clone" - don't copy external config files in --dry-run
|
||||
mode; GitHub #491 (Ian)
|
||||
repmgr: add "cluster_cleanup" event; GitHub #492 (Ian)
|
||||
repmgr: (standby switchover) improve detection of free walsenders;
|
||||
GitHub #495 (Ian)
|
||||
repmgr: (node rejoin) improve replication slot handling; GitHub #499 (Ian)
|
||||
repmgrd: ensure that sending SIGHUP always results in the log file
|
||||
being reopened; GitHub #485 (Ian)
|
||||
repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian)
|
||||
repmgrd: fix startup on witness node when local data is stale; GitHub #488/#489 (Ian)
|
||||
repmgrd: improve cascaded standby failover handling; GitHub #480 (Ian)
|
||||
repmgrd: improve reconnection handling (Ian)
|
||||
|
||||
4.1.0 2018-07-31
|
||||
repmgr: change default log_level to INFO, add documentation; GitHub #470 (Ian)
|
||||
repmgr: add "--missing-slots" check to "repmgr node check" (Ian)
|
||||
repmgr: improve command line error handling; GitHub #464 (Ian)
|
||||
|
||||
26
configfile.c
26
configfile.c
@@ -28,6 +28,7 @@ char config_file_path[MAXPGPATH] = "";
|
||||
static bool config_file_provided = false;
|
||||
bool config_file_found = false;
|
||||
|
||||
static void parse_config(t_configuration_options *options, bool terse);
|
||||
static void _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *warning_list);
|
||||
|
||||
static void _parse_line(char *buf, char *name, char *value);
|
||||
@@ -238,7 +239,7 @@ end_search:
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
static void
|
||||
parse_config(t_configuration_options *options, bool terse)
|
||||
{
|
||||
/* Collate configuration file errors here for friendlier reporting */
|
||||
@@ -785,7 +786,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
PQconninfoFree(conninfo_options);
|
||||
}
|
||||
|
||||
|
||||
/* set values for parameters which default to other parameters */
|
||||
|
||||
/*
|
||||
@@ -1052,11 +1052,13 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
|
||||
* - repmgrd_standby_startup_timeout
|
||||
* - retry_promote_interval_secs
|
||||
*
|
||||
* non-changeable options
|
||||
* non-changeable options (repmgrd references these from the "repmgr.nodes"
|
||||
* table, not the configuration file)
|
||||
*
|
||||
* - node_id
|
||||
* - node_name
|
||||
* - data_directory
|
||||
* - location
|
||||
* - priority
|
||||
* - replication_type
|
||||
*
|
||||
@@ -1065,7 +1067,7 @@ parse_time_unit_parameter(const char *name, const char *value, char *dest, ItemL
|
||||
|
||||
*/
|
||||
bool
|
||||
reload_config(t_configuration_options *orig_options)
|
||||
reload_config(t_configuration_options *orig_options, t_server_type server_type)
|
||||
{
|
||||
PGconn *conn;
|
||||
t_configuration_options new_options = T_CONFIGURATION_OPTIONS_INITIALIZER;
|
||||
@@ -1081,6 +1083,20 @@ reload_config(t_configuration_options *orig_options)
|
||||
|
||||
_parse_config(&new_options, &config_errors, &config_warnings);
|
||||
|
||||
|
||||
if (server_type == PRIMARY || server_type == STANDBY)
|
||||
{
|
||||
if (new_options.promote_command[0] == '\0')
|
||||
{
|
||||
item_list_append(&config_errors, _("\"promote_command\": required parameter was not found"));
|
||||
}
|
||||
|
||||
if (new_options.follow_command[0] == '\0')
|
||||
{
|
||||
item_list_append(&config_errors, _("\"follow_command\": required parameter was not found"));
|
||||
}
|
||||
}
|
||||
|
||||
if (config_errors.head != NULL)
|
||||
{
|
||||
ItemListCell *cell = NULL;
|
||||
@@ -1258,7 +1274,7 @@ reload_config(t_configuration_options *orig_options)
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/* promote_delay */
|
||||
/* promote_delay (for testing use only; not documented */
|
||||
if (orig_options->promote_delay != new_options.promote_delay)
|
||||
{
|
||||
orig_options->promote_delay = new_options.promote_delay;
|
||||
|
||||
@@ -273,13 +273,13 @@ typedef struct
|
||||
"", "", "", "" \
|
||||
}
|
||||
|
||||
#include "dbutils.h"
|
||||
|
||||
void set_progname(const char *argv0);
|
||||
const char *progname(void);
|
||||
|
||||
void load_config(const char *config_file, bool verbose, bool terse, t_configuration_options *options, char *argv0);
|
||||
void parse_config(t_configuration_options *options, bool terse);
|
||||
bool reload_config(t_configuration_options *orig_options);
|
||||
bool reload_config(t_configuration_options *orig_options, t_server_type server_type);
|
||||
|
||||
bool parse_recovery_conf(const char *data_dir, t_recovery_conf *conf);
|
||||
|
||||
|
||||
18
configure
vendored
18
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for repmgr 4.1.
|
||||
# Generated by GNU Autoconf 2.69 for repmgr 4.1.2.
|
||||
#
|
||||
# Report bugs to <pgsql-bugs@postgresql.org>.
|
||||
#
|
||||
@@ -582,8 +582,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='repmgr'
|
||||
PACKAGE_TARNAME='repmgr'
|
||||
PACKAGE_VERSION='4.1'
|
||||
PACKAGE_STRING='repmgr 4.1'
|
||||
PACKAGE_VERSION='4.1.2'
|
||||
PACKAGE_STRING='repmgr 4.1.2'
|
||||
PACKAGE_BUGREPORT='pgsql-bugs@postgresql.org'
|
||||
PACKAGE_URL='https://2ndquadrant.com/en/resources/repmgr/'
|
||||
|
||||
@@ -1178,7 +1178,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures repmgr 4.1 to adapt to many kinds of systems.
|
||||
\`configure' configures repmgr 4.1.2 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1239,7 +1239,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of repmgr 4.1:";;
|
||||
short | recursive ) echo "Configuration of repmgr 4.1.2:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1313,7 +1313,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
repmgr configure 4.1
|
||||
repmgr configure 4.1.2
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -1332,7 +1332,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by repmgr $as_me 4.1, which was
|
||||
It was created by repmgr $as_me 4.1.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2359,7 +2359,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by repmgr $as_me 4.1, which was
|
||||
This file was extended by repmgr $as_me 4.1.2, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -2422,7 +2422,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
repmgr config.status 4.1
|
||||
repmgr config.status 4.1.2
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
AC_INIT([repmgr], [4.1], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])
|
||||
AC_INIT([repmgr], [4.1.2], [pgsql-bugs@postgresql.org], [repmgr], [https://2ndquadrant.com/en/resources/repmgr/])
|
||||
|
||||
AC_COPYRIGHT([Copyright (c) 2010-2018, 2ndQuadrant Ltd.])
|
||||
|
||||
|
||||
@@ -227,7 +227,15 @@ get_controlfile(const char *DataDir)
|
||||
|
||||
control_file_info->control_file_processed = true;
|
||||
|
||||
if (version_num >= 90500)
|
||||
if (version_num >= 110000)
|
||||
{
|
||||
ControlFileData11 *ptr = (struct ControlFileData11 *)ControlFileDataPtr;
|
||||
control_file_info->system_identifier = ptr->system_identifier;
|
||||
control_file_info->state = ptr->state;
|
||||
control_file_info->checkPoint = ptr->checkPoint;
|
||||
control_file_info->data_checksum_version = ptr->data_checksum_version;
|
||||
}
|
||||
else if (version_num >= 90500)
|
||||
{
|
||||
ControlFileData95 *ptr = (struct ControlFileData95 *)ControlFileDataPtr;
|
||||
control_file_info->system_identifier = ptr->system_identifier;
|
||||
|
||||
@@ -265,6 +265,71 @@ typedef struct ControlFileData95
|
||||
|
||||
} ControlFileData95;
|
||||
|
||||
/*
|
||||
* Following field removed in 11:
|
||||
*
|
||||
* XLogRecPtr prevCheckPoint;
|
||||
*
|
||||
* In 10, following field appended *after* "data_checksum_version":
|
||||
*
|
||||
* char mock_authentication_nonce[MOCK_AUTH_NONCE_LEN];
|
||||
*
|
||||
* (but we don't care about that)
|
||||
*/
|
||||
|
||||
typedef struct ControlFileData11
|
||||
{
|
||||
uint64 system_identifier;
|
||||
|
||||
uint32 pg_control_version; /* PG_CONTROL_VERSION */
|
||||
uint32 catalog_version_no; /* see catversion.h */
|
||||
|
||||
DBState state; /* see enum above */
|
||||
pg_time_t time; /* time stamp of last pg_control update */
|
||||
XLogRecPtr checkPoint; /* last check point record ptr */
|
||||
|
||||
CheckPoint95 checkPointCopy; /* copy of last check point record */
|
||||
|
||||
XLogRecPtr unloggedLSN; /* current fake LSN value, for unlogged rels */
|
||||
|
||||
XLogRecPtr minRecoveryPoint;
|
||||
TimeLineID minRecoveryPointTLI;
|
||||
XLogRecPtr backupStartPoint;
|
||||
XLogRecPtr backupEndPoint;
|
||||
bool backupEndRequired;
|
||||
|
||||
int wal_level;
|
||||
bool wal_log_hints;
|
||||
int MaxConnections;
|
||||
int max_worker_processes;
|
||||
int max_prepared_xacts;
|
||||
int max_locks_per_xact;
|
||||
bool track_commit_timestamp;
|
||||
|
||||
uint32 maxAlign; /* alignment requirement for tuples */
|
||||
double floatFormat; /* constant 1234567.0 */
|
||||
|
||||
uint32 blcksz; /* data block size for this DB */
|
||||
uint32 relseg_size; /* blocks per segment of large relation */
|
||||
|
||||
uint32 xlog_blcksz; /* block size within WAL files */
|
||||
uint32 xlog_seg_size; /* size of each WAL segment */
|
||||
|
||||
uint32 nameDataLen; /* catalog name field width */
|
||||
uint32 indexMaxKeys; /* max number of columns in an index */
|
||||
|
||||
uint32 toast_max_chunk_size; /* chunk size in TOAST tables */
|
||||
uint32 loblksize; /* chunk size in pg_largeobject */
|
||||
|
||||
bool enableIntTimes; /* int64 storage enabled? */
|
||||
|
||||
bool float4ByVal; /* float4 pass-by-value? */
|
||||
bool float8ByVal; /* float8, int8, etc pass-by-value? */
|
||||
|
||||
uint32 data_checksum_version;
|
||||
|
||||
} ControlFileData11;
|
||||
|
||||
|
||||
|
||||
extern DBState get_db_state(const char *data_directory);
|
||||
|
||||
@@ -475,7 +475,7 @@ int wait_connection_availability(PGconn *conn, long long timeout);
|
||||
/* node availability functions */
|
||||
bool is_server_available(const char *conninfo);
|
||||
bool is_server_available_params(t_conninfo_param_list *param_list);
|
||||
void connection_ping(PGconn *conn);
|
||||
ExecStatusType connection_ping(PGconn *conn);
|
||||
|
||||
/* monitoring functions */
|
||||
void
|
||||
|
||||
@@ -108,6 +108,14 @@
|
||||
is not possible, contact your vendor for assistance.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-old-packages">
|
||||
<title>How can I obtain old versions of &repmgr; packages?</title>
|
||||
<para>
|
||||
See appendix <xref linkend="packages-old-versions"> for details.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="faq-repmgr" xreflabel="repmgr">
|
||||
@@ -239,11 +247,22 @@
|
||||
Under some circumstances event notifications can be generated for servers
|
||||
which have not yet been registered; it's also useful to retain a record
|
||||
of events which includes servers removed from the replication cluster
|
||||
which no longer have an entry in the <literal>repmrg.nodes</literal> table.
|
||||
which no longer have an entry in the <literal>repmgr.nodes</literal> table.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
<sect2 id="faq-repmgr-recovery-conf-quoted-values" xreflabel="Quoted values in recovery.conf">
|
||||
<title>Why are some values in <filename>recovery.conf</filename> surrounded by pairs of single quotes?</title>
|
||||
<para>
|
||||
This is to ensure that user-supplied values which are written as parameter values in <filename>recovery.conf</filename>
|
||||
are escaped correctly and do not cause errors when <filename>recovery.conf</filename> is parsed.
|
||||
</para>
|
||||
<para>
|
||||
The escaping is performed by an internal PostgreSQL routine, which leaves strings consisting
|
||||
of digits and alphabetical characters only as-is, but wraps everything else in pairs of single quotes,
|
||||
even if the string does not contain any characters which need escaping.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
</sect1>
|
||||
@@ -255,7 +274,7 @@
|
||||
<sect2 id="faq-repmgrd-prevent-promotion" xreflabel="Prevent standby from being promoted to primary">
|
||||
<title>How can I prevent a node from ever being promoted to primary?</title>
|
||||
<para>
|
||||
In `repmgr.conf`, set its priority to a value of 0 or less; apply the changed setting with
|
||||
In <filename>repmgr.conf</filename>, set its priority to a value of <literal>0</literal>; apply the changed setting with
|
||||
<command><link linkend="repmgr-standby-register">repmgr standby register --force</link></command>.
|
||||
</para>
|
||||
<para>
|
||||
@@ -303,5 +322,36 @@
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-repmgrd-pg-bindir" xreflabel="repmgrd does not apply pg_bindir to promote_command or follow_command">
|
||||
<title>
|
||||
<application>repmgrd</application> ignores pg_bindir when executing <varname>promote_command</varname> or <varname>follow_command</varname>
|
||||
</title>
|
||||
<para>
|
||||
<varname>promote_command</varname> or <varname>follow_command</varname> can be user-defined scripts,
|
||||
so &repmgr; will not apply <option>pg_bindir</option> even if excuting &repmgr;. Always provide the full
|
||||
path; see <xref linkend="repmgrd-automatic-failover-configuration"> for more details.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="faq-repmgrd-startup-no-upstream" xreflabel="repmgrd does not start if upstream node is not running">
|
||||
<title>
|
||||
<application>repmgrd</application> aborts startup with the error "<literal>upstream node must be running before repmgrd can start</literal>"
|
||||
</title>
|
||||
<para>
|
||||
<application>repmgrd</application> does this to avoid starting up on a replication cluster
|
||||
which is not in a healthy state. If the upstream is unavailable, <application>repmgrd</application>
|
||||
may initiate a failover immediately after starting up, which could have unintended side-effects,
|
||||
particularly if <application>repmgrd</application> is not running on other nodes.
|
||||
</para>
|
||||
<para>
|
||||
In particular, it's possible that the node's local copy of the <literal>repmgr.nodes</literal> copy
|
||||
is out-of-date, which may lead to incorrect failover behaviour.
|
||||
</para>
|
||||
<para>
|
||||
The onus is therefore on the adminstrator to manually set the cluster to a stable, healthy state before
|
||||
starting <application>repmgrd</application>.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
</appendix>
|
||||
|
||||
@@ -53,11 +53,11 @@
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>Repository URL:</entry>
|
||||
<entry><ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink></entry>
|
||||
<entry><ulink url="https://dl.2ndquadrant.com/">https://dl.2ndquadrant.com/</ulink></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ">https://repmgr.org/docs/4.0/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ</ulink></entry>
|
||||
<entry><ulink url="https://repmgr.org/docs/4.1/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ">https://repmgr.org/docs/4.1/installation-packages.html#INSTALLATION-PACKAGES-REDHAT-2NDQ</ulink></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
@@ -253,6 +253,23 @@
|
||||
</para>
|
||||
|
||||
|
||||
<table id="apt-2ndquadrant-repository">
|
||||
<title>2ndQuadrant public repository</title>
|
||||
<tgroup cols="2">
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>Repository URL:</entry>
|
||||
<entry><ulink url="https://dl.2ndquadrant.com/">https://dl.2ndquadrant.com/</ulink></entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>Repository documentation:</entry>
|
||||
<entry><ulink url="https://repmgr.org/docs/4.1/installation-packages.html#INSTALLATION-PACKAGES-DEBIAN">https://repmgr.org/docs/4.1/installation-packages.html#INSTALLATION-PACKAGES-DEBIAN</ulink></entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
|
||||
|
||||
<table id="apt-repository">
|
||||
<title>PostgreSQL Community APT repository (PGDG)</title>
|
||||
<tgroup cols="2">
|
||||
@@ -365,6 +382,127 @@
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="packages-snapshot" xreflabel="Snapshot packages">
|
||||
<title>Snapshot packages</title>
|
||||
<indexterm>
|
||||
<primary>snapshot packages</primary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>packages</primary>
|
||||
<secondary>snaphots</secondary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
For testing new features and bug fixes, from time to time 2ndQuadrant provides
|
||||
so-called "snapshot packages" via its public repository. These packages
|
||||
are built from the &repmgr; source at a particular point in time, and are not formal
|
||||
releases.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
We do not recommend installing these packages in a production environment
|
||||
unless specifically advised.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
To install a snapshot package, it's necessary to install the 2ndQuadrant public snapshot repository,
|
||||
following the instructions here: <ulink url="https://dl.2ndquadrant.com/default/release/site/">https://dl.2ndquadrant.com/default/release/site/</ulink> but replace <literal>release</literal> with <literal>snapshot</literal>
|
||||
in the appropriate URL.
|
||||
</para>
|
||||
<para>
|
||||
For example, to install the snapshot RPM repository for PostgreSQL 9.6, execute (as <literal>root</literal>):
|
||||
<programlisting>
|
||||
curl https://dl.2ndquadrant.com/default/snapshot/get/9.6/rpm | bash</programlisting>
|
||||
|
||||
or as a normal user with root sudo access:
|
||||
<programlisting>
|
||||
curl https://dl.2ndquadrant.com/default/snapshot/get/9.6/rpm | sudo bash</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Alternatively you can browse the repository here:
|
||||
<ulink url="https://dl.2ndquadrant.com/default/snapshot/browse/">https://dl.2ndquadrant.com/default/snapshot/browse/</ulink>.
|
||||
</para>
|
||||
<para>
|
||||
Once the repository is installed, installing or updating &repmgr; will result in the latest snapshot
|
||||
package being installed.
|
||||
</para>
|
||||
<para>
|
||||
The package name will be formatted like this:
|
||||
<programlisting>
|
||||
repmgr96-4.1.1-0.0git320.g5113ab0.1.el7.x86_64.rpm</programlisting>
|
||||
containg the snapshot build number (here: <literal>320</literal>) and the hash
|
||||
of the <application>git</application> commit it was built from (here: <literal>g5113ab0</literal>).
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Note that the next formal release (in the above example <literal>4.1.1</literal>), once available,
|
||||
will install in place of any snapshot builds.
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="packages-old-versions" xreflabel="Installing old package versions">
|
||||
<title>Installing old package versions</title>
|
||||
<indexterm>
|
||||
<primary>old packages</primary>
|
||||
</indexterm>
|
||||
<indexterm>
|
||||
<primary>packages</primary>
|
||||
<secondary>old versions</secondary>
|
||||
</indexterm>
|
||||
|
||||
<sect2 id="packages-old-versions-debian" xreflabel="old Debian package versions">
|
||||
<title>Debian/Ubuntu</title>
|
||||
<para>
|
||||
An archive of old packages (<literal>3.3.2</literal> and later) for Debian/Ubuntu-based systems is available here:
|
||||
<ulink url="http://atalia.postgresql.org/morgue/r/repmgr/">http://atalia.postgresql.org/morgue/r/repmgr/</ulink>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="packages-old-versions-rhel-centos" xreflabel="old RHEL/CentOS package versions">
|
||||
<title>RHEL/CentOS</title>
|
||||
<para>
|
||||
Old RPM packages (<literal>3.2</literal> and later) can be retrieved from the
|
||||
(deprecated) 2ndQuadrant repository at
|
||||
<ulink url="http://packages.2ndquadrant.com/">http://packages.2ndquadrant.com/</ulink>
|
||||
by installing the appropriate repository RPM:
|
||||
</para>
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-fedora-1.0-1.noarch.rpm</ulink>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm">http://packages.2ndquadrant.com/repmgr/yum-repo-rpms/repmgr-rhel-1.0-1.noarch.rpm</ulink>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
<para>
|
||||
Old versions can be located with e.g.:
|
||||
<programlisting>
|
||||
yum --showduplicates list repmgr96</programlisting>
|
||||
(substitute the appropriate package name; see <xref linkend="packages-centos">) and installed with:
|
||||
<programlisting>
|
||||
yum install {package_name}-{version}</programlisting>
|
||||
where <literal>{package_name}</literal> is the base package name (e.g. <literal>repmgr96</literal>)
|
||||
and <literal>{version}</literal> is the version listed by the
|
||||
<command> yum --showduplicates list ...</command> command, e.g. <literal>4.0.6-1.rhel6</literal>.
|
||||
</para>
|
||||
<para>For example:
|
||||
<programlisting>
|
||||
yum install repmgr96-4.0.6-1.rhel6</programlisting>
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="packages-packager-info" xreflabel="Information for packagers">
|
||||
<title>Information for packagers</title>
|
||||
@@ -373,7 +511,7 @@
|
||||
<secondary>information for packagers</secondary>
|
||||
</indexterm>
|
||||
<para>
|
||||
We recommend patching the following parameters when
|
||||
We recommend patching the following parameters when
|
||||
building the package as built-in default values for user convenience.
|
||||
These values can nevertheless be overridden by the user, if desired.
|
||||
</para>
|
||||
|
||||
@@ -15,9 +15,164 @@
|
||||
See also: <xref linkend="upgrading-repmgr">
|
||||
</para>
|
||||
|
||||
<sect1 id="release-4.1.1">
|
||||
<title>Release 4.1.1</title>
|
||||
<para><emphasis>Wed September 5, 2018</emphasis></para>
|
||||
<para>
|
||||
repmgr 4.1.1 contains a number of usability enhancements and bug fixes.
|
||||
</para>
|
||||
<para>
|
||||
We recommend upgrading to this version as soon as possible.
|
||||
This release can be installed as a simple package upgrade from repmgr 4.0 ~ 4.1.0;
|
||||
<application>repmgrd</application> (if running) should be restarted.
|
||||
See <xref linkend="upgrading-repmgr"> for more details.
|
||||
</para>
|
||||
|
||||
<sect2>
|
||||
<title>repmgr enhancements</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-standby-switchover">repmgr standby switchover --dry-run</link></command>
|
||||
no longer copies external configuration files to test they can be copied; this avoids making
|
||||
any changes to the target system. (GitHub #491).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-cluster-cleanup">repmgr cluster cleanup</link></command>:
|
||||
add <literal>cluster_cleanup</literal> event. (GitHub #492).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-standby-switchover">repmgr standby switchover</link></command>:
|
||||
improve detection of free walsenders. (GitHub #495).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Improve messages emitted during
|
||||
<command><link linkend="repmgr-standby-promote">repmgr standby promote</link></command>.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
<sect2>
|
||||
<title>repmgrd enhancements</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Always reopen the log file after
|
||||
receiving <literal>SIGHUP</literal>. Previously this only happened if
|
||||
a configuration file change was detected.
|
||||
(GitHub #485).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Report version number <emphasis>after</emphasis>
|
||||
logger initialisation. (GitHub #487).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Improve cascaded standby failover handling. (GitHub #480).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Improve reconnection handling after brief network outages; if
|
||||
monitoring data being collected, this could lead to orphaned
|
||||
sessions on the primary. (GitHub #480).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Check <varname>promote_command</varname> and <varname>follow_command</varname>
|
||||
are defined when reloading configuration. These were checked on startup but
|
||||
not reload by <application>repmgrd</application>, which made it possible to
|
||||
make <application>repmgrd</application> with invalid values. It's unlikely
|
||||
anyone would want to do this, but we should make it impossible anyway.
|
||||
(GitHub #486).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Other</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Text of any failed queries will now be logged as <literal>ERROR</literal> to assist
|
||||
logfile analysis at log levels higher than <literal>DEBUG</literal>.
|
||||
(GitHub #498).
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Bug fixes</title>
|
||||
<para>
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<command><link linkend="repmgr-node-rejoin">repmgr node rejoin</link></command>:
|
||||
remove new upstream's replication slot if it still exists on the rejoined
|
||||
standby. (GitHub #499).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application>: fix startup on witness node when local data is stale. (GitHub #488, #489).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Truncate version string reported by PostgreSQL if necessary; some
|
||||
distributions insert additional detail after the actual version.
|
||||
(GitHub #490).
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
|
||||
</sect1>
|
||||
|
||||
|
||||
|
||||
<sect1 id="release-4.1.0">
|
||||
<title>Release 4.1.0</title>
|
||||
<para><emphasis>???? ??, 2018</emphasis></para>
|
||||
<para><emphasis>Tue July 31, 2018</emphasis></para>
|
||||
<para>
|
||||
&repmgr; 4.1.0 introduces some changes to <application>repmgrd</application>
|
||||
behaviour and some additional configuration parameters.
|
||||
@@ -29,19 +184,20 @@
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
<application>repmgrd</application> (if running) must be restarted.
|
||||
Execute <command>ALTER EXTENSION repmgr UPDATE</command>
|
||||
on the primary server in the database where &repmgr; is installed.
|
||||
</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>
|
||||
Execute <command>ALTER EXTENSION repmgr UPGRADE</command>
|
||||
on the primary server in the database where &repmgr; is installed.
|
||||
<application>repmgrd</application> must be restarted on all nodes where it is running.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
A restart of the PostgreSQL server is <emphasis>not</emphasis> required
|
||||
for this release.
|
||||
for this release (unless upgrading from repmgr 3.x).
|
||||
</para>
|
||||
<para>
|
||||
See <xref linkend="upgrading-repmgr-extension"> for more details.
|
||||
@@ -53,6 +209,17 @@
|
||||
review the changes listed below.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
<emphasis>Repository changes</emphasis>
|
||||
</para>
|
||||
<para>
|
||||
Coinciding with this release, the 2ndQuadrant repository structure has changed.
|
||||
See section <xref linkend="installation-packages"> for details, particularly
|
||||
if you are using a RPM-based system.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<sect2>
|
||||
<title>Configuration file changes</title>
|
||||
|
||||
@@ -214,7 +381,7 @@
|
||||
|
||||
<sect1 id="release-4.0.6">
|
||||
<title>Release 4.0.6</title>
|
||||
<para><emphasis>June 14, 2018</emphasis></para>
|
||||
<para><emphasis>Thu June 14, 2018</emphasis></para>
|
||||
<para>
|
||||
&repmgr; 4.0.6 contains a number of bug fixes and usability enhancements.
|
||||
</para>
|
||||
|
||||
@@ -5,14 +5,14 @@
|
||||
<title>repmgr source code signing key</title>
|
||||
<para>
|
||||
The signing key ID used for <application>repmgr</application> source code bundles is:
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/SOURCE-GPG-KEY-repmgr">
|
||||
<ulink url="https://repmgr.org/download/SOURCE-GPG-KEY-repmgr">
|
||||
<literal>0x297F1DCC</literal></ulink>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To download the <application>repmgr</application> source key to your computer:
|
||||
<programlisting>
|
||||
curl -s http://packages.2ndquadrant.com/repmgr/SOURCE-GPG-KEY-repmgr | gpg --import
|
||||
curl -s https://repmgr.org/download/SOURCE-GPG-KEY-repmgr | gpg --import
|
||||
gpg --fingerprint 0x297F1DCC
|
||||
</programlisting>
|
||||
then verify that the fingerprint is the expected value:
|
||||
|
||||
@@ -17,15 +17,15 @@
|
||||
<link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>.
|
||||
</para>
|
||||
<para>
|
||||
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> to control the PostgreSQL
|
||||
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> utility to control the PostgreSQL
|
||||
server. However this can lead to various problems, particularly when PostgreSQL has been
|
||||
installed from packages, and expecially so if <application>systemd</application> is in use.
|
||||
installed from packages, and especially so if <application>systemd</application> is in use.
|
||||
</para>
|
||||
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If using <application>systemd</application>, ensure you have <varname>RemoteIPC</varname> set to <literal>off</literal>.
|
||||
If using <application>systemd</application>, ensure you have <varname>RemoveIPC</varname> set to <literal>off</literal>.
|
||||
See the <ulink url="https://wiki.postgresql.org/wiki/Systemd">systemd</ulink>
|
||||
entry in the <ulink url="https://wiki.postgresql.org/wiki/Main_Page">PostgreSQL wiki</ulink> for details.
|
||||
</para>
|
||||
@@ -48,6 +48,13 @@
|
||||
service_reload_command</programlisting>
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; will not apply <option>pg_bindir</option> when executing any of these commands;
|
||||
these can be user-defined scripts so must always be specified with the full path.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
It's also possible to specify a <varname>service_promote_command</varname>.
|
||||
@@ -92,7 +99,7 @@
|
||||
Defaults:postgres !requiretty
|
||||
postgres ALL = NOPASSWD: /usr/bin/systemctl stop postgresql-9.6, \
|
||||
/usr/bin/systemctl start postgresql-9.6, \
|
||||
/usr/bin/systemctl restart postgresql-9.6 \
|
||||
/usr/bin/systemctl restart postgresql-9.6, \
|
||||
/usr/bin/systemctl reload postgresql-9.6</programlisting>
|
||||
</para>
|
||||
|
||||
|
||||
@@ -16,15 +16,22 @@
|
||||
<para>
|
||||
A typical use case for a witness server is a two-node streaming replication
|
||||
setup, where the primary and standby are in different locations (data centres).
|
||||
By creating a witness server in the same location as the primary, if the primary
|
||||
becomes unavailable it's possible for the standby to decide whether it can
|
||||
promote itself without risking a "split brain" scenario: if it can't see either the
|
||||
By creating a witness server in the same location (data centre) as the primary,
|
||||
if the primary becomes unavailable it's possible for the standby to decide whether
|
||||
it can promote itself without risking a "split brain" scenario: if it can't see either the
|
||||
witness or the primary server, it's likely there's a network-level interruption
|
||||
and it should not promote itself. If it can seen the witness but not the primary,
|
||||
this proves there is no network interruption and the primary itself is unavailable,
|
||||
and it can therefore promote itself (and ideally take action to fence the
|
||||
former primary).
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
<emphasis>Never</emphasis> install a witness server on the same physical host
|
||||
as another node in the replication cluster managed by &repmgr; - it's essential
|
||||
the witness is not affected in any way by failure of another node.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
For more complex replication scenarios,e.g. with multiple datacentres, it may
|
||||
be preferable to use location-based failover, which ensures that only nodes
|
||||
|
||||
@@ -147,58 +147,76 @@
|
||||
<para>
|
||||
By default, all notification types will be passed to the designated script;
|
||||
the notification types can be filtered to explicitly named ones using the
|
||||
<varname>event_notifications</varname> parameter:
|
||||
<varname>event_notifications</varname> parameter.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Events generated by the &repmgr; command:
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara><literal>primary_register</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-primary-register-events">cluster_created</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>primary_unregister</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-primary-register-events">primary_register</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_register</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-primary-unregister-events">primary_unregister</link></literal></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><literal><link linkend="repmgr-standby-clone-events">standby_clone</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_register_sync</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-register-events">standby_register</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_unregister</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-register-events">standby_register_sync</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_clone</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-unregister-events">standby_unregister</link></literal></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><literal><link linkend="repmgr-standby-promote-events">standby_promote</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_promote</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-follow-events">standby_follow</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_follow</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-standby-switchover-events">standby_switchover</link></literal></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><literal><link linkend="repmgr-witness-register-events">witness_register</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_disconnect_manual</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-witness-unregister-events">witness_unregister</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_failure</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-node-rejoin-events">node_rejoin</link></literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_recovery</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>witness_register</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>witness_unregister</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>node_rejoin</literal></simpara>
|
||||
<simpara><literal><link linkend="repmgr-cluster-cleanup-events">cluster_cleanup</link></literal></simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Events generated by <application>repmgrd</application> (streaming replication mode):
|
||||
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_start</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_shutdown</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_reload</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_failover_promote</literal></simpara>
|
||||
</listitem>
|
||||
@@ -208,15 +226,41 @@
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_failover_aborted</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_standby_reconnect</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_promote_error</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_local_disconnect</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_local_reconnect</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_upstream_disconnect</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_upstream_reconnect</literal></simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara><literal>repmgrd_promote_error</literal></simpara>
|
||||
<simpara><literal>standby_disconnect_manual</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_failure</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>standby_recovery</literal></simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Events generated by <application>repmgrd</application> (BDR mode):
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><literal>bdr_failover</literal></simpara>
|
||||
</listitem>
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
<para>
|
||||
&repmgr; RPM packages for RedHat/CentOS variants and Fedora are available from the
|
||||
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink>
|
||||
<ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>; see following
|
||||
<ulink url="https://dl.2ndquadrant.com/">public repository</ulink>; see following
|
||||
section for details.
|
||||
</para>
|
||||
<para>
|
||||
@@ -46,26 +46,15 @@
|
||||
<sect3 id="installation-packages-redhat-2ndq">
|
||||
<title>2ndQuadrant public RPM yum repository</title>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
<ulink url="https://2ndquadrant.com">2ndQuadrant</ulink> previously provided a dedicated
|
||||
&repmgr; repository at
|
||||
<ulink url="http://packages.2ndquadrant.com/repmgr/">http://packages.2ndquadrant.com/repmgr/</ulink>.
|
||||
This repository will be deprecated in a future release as it is now replaced by
|
||||
the <ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink>
|
||||
documented below.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
|
||||
Beginning with <ulink url="https://repmgr.org/docs/4.1/release-4.0.5.html">repmgr 4.0.5</ulink>,
|
||||
<ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a dedicated <literal>yum</literal>
|
||||
<ulink url="https://rpm.2ndquadrant.com/">public RPM repository</ulink> for 2ndQuadrant software,
|
||||
<ulink url="https://dl.2ndquadrant.com/">public repository</ulink> for 2ndQuadrant software,
|
||||
including &repmgr;. We recommend using this for all future &repmgr; releases.
|
||||
</para>
|
||||
<para>
|
||||
General instructions for using this repository can be found on its
|
||||
<ulink url="https://rpm.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
<ulink url="https://dl.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
for installing &repmgr; follow below.
|
||||
</para>
|
||||
<para>
|
||||
@@ -75,20 +64,19 @@
|
||||
<listitem>
|
||||
<para>
|
||||
Locate the repository RPM for your PostgreSQL version from the list at:
|
||||
<ulink url="https://rpm.2ndquadrant.com/">https://rpm.2ndquadrant.com/</ulink>
|
||||
<ulink url="https://dl.2ndquadrant.com/">https://dl.2ndquadrant.com/</ulink>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Install the repository RPM for your distribution and PostgreSQL version
|
||||
Install the repository definition for your distribution and PostgreSQL version
|
||||
(this enables the 2ndQuadrant repository as a source of &repmgr; packages).
|
||||
</para>
|
||||
<para>
|
||||
For example, for PostgreSQL 10 on CentOS, execute:
|
||||
<programlisting>
|
||||
sudo yum install https://rpm.2ndquadrant.com/site/content/2ndquadrant-repo-10-1-1.el7.noarch.rpm
|
||||
</programlisting>
|
||||
curl https://dl.2ndquadrant.com/default/release/get/10/rpm | sudo bash</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Verify that the repository is installed with:
|
||||
@@ -96,8 +84,8 @@ sudo yum install https://rpm.2ndquadrant.com/site/content/2ndquadrant-repo-10-1-
|
||||
sudo yum repolist</programlisting>
|
||||
The output should contain two entries like this:
|
||||
<programlisting>
|
||||
2ndquadrant-repo-10/7/x86_64 2ndQuadrant packages for PG10 for rhel 7 - x86_64 1
|
||||
2ndquadrant-repo-10-debug/7/x86_64 2ndQuadrant packages for PG10 for rhel 7 - x86_64 - Debug 1</programlisting>
|
||||
2ndquadrant-dl-default-release-pg10/7/x86_64 2ndQuadrant packages (PG10) for 7 - x86_64 4
|
||||
2ndquadrant-dl-default-release-pg10-debug/7/x86_64 2ndQuadrant packages (PG10) for 7 - x86_64 - Debug 3</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
@@ -177,52 +165,43 @@ $ yum install repmgr10</programlisting>
|
||||
<para>
|
||||
Beginning with <ulink url="https://repmgr.org/docs/4.0/release-4.0.5.html">repmgr 4.0.5</ulink>,
|
||||
<ulink url="https://2ndquadrant.com/">2ndQuadrant</ulink> provides a
|
||||
<ulink url="https://apt.2ndquadrant.com/">public apt repository</ulink> for 2ndQuadrant software,
|
||||
<ulink url="https://dl.2ndquadrant.com/">public apt repository</ulink> for 2ndQuadrant software,
|
||||
including &repmgr;.
|
||||
</para>
|
||||
<para>
|
||||
General instructions for using this repository can be found on its
|
||||
<ulink url="https://apt.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
<ulink url="https://dl.2ndquadrant.com/">homepage</ulink>. Specific instructions
|
||||
for installing &repmgr; follow below.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
|
||||
<emphasis>Installation</emphasis>
|
||||
|
||||
<itemizedlist>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
If not already present, install the <application>apt-transport-https</application> package:
|
||||
<programlisting>
|
||||
sudo apt-get install apt-transport-https</programlisting>
|
||||
Install the repository definition for your distribution and PostgreSQL version
|
||||
(this enables the 2ndQuadrant repository as a source of &repmgr; packages) by executing:
|
||||
<programlisting>
|
||||
curl https://dl.2ndquadrant.com/default/release/get/deb | sudo bash</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
<note>
|
||||
<para>
|
||||
This will automatically install the following additional packages, if not already present:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><literal>lsb-release</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>apt-transport-https</literal></simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</note>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Create <filename>/etc/apt/sources.list.d/2ndquadrant.list</filename> as follows:
|
||||
<programlisting>
|
||||
sudo sh -c 'echo "deb https://apt.2ndquadrant.com/ $(lsb_release -cs)-2ndquadrant main" > /etc/apt/sources.list.d/2ndquadrant.list'</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Install the 2ndQuadrant <ulink url="https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc">repository key</ulink>:
|
||||
<programlisting>
|
||||
sudo apt-get install curl ca-certificates
|
||||
curl https://apt.2ndquadrant.com/site/keys/9904CD4BD6BAF0C3.asc | sudo apt-key add -</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Update the package list
|
||||
<programlisting>
|
||||
sudo apt-get update</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
|
||||
@@ -12,8 +12,8 @@
|
||||
To install &repmgr; the prerequisites for compiling
|
||||
&postgres; must be installed. These are described in &postgres;'s
|
||||
documentation
|
||||
on <ulink url="https://www.postgresql.org/docs/current/install-requirements.html">build requirements</ulink>
|
||||
and <ulink url="https://www.postgresql.org/docs/current/docguide-toolsets.html">build requirements for documentation</ulink>.
|
||||
on <ulink url="https://www.postgresql.org/docs/current/static/install-requirements.html">build requirements</ulink>
|
||||
and <ulink url="https://www.postgresql.org/docs/current/static/docguide-toolsets.html">build requirements for documentation</ulink>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
|
||||
@@ -240,11 +240,28 @@
|
||||
<tip>
|
||||
<simpara>
|
||||
For Debian-based distributions we recommend explictly setting
|
||||
<literal>pg_bindir</literal> to the directory where <command>pg_ctl</command> and other binaries
|
||||
<option>pg_bindir</option> to the directory where <command>pg_ctl</command> and other binaries
|
||||
not in the standard path are located. For PostgreSQL 9.6 this would be <filename>/usr/lib/postgresql/9.6/bin/</filename>.
|
||||
</simpara>
|
||||
</tip>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; only uses <option>pg_bindir</option> when it executes
|
||||
PostgreSQL binaries directly.
|
||||
</para>
|
||||
<para>
|
||||
For user-defined scripts such as <option>promote_command</option> and the
|
||||
various <option>service_*_command</option>s, you <emphasis>must</emphasis>
|
||||
always explicitly provide the full path to the binary or script being
|
||||
executed, even if it is &repmgr; itself.
|
||||
</para>
|
||||
<para>
|
||||
This is because these options can contain user-defined scripts in arbitrary
|
||||
locations, so prepending <option>pg_bindir</option> may break them.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
See the file
|
||||
<ulink url="https://raw.githubusercontent.com/2ndQuadrant/repmgr/master/repmgr.conf.sample">repmgr.conf.sample</>
|
||||
|
||||
@@ -15,9 +15,14 @@
|
||||
<title>Description</title>
|
||||
<para>
|
||||
Purges monitoring history from the <literal>repmgr.monitoring_history</literal> table to
|
||||
prevent excessive table growth. Use the <literal>-k/--keep-history</literal> to specify the
|
||||
number of days of monitoring history to retain. This command can be used
|
||||
manually or as a cronjob.
|
||||
prevent excessive table growth.
|
||||
</para>
|
||||
<para>
|
||||
By default <emphasis>all</emphasis> data will be removed; Use the <option>-k/--keep-history</option>
|
||||
option to specify the number of days of monitoring history to retain.
|
||||
</para>
|
||||
<para>
|
||||
This command can be executed manually or as a cronjob.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
@@ -38,4 +43,21 @@
|
||||
<filename>repmgr.conf</filename>.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-cluster-cleanup-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>cluster_cleanup</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<title>See also</title>
|
||||
<para>
|
||||
For more details see the sections <xref linkend="repmgrd-monitoring"> and
|
||||
<xref linkend="repmgrd-monitoring-configuration">.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
</refentry>
|
||||
|
||||
@@ -81,36 +81,56 @@
|
||||
|
||||
<refsect1>
|
||||
<title>Options</title>
|
||||
<para>
|
||||
<command>repmgr cluster show</command> accepts an optional parameter <literal>--csv</literal>, which
|
||||
outputs the replication cluster's status in a simple CSV format, suitable for
|
||||
parsing by scripts:
|
||||
<programlisting>
|
||||
|
||||
<variablelist>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--csv</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
<command>repmgr cluster show</command> accepts an optional parameter <literal>--csv</literal>, which
|
||||
outputs the replication cluster's status in a simple CSV format, suitable for
|
||||
parsing by scripts:
|
||||
<programlisting>
|
||||
$ repmgr -f /etc/repmgr.conf cluster show --csv
|
||||
1,-1,-1
|
||||
2,0,0
|
||||
3,0,1</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
The columns have following meanings:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
node ID
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
</para>
|
||||
<para>
|
||||
The columns have following meanings:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
node ID
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
availability (0 = available, -1 = unavailable)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
recovery state (0 = not in recovery, 1 = in recovery, -1 = unknown)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--verbose</option></term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
recovery state (0 = not in recovery, 1 = in recovery, -1 = unknown)
|
||||
</simpara>
|
||||
<para>
|
||||
Display the full text of any database connection error messages
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
</refsect1>
|
||||
|
||||
|
||||
|
||||
@@ -28,6 +28,10 @@
|
||||
If the node is running and needs to be attached to the current primary, use
|
||||
<xref linkend="repmgr-standby-follow">.
|
||||
</para>
|
||||
<para>
|
||||
Note <xref linkend="repmgr-standby-follow"> can only be used for standbys which have not diverged
|
||||
from the rest of the cluster.
|
||||
</para>
|
||||
</tip>
|
||||
</refsect1>
|
||||
|
||||
@@ -63,10 +67,10 @@
|
||||
<term><option>--force-rewind[=/path/to/pg_rewind]</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Execute <application>pg_rewind</application> if necessary.
|
||||
Execute <application>pg_rewind</application>.
|
||||
</para>
|
||||
<para>
|
||||
It is only necessary to provide the <application>pg_rewind</application>
|
||||
It is only necessary to provide the <application>pg_rewind</application> path
|
||||
if using PostgreSQL 9.3 or 9.4, and <application>pg_rewind</application>
|
||||
is not installed in the PostgreSQL <filename>bin</filename> directory.
|
||||
</para>
|
||||
@@ -115,7 +119,7 @@
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1>
|
||||
<refsect1>
|
||||
<title>Configuration file settings</title>
|
||||
|
||||
<para>
|
||||
@@ -132,8 +136,9 @@
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
<refsect1>
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-node-rejoin-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>node_rejoin</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
@@ -188,7 +193,7 @@
|
||||
</note>
|
||||
|
||||
<para>
|
||||
To have <command>repmgr node rejoin</command> use <command>pg_rewind</command> if required,
|
||||
To have <command>repmgr node rejoin</command> use <command>pg_rewind</command>,
|
||||
pass the command line option <literal>--force-rewind</literal>, which will tell &repmgr;
|
||||
to execute <command>pg_rewind</command> to ensure the node can be rejoined successfully.
|
||||
</para>
|
||||
@@ -221,6 +226,15 @@
|
||||
INFO: pg_rewind would now be executed
|
||||
DETAIL: pg_rewind command is:
|
||||
pg_rewind -D '/var/lib/postgresql/data' --source-server='host=node1 dbname=repmgr user=repmgr'</programlisting>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
If <option>--force-rewind</option> is used with the <option>--dry-run</option> option,
|
||||
this checks the prerequisites for using <application>pg_rewind</application>, but cannot
|
||||
predict the outcome of actually executing <application>pg_rewind</application>.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<programlisting>
|
||||
$ repmgr node rejoin -f /etc/repmgr.conf -d 'host=node1 dbname=repmgr user=repmgr' \
|
||||
--force-rewind --config-files=postgresql.local.conf,postgresql.conf --verbose
|
||||
|
||||
@@ -75,10 +75,18 @@
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-primary-register-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>primary_register</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
Following <link linkend="event-notifications">event notifications</link> will be generated:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara><literal>cluster_created</literal></simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara><literal>primary_register</literal></simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-primary-unregister-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>primary_unregister</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -49,7 +49,7 @@
|
||||
not be copied by default. &repmgr; can copy these files, either to the same
|
||||
location on the standby server (provided appropriate directory and file permissions
|
||||
are available), or into the standby's data directory. This requires passwordless
|
||||
SSH access to the primary server. Add the option <literal>--copy-external-config-files</literal>
|
||||
SSH access to the primary server. Add the option <option>--copy-external-config-files</option>
|
||||
to the <command>repmgr standby clone</command> command; by default files will be copied to
|
||||
the same path as on the upstream server. Note that the user executing <command>repmgr</command>
|
||||
must have write access to those directories.
|
||||
@@ -59,12 +59,29 @@
|
||||
<literal>--copy-external-config-files=pgdata</literal>, but note that
|
||||
any include directives in the copied files may need to be updated.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
When executing <command>repmgr standby clone</command> with the
|
||||
<option>--copy-external-config-files</option> aand <option>--dry-run</option>
|
||||
options, &repmgr; will check the SSH connection to the source node, but
|
||||
will not verify whether the files can actually be copied.
|
||||
</para>
|
||||
<para>
|
||||
During the actual clone operation, a check will be made before the database itself
|
||||
is cloned to determine whether the files can actually be copied; if any problems are
|
||||
encountered, the clone operation will be aborted, enabling the user to fix
|
||||
any issues before retrying the clone operation.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<tip>
|
||||
<simpara>
|
||||
For reliable configuration file management we recommend using a
|
||||
configuration management tool such as Ansible, Chef, Puppet or Salt.
|
||||
</simpara>
|
||||
</tip>
|
||||
|
||||
</refsect1>
|
||||
|
||||
<refsect1 id="repmgr-standby-clone-recovery-conf">
|
||||
@@ -333,7 +350,7 @@
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-clone-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_clone</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -94,7 +94,7 @@
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-follow-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_follow</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -50,7 +50,7 @@
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-promote-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_promote</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -159,7 +159,7 @@
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-register-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_register</literal> <link linkend="event-notifications">event notification</link>
|
||||
|
||||
@@ -196,7 +196,7 @@
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-switchover-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
<literal>standby_switchover</literal> and <literal>standby_promote</literal>
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-standby-unregister-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>standby_unregister</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -50,7 +50,7 @@
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-witness-register-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>witness_register</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -92,7 +92,7 @@
|
||||
</refsect1>
|
||||
|
||||
|
||||
<refsect1>
|
||||
<refsect1 id="repmgr-witness-unregister-events">
|
||||
<title>Event notifications</title>
|
||||
<para>
|
||||
A <literal>witness_unregister</literal> <link linkend="event-notifications">event notification</link> will be generated.
|
||||
|
||||
@@ -25,7 +25,13 @@
|
||||
<para>
|
||||
This is the official documentation of &repmgr; &repmgrversion; for
|
||||
use with PostgreSQL 9.3 - PostgreSQL 10.
|
||||
It describes the functionality supported by the current version of &repmgr;.
|
||||
</para>
|
||||
<para>
|
||||
&repmgr; is being continually developed and we strongly recommend using the
|
||||
latest version. Please check the
|
||||
<ulink url="https://repmgr.org/">repmgr website</ulink> for details
|
||||
about the current &repmgr; version as well as the
|
||||
<ulink url="https://repmgr.org/docs/current/index.html">current documentation</ulink>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
</para>
|
||||
<note>
|
||||
<simpara>
|
||||
Due to the nature of BDR, it's only safe to use this solution for
|
||||
Due to the nature of BDR 1.x/2.x, it's only safe to use this solution for
|
||||
a two-node scenario. Introducing additional nodes will create an inherent
|
||||
risk of node desynchronisation if a node goes down without being cleanly
|
||||
removed from the cluster.
|
||||
|
||||
@@ -34,24 +34,6 @@
|
||||
the <ulink url="https://www.postgresql.org/docs/current/static/runtime-config-client.html#GUC-SHARED-PRELOAD-LIBRARIES">PostgreSQL documentation</ulink>.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To apply configuration file changes to a running <application>repmgrd</application>
|
||||
daemon, execute the operating system's r<application>repmgrd</application> service reload command
|
||||
(see <xref linkend="appendix-packages"> for examples),
|
||||
or for instances which were manually started, execute <command>kill -HUP</command>, e.g.
|
||||
<command>kill -HUP `cat /tmp/repmgrd.pid`</command>.
|
||||
</para>
|
||||
<note>
|
||||
<para>
|
||||
Check the <application>repmgrd</application> log to see what changes were
|
||||
applied, or if any issues were encountered when reloading the configuration.
|
||||
</para>
|
||||
</note>
|
||||
<para>
|
||||
Note that only a subset of configuration file parameters can be changed on a
|
||||
running <application>repmgrd</application> daemon.
|
||||
</para>
|
||||
|
||||
|
||||
<sect2 id="repmgrd-automatic-failover-configuration">
|
||||
<title>automatic failover configuration</title>
|
||||
@@ -64,8 +46,17 @@
|
||||
follow_command='/usr/bin/repmgr standby follow -f /etc/repmgr.conf --log-to-file --upstream-node-id=%n'</programlisting>
|
||||
</para>
|
||||
<para>
|
||||
Adjust file paths as appropriate; we recomment specifying the full path to the &repmgr; binary.
|
||||
Adjust file paths as appropriate; alway specify the full path to the &repmgr; binary.
|
||||
</para>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
&repmgr; will not apply <option>pg_bindir</option> when executing <option>promote_command</option>
|
||||
or <option>follow_command</option>; these can be user-defined scripts so must always be
|
||||
specified with the full path.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
<para>
|
||||
Note that the <literal>--log-to-file</literal> option will cause
|
||||
output generated by the &repmgr; command, when executed by <application>repmgrd</application>,
|
||||
@@ -135,7 +126,7 @@
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="repmgrd-monitoring-configuration">
|
||||
<sect2 id="repmgrd-monitoring-configuration" xreflabel="repmgrd monitoring configuration">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>monitoring configuration</secondary>
|
||||
@@ -158,6 +149,203 @@
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2 id="repmgrd-reloading-configuration"xreflabel="reloading repmgrd configuration">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>applying configuration changes</secondary>
|
||||
</indexterm>
|
||||
<title>Applying configuration changes to repmgrd</title>
|
||||
<para>
|
||||
To apply configuration file changes to a running <application>repmgrd</application>
|
||||
daemon, execute the operating system's <application>repmgrd</application> service reload command
|
||||
(see <xref linkend="appendix-packages"> for examples),
|
||||
or for instances which were manually started, execute <command>kill -HUP</command>, e.g.
|
||||
<command>kill -HUP `cat /tmp/repmgrd.pid`</command>.
|
||||
</para>
|
||||
<tip>
|
||||
<para>
|
||||
Check the <application>repmgrd</application> log to see what changes were
|
||||
applied, or if any issues were encountered when reloading the configuration.
|
||||
</para>
|
||||
</tip>
|
||||
<para>
|
||||
Note that only the following subset of configuration file parameters can be changed on a
|
||||
running <application>repmgrd</application> daemon:
|
||||
</para>
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>async_query_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>bdr_local_monitoring_only</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>bdr_recovery_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>conninfo</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>degraded_monitoring_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>event_notification_command</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>event_notifications</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>failover</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>follow_command</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>log_facility</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>log_file</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>log_level</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>log_status_interval</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>monitor_interval_secs</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>monitoring_history</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>primary_notification_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>promote_command</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>reconnect_attempts</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>reconnect_interval</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>repmgrd_standby_startup_timeout</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
<para>
|
||||
The following set of configuration file parameters must be updated via
|
||||
<command><link linkend="repmgr-standby-register">repmgr standby register --force</link></command>,
|
||||
as they require changes to the <literal>repmgr.nodes</literal> table so they are visible to
|
||||
all nodes in the replication cluster:
|
||||
</para>
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>node_id</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>node_name</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>data_directory</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>location</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
|
||||
<listitem>
|
||||
<simpara>
|
||||
<varname>priority</varname>
|
||||
</simpara>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
|
||||
<note>
|
||||
<para>
|
||||
After executing <command><link linkend="repmgr-standby-register">repmgr standby register --force</link></command>,
|
||||
<application>repmgrd</application> <emphasis>must</emphasis> be restarted for the changes to take effect.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="repmgrd-daemon">
|
||||
@@ -323,25 +511,34 @@ REPMGRD_ENABLED=no
|
||||
<secondary>repmgrd</secondary>
|
||||
</indexterm>
|
||||
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>log rotation</secondary>
|
||||
</indexterm>
|
||||
|
||||
<title>repmgrd log rotation</title>
|
||||
<para>
|
||||
To ensure the current <application>repmgrd</application> logfile
|
||||
(specified in <filename>repmgr.conf</filename> with the parameter
|
||||
<option>log_file</option> does not grow indefinitely, configure your
|
||||
<option>log_file</option>) does not grow indefinitely, configure your
|
||||
system's <command>logrotate</command> to regularly rotate it.
|
||||
</para>
|
||||
<para>
|
||||
Sample configuration to rotate logfiles weekly with retention for
|
||||
up to 52 weeks and rotation forced if a file grows beyond 100Mb:
|
||||
<programlisting>
|
||||
/var/log/postgresql/repmgr-9.6.log {
|
||||
/var/log/repmgr/repmgrd.log {
|
||||
missingok
|
||||
compress
|
||||
rotate 52
|
||||
maxsize 100M
|
||||
weekly
|
||||
create 0600 postgres postgres
|
||||
postrotate
|
||||
/usr/bin/killall -HUP repmgrd
|
||||
endscript
|
||||
}</programlisting>
|
||||
</para>
|
||||
|
||||
</sect1>
|
||||
</chapter>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<chapter id="repmgrd-degraded-monitoring">
|
||||
<chapter id="repmgrd-degraded-monitoring" xreflabel="repmgrd degraded monitoring">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>degraded monitoring</secondary>
|
||||
@@ -7,8 +7,8 @@
|
||||
<title>"degraded monitoring" mode</title>
|
||||
<para>
|
||||
In certain circumstances, <application>repmgrd</application> is not able to fulfill its primary mission
|
||||
of monitoring the nodes' upstream server. In these cases it enters "degraded
|
||||
monitoring" mode, where <application>repmgrd</application> remains active but is waiting for the situation
|
||||
of monitoring the node's upstream server. In these cases it enters "degraded monitoring"
|
||||
mode, where <application>repmgrd</application> remains active but is waiting for the situation
|
||||
to be resolved.
|
||||
</para>
|
||||
<para>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<chapter id="repmgrd-monitoring">
|
||||
<chapter id="repmgrd-monitoring" xreflabel="Monitoring with repmgrd">
|
||||
<indexterm>
|
||||
<primary>repmgrd</primary>
|
||||
<secondary>monitoring</secondary>
|
||||
|
||||
@@ -40,8 +40,8 @@
|
||||
In a failover situation, <application>repmgrd</application> will check if any servers in the
|
||||
same location as the current primary node are visible. If not, <application>repmgrd</application>
|
||||
will assume a network interruption and not promote any node in any
|
||||
other location (it will however enter <xref linkend="repmgrd-degraded-monitoring"> mode until
|
||||
a primary becomes visible).
|
||||
other location (it will however enter <link linkend="repmgrd-degraded-monitoring">degraded monitoring</link>
|
||||
mode until a primary becomes visible).
|
||||
</para>
|
||||
|
||||
</chapter>
|
||||
|
||||
@@ -60,6 +60,13 @@
|
||||
&repmgr; being able to shut down the current primary server quickly and cleanly.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Ensure that the promotion candidate has sufficient free walsenders available
|
||||
(PostgreSQL configuration item <varname>max_wal_senders</varname>), and if replication
|
||||
slots are in use, at least one free slot is available for the demotion candidate (
|
||||
PostgreSQL configuration item <varname>max_replication_slots</varname>).
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Ensure that a passwordless SSH connection is possible from the promotion candidate
|
||||
(standby) to the demotion candidate (current primary). If <literal>--siblings-follow</literal>
|
||||
|
||||
@@ -1 +1 @@
|
||||
<!ENTITY repmgrversion "4.1dev">
|
||||
<!ENTITY repmgrversion "4.1.1">
|
||||
|
||||
@@ -84,6 +84,7 @@ do_cluster_show(void)
|
||||
ItemList warnings = {NULL, NULL};
|
||||
bool success = false;
|
||||
bool error_found = false;
|
||||
bool connection_error_found = false;
|
||||
|
||||
/* Connect to local database to obtain cluster connection data */
|
||||
log_verbose(LOG_INFO, _("connecting to database"));
|
||||
@@ -141,14 +142,26 @@ do_cluster_show(void)
|
||||
}
|
||||
else
|
||||
{
|
||||
char error[MAXLEN];
|
||||
|
||||
strncpy(error, PQerrorMessage(cell->node_info->conn), MAXLEN);
|
||||
cell->node_info->node_status = NODE_STATUS_DOWN;
|
||||
cell->node_info->recovery_type = RECTYPE_UNKNOWN;
|
||||
item_list_append_format(&warnings,
|
||||
"when attempting to connect to node \"%s\" (ID: %i), following error encountered :\n\"%s\"",
|
||||
cell->node_info->node_name, cell->node_info->node_id, trim(error));
|
||||
|
||||
connection_error_found = true;
|
||||
|
||||
if (runtime_options.verbose)
|
||||
{
|
||||
char error[MAXLEN];
|
||||
|
||||
strncpy(error, PQerrorMessage(cell->node_info->conn), MAXLEN);
|
||||
item_list_append_format(&warnings,
|
||||
"when attempting to connect to node \"%s\" (ID: %i), following error encountered :\n\"%s\"",
|
||||
cell->node_info->node_name, cell->node_info->node_id, trim(error));
|
||||
}
|
||||
else
|
||||
{
|
||||
item_list_append_format(&warnings,
|
||||
"unable to connect to node \"%s\" (ID: %i)",
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
}
|
||||
}
|
||||
|
||||
initPQExpBuffer(&details);
|
||||
@@ -437,6 +450,11 @@ do_cluster_show(void)
|
||||
{
|
||||
printf(_(" - %s\n"), cell->string);
|
||||
}
|
||||
|
||||
if (runtime_options.verbose == false && connection_error_found == true)
|
||||
{
|
||||
log_hint(_("execute with --verbose option to see connection error messages"));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1332,6 +1350,7 @@ do_cluster_cleanup(void)
|
||||
PGconn *conn = NULL;
|
||||
PGconn *primary_conn = NULL;
|
||||
int entries_to_delete = 0;
|
||||
PQExpBufferData event_details;
|
||||
|
||||
conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
@@ -1345,7 +1364,13 @@ do_cluster_cleanup(void)
|
||||
|
||||
entries_to_delete = get_number_of_monitoring_records_to_delete(primary_conn, runtime_options.keep_history);
|
||||
|
||||
if (entries_to_delete == 0)
|
||||
if (entries_to_delete < 0)
|
||||
{
|
||||
log_error(_("unable to query number of monitoring records to clean up"));
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
else if (entries_to_delete == 0)
|
||||
{
|
||||
log_info(_("no monitoring records to delete"));
|
||||
PQfinish(primary_conn);
|
||||
@@ -1355,10 +1380,23 @@ do_cluster_cleanup(void)
|
||||
log_debug("at least %i monitoring records for deletion",
|
||||
entries_to_delete);
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
if (delete_monitoring_records(primary_conn, runtime_options.keep_history) == false)
|
||||
{
|
||||
log_error(_("unable to delete monitoring records"));
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to delete monitoring records"));
|
||||
|
||||
log_error("%s", event_details.data);
|
||||
log_detail("%s", PQerrorMessage(primary_conn));
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"cluster_cleanup",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
@@ -1370,7 +1408,22 @@ do_cluster_cleanup(void)
|
||||
log_detail("%s", PQerrorMessage(primary_conn));
|
||||
}
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("monitoring records deleted"));
|
||||
|
||||
if (runtime_options.keep_history > 0)
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("; records newer than %i day(s) retained"),
|
||||
runtime_options.keep_history);
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"cluster_cleanup",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
PQfinish(primary_conn);
|
||||
|
||||
if (runtime_options.keep_history > 0)
|
||||
|
||||
@@ -2417,6 +2417,54 @@ do_node_rejoin(void)
|
||||
success = is_downstream_node_attached(upstream_conn, config_file_options.node_name);
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle replication slots:
|
||||
* - if a slot for the new upstream exists, delete that
|
||||
* - warn about any other inactive replication slots
|
||||
*/
|
||||
if (runtime_options.force_rewind_used == false && config_file_options.use_replication_slots)
|
||||
{
|
||||
PGconn *local_conn = NULL;
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_warning(_("unable to connect to local node to check replication slot status"));
|
||||
log_hint(_("execute \"repmgr node check\" to check inactive slots and drop manually if necessary"));
|
||||
}
|
||||
else
|
||||
{
|
||||
KeyValueList inactive_replication_slots = {NULL, NULL};
|
||||
KeyValueListCell *cell = NULL;
|
||||
int inactive_count = 0;
|
||||
PQExpBufferData slotinfo;
|
||||
|
||||
drop_replication_slot_if_exists(local_conn,
|
||||
config_file_options.node_id,
|
||||
primary_node_record.slot_name);
|
||||
|
||||
(void) get_inactive_replication_slots(local_conn, &inactive_replication_slots);
|
||||
|
||||
initPQExpBuffer(&slotinfo);
|
||||
for (cell = inactive_replication_slots.head; cell; cell = cell->next)
|
||||
{
|
||||
appendPQExpBuffer(&slotinfo,
|
||||
" - %s (%s)", cell->key, cell->value);
|
||||
inactive_count++;
|
||||
}
|
||||
|
||||
if (inactive_count > 0)
|
||||
{
|
||||
log_warning(_("%i inactive replication slots detected"), inactive_count);
|
||||
log_detail(_("inactive replication slots:\n%s"), slotinfo.data);
|
||||
log_hint(_("these replication slots may need to be removed manually"));
|
||||
}
|
||||
|
||||
termPQExpBuffer(&slotinfo);
|
||||
|
||||
PQfinish(local_conn);
|
||||
}
|
||||
}
|
||||
|
||||
if (success == true)
|
||||
{
|
||||
@@ -2426,7 +2474,8 @@ do_node_rejoin(void)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* if we reach here, no record found in upstream node's pg_stat_replication */
|
||||
* if we reach here, no record found in upstream node's pg_stat_replication
|
||||
*/
|
||||
log_notice(_("NODE REJOIN has completed but node is not yet reattached to upstream"));
|
||||
log_hint(_("you will need to manually check the node's replication status"));
|
||||
}
|
||||
|
||||
@@ -64,12 +64,10 @@ do_primary_register(void)
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("connection to node lost"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
log_error(_("unable to determine server's recovery type"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_DB_CONN);
|
||||
}
|
||||
|
||||
log_verbose(LOG_INFO, _("server is not in recovery"));
|
||||
|
||||
@@ -89,8 +89,6 @@ static int run_file_backup(t_node_info *node_record);
|
||||
|
||||
static void copy_configuration_files(bool delete_after_copy);
|
||||
|
||||
static void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);
|
||||
|
||||
static void tablespace_data_append(TablespaceDataList *list, const char *name, const char *oid, const char *location);
|
||||
|
||||
static void get_barman_property(char *dst, char *name, char *local_repmgr_directory);
|
||||
@@ -471,6 +469,7 @@ do_standby_clone(void)
|
||||
termPQExpBuffer(&msg);
|
||||
|
||||
r = test_ssh_connection(runtime_options.host, runtime_options.remote_user);
|
||||
|
||||
if (r != 0)
|
||||
{
|
||||
log_error(_("remote host \"%s\" is not reachable via SSH - unable to copy external configuration files"),
|
||||
@@ -498,32 +497,41 @@ do_standby_clone(void)
|
||||
|
||||
termPQExpBuffer(&msg);
|
||||
|
||||
|
||||
/*
|
||||
* Here we'll attempt an initial test copy of the detected external
|
||||
* files, to detect any issues before we run the base backup.
|
||||
*
|
||||
* Note this will exit with an error, unless -F/--force supplied.
|
||||
*
|
||||
* We don't do this during a --dry-run as it may introduce unexpected changes
|
||||
* on the local node; during an actual clone operation, any problems with
|
||||
* copying files will be detected early and the operation aborted before
|
||||
* the actual database cloning commences.
|
||||
*
|
||||
* TODO: put the files in a temporary directory and move to their final
|
||||
* destination once the database has been cloned.
|
||||
*/
|
||||
|
||||
if (runtime_options.copy_external_config_files_destination == CONFIG_FILE_SAMEPATH)
|
||||
if (runtime_options.dry_run == false)
|
||||
{
|
||||
/*
|
||||
* Files will be placed in the same path as on the source server;
|
||||
* don't delete after copying.
|
||||
*/
|
||||
copy_configuration_files(false);
|
||||
if (runtime_options.copy_external_config_files_destination == CONFIG_FILE_SAMEPATH)
|
||||
{
|
||||
/*
|
||||
* Files will be placed in the same path as on the source server;
|
||||
* don't delete after copying.
|
||||
*/
|
||||
copy_configuration_files(false);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Files will be placed in the data directory - delete after copying.
|
||||
* They'll be copied again later; see TODO above.
|
||||
*/
|
||||
copy_configuration_files(true);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Files will be placed in the data directory - delete after copying.
|
||||
* They'll be copied again later; see TODO above.
|
||||
*/
|
||||
copy_configuration_files(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1558,8 +1566,8 @@ do_standby_register(void)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
log_warning(_("this node does not appear to be attached to upstream node \"%s\" (ID: %i)"),
|
||||
config_file_options.node_name,
|
||||
config_file_options.node_id);
|
||||
upstream_node_record.node_name,
|
||||
upstream_node_record.node_id);
|
||||
}
|
||||
PQfinish(upstream_conn);
|
||||
}
|
||||
@@ -2050,6 +2058,8 @@ _do_standby_promote_internal(PGconn *conn)
|
||||
local_node_record.node_name,
|
||||
local_node_record.node_id,
|
||||
script);
|
||||
log_detail(_("waiting up to %i seconds (parameter \"promote_check_timeout\") for promotion to complete"),
|
||||
config_file_options.promote_check_timeout);
|
||||
|
||||
r = system(script);
|
||||
if (r != 0)
|
||||
@@ -2075,6 +2085,8 @@ _do_standby_promote_internal(PGconn *conn)
|
||||
if (recovery_type == RECTYPE_STANDBY)
|
||||
{
|
||||
log_error(_("STANDBY PROMOTE failed, node is still a standby"));
|
||||
log_detail(_("node still in recovery after %i seconds"), config_file_options.promote_check_timeout);
|
||||
log_hint(_("the node may need more time to promote itself, check the PostgreSQL log for details"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_PROMOTION_FAIL);
|
||||
}
|
||||
@@ -2720,6 +2732,10 @@ do_standby_follow_internal(PGconn *primary_conn, t_node_info *primary_node_recor
|
||||
* If replication slots are in use, and an inactive one for this node
|
||||
* exists on the former upstream, drop it.
|
||||
*
|
||||
* Note that if this function is called by do_standby_switchover(), the
|
||||
* "repmgr node rejoin" command executed on the demotion candidate may already
|
||||
* have removed the slot, so there may be nothing to do.
|
||||
*
|
||||
* XXX check if former upstream is current primary?
|
||||
*/
|
||||
|
||||
@@ -2827,6 +2843,12 @@ do_standby_switchover(void)
|
||||
int reachable_sibling_nodes_with_slot_count = 0;
|
||||
int unreachable_sibling_node_count = 0;
|
||||
|
||||
/* number of free walsenders required on promotion candidate */
|
||||
int min_required_wal_senders = 1;
|
||||
|
||||
/* this will be calculated as max_wal_senders - COUNT(*) FROM pg_stat_replication */
|
||||
int available_wal_senders = 0;
|
||||
|
||||
/* number of free replication slots required on promotion candidate */
|
||||
int min_required_free_slots = 0;
|
||||
|
||||
@@ -3096,6 +3118,176 @@ do_standby_switchover(void)
|
||||
}
|
||||
termPQExpBuffer(&command_output);
|
||||
|
||||
/*
|
||||
* populate local node record with current state of various replication-related
|
||||
* values, so we can check for sufficient walsenders and replication slots
|
||||
*/
|
||||
get_node_replication_stats(local_conn, server_version_num, &local_node_record);
|
||||
|
||||
available_wal_senders = local_node_record.max_wal_senders -
|
||||
local_node_record.attached_wal_receivers;
|
||||
|
||||
/*
|
||||
* If --siblings-follow specified, get list and check they're reachable
|
||||
* (if not just issue a warning)
|
||||
*/
|
||||
get_active_sibling_node_records(local_conn,
|
||||
local_node_record.node_id,
|
||||
local_node_record.upstream_node_id,
|
||||
&sibling_nodes);
|
||||
|
||||
if (runtime_options.siblings_follow == false)
|
||||
{
|
||||
if (sibling_nodes.node_count > 0)
|
||||
{
|
||||
log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"),
|
||||
sibling_nodes.node_count);
|
||||
log_detail(_("these nodes will remain attached to the current primary"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
char host[MAXLEN] = "";
|
||||
NodeInfoListCell *cell;
|
||||
|
||||
log_verbose(LOG_INFO, _("%i active sibling nodes found"),
|
||||
sibling_nodes.node_count);
|
||||
|
||||
if (sibling_nodes.node_count == 0)
|
||||
{
|
||||
log_warning(_("option \"--sibling-nodes\" specified, but no sibling nodes exist"));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* include walsender for promotion candidate in total */
|
||||
|
||||
|
||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
/* get host from node record */
|
||||
get_conninfo_value(cell->node_info->conninfo, "host", host);
|
||||
r = test_ssh_connection(host, runtime_options.remote_user);
|
||||
|
||||
if (r != 0)
|
||||
{
|
||||
cell->node_info->reachable = false;
|
||||
unreachable_sibling_node_count++;
|
||||
}
|
||||
else
|
||||
{
|
||||
cell->node_info->reachable = true;
|
||||
reachable_sibling_node_count++;
|
||||
min_required_wal_senders++;
|
||||
|
||||
if (cell->node_info->slot_name[0] != '\0')
|
||||
{
|
||||
reachable_sibling_nodes_with_slot_count++;
|
||||
min_required_free_slots++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (unreachable_sibling_node_count > 0)
|
||||
{
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("%i of %i sibling nodes unreachable via SSH:"),
|
||||
unreachable_sibling_node_count,
|
||||
sibling_nodes.node_count);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_warning(_("%i of %i sibling nodes unreachable via SSH:"),
|
||||
unreachable_sibling_node_count,
|
||||
sibling_nodes.node_count);
|
||||
}
|
||||
|
||||
/* display list of unreachable sibling nodes */
|
||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
if (cell->node_info->reachable == true)
|
||||
continue;
|
||||
log_detail(" %s (ID: %i)",
|
||||
cell->node_info->node_name,
|
||||
cell->node_info->node_id);
|
||||
}
|
||||
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_hint(_("use -F/--force to proceed in any case"));
|
||||
PQfinish(local_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_detail(_("F/--force specified, would proceed anyway"));
|
||||
}
|
||||
else
|
||||
{
|
||||
log_detail(_("F/--force specified, proceeding anyway"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
char *msg = _("all sibling nodes are reachable via SSH");
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info("%s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_verbose(LOG_INFO, "%s", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* check there are sufficient free walsenders - obviously there's potential
|
||||
* for a later race condition if some walsenders come into use before the
|
||||
* switchover operation gets around to attaching the sibling nodes, but
|
||||
* this should catch any actual existing configuration issue (and if anyone's
|
||||
* performing a switchover in such an unstable environment, they only have
|
||||
* themselves to blame).
|
||||
*/
|
||||
if (available_wal_senders < min_required_wal_senders)
|
||||
{
|
||||
if (runtime_options.force == false || runtime_options.dry_run == true)
|
||||
{
|
||||
log_error(_("insufficient free walsenders on promotion candidate"));
|
||||
log_detail(_("at least %i walsenders required but only %i free walsenders on promotion candidate"),
|
||||
min_required_wal_senders,
|
||||
available_wal_senders);
|
||||
log_hint(_("increase parameter \"max_wal_senders\" or use -F/--force to proceed in any case"));
|
||||
|
||||
if (runtime_options.dry_run == false)
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log_warning(_("insufficient free walsenders on promotion candidate"));
|
||||
log_detail(_("at least %i walsenders required but only %i free walsender(s) on promotion candidate"),
|
||||
min_required_wal_senders,
|
||||
available_wal_senders);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info(_("%i walsenders required, %i available"),
|
||||
min_required_wal_senders,
|
||||
available_wal_senders);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* check demotion candidate can make replication connection to promotion candidate */
|
||||
{
|
||||
initPQExpBuffer(&remote_command_str);
|
||||
@@ -3339,171 +3531,6 @@ do_standby_switchover(void)
|
||||
|
||||
PQfinish(remote_conn);
|
||||
|
||||
/*
|
||||
* populate local node record with current state of various replication-related
|
||||
* values, so we can check for sufficient walsenders and replication slots
|
||||
*/
|
||||
get_node_replication_stats(local_conn, server_version_num, &local_node_record);
|
||||
|
||||
/*
|
||||
* If --siblings-follow specified, get list and check they're reachable
|
||||
* (if not just issue a warning)
|
||||
*/
|
||||
get_active_sibling_node_records(local_conn,
|
||||
local_node_record.node_id,
|
||||
local_node_record.upstream_node_id,
|
||||
&sibling_nodes);
|
||||
|
||||
if (runtime_options.siblings_follow == false)
|
||||
{
|
||||
if (sibling_nodes.node_count > 0)
|
||||
{
|
||||
log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"),
|
||||
sibling_nodes.node_count);
|
||||
log_detail(_("these nodes will remain attached to the current primary"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
char host[MAXLEN] = "";
|
||||
NodeInfoListCell *cell;
|
||||
|
||||
log_verbose(LOG_INFO, _("%i active sibling nodes found"),
|
||||
sibling_nodes.node_count);
|
||||
|
||||
if (sibling_nodes.node_count == 0)
|
||||
{
|
||||
log_warning(_("option \"--sibling-nodes\" specified, but no sibling nodes exist"));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* include walsender for promotion candidate in total */
|
||||
int min_required_wal_senders = 1;
|
||||
int available_wal_senders = local_node_record.max_wal_senders -
|
||||
local_node_record.attached_wal_receivers;
|
||||
|
||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
/* get host from node record */
|
||||
get_conninfo_value(cell->node_info->conninfo, "host", host);
|
||||
r = test_ssh_connection(host, runtime_options.remote_user);
|
||||
|
||||
if (r != 0)
|
||||
{
|
||||
cell->node_info->reachable = false;
|
||||
unreachable_sibling_node_count++;
|
||||
}
|
||||
else
|
||||
{
|
||||
cell->node_info->reachable = true;
|
||||
reachable_sibling_node_count++;
|
||||
min_required_wal_senders++;
|
||||
|
||||
if (cell->node_info->slot_name[0] != '\0')
|
||||
{
|
||||
reachable_sibling_nodes_with_slot_count++;
|
||||
min_required_free_slots++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (unreachable_sibling_node_count > 0)
|
||||
{
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_error(_("%i of %i sibling nodes unreachable via SSH:"),
|
||||
unreachable_sibling_node_count,
|
||||
sibling_nodes.node_count);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_warning(_("%i of %i sibling nodes unreachable via SSH:"),
|
||||
unreachable_sibling_node_count,
|
||||
sibling_nodes.node_count);
|
||||
}
|
||||
|
||||
/* display list of unreachable sibling nodes */
|
||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
if (cell->node_info->reachable == true)
|
||||
continue;
|
||||
log_detail(" %s (ID: %i)",
|
||||
cell->node_info->node_name,
|
||||
cell->node_info->node_id);
|
||||
}
|
||||
|
||||
if (runtime_options.force == false)
|
||||
{
|
||||
log_hint(_("use -F/--force to proceed in any case"));
|
||||
PQfinish(local_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_detail(_("F/--force specified, would proceed anyway"));
|
||||
}
|
||||
else
|
||||
{
|
||||
log_detail(_("F/--force specified, proceeding anyway"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
char *msg = _("all sibling nodes are reachable via SSH");
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info("%s", msg);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_verbose(LOG_INFO, "%s", msg);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* check there are sufficient free walsenders - obviously there's potential
|
||||
* for a later race condition if some walsenders come into use before the
|
||||
* switchover operation gets around to attaching the sibling nodes, but
|
||||
* this should catch any actual existing configuration issue.
|
||||
*/
|
||||
if (available_wal_senders < min_required_wal_senders)
|
||||
{
|
||||
if (runtime_options.force == false || runtime_options.dry_run == true)
|
||||
{
|
||||
log_error(_("insufficient free walsenders to attach all sibling nodes"));
|
||||
log_detail(_("at least %i walsenders required but only %i free walsenders on promotion candidate"),
|
||||
min_required_wal_senders,
|
||||
available_wal_senders);
|
||||
log_hint(_("increase parameter \"max_wal_senders\" or use -F/--force to proceed in any case"));
|
||||
|
||||
if (runtime_options.dry_run == false)
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log_warning(_("insufficient free walsenders to attach all sibling nodes"));
|
||||
log_detail(_("at least %i walsenders required but only %i free walsender(s) on promotion candidate"),
|
||||
min_required_wal_senders,
|
||||
available_wal_senders);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info(_("%i walsenders required, %i available"),
|
||||
min_required_wal_senders,
|
||||
available_wal_senders);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* if replication slots are required by demotion candidate and/or siblings,
|
||||
@@ -5111,65 +5138,81 @@ run_basebackup(t_node_info *node_record)
|
||||
{
|
||||
PGconn *upstream_conn = NULL;
|
||||
|
||||
upstream_conn = establish_db_connection(upstream_node_record.conninfo, true);
|
||||
upstream_conn = establish_db_connection(upstream_node_record.conninfo, false);
|
||||
|
||||
record_status = get_slot_record(upstream_conn, node_record->slot_name, &slot_info);
|
||||
|
||||
if (record_status == RECORD_FOUND)
|
||||
/*
|
||||
* It's possible the upstream node is not yet running, in which case we'll
|
||||
* have to rely on the user taking action to create the slot
|
||||
*/
|
||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_verbose(LOG_INFO,
|
||||
_("replication slot \"%s\" aleady exists on upstream node %i"),
|
||||
node_record->slot_name,
|
||||
upstream_node_id);
|
||||
slot_exists_on_upstream = true;
|
||||
log_warning(_("unable to connect to upstream node to create replication slot"));
|
||||
/*
|
||||
* TODO: if slot creation also handled by "standby register", update warning
|
||||
*/
|
||||
log_hint(_("you may need to create the replication slot manually"));
|
||||
}
|
||||
else
|
||||
{
|
||||
PQExpBufferData event_details;
|
||||
record_status = get_slot_record(upstream_conn, node_record->slot_name, &slot_info);
|
||||
|
||||
log_notice(_("creating replication slot \"%s\" on upstream node %i"),
|
||||
node_record->slot_name,
|
||||
upstream_node_id);
|
||||
|
||||
get_superuser_connection(&upstream_conn, &superuser_conn, &privileged_conn);
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
if (create_replication_slot(privileged_conn, node_record->slot_name, source_server_version_num, &event_details) == false)
|
||||
if (record_status == RECORD_FOUND)
|
||||
{
|
||||
log_error("%s", event_details.data);
|
||||
log_verbose(LOG_INFO,
|
||||
_("replication slot \"%s\" aleady exists on upstream node %i"),
|
||||
node_record->slot_name,
|
||||
upstream_node_id);
|
||||
slot_exists_on_upstream = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
PQExpBufferData event_details;
|
||||
|
||||
create_event_notification(
|
||||
primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"standby_clone",
|
||||
false,
|
||||
event_details.data);
|
||||
log_notice(_("creating replication slot \"%s\" on upstream node %i"),
|
||||
node_record->slot_name,
|
||||
upstream_node_id);
|
||||
|
||||
PQfinish(source_conn);
|
||||
get_superuser_connection(&upstream_conn, &superuser_conn, &privileged_conn);
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
if (create_replication_slot(privileged_conn, node_record->slot_name, source_server_version_num, &event_details) == false)
|
||||
{
|
||||
log_error("%s", event_details.data);
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"standby_clone",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
PQfinish(source_conn);
|
||||
|
||||
if (superuser_conn != NULL)
|
||||
PQfinish(superuser_conn);
|
||||
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
|
||||
if (superuser_conn != NULL)
|
||||
PQfinish(superuser_conn);
|
||||
|
||||
exit(ERR_DB_QUERY);
|
||||
termPQExpBuffer(&event_details);
|
||||
}
|
||||
|
||||
if (superuser_conn != NULL)
|
||||
PQfinish(superuser_conn);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
PQfinish(upstream_conn);
|
||||
}
|
||||
|
||||
PQfinish(upstream_conn);
|
||||
}
|
||||
|
||||
/* delete slot on source server */
|
||||
|
||||
get_superuser_connection(&source_conn, &superuser_conn, &privileged_conn);
|
||||
|
||||
if (slot_info.active == false)
|
||||
{
|
||||
if (slot_exists_on_upstream == false)
|
||||
{
|
||||
if (drop_replication_slot(source_conn, node_record->slot_name) == true)
|
||||
if (drop_replication_slot(privileged_conn, node_record->slot_name) == true)
|
||||
{
|
||||
log_notice(_("replication slot \"%s\" deleted on source node"), node_record->slot_name);
|
||||
}
|
||||
@@ -5827,7 +5870,7 @@ get_barman_property(char *dst, char *name, char *local_repmgr_directory)
|
||||
initPQExpBuffer(&command_output);
|
||||
|
||||
maxlen_snprintf(command,
|
||||
"grep \"^\t%s:\" %s/show-server.txt",
|
||||
"grep \"^[[:space:]]%s:\" %s/show-server.txt",
|
||||
name, local_repmgr_tmp_directory);
|
||||
(void) local_command(command, &command_output);
|
||||
|
||||
@@ -6024,45 +6067,6 @@ check_recovery_type(PGconn *conn)
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
|
||||
{
|
||||
t_replication_slot slot_info = T_REPLICATION_SLOT_INITIALIZER;
|
||||
RecordStatus record_status = get_slot_record(conn, slot_name, &slot_info);
|
||||
|
||||
log_verbose(LOG_DEBUG, "attempting to delete slot \"%s\" on node %i",
|
||||
slot_name, node_id);
|
||||
|
||||
if (record_status != RECORD_FOUND)
|
||||
{
|
||||
log_info(_("no slot record found for slot \"%s\" on node %i"),
|
||||
slot_name, node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (slot_info.active == false)
|
||||
{
|
||||
if (drop_replication_slot(conn, slot_name) == true)
|
||||
{
|
||||
log_notice(_("replication slot \"%s\" deleted on node %i"), slot_name, node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("unable to delete replication slot \"%s\" on node %i"), slot_name, node_id);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* if active replication slot exists, call Houston as we have a
|
||||
* problem
|
||||
*/
|
||||
else
|
||||
{
|
||||
log_warning(_("replication slot \"%s\" is still active on node %i"), slot_name, node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Creates a recovery.conf file for a standby
|
||||
|
||||
@@ -237,5 +237,6 @@ extern void get_node_config_directory(char *config_dir_buf);
|
||||
extern void get_node_data_directory(char *data_dir_buf);
|
||||
extern void init_node_record(t_node_info *node_record);
|
||||
extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
|
||||
extern void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);
|
||||
|
||||
#endif /* _REPMGR_CLIENT_GLOBAL_H_ */
|
||||
|
||||
@@ -2978,3 +2978,46 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea
|
||||
|
||||
return can_use;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
|
||||
{
|
||||
t_replication_slot slot_info = T_REPLICATION_SLOT_INITIALIZER;
|
||||
RecordStatus record_status = get_slot_record(conn, slot_name, &slot_info);
|
||||
|
||||
log_verbose(LOG_DEBUG, "attempting to delete slot \"%s\" on node %i",
|
||||
slot_name, node_id);
|
||||
|
||||
if (record_status != RECORD_FOUND)
|
||||
{
|
||||
/* this is a good thing */
|
||||
log_verbose(LOG_INFO,
|
||||
_("slot \"%s\" does not exist on node %i, nothing to remove"),
|
||||
slot_name, node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (slot_info.active == false)
|
||||
{
|
||||
if (drop_replication_slot(conn, slot_name) == true)
|
||||
{
|
||||
log_notice(_("replication slot \"%s\" deleted on node %i"), slot_name, node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("unable to delete replication slot \"%s\" on node %i"), slot_name, node_id);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* if active replication slot exists, call Houston as we have a
|
||||
* problem
|
||||
*/
|
||||
else
|
||||
{
|
||||
log_warning(_("replication slot \"%s\" is still active on node %i"), slot_name, node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
4
repmgr.c
4
repmgr.c
@@ -416,9 +416,9 @@ unset_bdr_failover_handler(PG_FUNCTION_ARGS)
|
||||
LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
|
||||
|
||||
shared_state->bdr_failover_handler = UNKNOWN_NODE_ID;
|
||||
|
||||
LWLockRelease(shared_state->lock);
|
||||
}
|
||||
|
||||
LWLockRelease(shared_state->lock);
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
@@ -143,6 +143,11 @@
|
||||
# Debian/Ubuntu users: you will probably need to
|
||||
# set this to the directory where `pg_ctl` is located,
|
||||
# e.g. /usr/lib/postgresql/9.6/bin/
|
||||
#
|
||||
# *NOTE* "pg_bindir" is only used when repmgr directly
|
||||
# executes PostgreSQL binaries; any user-defined scripts
|
||||
# *must* be specified with the full path
|
||||
#
|
||||
#use_primary_conninfo_password=false # explicitly set "password" in recovery.conf's
|
||||
# "primary_conninfo" parameter using the value contained
|
||||
# in the environment variable PGPASSWORD
|
||||
@@ -156,7 +161,7 @@
|
||||
# Examples:
|
||||
#
|
||||
# pg_ctl_options='-s'
|
||||
# pg_basebackup_options='--label=repmgr_backup
|
||||
# pg_basebackup_options='--label=repmgr_backup'
|
||||
# rsync_options=--archive --checksum --compress --progress --rsh="ssh -o \"StrictHostKeyChecking no\""
|
||||
# ssh_options=-o "StrictHostKeyChecking no"
|
||||
|
||||
@@ -183,11 +188,11 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
# parameter can be provided multiple times.
|
||||
|
||||
#restore_command='' # This will be placed in the recovery.conf file generated
|
||||
# by repmgr.
|
||||
# by repmgr.
|
||||
|
||||
#archive_cleanup_command='' # This will be placed in the recovery.conf file generated
|
||||
# by repmgr. Note we recommend using Barman for managing
|
||||
# WAL archives (see: https://www.pgbarman.org )
|
||||
# by repmgr. Note we recommend using Barman for managing
|
||||
# WAL archives (see: https://www.pgbarman.org )
|
||||
|
||||
#recovery_min_apply_delay= # If provided, "recovery_min_apply_delay" in recovery.conf
|
||||
# will be set to this value (PostgreSQL 9.4 and later).
|
||||
@@ -259,10 +264,10 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
# are defaults.
|
||||
|
||||
#repmgrd_pid_file= # Path of PID file to use for repmgrd; if not set, a PID file will
|
||||
# be generated in a temporary directory specified by the environment
|
||||
# variable $TMPDIR, or if not set, in "/tmp". This value can be overridden
|
||||
# by the command line option "-p/--pid-file"; the command line option
|
||||
# "--no-pid-file" will force PID file creation to be skipped.
|
||||
# be generated in a temporary directory specified by the environment
|
||||
# variable $TMPDIR, or if not set, in "/tmp". This value can be overridden
|
||||
# by the command line option "-p/--pid-file"; the command line option
|
||||
# "--no-pid-file" will force PID file creation to be skipped.
|
||||
#failover=manual # one of 'automatic', 'manual'.
|
||||
# determines what action to take in the event of upstream failure
|
||||
#
|
||||
@@ -276,9 +281,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
# a value of zero prevents the node being promoted to primary
|
||||
# (default: 100)
|
||||
|
||||
#reconnect_attempts=6 # Number attempts which will be made to reconnect to an unreachable
|
||||
#reconnect_attempts=6 # Number of attempts which will be made to reconnect to an unreachable
|
||||
# primary (or other upstream node)
|
||||
#reconnect_interval=10 # Interval between attempts to reconnect to an unreachable
|
||||
#reconnect_interval=10 # Interval between attempts to reconnect to an unreachable
|
||||
# primary (or other upstream node)
|
||||
#promote_command= # command repmgrd executes when promoting a new primary; use something like:
|
||||
#
|
||||
@@ -332,7 +337,7 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
#
|
||||
# Debian/Ubuntu users: use "sudo pg_ctlcluster" to execute service control commands.
|
||||
#
|
||||
# For more details, see: https://repmgr.org/docs/4.0/configuration-service-commands.html
|
||||
# For more details, see: https://repmgr.org/docs/4.1/configuration-service-commands.html
|
||||
|
||||
#service_start_command = ''
|
||||
#service_stop_command = ''
|
||||
@@ -376,7 +381,7 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
#bdr_local_monitoring_only=false # Only monitor the local node; no checks will be
|
||||
# performed on the other node
|
||||
# performed on the other node
|
||||
#bdr_recovery_timeout # If a BDR node was offline and has become available
|
||||
# maximum length of time in seconds to wait for the
|
||||
# node to reconnect to the cluster
|
||||
# maximum length of time in seconds to wait for the
|
||||
# node to reconnect to the cluster
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
#define REPMGR_VERSION_DATE ""
|
||||
#define REPMGR_VERSION "4.1dev"
|
||||
#define REPMGR_VERSION "4.1.2"
|
||||
|
||||
@@ -214,7 +214,8 @@ monitor_bdr(void)
|
||||
|
||||
log_warning(_("unable to connect to node %s (ID %i)"),
|
||||
cell->node_info->node_name, cell->node_info->node_id);
|
||||
cell->node_info->conn = try_reconnect(cell->node_info);
|
||||
//cell->node_info->conn = try_reconnect(cell->node_info);
|
||||
try_reconnect(&cell->node_info->conn, cell->node_info);
|
||||
|
||||
/* node has recovered - log and continue */
|
||||
if (cell->node_info->node_status == NODE_STATUS_UP)
|
||||
@@ -293,7 +294,7 @@ loop:
|
||||
/*
|
||||
* if we can reload, then could need to change local_conn
|
||||
*/
|
||||
if (reload_config(&config_file_options))
|
||||
if (reload_config(&config_file_options, BDR))
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
@@ -303,11 +304,12 @@ loop:
|
||||
got_SIGHUP = false;
|
||||
}
|
||||
|
||||
/* XXX this looks like it will never be called */
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
log_debug("SIGHUP received");
|
||||
|
||||
if (reload_config(&config_file_options))
|
||||
if (reload_config(&config_file_options, BDR))
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
@@ -60,6 +60,8 @@ static int primary_node_id = UNKNOWN_NODE_ID;
|
||||
static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
|
||||
static NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||
|
||||
static instr_time last_monitoring_update;
|
||||
|
||||
|
||||
static ElectionResult do_election(void);
|
||||
static const char *_print_election_result(ElectionResult result);
|
||||
@@ -81,6 +83,8 @@ static bool do_witness_failover(void);
|
||||
|
||||
static void update_monitoring_history(void);
|
||||
|
||||
static void handle_sighup(PGconn **conn, t_server_type server_type);
|
||||
|
||||
static const char * format_failover_state(FailoverState failover_state);
|
||||
|
||||
|
||||
@@ -264,7 +268,12 @@ monitor_streaming_primary(void)
|
||||
* TODO: cache node list here, refresh at `node_list_refresh_interval`
|
||||
* also return reason for inavailability so we can log it
|
||||
*/
|
||||
if (is_server_available(local_node_info.conninfo) == false)
|
||||
|
||||
(void) connection_ping(local_conn);
|
||||
|
||||
check_connection(&local_node_info, &local_conn);
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
|
||||
/* local node is down, we were expecting it to be up */
|
||||
@@ -284,8 +293,6 @@ monitor_streaming_primary(void)
|
||||
|
||||
local_node_info.node_status = NODE_STATUS_UNKNOWN;
|
||||
|
||||
close_connection(&local_conn);
|
||||
|
||||
/*
|
||||
* as we're monitoring the primary, no point in trying to
|
||||
* write the event to the database
|
||||
@@ -301,11 +308,12 @@ monitor_streaming_primary(void)
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
local_conn = try_reconnect(&local_node_info);
|
||||
try_reconnect(&local_conn, &local_node_info);
|
||||
|
||||
if (local_node_info.node_status == NODE_STATUS_UP)
|
||||
{
|
||||
int local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);
|
||||
int stored_local_node_id = UNKNOWN_NODE_ID;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
@@ -322,6 +330,17 @@ monitor_streaming_primary(void)
|
||||
event_details.data);
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
/*
|
||||
* If the local node was restarted, we'll need to reinitialise values
|
||||
* stored in shared memory.
|
||||
*/
|
||||
|
||||
stored_local_node_id = repmgrd_get_local_node_id(local_conn);
|
||||
if (stored_local_node_id == UNKNOWN_NODE_ID)
|
||||
{
|
||||
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
|
||||
}
|
||||
|
||||
goto loop;
|
||||
}
|
||||
|
||||
@@ -545,26 +564,7 @@ loop:
|
||||
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
log_debug("SIGHUP received");
|
||||
|
||||
if (reload_config(&config_file_options))
|
||||
{
|
||||
close_connection(&local_conn);
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
if (*config_file_options.log_file)
|
||||
{
|
||||
FILE *fd;
|
||||
|
||||
fd = freopen(config_file_options.log_file, "a", stderr);
|
||||
if (fd == NULL)
|
||||
{
|
||||
fprintf(stderr, "error reopening stderr to \"%s\": %s",
|
||||
config_file_options.log_file, strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
got_SIGHUP = false;
|
||||
handle_sighup(&local_conn, PRIMARY);
|
||||
}
|
||||
|
||||
log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
|
||||
@@ -582,9 +582,11 @@ monitor_streaming_standby(void)
|
||||
instr_time log_status_interval_start;
|
||||
PQExpBufferData event_details;
|
||||
|
||||
log_debug("monitor_streaming_standby()");
|
||||
|
||||
reset_node_voting_status();
|
||||
|
||||
log_debug("monitor_streaming_standby()");
|
||||
INSTR_TIME_SET_ZERO(last_monitoring_update);
|
||||
|
||||
/*
|
||||
* If no upstream node id is specified in the metadata, we'll try and
|
||||
@@ -733,10 +735,9 @@ monitor_streaming_standby(void)
|
||||
_("unable to connect to upstream node \"%s\" (node ID: %i)"),
|
||||
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||
|
||||
/* */
|
||||
/* XXX possible pre-action event */
|
||||
if (upstream_node_info.type == STANDBY)
|
||||
{
|
||||
/* XXX possible pre-action event */
|
||||
create_event_record(primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
@@ -758,8 +759,6 @@ monitor_streaming_standby(void)
|
||||
log_warning("%s", event_details.data);
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
close_connection(&upstream_conn);
|
||||
|
||||
/*
|
||||
* if local node is unreachable, make a last-minute attempt to reconnect
|
||||
* before continuing with the failover process
|
||||
@@ -770,13 +769,18 @@ monitor_streaming_standby(void)
|
||||
check_connection(&local_node_info, &local_conn);
|
||||
}
|
||||
|
||||
upstream_conn = try_reconnect(&upstream_node_info);
|
||||
try_reconnect(&upstream_conn, &upstream_node_info);
|
||||
|
||||
/* Node has recovered - log and continue */
|
||||
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
||||
{
|
||||
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
|
||||
|
||||
if (upstream_node_info.type == PRIMARY)
|
||||
{
|
||||
primary_conn = upstream_conn;
|
||||
}
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
@@ -784,7 +788,7 @@ monitor_streaming_standby(void)
|
||||
upstream_node_unreachable_elapsed);
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
create_event_notification(upstream_conn,
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"repmgrd_upstream_reconnect",
|
||||
@@ -1004,6 +1008,13 @@ monitor_streaming_standby(void)
|
||||
continue;
|
||||
}
|
||||
|
||||
/* skip witness node - we can't possibly "follow" that */
|
||||
|
||||
if (cell->node_info->type == WITNESS)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
||||
|
||||
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
||||
@@ -1026,6 +1037,7 @@ monitor_streaming_standby(void)
|
||||
follow_new_primary(follow_node_id);
|
||||
}
|
||||
}
|
||||
|
||||
clear_node_info_list(&sibling_nodes);
|
||||
}
|
||||
}
|
||||
@@ -1054,8 +1066,7 @@ loop:
|
||||
|
||||
if (config_file_options.failover == FAILOVER_MANUAL)
|
||||
{
|
||||
appendPQExpBuffer(
|
||||
&monitoring_summary,
|
||||
appendPQExpBuffer(&monitoring_summary,
|
||||
_(" (automatic failover disabled)"));
|
||||
}
|
||||
|
||||
@@ -1065,6 +1076,18 @@ loop:
|
||||
{
|
||||
log_detail(_("waiting for upstream or another primary to reappear"));
|
||||
}
|
||||
else if (config_file_options.monitoring_history == true)
|
||||
{
|
||||
if (INSTR_TIME_IS_ZERO(last_monitoring_update))
|
||||
{
|
||||
log_detail(_("no monitoring statistics have been written yet"));
|
||||
}
|
||||
else
|
||||
{
|
||||
log_detail(_("last monitoring statistics update was %i seconds ago"),
|
||||
calculate_elapsed(last_monitoring_update));
|
||||
}
|
||||
}
|
||||
|
||||
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
||||
}
|
||||
@@ -1076,7 +1099,16 @@ loop:
|
||||
}
|
||||
else
|
||||
{
|
||||
connection_ping(local_conn);
|
||||
if (config_file_options.monitoring_history == true)
|
||||
{
|
||||
log_verbose(LOG_WARNING, _("monitoring_history requested but primary connection not available"));
|
||||
}
|
||||
|
||||
/*
|
||||
* if monitoring not in use, we'll need to ensure the local connection
|
||||
* handle isn't stale
|
||||
*/
|
||||
(void) connection_ping(local_conn);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1129,8 +1161,11 @@ loop:
|
||||
}
|
||||
else
|
||||
{
|
||||
/* we've reconnected to the local node after an outage */
|
||||
if (local_node_info.active == false)
|
||||
{
|
||||
int stored_local_node_id = UNKNOWN_NODE_ID;
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||
{
|
||||
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
|
||||
@@ -1146,45 +1181,36 @@ loop:
|
||||
local_node_info.node_name,
|
||||
local_node_info.node_id);
|
||||
|
||||
log_warning("%s", event_details.data)
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"standby_recovery",
|
||||
true,
|
||||
event_details.data);
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"standby_recovery",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the local node was restarted, we'll need to reinitialise values
|
||||
* stored in shared memory.
|
||||
*/
|
||||
|
||||
stored_local_node_id = repmgrd_get_local_node_id(local_conn);
|
||||
if (stored_local_node_id == UNKNOWN_NODE_ID)
|
||||
{
|
||||
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
log_debug("SIGHUP received");
|
||||
|
||||
if (reload_config(&config_file_options))
|
||||
{
|
||||
close_connection(&local_conn);
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
if (*config_file_options.log_file)
|
||||
{
|
||||
FILE *fd;
|
||||
|
||||
fd = freopen(config_file_options.log_file, "a", stderr);
|
||||
if (fd == NULL)
|
||||
{
|
||||
fprintf(stderr, "error reopening stderr to \"%s\": %s",
|
||||
config_file_options.log_file, strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
got_SIGHUP = false;
|
||||
handle_sighup(&local_conn, STANDBY);
|
||||
}
|
||||
|
||||
log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
|
||||
@@ -1204,36 +1230,18 @@ monitor_streaming_witness(void)
|
||||
PQExpBufferData event_details;
|
||||
RecordStatus record_status;
|
||||
|
||||
int primary_node_id = UNKNOWN_NODE_ID;
|
||||
|
||||
reset_node_voting_status();
|
||||
|
||||
log_debug("monitor_streaming_witness()");
|
||||
|
||||
if (get_primary_node_record(local_conn, &upstream_node_info) == false)
|
||||
{
|
||||
PQExpBufferData event_details;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to retrieve record for primary node"));
|
||||
|
||||
log_error("%s", event_details.data);
|
||||
log_hint(_("execute \"repmgr witness register --force\" to update the witness node "));
|
||||
close_connection(&local_conn);
|
||||
|
||||
create_event_notification(NULL,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"repmgrd_shutdown",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
primary_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
||||
/*
|
||||
* At this point we can't trust the local copy of "repmgr.nodes", as
|
||||
* it may not have been updated. We'll scan the cluster for the current
|
||||
['' * primary and refresh the copy from that before proceeding further.
|
||||
*/
|
||||
primary_conn = get_primary_connection_quiet(local_conn, &primary_node_id, NULL);
|
||||
|
||||
/*
|
||||
* Primary node must be running at repmgrd startup.
|
||||
@@ -1258,7 +1266,7 @@ monitor_streaming_witness(void)
|
||||
* refresh upstream node record from primary, so it's as up-to-date
|
||||
* as possible
|
||||
*/
|
||||
record_status = get_node_record(primary_conn, upstream_node_info.node_id, &upstream_node_info);
|
||||
record_status = get_node_record(primary_conn, primary_node_id, &upstream_node_info);
|
||||
|
||||
/*
|
||||
* This is unlikely to happen; if it does emit a warning for diagnostic
|
||||
@@ -1330,8 +1338,7 @@ monitor_streaming_witness(void)
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
close_connection(&primary_conn);
|
||||
primary_conn = try_reconnect(&upstream_node_info);
|
||||
try_reconnect(&primary_conn, &upstream_node_info);
|
||||
|
||||
/* Node has recovered - log and continue */
|
||||
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
||||
@@ -1345,7 +1352,7 @@ monitor_streaming_witness(void)
|
||||
upstream_node_unreachable_elapsed);
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
create_event_notification(upstream_conn,
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"repmgrd_upstream_reconnect",
|
||||
@@ -1468,6 +1475,105 @@ monitor_streaming_witness(void)
|
||||
}
|
||||
loop:
|
||||
|
||||
/*
|
||||
* handle local node failure
|
||||
*
|
||||
* currently we'll just check the connection, and try to reconnect
|
||||
*
|
||||
* TODO: add timeout, after which we run in degraded state
|
||||
*/
|
||||
|
||||
(void) connection_ping(local_conn);
|
||||
|
||||
check_connection(&local_node_info, &local_conn);
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
if (local_node_info.active == true)
|
||||
{
|
||||
bool success = true;
|
||||
PQExpBufferData event_details;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
local_node_info.active = false;
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to connect to local node \"%s\" (ID: %i), marking inactive"),
|
||||
local_node_info.node_name,
|
||||
local_node_info.node_id);
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||
{
|
||||
if (update_node_record_set_active(primary_conn, local_node_info.node_id, false) == false)
|
||||
{
|
||||
success = false;
|
||||
log_warning(_("unable to mark node \"%s\" (ID: %i) as inactive"),
|
||||
local_node_info.node_name,
|
||||
local_node_info.node_id);
|
||||
}
|
||||
}
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"standby_failure",
|
||||
success,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* we've reconnected to the local node after an outage */
|
||||
if (local_node_info.active == false)
|
||||
{
|
||||
int stored_local_node_id = UNKNOWN_NODE_ID;
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||
{
|
||||
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
|
||||
{
|
||||
PQExpBufferData event_details;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
local_node_info.active = true;
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("reconnected to local node \"%s\" (ID: %i), marking active"),
|
||||
local_node_info.node_name,
|
||||
local_node_info.node_id);
|
||||
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"standby_recovery",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If the local node was restarted, we'll need to reinitialise values
|
||||
* stored in shared memory.
|
||||
*/
|
||||
|
||||
stored_local_node_id = repmgrd_get_local_node_id(local_conn);
|
||||
if (stored_local_node_id == UNKNOWN_NODE_ID)
|
||||
{
|
||||
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* refresh repmgr.nodes after "witness_sync_interval" seconds */
|
||||
|
||||
{
|
||||
@@ -1511,28 +1617,10 @@ loop:
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
log_debug("SIGHUP received");
|
||||
|
||||
if (reload_config(&config_file_options))
|
||||
{
|
||||
close_connection(&local_conn);
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
if (*config_file_options.log_file)
|
||||
{
|
||||
FILE *fd;
|
||||
|
||||
fd = freopen(config_file_options.log_file, "a", stderr);
|
||||
if (fd == NULL)
|
||||
{
|
||||
fprintf(stderr, "error reopening stderr to \"%s\": %s",
|
||||
config_file_options.log_file, strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
got_SIGHUP = false;
|
||||
handle_sighup(&local_conn, WITNESS);
|
||||
}
|
||||
|
||||
log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
|
||||
@@ -1778,12 +1866,21 @@ update_monitoring_history(void)
|
||||
long long unsigned int replication_lag_bytes = 0;
|
||||
|
||||
/* both local and primary connections must be available */
|
||||
if (PQstatus(primary_conn) != CONNECTION_OK || PQstatus(local_conn) != CONNECTION_OK)
|
||||
if (PQstatus(primary_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_warning(_("primary connection is not available, unable to update monitoring history"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_warning(_("local connection is not available, unable to update monitoring history"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (get_replication_info(local_conn, &replication_info) == false)
|
||||
{
|
||||
log_warning(_("unable to retrieve replication status information"));
|
||||
log_warning(_("unable to retrieve replication status information, unable to update monitoring history"));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1835,8 +1932,7 @@ update_monitoring_history(void)
|
||||
replication_lag_bytes = 0;
|
||||
}
|
||||
|
||||
add_monitoring_record(
|
||||
primary_conn,
|
||||
add_monitoring_record(primary_conn,
|
||||
local_conn,
|
||||
primary_node_id,
|
||||
local_node_info.node_id,
|
||||
@@ -1846,6 +1942,8 @@ update_monitoring_history(void)
|
||||
replication_info.last_xact_replay_timestamp,
|
||||
replication_lag_bytes,
|
||||
apply_lag_bytes);
|
||||
|
||||
INSTR_TIME_SET_CURRENT(last_monitoring_update);
|
||||
}
|
||||
|
||||
|
||||
@@ -1870,7 +1968,7 @@ do_upstream_standby_failover(void)
|
||||
t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
|
||||
RecordStatus record_status = RECORD_NOT_FOUND;
|
||||
RecoveryType primary_type = RECTYPE_UNKNOWN;
|
||||
int i, r;
|
||||
int i, standby_follow_result;
|
||||
char parsed_follow_command[MAXPGPATH] = "";
|
||||
|
||||
close_connection(&upstream_conn);
|
||||
@@ -1904,9 +2002,18 @@ do_upstream_standby_failover(void)
|
||||
|
||||
if (primary_type != RECTYPE_PRIMARY)
|
||||
{
|
||||
log_error(_("last known primary\"%s\" (ID: %i) is in recovery, not following"),
|
||||
primary_node_info.node_name,
|
||||
primary_node_info.node_id);
|
||||
if (primary_type == RECTYPE_STANDBY)
|
||||
{
|
||||
log_error(_("last known primary \"%s\" (ID: %i) is in recovery, not following"),
|
||||
primary_node_info.node_name,
|
||||
primary_node_info.node_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("unable to determine status of last known primary \"%s\" (ID: %i), not following"),
|
||||
primary_node_info.node_name,
|
||||
primary_node_info.node_id);
|
||||
}
|
||||
|
||||
close_connection(&primary_conn);
|
||||
monitoring_state = MS_DEGRADED;
|
||||
@@ -1917,8 +2024,6 @@ do_upstream_standby_failover(void)
|
||||
/* Close the connection to this server */
|
||||
close_connection(&local_conn);
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
log_debug(_("standby follow command is:\n \"%s\""),
|
||||
config_file_options.follow_command);
|
||||
|
||||
@@ -1928,10 +2033,12 @@ do_upstream_standby_failover(void)
|
||||
*/
|
||||
parse_follow_command(parsed_follow_command, config_file_options.follow_command, primary_node_info.node_id);
|
||||
|
||||
r = system(parsed_follow_command);
|
||||
standby_follow_result = system(parsed_follow_command);
|
||||
|
||||
if (r != 0)
|
||||
if (standby_follow_result != 0)
|
||||
{
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to execute follow command:\n %s"),
|
||||
config_file_options.follow_command);
|
||||
@@ -1942,8 +2049,7 @@ do_upstream_standby_failover(void)
|
||||
* It may not possible to write to the event notification table but we
|
||||
* should be able to generate an external notification if required.
|
||||
*/
|
||||
create_event_notification(
|
||||
primary_conn,
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
@@ -1956,6 +2062,10 @@ do_upstream_standby_failover(void)
|
||||
/*
|
||||
* It's possible that the standby is still starting up after the "follow_command"
|
||||
* completes, so poll for a while until we get a connection.
|
||||
*
|
||||
* NOTE: we've previously closed the local connection, so even if the follow command
|
||||
* failed for whatever reason and the local node remained up, we can re-open
|
||||
* the local connection.
|
||||
*/
|
||||
|
||||
for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
|
||||
@@ -1965,7 +2075,7 @@ do_upstream_standby_failover(void)
|
||||
if (PQstatus(local_conn) == CONNECTION_OK)
|
||||
break;
|
||||
|
||||
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
|
||||
log_debug("sleeping 1 second; %i of %i (\"repmgrd_standby_startup_timeout\") attempts to reconnect to local node",
|
||||
i + 1,
|
||||
config_file_options.repmgrd_standby_startup_timeout);
|
||||
sleep(1);
|
||||
@@ -1981,28 +2091,47 @@ do_upstream_standby_failover(void)
|
||||
/* refresh shared memory settings which will have been zapped by the restart */
|
||||
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
|
||||
|
||||
if (update_node_record_set_upstream(primary_conn,
|
||||
local_node_info.node_id,
|
||||
primary_node_info.node_id) == false)
|
||||
/*
|
||||
*
|
||||
*/
|
||||
|
||||
if (standby_follow_result != 0)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to set node %i's new upstream ID to %i"),
|
||||
local_node_info.node_id,
|
||||
primary_node_info.node_id);
|
||||
monitoring_state = MS_DEGRADED;
|
||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||
|
||||
log_error("%s", event_details.data);
|
||||
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||
}
|
||||
|
||||
create_event_notification(
|
||||
NULL,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
/*
|
||||
* update upstream_node_id to primary node (but only if follow command
|
||||
* was successful)
|
||||
*/
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
{
|
||||
if (update_node_record_set_upstream(primary_conn,
|
||||
local_node_info.node_id,
|
||||
primary_node_info.node_id) == false)
|
||||
{
|
||||
initPQExpBuffer(&event_details);
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to set node %i's new upstream ID to %i"),
|
||||
local_node_info.node_id,
|
||||
primary_node_info.node_id);
|
||||
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
log_error("%s", event_details.data);
|
||||
|
||||
create_event_notification(NULL,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
|
||||
/* refresh own internal node record */
|
||||
@@ -2018,6 +2147,8 @@ do_upstream_standby_failover(void)
|
||||
local_node_info.upstream_node_id = primary_node_info.node_id;
|
||||
}
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("node %i is now following primary node %i"),
|
||||
local_node_info.node_id,
|
||||
@@ -2025,8 +2156,7 @@ do_upstream_standby_failover(void)
|
||||
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
create_event_notification(
|
||||
primary_conn,
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
@@ -2264,6 +2394,8 @@ follow_new_primary(int new_primary_id)
|
||||
RecordStatus record_status = RECORD_NOT_FOUND;
|
||||
bool new_primary_ok = false;
|
||||
|
||||
log_verbose(LOG_DEBUG, "follow_new_primary(): new primary id is %i", new_primary_id);
|
||||
|
||||
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
|
||||
|
||||
if (record_status != RECORD_FOUND)
|
||||
@@ -2483,20 +2615,26 @@ witness_follow_new_primary(int new_primary_id)
|
||||
{
|
||||
RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
|
||||
|
||||
if (primary_recovery_type == RECTYPE_PRIMARY)
|
||||
switch (primary_recovery_type)
|
||||
{
|
||||
new_primary_ok = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
new_primary_ok = false;
|
||||
log_warning(_("new primary is not in recovery"));
|
||||
close_connection(&upstream_conn);
|
||||
case RECTYPE_PRIMARY:
|
||||
new_primary_ok = true;
|
||||
break;
|
||||
case RECTYPE_STANDBY:
|
||||
new_primary_ok = false;
|
||||
log_warning(_("new primary is not in recovery"));
|
||||
break;
|
||||
case RECTYPE_UNKNOWN:
|
||||
new_primary_ok = false;
|
||||
log_warning(_("unable to determine status of new primary"));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (new_primary_ok == false)
|
||||
{
|
||||
close_connection(&upstream_conn);
|
||||
|
||||
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||
}
|
||||
|
||||
@@ -2936,9 +3074,18 @@ check_connection(t_node_info *node_info, PGconn **conn)
|
||||
}
|
||||
else
|
||||
{
|
||||
int stored_local_node_id = UNKNOWN_NODE_ID;
|
||||
|
||||
log_info(_("reconnected to node \"%s\" (ID: %i)"),
|
||||
node_info->node_name,
|
||||
node_info->node_id);
|
||||
|
||||
stored_local_node_id = repmgrd_get_local_node_id(*conn);
|
||||
if (stored_local_node_id == UNKNOWN_NODE_ID)
|
||||
{
|
||||
repmgrd_set_local_node_id(*conn, config_file_options.node_id);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2982,3 +3129,30 @@ format_failover_state(FailoverState failover_state)
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
handle_sighup(PGconn **conn, t_server_type server_type)
|
||||
{
|
||||
log_debug("SIGHUP received");
|
||||
|
||||
if (reload_config(&config_file_options, server_type))
|
||||
{
|
||||
PQfinish(*conn);
|
||||
*conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
}
|
||||
|
||||
if (*config_file_options.log_file)
|
||||
{
|
||||
FILE *fd;
|
||||
|
||||
log_debug("reopening %s", config_file_options.log_file);
|
||||
|
||||
fd = freopen(config_file_options.log_file, "a", stderr);
|
||||
if (fd == NULL)
|
||||
{
|
||||
fprintf(stderr, "error reopening stderr to \"%s\": %s",
|
||||
config_file_options.log_file, strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
got_SIGHUP = false;
|
||||
}
|
||||
|
||||
52
repmgrd.c
52
repmgrd.c
@@ -320,8 +320,6 @@ main(int argc, char **argv)
|
||||
strncpy(config_file_options.log_level, cli_log_level, MAXLEN);
|
||||
}
|
||||
|
||||
log_notice(_("repmgrd (repmgr %s) starting up"), REPMGR_VERSION);
|
||||
|
||||
/*
|
||||
* -m/--monitoring-history, if provided, will override repmgr.conf's
|
||||
* monitoring_history; this is for backwards compatibility as it's
|
||||
@@ -349,6 +347,8 @@ main(int argc, char **argv)
|
||||
|
||||
logger_init(&config_file_options, progname());
|
||||
|
||||
log_notice(_("repmgrd (%s %s) starting up"), progname(), REPMGR_VERSION);
|
||||
|
||||
if (verbose)
|
||||
logger_set_verbose();
|
||||
|
||||
@@ -770,10 +770,10 @@ show_help(void)
|
||||
}
|
||||
|
||||
|
||||
PGconn *
|
||||
try_reconnect(t_node_info *node_info)
|
||||
void
|
||||
try_reconnect(PGconn **conn, t_node_info *node_info)
|
||||
{
|
||||
PGconn *conn;
|
||||
PGconn *our_conn;
|
||||
t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
||||
|
||||
int i;
|
||||
@@ -782,7 +782,6 @@ try_reconnect(t_node_info *node_info)
|
||||
|
||||
initialize_conninfo_params(&conninfo_params, false);
|
||||
|
||||
|
||||
/* we assume by now the conninfo string is parseable */
|
||||
(void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false);
|
||||
|
||||
@@ -805,18 +804,47 @@ try_reconnect(t_node_info *node_info)
|
||||
* degraded monitoring? - make that configurable
|
||||
*/
|
||||
|
||||
conn = establish_db_connection_by_params(&conninfo_params, false);
|
||||
our_conn = establish_db_connection_by_params(&conninfo_params, false);
|
||||
|
||||
if (PQstatus(conn) == CONNECTION_OK)
|
||||
if (PQstatus(our_conn) == CONNECTION_OK)
|
||||
{
|
||||
free_conninfo_params(&conninfo_params);
|
||||
|
||||
log_info(_("connection to node %i succeeded"), node_info->node_id);
|
||||
|
||||
if (PQstatus(*conn) == CONNECTION_BAD)
|
||||
{
|
||||
log_verbose(LOG_INFO, "original connection handle returned CONNECTION_BAD, using new connection");
|
||||
close_connection(conn);
|
||||
*conn = our_conn;
|
||||
}
|
||||
else
|
||||
{
|
||||
ExecStatusType ping_result;
|
||||
|
||||
ping_result = connection_ping(*conn);
|
||||
|
||||
if (ping_result != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_info("original connnection no longer available, using new connection");
|
||||
close_connection(conn);
|
||||
*conn = our_conn;
|
||||
}
|
||||
else
|
||||
{
|
||||
log_info(_("original connection is still available"));
|
||||
|
||||
PQfinish(our_conn);
|
||||
}
|
||||
}
|
||||
|
||||
node_info->node_status = NODE_STATUS_UP;
|
||||
return conn;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
close_connection(&conn);
|
||||
log_notice(_("unable to reconnect to node"));
|
||||
close_connection(&our_conn);
|
||||
log_notice(_("unable to reconnect to node %i"), node_info->node_id);
|
||||
}
|
||||
|
||||
if (i + 1 < max_attempts)
|
||||
@@ -835,7 +863,7 @@ try_reconnect(t_node_info *node_info)
|
||||
|
||||
free_conninfo_params(&conninfo_params);
|
||||
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ extern t_node_info local_node_info;
|
||||
extern PGconn *local_conn;
|
||||
extern bool startup_event_logged;
|
||||
|
||||
PGconn *try_reconnect(t_node_info *node_info);
|
||||
void try_reconnect(PGconn **conn, t_node_info *node_info);
|
||||
|
||||
int calculate_elapsed(instr_time start_time);
|
||||
const char *print_monitoring_state(MonitoringState monitoring_state);
|
||||
|
||||
Reference in New Issue
Block a user