diff --git a/FAQ.md b/FAQ.md index 0ecca1e9..2abbfdce 100644 --- a/FAQ.md +++ b/FAQ.md @@ -38,7 +38,7 @@ General No. Hash indexes and replication do not mix well and their use is explicitly discouraged; see: - http://www.postgresql.org/docs/current/interactive/sql-createindex.html#AEN74175 + https://www.postgresql.org/docs/current/interactive/sql-createindex.html#AEN74175 `repmgr` -------- diff --git a/HISTORY b/HISTORY index 9e594e10..69cff214 100644 --- a/HISTORY +++ b/HISTORY @@ -1,3 +1,13 @@ +3.1.4 2016-07- + repmgr: new configuration option for setting "restore_command" + in the recovery.conf file generated by repmgr (Martín) + repmgr: add --csv option to "repmgr cluster show" (Gianni) + repmgr: enable provision of a conninfo string as the -d/--dbname + parameter, similar to other PostgreSQL utilities (Ian) + repmgr: during switchover operations improve detection of + demotion candidate shutdown (Ian) + various bugfixes and documentation updates (Ian, Martín) + 3.1.3 2016-05-17 repmgrd: enable monitoring when a standby is catching up by replaying archived WAL (Ian) diff --git a/Makefile b/Makefile index 1fe7ec6b..bbf778fd 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ HEADERS = $(wildcard *.h) repmgrd_OBJS = dbutils.o config.o repmgrd.o log.o strutil.o -repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o +repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o dirmod.o DATA = repmgr.sql uninstall_repmgr.sql diff --git a/README.md b/README.md index 37d721d6..d700a8f8 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ This guide assumes that you are familiar with PostgreSQL administration and streaming replication concepts. For further details on streaming replication, see this link: - http://www.postgresql.org/docs/current/interactive/warm-standby.html#STREAMING-REPLICATION + https://www.postgresql.org/docs/current/interactive/warm-standby.html#STREAMING-REPLICATION The following terms are used throughout the `repmgr` documentation. @@ -237,15 +237,19 @@ both servers. On the master server, a PostgreSQL instance must be initialised and running. The following replication settings must be included in `postgresql.conf`: + + # Enable replication connections; set this figure to at least one more + # than the number of standbys which will connect to this server + # (note that repmgr will execute `pg_basebackup` in WAL streaming mode, + # which requires two free WAL senders) + + max_wal_senders = 10 + # Ensure WAL files contain enough information to enable read-only queries # on the standby wal_level = 'hot_standby' - # Enable up to 10 replication connections - - max_wal_senders = 10 - # How much WAL to retain on the master to allow a temporarily # disconnected standby to catch up again. The larger this is, the # longer the standby can be disconnected. This is needed only in @@ -259,6 +263,14 @@ The following replication settings must be included in `postgresql.conf`: hot_standby = on + # Enable WAL file archiving + archive_mode = on + + # Set archive command to a script or application that will safely store + # you WALs in a secure place. /bin/true is an example of a command that + # ignores archiving. Use something more sensible. + archive_command = '/bin/true' + * * * @@ -458,7 +470,11 @@ so should be used with care. Further options can be passed to the `pg_basebackup` utility via the setting `pg_basebackup_options` in `repmgr.conf`. See the PostgreSQL documentation for more details of available options: +<<<<<<< HEAD http://www.postgresql.org/docs/current/static/app-pgbasebackup.html +======= + https://www.postgresql.org/docs/current/static/app-pgbasebackup.html +>>>>>>> 72f9b0145afab1060dd1202c8f8937653c8b2e39 ### Using rsync to clone a standby @@ -594,13 +610,13 @@ place. If using the default `pg_basebackup` method, we recommend setting pg_basebackup_options='--xlog-method=stream' See the `pg_basebackup` documentation for details: - http://www.postgresql.org/docs/current/static/app-pgbasebackup.html + https://www.postgresql.org/docs/current/static/app-pgbasebackup.html Otherwise it's necessary to set `wal_keep_segments` to an appropriately high value. Further information on replication slots in the PostgreSQL documentation: - http://www.postgresql.org/docs/current/interactive/warm-standby.html#STREAMING-REPLICATION-SLOTS + https://www.postgresql.org/docs/current/interactive/warm-standby.html#STREAMING-REPLICATION-SLOTS Promoting a standby server with repmgr @@ -699,8 +715,9 @@ updated to reflect this: Note that with cascading replication, `repmgr standby follow` can also be -used to detach a standby from its current upstream server and follow another -upstream server, including the master. +used to detach a standby from its current upstream server and follow the +master. However it's currently not possible to have it follow another standby; +we hope to improve this in a future release. Performing a switchover with repmgr @@ -727,7 +744,7 @@ both passwordless SSH access and the path of `repmgr.conf` on that server. > careful preparation and with adequate attention. In particular you should > be confident that your network environment is stable and reliable. > -> We recommend running `repmgr standby switchover` at the most verbose +> We recommend running `repmgr standby switchover` at the most verbose > logging level (`--log-level DEBUG --verbose`) and capturing all output > to assist troubleshooting any problems. > @@ -793,7 +810,7 @@ should have been updated to reflect this: ### Caveats -- the functionality provided `repmgr standby switchover` is primarily aimed +- The functionality provided `repmgr standby switchover` is primarily aimed at a two-server master/standby replication cluster and currently does not support additional standbys. - `repmgr standby switchover` is designed to use the `pg_rewind` utility, @@ -802,11 +819,16 @@ should have been updated to reflect this: - `pg_rewind` *requires* that either `wal_log_hints` is enabled, or that data checksums were enabled when the cluster was initialized. See the `pg_rewind` documentation for details: - http://www.postgresql.org/docs/current/static/app-pgrewind.html + https://www.postgresql.org/docs/current/static/app-pgrewind.html - `repmgrd` should not be running when a switchover is carried out, otherwise the `repmgrd` may try and promote a standby by itself. - Any other standbys attached to the old master will need to be manually instructed to point to the new master (e.g. with `repmgr standby follow`). +- You must ensure that following a server start using `pg_ctl`, log output + is not send to STDERR (the default behaviour). If logging is not configured, + We recommend setting `logging_collector=on` in `postgresql.conf` and + providing an explicit `-l/--log` setting in `repmgr.conf`'s `pg_ctl_options` + parameter. We hope to remove some of these restrictions in future versions of `repmgr`. @@ -860,8 +882,8 @@ Adjust schema and node ID accordingly. A future `repmgr` release will make it possible to unregister failed standbys. -Automatic failover with repmgrd -------------------------------- +Automatic failover with `repmgrd` +--------------------------------- `repmgrd` is a management and monitoring daemon which runs on standby nodes and which can automate actions such as failover and updating standbys to @@ -981,8 +1003,8 @@ during the failover: (3 rows) -repmgrd log rotation --------------------- +`repmgrd` log rotation +---------------------- Note that currently `repmgrd` does not provide logfile rotation. To ensure the current logfile does not grow indefinitely, configure your system's `logrotate` @@ -998,8 +1020,29 @@ for up to 52 weeks and rotation forced if a file grows beyond 100Mb: create 0600 postgres postgres } -Monitoring ----------- + +`repmgrd` and PostgreSQL connection settings +-------------------------------------------- + +In addition to the `repmgr` configuration settings, parameters in the +`conninfo` string influence how `repmgr` makes a network connection to +PostgreSQL. In particular, if another server in the replication cluster +is unreachable at network level, system network settings will influence +the length of time it takes to determine that the connection is not possible. + +In particular explicitly setting a parameter for `connect_timeout` should +be considered; the effective minimum value of `2` (seconds) will ensure +that a connection failure at network level is reported as soon as possible, +otherwise dependeing on the system settings (e.g. `tcp_syn_retries` in Linux) +a delay of a minute or more is possible. + +For further details on `conninfo` network connection parameters, see: + + https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-PARAMKEYWORDS + + +Monitoring with `repmgrd` +------------------------- When `repmgrd` is running with the option `-m/--monitoring-history`, it will constantly write standby node status information to the `repl_monitor` table, diff --git a/RHEL/repmgr3-93.spec b/RHEL/repmgr3-93.spec deleted file mode 100644 index e5f9407e..00000000 --- a/RHEL/repmgr3-93.spec +++ /dev/null @@ -1,61 +0,0 @@ -Summary: repmgr -Name: repmgr -Version: 3.0 -Release: 1 -License: GPLv3 -Group: System Environment/Daemons -URL: http://repmgr.org -Packager: Ian Barwick -Vendor: 2ndQuadrant Limited -Distribution: centos -Source0: %{name}-%{version}.tar.gz -BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root - -%description -repmgr is a utility suite which greatly simplifies -the process of setting up and managing replication -using streaming replication within a cluster of -PostgreSQL servers. - -%prep -%setup - -%build -export PATH=$PATH:/usr/pgsql-9.3/bin/ -%{__make} USE_PGXS=1 - -%install -[ "%{buildroot}" != "/" ] && %{__rm} -rf %{buildroot} - -export PATH=$PATH:/usr/pgsql-9.3/bin/ -%{__make} USE_PGXS=1 install DESTDIR=%{buildroot} INSTALL="install -p" -%{__make} USE_PGXS=1 install_prog DESTDIR=%{buildroot} INSTALL="install -p" -%{__make} USE_PGXS=1 install_rhel DESTDIR=%{buildroot} INSTALL="install -p" - - -%clean -[ "%{buildroot}" != "/" ] && %{__rm} -rf %{buildroot} - - -%files -%defattr(-,root,root) -/usr/bin/repmgr -/usr/bin/repmgrd -/usr/pgsql-9.3/bin/repmgr -/usr/pgsql-9.3/bin/repmgrd -/usr/pgsql-9.3/lib/repmgr_funcs.so -/usr/pgsql-9.3/share/contrib/repmgr.sql -/usr/pgsql-9.3/share/contrib/repmgr_funcs.sql -/usr/pgsql-9.3/share/contrib/uninstall_repmgr.sql -/usr/pgsql-9.3/share/contrib/uninstall_repmgr_funcs.sql -%attr(0755,root,root)/etc/init.d/repmgrd -%attr(0644,root,root)/etc/sysconfig/repmgrd -%attr(0644,root,root)/etc/repmgr/repmgr.conf.sample - -%changelog -* Tue Mar 10 2015 Ian Barwick ian@2ndquadrant.com> -- build for repmgr 3.0 -* Thu Jun 05 2014 Nathan Van Overloop 2.0.2 -- fix witness creation to create db and user if needed -* Fri Apr 04 2014 Nathan Van Overloop 2.0.1 -- initial build for RHEL6 diff --git a/RHEL/repmgrd.init b/RHEL/repmgrd.init deleted file mode 100755 index 6d79e598..00000000 --- a/RHEL/repmgrd.init +++ /dev/null @@ -1,133 +0,0 @@ -#!/bin/sh -# -# chkconfig: - 75 16 -# description: Enable repmgrd replication management and monitoring daemon for PostgreSQL -# processname: repmgrd -# pidfile="/var/run/${NAME}.pid" - -# Source function library. -INITD=/etc/rc.d/init.d -. $INITD/functions - -# Get function listing for cross-distribution logic. -TYPESET=`typeset -f|grep "declare"` - -# Get network config. -. /etc/sysconfig/network - -DESC="PostgreSQL replication management and monitoring daemon" -NAME=repmgrd - -REPMGRD_ENABLED=no -REPMGRD_OPTS= -REPMGRD_USER=postgres -REPMGRD_BIN=/usr/pgsql-9.3/bin/repmgrd -REPMGRD_PIDFILE=/var/run/repmgrd.pid -REPMGRD_LOCK=/var/lock/subsys/${NAME} -REPMGRD_LOG=/var/lib/pgsql/9.3/data/pg_log/repmgrd.log - -# Read configuration variable file if it is present -[ -r /etc/sysconfig/$NAME ] && . /etc/sysconfig/$NAME - -# For SELinux we need to use 'runuser' not 'su' -if [ -x /sbin/runuser ] -then - SU=runuser -else - SU=su -fi - -test -x $REPMGRD_BIN || exit 0 - -case "$REPMGRD_ENABLED" in - [Yy]*) - break - ;; - *) - exit 0 - ;; -esac - - -if [ -z "${REPMGRD_OPTS}" ] -then - echo "Not starting ${NAME}, REPMGRD_OPTS not set in /etc/sysconfig/${NAME}" - exit 0 -fi - -start() -{ - REPMGRD_START=$"Starting ${NAME} service: " - - # Make sure startup-time log file is valid - if [ ! -e "${REPMGRD_LOG}" -a ! -h "${REPMGRD_LOG}" ] - then - touch "${REPMGRD_LOG}" || exit 1 - chown ${REPMGRD_USER}:postgres "${REPMGRD_LOG}" - chmod go-rwx "${REPMGRD_LOG}" - [ -x /sbin/restorecon ] && /sbin/restorecon "${REPMGRD_LOG}" - fi - - echo -n "${REPMGRD_START}" - $SU -l $REPMGRD_USER -c "${REPMGRD_BIN} ${REPMGRD_OPTS} -p ${REPMGRD_PIDFILE} &" >> "${REPMGRD_LOG}" 2>&1 < /dev/null - sleep 2 - pid=`head -n 1 "${REPMGRD_PIDFILE}" 2>/dev/null` - if [ "x${pid}" != "x" ] - then - success "${REPMGRD_START}" - touch "${REPMGRD_LOCK}" - echo $pid > "${REPMGRD_PIDFILE}" - echo - else - failure "${REPMGRD_START}" - echo - script_result=1 - fi -} - -stop() -{ - echo -n $"Stopping ${NAME} service: " - if [ -e "${REPMGRD_LOCK}" ] - then - killproc ${NAME} - ret=$? - if [ $ret -eq 0 ] - then - echo_success - rm -f "${REPMGRD_PIDFILE}" - rm -f "${REPMGRD_LOCK}" - else - echo_failure - script_result=1 - fi - else - # not running; per LSB standards this is "ok" - echo_success - fi - echo -} - - -# See how we were called. -case "$1" in - start) - start - ;; - stop) - stop - ;; - status) - status -p $REPMGRD_PIDFILE $NAME - script_result=$? - ;; - restart) - stop - start - ;; - *) - echo $"Usage: $0 {start|stop|status|restart}" - exit 2 -esac - -exit $script_result diff --git a/RHEL/repmgrd.sysconfig b/RHEL/repmgrd.sysconfig deleted file mode 100644 index 07136a50..00000000 --- a/RHEL/repmgrd.sysconfig +++ /dev/null @@ -1,21 +0,0 @@ -# default settings for repmgrd. This file is source by /bin/sh from -# /etc/init.d/repmgrd - -# disable repmgrd by default so it won't get started upon installation -# valid values: yes/no -REPMGRD_ENABLED=no - -# Options for repmgrd (required) -#REPMGRD_OPTS="--verbose -d -f /var/lib/pgsql/repmgr/repmgr.conf" - -# User to run repmgrd as -#REPMGRD_USER=postgres - -# repmgrd binary -#REPMGRD_BIN=/usr/bin/repmgrd - -# pid file -#REPMGRD_PIDFILE=/var/lib/pgsql/repmgr/repmgrd.pid - -# log file -#REPMGRD_LOG=/var/lib/pgsql/repmgr/repmgrd.log diff --git a/TODO b/TODO index 4ec153c4..3b78a9b6 100644 --- a/TODO +++ b/TODO @@ -53,8 +53,9 @@ Planned feature improvements requested, activate the replication slot using pg_receivexlog to negate the need to set `wal_keep_segments` just for the initial clone (9.4 and 9.5). -* Take into account the fact that a standby can obtain WAL from an archive, - so even if direct streaming replication is interrupted, it may be up-to-date +* repmgr: enable "standby follow" to point a standby at another standby, not + just the replication cluster master (see GitHub #130) + Usability improvements ====================== diff --git a/config.c b/config.c index c32166bb..f6b47954 100644 --- a/config.c +++ b/config.c @@ -28,7 +28,7 @@ static void parse_event_notifications_list(t_configuration_options *options, con static void tablespace_list_append(t_configuration_options *options, const char *arg); static void exit_with_errors(ErrorList *config_errors); -const static char *_progname = '\0'; +const static char *_progname = NULL; static char config_file_path[MAXPGPATH]; static bool config_file_provided = false; bool config_file_found = false; @@ -224,6 +224,7 @@ parse_config(t_configuration_options *options) memset(options->pg_bindir, 0, sizeof(options->pg_bindir)); memset(options->pg_ctl_options, 0, sizeof(options->pg_ctl_options)); memset(options->pg_basebackup_options, 0, sizeof(options->pg_basebackup_options)); + memset(options->restore_command, 0, sizeof(options->restore_command)); /* default master_response_timeout is 60 seconds */ options->master_response_timeout = 60; @@ -239,6 +240,8 @@ parse_config(t_configuration_options *options) options->witness_repl_nodes_sync_interval_secs = 30; memset(options->event_notification_command, 0, sizeof(options->event_notification_command)); + options->event_notifications.head = NULL; + options->event_notifications.tail = NULL; options->tablespace_mapping.head = NULL; options->tablespace_mapping.tail = NULL; @@ -340,7 +343,8 @@ parse_config(t_configuration_options *options) strncpy(options->follow_command, value, MAXLEN); else if (strcmp(name, "master_response_timeout") == 0) options->master_response_timeout = repmgr_atoi(value, "master_response_timeout", &config_errors, false); - /* 'primary_response_timeout' as synonym for 'master_response_timeout' - + /* + * 'primary_response_timeout' as synonym for 'master_response_timeout' - * we'll switch terminology in a future release (3.1?) */ else if (strcmp(name, "primary_response_timeout") == 0) @@ -372,6 +376,8 @@ parse_config(t_configuration_options *options) parse_event_notifications_list(options, value); else if (strcmp(name, "tablespace_mapping") == 0) tablespace_list_append(options, value); + else if (strcmp(name, "restore_command") == 0) + strncpy(options->restore_command, value, MAXLEN); else { known_parameter = false; diff --git a/config.h b/config.h index b5f3099b..bc6b92bf 100644 --- a/config.h +++ b/config.h @@ -72,6 +72,7 @@ typedef struct char pg_bindir[MAXLEN]; char pg_ctl_options[MAXLEN]; char pg_basebackup_options[MAXLEN]; + char restore_command[MAXLEN]; char logfile[MAXLEN]; int monitor_interval_secs; int retry_promote_interval_secs; @@ -82,7 +83,11 @@ typedef struct TablespaceList tablespace_mapping; } t_configuration_options; -#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", 0, 0, 0, 0, "", { NULL, NULL }, {NULL, NULL} } +/* + * The following will initialize the structure with a minimal set of options; + * actual defaults are set in parse_config() before parsing the configuration file + */ +#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", "", 0, 0, 0, 0, "", { NULL, NULL }, { NULL, NULL } } typedef struct ErrorListCell { diff --git a/dbutils.c b/dbutils.c index efc0b75b..c962ba0d 100644 --- a/dbutils.c +++ b/dbutils.c @@ -31,6 +31,7 @@ char repmgr_schema[MAXLEN] = ""; char repmgr_schema_quoted[MAXLEN] = ""; +static int _get_node_record(PGconn *conn, char *cluster, char *sqlquery, t_node_info *node_info); PGconn * _establish_db_connection(const char *conninfo, const bool exit_on_error, const bool log_notice) @@ -538,7 +539,7 @@ get_conninfo_value(const char *conninfo, const char *keyword, char *output) conninfo_options = PQconninfoParse(conninfo, NULL); - if (conninfo_options == false) + if (conninfo_options == NULL) { log_err(_("Unable to parse provided conninfo string \"%s\""), conninfo); return false; @@ -1681,8 +1682,7 @@ int get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info) { char sqlquery[QUERY_STR_LEN]; - PGresult *res; - int ntuples; + int result; sqlquery_snprintf( sqlquery, @@ -1696,6 +1696,51 @@ get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info log_verbose(LOG_DEBUG, "get_node_record():\n%s\n", sqlquery); + result = _get_node_record(conn, cluster, sqlquery, node_info); + + if (result == 0) + { + log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %i\n", node_id); + } + + return result; +} + +int +get_node_record_by_name(PGconn *conn, char *cluster, const char *node_name, t_node_info *node_info) +{ + char sqlquery[QUERY_STR_LEN]; + int result; + + sqlquery_snprintf( + sqlquery, + "SELECT id, type, upstream_node_id, name, conninfo, slot_name, priority, active" + " FROM %s.repl_nodes " + " WHERE cluster = '%s' " + " AND name = '%s'", + get_repmgr_schema_quoted(conn), + cluster, + node_name); + + log_verbose(LOG_DEBUG, "get_node_record_by_name():\n%s\n", sqlquery); + + result = _get_node_record(conn, cluster, sqlquery, node_info); + + if (result == 0) + { + log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %s\n", node_name); + } + + return result; +} + + +static int +_get_node_record(PGconn *conn, char *cluster, char *sqlquery, t_node_info *node_info) +{ + int ntuples; + PGresult *res; + res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { @@ -1706,7 +1751,6 @@ get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info if (ntuples == 0) { - log_verbose(LOG_DEBUG, "get_node_record(): no record found for node %i\n", node_id); return 0; } @@ -1727,6 +1771,9 @@ get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info } + + + int get_node_replication_state(PGconn *conn, char *node_name, char *output) { diff --git a/dbutils.h b/dbutils.h index 8d980d8e..d96667fa 100644 --- a/dbutils.h +++ b/dbutils.h @@ -52,18 +52,6 @@ typedef struct s_node_info } t_node_info; -/* - * Struct to store replication slot information - */ - -typedef struct s_replication_slot -{ - char slot_name[MAXLEN]; - char slot_type[MAXLEN]; - bool active; -} t_replication_slot; - - #define T_NODE_INFO_INITIALIZER { \ NODE_NOT_FOUND, \ NO_UPSTREAM_NODE, \ @@ -78,6 +66,19 @@ typedef struct s_replication_slot InvalidXLogRecPtr \ } +/* + * Struct to store replication slot information + */ + +typedef struct s_replication_slot +{ + char slot_name[MAXLEN]; + char slot_type[MAXLEN]; + bool active; +} t_replication_slot; + + + PGconn *_establish_db_connection(const char *conninfo, const bool exit_on_error, const bool log_notice); @@ -125,6 +126,7 @@ bool witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *c bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name, bool active); bool delete_node_record(PGconn *conn, int node, char *action); int get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info); +int get_node_record_by_name(PGconn *conn, char *cluster, const char *node_name, t_node_info *node_info); bool update_node_record_status(PGconn *conn, char *cluster_name, int this_node_id, char *type, int upstream_node_id, bool active); bool update_node_record_set_upstream(PGconn *conn, char *cluster_name, int this_node_id, int new_upstream_node_id); bool create_event_record(PGconn *conn, t_configuration_options *options, int node_id, char *event, bool successful, char *details); diff --git a/debian/DEBIAN/control b/debian/DEBIAN/control index 698b0850..4ca9a262 100644 --- a/debian/DEBIAN/control +++ b/debian/DEBIAN/control @@ -1,9 +1,9 @@ Package: repmgr-auto -Version: 3.0.1 +Version: 3.1.3 Section: database Priority: optional Architecture: all -Depends: rsync, postgresql-9.3 | postgresql-9.4 +Depends: rsync, postgresql-9.3 | postgresql-9.4 | postgresql-9.5 Maintainer: Self built package Description: PostgreSQL replication setup, magament and monitoring has two main executables diff --git a/dirmod.c b/dirmod.c new file mode 100644 index 00000000..82837f66 --- /dev/null +++ b/dirmod.c @@ -0,0 +1,194 @@ +/* + * + * dirmod.c + * directory handling functions + * + * Copyright (C) 2ndQuadrant, 2010-2016 + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#include "postgres_fe.h" + +/* Don't modify declarations in system headers */ + +#include +#include +#include + +/* + * pgfnames + * + * return a list of the names of objects in the argument directory. Caller + * must call pgfnames_cleanup later to free the memory allocated by this + * function. + */ +char ** +pgfnames(const char *path) +{ + DIR *dir; + struct dirent *file; + char **filenames; + int numnames = 0; + int fnsize = 200; /* enough for many small dbs */ + + dir = opendir(path); + if (dir == NULL) + { + return NULL; + } + + filenames = (char **) palloc(fnsize * sizeof(char *)); + + while (errno = 0, (file = readdir(dir)) != NULL) + { + if (strcmp(file->d_name, ".") != 0 && strcmp(file->d_name, "..") != 0) + { + if (numnames + 1 >= fnsize) + { + fnsize *= 2; + filenames = (char **) repalloc(filenames, + fnsize * sizeof(char *)); + } + filenames[numnames++] = pstrdup(file->d_name); + } + } + + if (errno) + { + fprintf(stderr, _("could not read directory \"%s\": %s\n"), + path, strerror(errno)); + } + + filenames[numnames] = NULL; + + if (closedir(dir)) + { + fprintf(stderr, _("could not close directory \"%s\": %s\n"), + path, strerror(errno)); + } + + return filenames; +} + + +/* + * pgfnames_cleanup + * + * deallocate memory used for filenames + */ +void +pgfnames_cleanup(char **filenames) +{ + char **fn; + + for (fn = filenames; *fn; fn++) + pfree(*fn); + + pfree(filenames); +} + + +/* + * rmtree + * + * Delete a directory tree recursively. + * Assumes path points to a valid directory. + * Deletes everything under path. + * If rmtopdir is true deletes the directory too. + * Returns true if successful, false if there was any problem. + * (The details of the problem are reported already, so caller + * doesn't really have to say anything more, but most do.) + */ +bool +rmtree(const char *path, bool rmtopdir) +{ + bool result = true; + char pathbuf[MAXPGPATH]; + char **filenames; + char **filename; + struct stat statbuf; + + /* + * we copy all the names out of the directory before we start modifying + * it. + */ + filenames = pgfnames(path); + + if (filenames == NULL) + return false; + + /* now we have the names we can start removing things */ + for (filename = filenames; *filename; filename++) + { + snprintf(pathbuf, MAXPGPATH, "%s/%s", path, *filename); + + /* + * It's ok if the file is not there anymore; we were just about to + * delete it anyway. + * + * This is not an academic possibility. One scenario where this + * happens is when bgwriter has a pending unlink request for a file in + * a database that's being dropped. In dropdb(), we call + * ForgetDatabaseFsyncRequests() to flush out any such pending unlink + * requests, but because that's asynchronous, it's not guaranteed that + * the bgwriter receives the message in time. + */ + if (lstat(pathbuf, &statbuf) != 0) + { + if (errno != ENOENT) + { + result = false; + } + continue; + } + + if (S_ISDIR(statbuf.st_mode)) + { + /* call ourselves recursively for a directory */ + if (!rmtree(pathbuf, true)) + { + /* we already reported the error */ + result = false; + } + } + else + { + if (unlink(pathbuf) != 0) + { + if (errno != ENOENT) + { + result = false; + } + } + } + } + + if (rmtopdir) + { + if (rmdir(path) != 0) + { + result = false; + } + } + + pgfnames_cleanup(filenames); + + return result; +} + diff --git a/dirmod.h b/dirmod.h new file mode 100644 index 00000000..47caf982 --- /dev/null +++ b/dirmod.h @@ -0,0 +1,23 @@ +/* + * dirmod.h + * Copyright (c) 2ndQuadrant, 2010-2016 + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#ifndef _DIRMOD_H_ +#define _DIRMOD_H_ + +#endif diff --git a/errcode.h b/errcode.h index 3a9b70e3..153d7c00 100644 --- a/errcode.h +++ b/errcode.h @@ -38,5 +38,6 @@ #define ERR_INTERNAL 15 #define ERR_MONITORING_FAIL 16 #define ERR_BAD_BACKUP_LABEL 17 +#define ERR_SWITCHOVER_FAIL 18 #endif /* _ERRCODE_H_ */ diff --git a/repmgr.c b/repmgr.c index e73dbd43..053b1da4 100644 --- a/repmgr.c +++ b/repmgr.c @@ -87,7 +87,7 @@ static bool create_recovery_file(const char *data_dir); static int test_ssh_connection(char *host, char *remote_user); static int copy_remote_files(char *host, char *remote_user, char *remote_path, char *local_path, bool is_directory, int server_version_num); -static int run_basebackup(const char *data_dir); +static int run_basebackup(const char *data_dir, int server_version); static void check_parameters_for_action(const int action); static bool create_schema(PGconn *conn); static void write_primary_conninfo(char *line); @@ -121,7 +121,7 @@ static bool remote_command(const char *host, const char *user, const char *comma static void format_db_cli_params(const char *conninfo, char *output); static bool copy_file(const char *old_filename, const char *new_filename); -static void read_backup_label(const char *local_data_directory, struct BackupLabel *out_backup_label); +static bool read_backup_label(const char *local_data_directory, struct BackupLabel *out_backup_label); /* Global variables */ static const char *keywords[6]; @@ -187,6 +187,7 @@ main(int argc, char **argv) {"config-archive-dir", required_argument, NULL, 5}, {"pg_rewind", optional_argument, NULL, 6}, {"pwprompt", optional_argument, NULL, 7}, + {"csv", no_argument, NULL, 8}, {"help", no_argument, NULL, '?'}, {"version", no_argument, NULL, 'V'}, {NULL, 0, NULL, 0} @@ -427,6 +428,9 @@ main(int argc, char **argv) case 7: runtime_options.witness_pwprompt = true; break; + case 8: + runtime_options.csv_mode = true; + break; default: { @@ -764,9 +768,9 @@ do_cluster_show(void) conn = establish_db_connection(options.conninfo, true); sqlquery_snprintf(sqlquery, - "SELECT conninfo, type, name, upstream_node_name" - " FROM %s.repl_show_nodes", - get_repmgr_schema_quoted(conn)); + "SELECT conninfo, type, name, upstream_node_name, id" + " FROM %s.repl_show_nodes", + get_repmgr_schema_quoted(conn)); log_verbose(LOG_DEBUG, "do_cluster_show(): \n%s\n",sqlquery ); @@ -813,21 +817,24 @@ do_cluster_show(void) upstream_length = upstream_length_cur; } - printf("Role | %-*s | %-*s | Connection String\n", name_length, name_header, upstream_length, upstream_header); - printf("----------+-"); + if (! runtime_options.csv_mode) + { + printf("Role | %-*s | %-*s | Connection String\n", name_length, name_header, upstream_length, upstream_header); + printf("----------+-"); - for (i = 0; i < name_length; i++) - printf("-"); + for (i = 0; i < name_length; i++) + printf("-"); - printf("-|-"); - for (i = 0; i < upstream_length; i++) - printf("-"); + printf("-|-"); + for (i = 0; i < upstream_length; i++) + printf("-"); - printf("-|-"); - for (i = 0; i < conninfo_length; i++) - printf("-"); + printf("-|-"); + for (i = 0; i < conninfo_length; i++) + printf("-"); - printf("\n"); + printf("\n"); + } for (i = 0; i < PQntuples(res); i++) { @@ -841,11 +848,20 @@ do_cluster_show(void) else strcpy(node_role, "* master"); - printf("%-10s", node_role); - printf("| %-*s ", name_length, PQgetvalue(res, i, 2)); - printf("| %-*s ", upstream_length, PQgetvalue(res, i, 3)); - printf("| %s\n", PQgetvalue(res, i, 0)); - + if (runtime_options.csv_mode) + { + int connection_status = + (PQstatus(conn) == CONNECTION_OK) ? + (is_standby(conn) ? 1 : 0) : -1; + printf("%s,%d\n", PQgetvalue(res, i, 4), connection_status); + } + else + { + printf("%-10s", node_role); + printf("| %-*s ", name_length, PQgetvalue(res, i, 2)); + printf("| %-*s ", upstream_length, PQgetvalue(res, i, 3)); + printf("| %s\n", PQgetvalue(res, i, 0)); + } PQfinish(conn); } @@ -1110,8 +1126,9 @@ do_standby_register(void) PGconn *master_conn; int ret; - bool record_created; + t_node_info node_record = T_NODE_INFO_INITIALIZER; + int node_result; log_info(_("connecting to standby database\n")); conn = establish_db_connection(options.conninfo, true); @@ -1168,6 +1185,28 @@ do_standby_register(void) } } + /* + * Check that an active node with the same node_name doesn't exist already + */ + + node_result = get_node_record_by_name(master_conn, + options.cluster_name, + options.node_name, + &node_record); + + if (node_result) + { + if (node_record.active == true) + { + log_err(_("Node %i exists already with node_name \"%s\"\n"), + node_record.node_id, + options.node_name); + PQfinish(master_conn); + PQfinish(conn); + exit(ERR_BAD_CONFIG); + } + } + record_created = create_node_record(master_conn, "standby register", options.node, @@ -1393,7 +1432,7 @@ do_standby_clone(void) if (*runtime_options.recovery_min_apply_delay) { - if (get_server_version(upstream_conn, NULL) < 90400) + if (server_version_num < 90400) { log_err(_("PostgreSQL 9.4 or greater required for --recovery-min-apply-delay\n")); PQfinish(upstream_conn); @@ -1417,7 +1456,7 @@ do_standby_clone(void) { TablespaceListCell *cell; - if (get_server_version(upstream_conn, NULL) < 90400 && !runtime_options.rsync_only) + if (server_version_num < 90400 && !runtime_options.rsync_only) { log_err(_("in PostgreSQL 9.3, tablespace mapping can only be used in conjunction with --rsync-only\n")); PQfinish(upstream_conn); @@ -1673,12 +1712,13 @@ do_standby_clone(void) } /* Read backup label copied from primary */ - /* XXX ensure this function does not exit on error as we'd need to stop the backup */ - read_backup_label(local_data_directory, &backup_label); + if (read_backup_label(local_data_directory, &backup_label) == false) + { + r = retval = ERR_BAD_BACKUP_LABEL; + goto stop_backup; + } - printf("Label: %s; file: %s\n", backup_label.label, backup_label.start_wal_file); - - /* Handle tablespaces */ + /* Copy tablespaces and, if required, remap to a new location */ sqlquery_snprintf(sqlquery, " SELECT oid, pg_tablespace_location(oid) AS spclocation " @@ -1715,7 +1755,6 @@ do_standby_clone(void) appendPQExpBuffer(&tblspc_dir_src, "%s", PQgetvalue(res, i, 1)); /* Check if tablespace path matches one of the provided tablespace mappings */ - if (options.tablespace_mapping.head != NULL) { for (cell = options.tablespace_mapping.head; cell; cell = cell->next) @@ -1758,10 +1797,15 @@ do_standby_clone(void) goto stop_backup; } - /* Update symlinks in pg_tblspc */ + /* + * If a valid mapping was provide for this tablespace, arrange for it to + * be remapped + * (if no tablespace mappings was provided, the link will be copied as-is + * by pg_basebackup or rsync and no action is required) + */ if (mapping_found == true) { - /* 9.5 and later - create a tablespace_map file */ + /* 9.5 and later - append to the tablespace_map file */ if (server_version_num >= 90500) { tablespace_map_rewrite = true; @@ -1805,6 +1849,13 @@ do_standby_clone(void) PQclear(res); + /* + * For 9.5 and later, if tablespace remapping was requested, we'll need + * to rewrite the tablespace map file ourselves. + * The tablespace map file is read on startup and any links created by + * the backend; we could do this ourselves like for pre-9.5 servers, but + * it's better to rely on functionality the backend provides. + */ if (server_version_num >= 90500 && tablespace_map_rewrite == true) { PQExpBufferData tablespace_map_filename; @@ -1817,7 +1868,9 @@ do_standby_clone(void) /* Unlink any existing file (it should be there, but we don't care if it isn't) */ if (unlink(tablespace_map_filename.data) < 0 && errno != ENOENT) { - log_err(_("unable to remove tablespace_map file %s\n"), tablespace_map_filename.data); + log_err(_("unable to remove tablespace_map file %s: %s\n"), + tablespace_map_filename.data, + strerror(errno)); r = retval = ERR_BAD_BASEBACKUP; goto stop_backup; @@ -1845,7 +1898,7 @@ do_standby_clone(void) } else { - r = run_basebackup(local_data_directory); + r = run_basebackup(local_data_directory, server_version_num); if (r != 0) { log_warning(_("standby clone: base backup failed\n")); @@ -1988,8 +2041,8 @@ stop_backup: */ if (runtime_options.rsync_only) { - char script[MAXLEN]; char label_path[MAXPGPATH]; + char dirpath[MAXLEN] = ""; if (runtime_options.force) { @@ -1997,13 +2050,13 @@ stop_backup: * Remove any existing WAL from the target directory, since * rsync's --exclude option doesn't do it. */ - maxlen_snprintf(script, "rm -rf %s/pg_xlog/*", - local_data_directory); - r = system(script); - if (r != 0) + + maxlen_snprintf(dirpath, "%s/pg_xlog/", local_data_directory); + + if (!rmtree(dirpath, false)) { - log_err(_("unable to empty local WAL directory %s/pg_xlog/\n"), - local_data_directory); + log_err(_("unable to empty local WAL directory %s\n"), + dirpath); exit(ERR_BAD_RSYNC); } } @@ -2024,17 +2077,17 @@ stop_backup: * functionality of replication slots */ if (server_version_num >= 90400 && - backup_label.min_failover_slot_lsn == InvalidXLogRecPtr) + backup_label.min_failover_slot_lsn == InvalidXLogRecPtr) { - maxlen_snprintf(script, "rm -rf %s/pg_replslot/*", + maxlen_snprintf(dirpath, "%s/pg_replslot/*", local_data_directory); log_debug("deleting pg_replslot directory contents\n"); - r = system(script); - if (r != 0) + + if (!rmtree(dirpath, false)) { - log_err(_("unable to empty replication slot directory %s/pg_replslot/\n"), - local_data_directory); + log_err(_("unable to empty replication slot directory %s\n"), + dirpath); exit(ERR_BAD_RSYNC); } } @@ -2151,8 +2204,7 @@ parse_label_lsn(const char *label_key, const char *label_value) { log_err(_("Couldn't parse backup label entry \"%s: %s\" as lsn"), label_key, label_value); - - exit(ERR_BAD_BACKUP_LABEL); + return InvalidXLogRecPtr; } return ptr; @@ -2173,7 +2225,7 @@ parse_label_lsn(const char *label_key, const char *label_value) * *====================================== */ -static void +static bool read_backup_label(const char *local_data_directory, struct BackupLabel *out_backup_label) { char label_path[MAXPGPATH]; @@ -2198,7 +2250,7 @@ read_backup_label(const char *local_data_directory, struct BackupLabel *out_back { log_err(_("read_backup_label: could not open backup label file %s: %s"), label_path, strerror(errno)); - exit(ERR_BAD_BACKUP_LABEL); + return false; } log_info(_("read_backup_label: parsing backup label file '%s'\n"), @@ -2220,7 +2272,7 @@ read_backup_label(const char *local_data_directory, struct BackupLabel *out_back { log_err(_("read_backup_label: line too long in backup label file. Line begins \"%s: %s\""), label_key, label_value); - exit(ERR_BAD_BACKUP_LABEL); + return false; } log_debug("standby clone: got backup label entry \"%s: %s\"\n", @@ -2232,14 +2284,19 @@ read_backup_label(const char *local_data_directory, struct BackupLabel *out_back char wal_filename[MAXLEN]; nmatches = sscanf(label_value, "%" MAXLEN_STR "s (file %" MAXLEN_STR "[^)]", start_wal_location, wal_filename); + if (nmatches != 2) { log_err(_("read_backup_label: unable to parse \"START WAL LOCATION\" in backup label\n")); - exit(ERR_BAD_BACKUP_LABEL); + return false; } + out_backup_label->start_wal_location = parse_label_lsn(&label_key[0], start_wal_location); + if (out_backup_label->start_wal_location == InvalidXLogRecPtr) + return false; + (void) strncpy(out_backup_label->start_wal_file, wal_filename, MAXLEN); out_backup_label->start_wal_file[MAXLEN-1] = '\0'; } @@ -2247,6 +2304,9 @@ read_backup_label(const char *local_data_directory, struct BackupLabel *out_back { out_backup_label->checkpoint_location = parse_label_lsn(&label_key[0], &label_value[0]); + + if (out_backup_label->checkpoint_location == InvalidXLogRecPtr) + return false; } else if (strcmp(label_key, "BACKUP METHOD") == 0) { @@ -2272,6 +2332,9 @@ read_backup_label(const char *local_data_directory, struct BackupLabel *out_back { out_backup_label->min_failover_slot_lsn = parse_label_lsn(&label_key[0], &label_value[0]); + + if (out_backup_label->min_failover_slot_lsn == InvalidXLogRecPtr) + return false; } else { @@ -2281,6 +2344,10 @@ read_backup_label(const char *local_data_directory, struct BackupLabel *out_back } (void) fclose(label_file); + + log_debug(_("read_backup_label: label is %s; start wal file is %s\n"), out_backup_label->label, out_backup_label->start_wal_file); + + return true; } static void @@ -2663,6 +2730,8 @@ do_standby_follow(void) * * TODO: * - make connection test timeouts/intervals configurable (see below) + * - add command line option --remote_pg_bindir or similar to + * optionally handle cases where the remote pg_bindir is different */ static void @@ -2673,7 +2742,7 @@ do_standby_switchover(void) int server_version_num; bool use_pg_rewind; - /* the remote server is the primary which will be demoted */ + /* the remote server is the primary to be demoted */ char remote_conninfo[MAXCONNINFO] = ""; char remote_host[MAXLEN]; char remote_data_directory[MAXLEN]; @@ -2689,9 +2758,9 @@ do_standby_switchover(void) char repmgr_db_cli_params[MAXLEN] = ""; int query_result; - t_node_info remote_node_record; - bool connection_success; - + t_node_info remote_node_record = T_NODE_INFO_INITIALIZER; + bool connection_success, + shutdown_success; /* * SANITY CHECKS @@ -2711,7 +2780,7 @@ do_standby_switchover(void) log_err(_("switchover must be executed from the standby node to be promoted\n")); PQfinish(local_conn); - exit(ERR_BAD_CONFIG); + exit(ERR_SWITCHOVER_FAIL); } server_version_num = check_server_version(local_conn, "master", true, NULL); @@ -2821,8 +2890,8 @@ do_standby_switchover(void) /* 9.5 and later have pg_rewind built-in - always use that */ use_pg_rewind = true; maxlen_snprintf(remote_pg_rewind, - "%s/pg_rewind", - pg_bindir); + "%s", + make_pg_path("pg_rewind")); } else { @@ -2842,8 +2911,8 @@ do_standby_switchover(void) else { maxlen_snprintf(remote_pg_rewind, - "%s/pg_rewind", - pg_bindir); + "%s", + make_pg_path("pg_rewind")); } } else @@ -3038,8 +3107,8 @@ do_standby_switchover(void) log_verbose(LOG_DEBUG, "remote_archive_config_dir: %s\n", remote_archive_config_dir); maxlen_snprintf(command, - "%s/repmgr standby archive-config -f %s --config-archive-dir=%s", - pg_bindir, + "%s standby archive-config -f %s --config-archive-dir=%s", + make_pg_path("repmgr"), runtime_options.remote_config_file, remote_archive_config_dir); @@ -3094,41 +3163,63 @@ do_standby_switchover(void) termPQExpBuffer(&command_output); - connection_success = false; + shutdown_success = false; /* loop for timeout waiting for current primary to stop */ - for(i = 0; i < options.reconnect_attempts; i++) + for (i = 0; i < options.reconnect_attempts; i++) { /* Check whether primary is available */ - remote_conn = test_db_connection(remote_conninfo, false); /* don't fail on error */ + PGPing ping_res = PQping(remote_conninfo); - /* XXX failure to connect doesn't mean the server is necessarily - * completely stopped - we need to better detect the reason for - * connection failure ("server not listening" vs "shutting down") - * - * -> check is_pgup() - */ - if (PQstatus(remote_conn) != CONNECTION_OK) + /* database server could not be contacted */ + if (ping_res == PQPING_NO_RESPONSE) { - connection_success = true; + bool command_success; - log_notice(_("current master has been stopped\n")); - break; + /* + * directly access the server and check that the + * pidfile has gone away so we can be sure the server is actually + * shut down and the PQPING_NO_RESPONSE is not due to other issues + * such as coincidental network failure. + */ + initPQExpBuffer(&command_output); + + maxlen_snprintf(command, + "ls %s/postmaster.pid >/dev/null 2>&1 && echo 1 || echo 0", + remote_data_directory); + + command_success = remote_command( + remote_host, + runtime_options.remote_user, + command, + &command_output); + + if (command_success == true && *command_output.data == '0') + { + shutdown_success = true; + + log_notice(_("current master has been stopped\n")); + + termPQExpBuffer(&command_output); + + break; + } + + termPQExpBuffer(&command_output); } - PQfinish(remote_conn); - // configurable? + /* XXX make configurable? */ sleep(options.reconnect_interval); i++; } - if (connection_success == false) + if (shutdown_success == false) { log_err(_("master server did not shut down\n")); log_hint(_("check the master server status before performing any further actions")); - exit(ERR_FAILOVER_FAIL); + exit(ERR_SWITCHOVER_FAIL); } /* promote this standby */ @@ -3151,8 +3242,8 @@ do_standby_switchover(void) /* Execute pg_rewind */ maxlen_snprintf(command, - "%s/pg_rewind -D %s --source-server=\\'%s\\'", - pg_bindir, + "%s -D %s --source-server=\\'%s\\'", + remote_pg_rewind, remote_data_directory, options.conninfo); @@ -3173,8 +3264,8 @@ do_standby_switchover(void) /* Restore any previously archived config files */ maxlen_snprintf(command, - "%s/repmgr standby restore-config -D %s --config-archive-dir=%s", - pg_bindir, + "%s standby restore-config -D %s --config-archive-dir=%s", + make_pg_path("repmgr"), remote_data_directory, remote_archive_config_dir); @@ -3231,8 +3322,8 @@ do_standby_switchover(void) format_db_cli_params(options.conninfo, repmgr_db_cli_params); maxlen_snprintf(command, - "%s/repmgr -D %s -f %s %s --rsync-only --force --ignore-external-config-files standby clone", - pg_bindir, + "%s -D %s -f %s %s --rsync-only --force --ignore-external-config-files standby clone", + make_pg_path("repmgr"), remote_data_directory, runtime_options.remote_config_file, repmgr_db_cli_params @@ -3257,8 +3348,8 @@ do_standby_switchover(void) */ format_db_cli_params(options.conninfo, repmgr_db_cli_params); maxlen_snprintf(command, - "%s/repmgr -D %s -f %s %s standby follow", - pg_bindir, + "%s -D %s -f %s %s standby follow", + make_pg_path("repmgr"), remote_data_directory, runtime_options.remote_config_file, repmgr_db_cli_params @@ -3292,7 +3383,7 @@ do_standby_switchover(void) if (is_standby(remote_conn) == 0) { log_err(_("new standby (old master) is not a standby\n")); - exit(ERR_FAILOVER_FAIL); + exit(ERR_SWITCHOVER_FAIL); } connection_success = true; break; @@ -3306,7 +3397,7 @@ do_standby_switchover(void) if (connection_success == false) { log_err(_("unable to connect to new standby (old master)\n")); - exit(ERR_FAILOVER_FAIL); + exit(ERR_SWITCHOVER_FAIL); } log_debug("new standby is in recovery\n"); @@ -3315,15 +3406,14 @@ do_standby_switchover(void) local_conn = establish_db_connection(options.conninfo, true); - query_result = get_node_replication_state(local_conn, remote_node_record.name, remote_node_replication_state); + if (query_result == -1) { log_err(_("unable to retrieve replication status for node %i\n"), remote_node_id); PQfinish(local_conn); - // errcode? - exit(ERR_DB_QUERY); + exit(ERR_SWITCHOVER_FAIL); } if (query_result == 0) @@ -3332,7 +3422,6 @@ do_standby_switchover(void) } else { - /* XXX other valid values? */ /* XXX we should poll for a while in case the node takes time to connect to the primary */ if (strcmp(remote_node_replication_state, "streaming") == 0 || strcmp(remote_node_replication_state, "catchup") == 0) @@ -3341,9 +3430,16 @@ do_standby_switchover(void) } else { - log_err(_("node %i replication state is \"%s\"\n"), remote_node_id, remote_node_replication_state); + /* + * Other possible replication states are: + * - startup + * - backup + * - UNKNOWN + */ + log_err(_("node %i has unexpected replication state \"%s\"\n"), + remote_node_id, remote_node_replication_state); PQfinish(local_conn); - exit(ERR_DB_QUERY); + exit(ERR_SWITCHOVER_FAIL); } } @@ -3355,7 +3451,7 @@ do_standby_switchover(void) if (options.use_replication_slots) { - t_node_info local_node_record; + t_node_info local_node_record = T_NODE_INFO_INITIALIZER; query_result = get_node_record(local_conn, options.cluster_name, options.node, &local_node_record); @@ -4120,12 +4216,13 @@ do_help(void) printf(_(" -w, --wal-keep-segments=VALUE (standby clone) minimum value for the GUC\n" \ " wal_keep_segments (default: %s)\n"), DEFAULT_WAL_KEEP_SEGMENTS); printf(_(" -W, --wait (standby follow) wait for a master to appear\n")); - printf(_(" -m, --mode (standby switchover) shutdown mode (smart|fast|immediate)\n")); + printf(_(" -m, --mode (standby switchover) shutdown mode (\"fast\" - default, \"smart\" or \"immediate\")\n")); printf(_(" -C, --remote-config-file (standby switchover) path to the configuration file on\n" \ " the current master\n")); printf(_(" --pg_rewind[=VALUE] (standby switchover) 9.3/9.4 only - use pg_rewind if available,\n" \ " optionally providing a path to the binary\n")); printf(_(" -k, --keep-history=VALUE (cluster cleanup) retain indicated number of days of history (default: 0)\n")); + printf(_(" --csv (cluster show) output in CSV mode (0 = master, 1 = standby, -1 = down)\n")); /* printf(_(" --initdb-no-pwprompt (witness server) no superuser password prompt during initdb\n"));*/ printf(_(" -P, --pwprompt (witness server) prompt for password when creating users\n")); printf(_(" -S, --superuser=USERNAME (witness server) superuser username for witness database\n" \ @@ -4216,6 +4313,15 @@ create_recovery_file(const char *data_dir) log_debug(_("recovery.conf: %s"), line); } + /* If restore_command is set, we use it as restore_command in recovery.conf */ + if (strcmp(options.restore_command, "") != 0) + { + maxlen_snprintf(line, "restore_command = '%s'\n", + options.restore_command); + if (write_recovery_file_line(recovery_file, recovery_file_path, line) == false) + return false; + log_debug(_("recovery.conf: %s"), line); + } fclose(recovery_file); return true; @@ -4371,12 +4477,12 @@ copy_remote_files(char *host, char *remote_user, char *remote_path, static int -run_basebackup(const char *data_dir) +run_basebackup(const char *data_dir, int server_version) { char script[MAXLEN]; int r = 0; PQExpBufferData params; - TablespaceListCell *cell; + TablespaceListCell *cell; /* Create pg_basebackup command line options */ @@ -4411,6 +4517,28 @@ run_basebackup(const char *data_dir) } } + /* + * To ensure we have all the WALs needed during basebackup execution we stream + * them as the backup is taking place. + * Not necessary if on 9.6 if we have replication slots set in repmgr.conf + * (starting at 9.6 there is an option, which we use, to reserve the LSN at + * creation time) + */ + if (server_version < 90600 || !options.use_replication_slots) + { + /* + * We're going to check first if the user set the xlog method in the repmgr.conf + * file. We don't want to have conflicts with pg_basebackup due to specifying the + * method twice. + */ + const char xlog_short[4] = "-X "; + const char xlog_long[14] = "--xlog-method"; + if (strstr(options.pg_basebackup_options, xlog_short) == NULL && strstr(options.pg_basebackup_options, xlog_long) == NULL ) + { + appendPQExpBuffer(¶ms, " -X stream"); + } + } + maxlen_snprintf(script, "%s -l \"repmgr base backup\" %s %s", make_pg_path("pg_basebackup"), @@ -4423,7 +4551,7 @@ run_basebackup(const char *data_dir) /* * As of 9.4, pg_basebackup only ever returns 0 or 1 - */ + */ r = system(script); @@ -4624,6 +4752,15 @@ check_parameters_for_action(const int action) } } + /* Warn about parameters which apply to CLUSTER SHOW only */ + if (action != CLUSTER_SHOW) + { + if (runtime_options.csv_mode) + { + error_list_append(&cli_warnings, _("--csv can only be used when executing CLUSTER SHOW")); + } + } + return; } @@ -5080,6 +5217,10 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error) NULL, }; + /* + * Note that in 9.6+, "hot_standby" and "archive" are accepted as aliases + * for "replica", but current_setting() will of course always return "replica" + */ char *levels_96plus[] = { "replica", "logical", diff --git a/repmgr.conf.sample b/repmgr.conf.sample index fc2bd5b4..6b1af192 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -19,7 +19,7 @@ # Node ID and name # (Note: we recommend to avoid naming nodes after their initial -# replication funcion, as this will cause confusion when e.g. +# replication function, as this will cause confusion when e.g. # "standby2" is promoted to primary) #node=2 # a unique integer #node_name=node2 # an arbitrary (but unique) string; we recommend using @@ -28,8 +28,16 @@ # Database connection information as a conninfo string # This must be accessible to all servers in the cluster; for details see: -# http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING +# +# https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING +# #conninfo='host=192.168.204.104 dbname=repmgr_db user=repmgr_usr' +# +# If repmgrd is in use, consider explicitly setting `connect_timeout` in the +# conninfo string to determine the length of time which elapses before +# a network connection attempt is abandoned; for details see: +# +# https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNECT-CONNECT-TIMEOUT # Optional configuration items # ============================ @@ -113,6 +121,10 @@ # # tablespace_mapping=/path/to/original/tablespace=/path/to/new/tablespace +# You can specify a restore_command to be used in the recovery.conf that +# will be placed in the cloned standby +# +# restore_command = cp /path/to/archived/wals/%f %p # Failover settings (repmgrd) # --------------------------- diff --git a/repmgr.h b/repmgr.h index 780dccf7..20690eea 100644 --- a/repmgr.h +++ b/repmgr.h @@ -28,6 +28,7 @@ #include "dbutils.h" #include "errcode.h" #include "config.h" +#include "dirmod.h" #define MIN_SUPPORTED_VERSION "9.3" #define MIN_SUPPORTED_VERSION_NUM 90300 @@ -69,7 +70,7 @@ typedef struct bool rsync_only; bool fast_checkpoint; bool ignore_external_config_files; - char pg_ctl_mode[MAXLEN]; + bool csv_mode; char masterport[MAXLEN]; /* * configuration file parameters which can be overridden on the @@ -80,6 +81,7 @@ typedef struct /* parameter used by STANDBY SWITCHOVER */ char remote_config_file[MAXLEN]; char pg_rewind[MAXPGPATH]; + char pg_ctl_mode[MAXLEN]; /* parameter used by STANDBY {ARCHIVE_CONFIG | RESTORE_CONFIG} */ char config_archive_dir[MAXLEN]; /* parameter used by CLUSTER CLEANUP */ @@ -94,7 +96,7 @@ typedef struct bool initdb_no_pwprompt; } t_runtime_options; -#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, false, "smart", "", "", "", "", "", 0, "", "", "", false } +#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, false, false, false, false, false, false, "", "", "", "", "fast", "", 0, "", "", "", false } struct BackupLabel { diff --git a/repmgrd.c b/repmgrd.c index b0d26f32..3c856d9a 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -44,11 +44,11 @@ /* Local info */ -t_configuration_options local_options; +t_configuration_options local_options = T_CONFIGURATION_OPTIONS_INITIALIZER; PGconn *my_local_conn = NULL; /* Master info */ -t_configuration_options master_options; +t_configuration_options master_options = T_CONFIGURATION_OPTIONS_INITIALIZER; PGconn *master_conn = NULL; @@ -61,8 +61,6 @@ bool failover_done = false; char *pid_file = NULL; -t_configuration_options config = T_CONFIGURATION_OPTIONS_INITIALIZER; - static void help(void); static void usage(void); static void check_cluster_configuration(PGconn *conn); @@ -399,7 +397,7 @@ main(int argc, char **argv) case STANDBY: /* We need the node id of the master server as well as a connection to it */ - log_info(_("connecting to master node '%s'\n"), + log_info(_("connecting to master node of cluster '%s'\n"), local_options.cluster_name); master_conn = get_master_connection(my_local_conn, @@ -462,16 +460,16 @@ main(int argc, char **argv) do { - log_verbose(LOG_DEBUG, "standby check loop...\n"); - - if (node_info.type == WITNESS) - { - witness_monitor(); - } - else if (node_info.type == STANDBY) + if (node_info.type == STANDBY) { + log_verbose(LOG_DEBUG, "standby check loop...\n"); standby_monitor(); } + else if (node_info.type == WITNESS) + { + log_verbose(LOG_DEBUG, "witness check loop...\n"); + witness_monitor(); + } sleep(local_options.monitor_interval_secs); @@ -667,7 +665,7 @@ witness_monitor(void) " replication_lag, apply_lag )" " VALUES(%d, %d, " " '%s'::TIMESTAMP WITH TIME ZONE, NULL, " - " pg_current_xlog_location(), NULL, " + " pg_catalog.pg_current_xlog_location(), NULL, " " 0, 0) ", get_repmgr_schema_quoted(my_local_conn), master_options.node, @@ -695,7 +693,7 @@ standby_monitor(void) { PGresult *res; char monitor_standby_timestamp[MAXLEN]; - char last_wal_master_location[MAXLEN]; + char last_wal_primary_location[MAXLEN]; char last_xlog_receive_location[MAXLEN]; char last_xlog_replay_location[MAXLEN]; char last_xact_replay_timestamp[MAXLEN]; @@ -706,6 +704,9 @@ standby_monitor(void) XLogRecPtr lsn_last_xlog_receive_location; XLogRecPtr lsn_last_xlog_replay_location; + long long unsigned int replication_lag; + long long unsigned int apply_lag; + int connection_retries, ret; bool did_retry = false; @@ -750,10 +751,9 @@ standby_monitor(void) ? "master" : "upstream"; - // ZZZ "5 minutes"? /* - * Check if the upstream node is still available, if after 5 minutes of retries - * we cannot reconnect, try to get a new upstream node. + * Check that the upstream node is still available + * If not, initiate failover process */ check_connection(&upstream_conn, upstream_node_type, upstream_conninfo); @@ -820,26 +820,24 @@ standby_monitor(void) else if (local_options.failover == AUTOMATIC_FAILOVER) { /* - * When we returns from this function we will have a new master + * When we return from this function we will have a new master * and a new master_conn - */ - - /* + * * Failover handling is handled differently depending on whether * the failed node is the master or a cascading standby */ upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id); - if (upstream_node.type == MASTER) - { - log_debug(_("failure detected on master node (%i); attempting to promote a standby\n"), - node_info.upstream_node_id); - do_master_failover(); - } - else - { - log_debug(_("failure detected on upstream node %i; attempting to reconnect to new upstream node\n"), - node_info.upstream_node_id); + if (upstream_node.type == MASTER) + { + log_debug(_("failure detected on master node (%i); attempting to promote a standby\n"), + node_info.upstream_node_id); + do_master_failover(); + } + else + { + log_debug(_("failure detected on upstream node %i; attempting to reconnect to new upstream node\n"), + node_info.upstream_node_id); if (!do_upstream_standby_failover(upstream_node)) { @@ -847,20 +845,20 @@ standby_monitor(void) initPQExpBuffer(&errmsg); appendPQExpBuffer(&errmsg, - _("unable to reconnect to new upstream node, terminating...")); + _("unable to reconnect to new upstream node, terminating...")); log_err("%s\n", errmsg.data); create_event_record(master_conn, - &local_options, - local_options.node, - "repmgrd_shutdown", - false, - errmsg.data); + &local_options, + local_options.node, + "repmgrd_shutdown", + false, + errmsg.data); terminate(ERR_DB_CON); } - } + } return; } } @@ -963,7 +961,7 @@ standby_monitor(void) if (active_master_id != master_options.node) { - log_notice(_("connecting to active master (node %i)...\n"), active_master_id); \ + log_notice(_("connecting to active master (node %i)...\n"), active_master_id); if (master_conn != NULL) { PQfinish(master_conn); @@ -986,9 +984,11 @@ standby_monitor(void) /* Get local xlog info */ sqlquery_snprintf(sqlquery, - "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " - "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp(), " - "pg_last_xlog_receive_location() >= pg_last_xlog_replay_location()"); + "SELECT CURRENT_TIMESTAMP, " + "pg_catalog.pg_last_xlog_receive_location(), " + "pg_catalog.pg_last_xlog_replay_location(), " + "pg_catalog.pg_last_xact_replay_timestamp(), " + "pg_catalog.pg_last_xlog_receive_location() >= pg_catalog.pg_last_xlog_replay_location()"); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) @@ -1038,7 +1038,12 @@ standby_monitor(void) "Replayed WAL newer than received WAL - is this standby connected to its upstream?\n"); } - /* Get master xlog info */ + /* + * Get master xlog position + * + * TODO: investigate whether pg_current_xlog_insert_location() would be a better + * choice; see: https://github.com/2ndQuadrant/repmgr/issues/189 + */ sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_current_xlog_location()"); res = PQexec(master_conn, sqlquery); @@ -1049,23 +1054,43 @@ standby_monitor(void) return; } - strncpy(last_wal_master_location, PQgetvalue(res, 0, 0), MAXLEN); + strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN); PQclear(res); - /* Calculate the lag */ - lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_master_location, NULL); - + lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_primary_location, NULL); lsn_last_xlog_replay_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL); + /* Calculate apply lag */ if (last_xlog_receive_location_gte_replayed == false) { - lsn_last_xlog_receive_location = lsn_last_xlog_replay_location; + /* + * We're not receiving streaming WAL - in this case the receive location + * equals the last replayed location + */ + apply_lag = 0; + strncpy(last_xlog_receive_location, last_xlog_replay_location, MAXLEN); + lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL); } else { + apply_lag = (long long unsigned int)lsn_last_xlog_receive_location - lsn_last_xlog_replay_location; lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL); } + /* Calculate replication lag */ + if (lsn_master_current_xlog_location >= lsn_last_xlog_receive_location) + { + replication_lag = (long long unsigned int)(lsn_master_current_xlog_location - lsn_last_xlog_receive_location); + } + else + { + /* This should never happen, but in case it does set lag to zero */ + log_warning("Master xlog (%s) location appears less than standby receive location (%s)\n", + last_wal_primary_location, + last_xlog_receive_location); + replication_lag = 0; + } + /* * Build the SQL to execute on master */ @@ -1092,11 +1117,10 @@ standby_monitor(void) local_options.node, monitor_standby_timestamp, last_xact_replay_timestamp, - last_wal_master_location, + last_wal_primary_location, last_xlog_receive_location, - (long long unsigned int)(lsn_master_current_xlog_location - lsn_last_xlog_receive_location), - (long long unsigned int)(lsn_last_xlog_receive_location - lsn_last_xlog_replay_location)); - + replication_lag, + apply_lag); /* * Execute the query asynchronously, but don't check for a result. We will * check the result next time we pause for a monitor step. @@ -1143,8 +1167,8 @@ do_master_failover(void) */ t_node_info nodes[FAILOVER_NODES_MAX_CHECK]; - /* Store details of the failed node here */ - t_node_info failed_master = T_NODE_INFO_INITIALIZER; + /* Store details of the failed node here */ + t_node_info failed_master = T_NODE_INFO_INITIALIZER; /* Store details of the best candidate for promotion to master here */ t_node_info best_candidate = T_NODE_INFO_INITIALIZER; @@ -1154,7 +1178,7 @@ do_master_failover(void) "SELECT id, conninfo, type, upstream_node_id " " FROM %s.repl_nodes " " WHERE cluster = '%s' " - " AND active IS TRUE " + " AND active IS TRUE " " AND priority > 0 " " ORDER BY priority DESC, id " " LIMIT %i ", @@ -1167,7 +1191,6 @@ do_master_failover(void) { log_err(_("unable to retrieve node records: %s\n"), PQerrorMessage(my_local_conn)); PQclear(res); - PQfinish(my_local_conn); terminate(ERR_DB_QUERY); } @@ -1541,12 +1564,12 @@ do_master_failover(void) log_notice(_("Original master reappeared before this standby was promoted - no action taken\n")); PQfinish(master_conn); + master_conn = NULL; + /* no failover occurred but we'll want to restart connections */ failover_done = true; return; } - - PQfinish(my_local_conn); } log_err(_("promote command failed. You could check and try it manually.\n")); @@ -1901,7 +1924,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo) static bool set_local_node_status(void) { - PGresult *res; + PGresult *res; char sqlquery[QUERY_STR_LEN]; int active_master_node_id = NODE_NOT_FOUND; char master_conninfo[MAXLEN]; @@ -1994,10 +2017,12 @@ check_cluster_configuration(PGconn *conn) log_info(_("checking cluster configuration with schema '%s'\n"), get_repmgr_schema()); sqlquery_snprintf(sqlquery, - "SELECT oid FROM pg_class " + "SELECT oid FROM pg_catalog.pg_class " " WHERE oid = '%s.repl_nodes'::regclass ", - get_repmgr_schema_quoted(master_conn)); + get_repmgr_schema_quoted(master_conn)); + res = PQexec(conn, sqlquery); + if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("PQexec failed: %s\n"), PQerrorMessage(conn)); @@ -2416,6 +2441,8 @@ get_node_info(PGconn *conn, char *cluster, int node_id) errmsg.data); PQfinish(conn); + conn = NULL; + terminate(ERR_DB_QUERY); } diff --git a/sql/repmgr2_repmgr3.sql b/sql/repmgr2_repmgr3.sql index f99e84d0..a4eacd86 100644 --- a/sql/repmgr2_repmgr3.sql +++ b/sql/repmgr2_repmgr3.sql @@ -63,6 +63,15 @@ UPDATE repl_nodes SET type = 'master' WHERE id = $master_id; -- UPDATE repl_nodes SET active = FALSE WHERE id IN (...); +/* There's also an event table which we need to create */ +CREATE TABLE repl_events ( + node_id INTEGER NOT NULL, + event TEXT NOT NULL, + successful BOOLEAN NOT NULL DEFAULT TRUE, + event_timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, + details TEXT NULL +); + /* When you're sure of your changes, commit them */ -- COMMIT; diff --git a/sql/repmgr3.1.1_repmgr3.1.2.sql b/sql/repmgr3.1.1_repmgr3.1.2.sql index 7dedacc8..36b3dc79 100644 --- a/sql/repmgr3.1.1_repmgr3.1.2.sql +++ b/sql/repmgr3.1.1_repmgr3.1.2.sql @@ -27,5 +27,6 @@ BEGIN; -ALTER TABLE repl_nodes ALTER CONSTRAINT repl_nodes_upstream_node_id_fkey DEFERRABLE; +ALTER TABLE repl_nodes DROP CONSTRAINT repl_nodes_upstream_node_id_fkey, + ADD CONSTRAINT repl_nodes_upstream_node_id_fkey FOREIGN KEY (upstream_node_id) REFERENCES repl_nodes(id) DEFERRABLE; COMMIT;