Compare commits

..

1 Commits
v3.0.1 ... v3.0

Author SHA1 Message Date
Ian Barwick
372f4f7d3d Bump specfile version number 2015-03-27 11:47:23 +09:00
13 changed files with 337 additions and 425 deletions

19
FAQ.md
View File

@@ -90,23 +90,6 @@ General
This option is only available when using the `--rsync-only` option.
- How can I make the witness server use a particular port?
By default the witness server is configured to use port 5499; this
is intended to support running the witness server as a separate
instance on a normal node server, rather than on its own dedicated server.
To specify a port for the witness server, supply the port number to
repmgr with the `-l/--local-port` command line option.
- Do I need to include `shared_preload_libraries = 'repmgr_funcs'`
in `postgresql.conf` if I'm not using `repmgrd`?
No, the `repmgr_funcs` library is only needed when running `repmgrd`.
If you later decide to run `repmgrd`, you just need to add
`shared_preload_libraries = 'repmgr_funcs'` and restart PostgreSQL.
`repmgrd`
---------
@@ -119,7 +102,7 @@ General
- How can I prevent a node from ever being promoted to master?
In `repmgr.conf`, set its priority to a value of 0 or less.
In `rempgr.conf`, set its priority to a value of 0 or less.
- Does `repmgrd` support delayed standbys?

14
HISTORY
View File

@@ -1,11 +1,4 @@
3.0.1 2015-04-16
Prevent repmgrd from looping infinitely if node was not registered (Ian)
When promoting a standby, have repmgr (not repmgrd) handle metadata updates (Ian)
Re-use replication slot if it already exists (Ian)
Prevent a test SSH connection being made when not needed (Ian)
Correct monitoring table column names (Ian)
3.0 2015-03-27
3.0
Require PostgreSQL 9.3 or later (Ian)
Use `pg_basebackup` by default (instead of `rsync`) to clone standby servers (Ian)
Use `pg_ctl promote` to promote a standby to primary
@@ -18,11 +11,6 @@
General usability and logging message improvements (Ian)
Code consolidation and cleanup (Ian)
2.0.3 2015-04-16
Add -S/--superuser option for witness database creation Ian)
Add -c/--fast-checkpoint option for cloning (Christoph)
Add option "--initdb-no-pwprompt" (Ian)
2.0.2 2015-02-17
Add "--checksum" in rsync when using "--force" (Jaime)
Use createdb/createuser instead of psql (Jaime)

View File

@@ -98,7 +98,7 @@ packages installed::
sudo apt-get install libxslt-dev libxml2-dev libpam-dev libedit-dev
If you're using Debian packages for PostgreSQL and are building repmgr with the
If your using Debian packages for PostgreSQL and are building repmgr with the
USE_PGXS option you also need to install the corresponding development package::
sudo apt-get install postgresql-server-dev-9.0

View File

@@ -71,10 +71,7 @@ Standby setup
[2015-03-03 18:18:23] [NOTICE] HINT: You can now start your postgresql server
[2015-03-03 18:18:23] [NOTICE] for example : pg_ctl -D /path/to/standby/data start
Note that the `repmgr.conf` file is not required when cloning a standby.
However we recommend providing a valid `repmgr.conf` if you wish to use
replication slots, or want `repmgr` to log the clone event to the
`repl_events` table.
Note that at this point it does not matter if the `repmgr.conf` file is not found.
This will clone the PostgreSQL database files from the master, including its
`postgresql.conf` and `pg_hba.conf` files, and additionally automatically create

View File

@@ -1,114 +1,89 @@
#!/bin/sh
#!/bin/bash
#
# chkconfig: - 75 16
# description: Enable repmgrd replication management and monitoring daemon for PostgreSQL
# processname: repmgrd
# pidfile="/var/run/${NAME}.pid"
# repmgrd Start up the repmgrd daemon
# repmrgd (replication manager daemon)
#
# chkconfig: - 75 16
# description: repmgrd is the repliation manager daemon \
# The repmgrd replication management and monitoring daemon for PostgreSQL.
### BEGIN INIT INFO
# Provides: repmgrd
# Required-Start: $local_fs $remote_fs $network $syslog postgresql
# Required-Stop: $local_fs $remote_fs $network $syslog postgresql
# Should-Start: $syslog postgresql-9.3
# Should-Stop: $syslog postgresql-9.3
# Short-Description: start and stop repmrgd
# Description: Enable repmgrd replication management and monitoring daemon for PostgreSQL
# this is used to monitor a postgresql cluster.
### END INIT INFO
# Source function library.
INITD=/etc/rc.d/init.d
. $INITD/functions
. /etc/init.d/functions
# Get function listing for cross-distribution logic.
TYPESET=`typeset -f|grep "declare"`
# Get network config.
# Source networking configuration.
. /etc/sysconfig/network
DESC="PostgreSQL replication management and monitoring daemon"
NAME=repmgrd
REPMGRD_ENABLED=no
prog=repmgrd
REPMGRD_ENABLED=yes
REPMGRD_OPTS=
REPMGRD_USER=postgres
REPMGRD_BIN=/usr/pgsql-9.3/bin/repmgrd
REPMGRD_PIDFILE=/var/run/repmgrd.pid
REPMGRD_LOCK=/var/lock/subsys/${NAME}
REPMGRD_LOG=/var/lib/pgsql/9.3/data/pg_log/repmgrd.log
DAEMONIZE="-d"
# Read configuration variable file if it is present
[ -r /etc/sysconfig/$NAME ] && . /etc/sysconfig/$NAME
# pull in sysconfig settings
[ -f /etc/sysconfig/repmgrd ] && . /etc/sysconfig/repmgrd
# For SELinux we need to use 'runuser' not 'su'
if [ -x /sbin/runuser ]
then
SU=runuser
else
SU=su
fi
test -x $REPMGRD_BIN || exit 0
LOCKFILE=/var/lock/subsys/$prog
RETVAL=0
case "$REPMGRD_ENABLED" in
[Yy]*)
break
#nothing to do here
;;
*)
exit 0
exit 2
;;
esac
if [ -z "${REPMGRD_OPTS}" ]
if [ -z "$REPMGRD_OPTS" ]
then
echo "Not starting ${NAME}, REPMGRD_OPTS not set in /etc/sysconfig/${NAME}"
exit 0
echo "Not starting $prog, REPMGRD_OPTS not set in /etc/sysconfig/$prog"
exit 2
fi
start()
{
REPMGRD_START=$"Starting ${NAME} service: "
start() {
[ "$EUID" != "0" ] && exit 4
[ "$NETWORKING" = "no" ] && exit 1
# Make sure startup-time log file is valid
if [ ! -e "${REPMGRD_LOG}" -a ! -h "${REPMGRD_LOG}" ]
then
touch "${REPMGRD_LOG}" || exit 1
chown ${REPMGRD_USER}:postgres "${REPMGRD_LOG}"
chmod go-rwx "${REPMGRD_LOG}"
[ -x /sbin/restorecon ] && /sbin/restorecon "${REPMGRD_LOG}"
fi
echo -n "${REPMGRD_START}"
$SU -l $REPMGRD_USER -c "${REPMGRD_BIN} ${REPMGRD_OPTS} -p ${REPMGRD_PIDFILE} &" >> "${REPMGRD_LOG}" 2>&1 < /dev/null
sleep 2
pid=`head -n 1 "${REPMGRD_PIDFILE}" 2>/dev/null`
if [ "x${pid}" != "x" ]
then
success "${REPMGRD_START}"
touch "${REPMGRD_LOCK}"
echo $pid > "${REPMGRD_PIDFILE}"
# Start daemons.
echo -n $"Starting $prog: "
daemon --user $REPMGRD_USER $prog $DAEMONIZE $REPMGRD_OPTS
RETVAL=$?
echo
else
failure "${REPMGRD_START}"
echo
script_result=1
fi
[ $RETVAL -eq 0 ] && touch $LOCKFILE
return $RETVAL
}
stop()
{
echo -n $"Stopping ${NAME} service: "
if [ -e "${REPMGRD_LOCK}" ]
then
killproc ${NAME}
ret=$?
if [ $ret -eq 0 ]
then
echo_success
rm -f "${REPMGRD_PIDFILE}"
rm -f "${REPMGRD_LOCK}"
stop() {
[ "$EUID" != "0" ] && exit 4
echo -n $"Shutting down $prog: "
killproc $prog
RETVAL=$?
echo
[ $RETVAL -eq 0 ] && rm -f $LOCKFILE
return $RETVAL
}
status() {
if [ -f "$LOCKFILE" ]; then
echo "$prog is running"
else
echo_failure
script_result=1
RETVAL=3
echo "$prog is stopped"
fi
else
# not running; per LSB standards this is "ok"
echo_success
fi
echo
return $RETVAL
}
# See how we were called.
case "$1" in
start)
@@ -118,16 +93,22 @@ case "$1" in
stop
;;
status)
status -p $REPMGRD_PIDFILE $NAME
script_result=$?
status $prog
;;
restart)
restart|force-reload)
stop
start
start
;;
try-restart|condrestart)
if status $prog > /dev/null; then
stop
start
fi
;;
reload)
exit 3
;;
*)
echo $"Usage: $0 {start|stop|status|restart}"
echo $"Usage: $0 {start|stop|status|restart|try-restart|force-reload}"
exit 2
esac
exit $script_result

View File

@@ -1,21 +1,4 @@
# default settings for repmgrd. This file is source by /bin/sh from
# /etc/init.d/repmgrd
#default sysconfig file for repmrgd
#custom overrides can be placed here
# disable repmgrd by default so it won't get started upon installation
# valid values: yes/no
REPMGRD_ENABLED=no
# Options for repmgrd (required)
#REPMGRD_OPTS="--verbose -d -f /var/lib/pgsql/repmgr/repmgr.conf"
# User to run repmgrd as
#REPMGRD_USER=postgres
# repmgrd binary
#REPMGRD_BIN=/usr/bin/repmgr
# pid file
#REPMGRD_PIDFILE=/var/lib/pgsql/repmgr/repmgrd.pid
# log file
#REPMGRD_LOG=/var/lib/pgsql/repmgr/repmgrd.log
REPMGRD_OPTS="-f /etc/repmgr/repmgr.conf"

2
TODO
View File

@@ -5,8 +5,6 @@ Known issues in repmgr
the database server using the ``pg_ctl`` command may accidentally
terminate after their associated ssh session ends.
* PGPASSFILE may not be passed to pg_basebackup
Planned feature improvements
============================

View File

@@ -197,7 +197,7 @@ is_pgup(PGconn *conn, int timeout)
/*
* Return the id of the active master node, or NODE_NOT_FOUND if no
* Return the id of the active master node, or -1 if no
* record available.
*
* This reports the value stored in the database only and
@@ -224,12 +224,12 @@ get_master_node_id(PGconn *conn, char *cluster)
{
log_err(_("get_master_node_id(): query failed\n%s\n"),
PQerrorMessage(conn));
retval = NODE_NOT_FOUND;
retval = -1;
}
else if (PQntuples(res) == 0)
{
log_warning(_("get_master_node_id(): no active primary found\n"));
retval = NODE_NOT_FOUND;
retval = -1;
}
else
{
@@ -511,7 +511,7 @@ get_master_connection(PGconn *standby_conn, char *cluster,
if(master_id != NULL)
{
*master_id = NODE_NOT_FOUND;
*master_id = -1;
}
/* find all nodes belonging to this cluster */
@@ -728,49 +728,6 @@ create_replication_slot(PGconn *conn, char *slot_name)
char sqlquery[QUERY_STR_LEN];
PGresult *res;
/*
* Check whether slot exists already; if it exists and is active, that
* means another active standby is using it, which creates an error situation;
* if not we can reuse it as-is
*/
sqlquery_snprintf(sqlquery,
"SELECT active, slot_type "
" FROM pg_replication_slots "
" WHERE slot_name = '%s' ",
slot_name);
res = PQexec(conn, sqlquery);
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_err(_("unable to query pg_replication_slots: %s\n"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
if(PQntuples(res))
{
if(strcmp(PQgetvalue(res, 0, 1), "physical") != 0)
{
log_err(_("Slot '%s' exists and is not a physical slot\n"),
slot_name);
PQclear(res);
}
if(strcmp(PQgetvalue(res, 0, 0), "f") == 0)
{
PQclear(res);
log_debug(_("Replication slot '%s' exists but is inactive; reusing\n"),
slot_name);
return true;
}
PQclear(res);
log_err(_("Slot '%s' already exists as an active slot\n"),
slot_name);
return false;
}
sqlquery_snprintf(sqlquery,
"SELECT * FROM pg_create_physical_replication_slot('%s')",
slot_name);
@@ -1094,12 +1051,7 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
bool success = true;
struct tm ts;
/* Only attempt to write a record if a connection handle was provided/
Also check that the repmgr schema has been properly intialised - if
not it means no configuration file was provided, which can happen with
e.g. `repmgr standby clone`, and we won't know which schema to write to.
*/
if(conn != NULL && strcmp(repmgr_schema, DEFAULT_REPMGR_SCHEMA_PREFIX) != 0)
if(conn != NULL)
{
int n_node_id = htonl(node_id);
char *t_successful = successful ? "TRUE" : "FALSE";

331
repmgr.c
View File

@@ -77,7 +77,6 @@ static bool write_recovery_file_line(FILE *recovery_file, char *recovery_file_pa
static void check_master_standby_version_match(PGconn *conn, PGconn *master_conn);
static int check_server_version(PGconn *conn, char *server_type, bool exit_on_error, char *server_version_string);
static bool check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error);
static bool update_node_record_set_master(PGconn *conn, int this_node_id);
static char *make_pg_path(char *file);
@@ -140,11 +139,11 @@ main(int argc, char **argv)
{"verbose", no_argument, NULL, 'v'},
{"pg_bindir", required_argument, NULL, 'b'},
{"rsync-only", no_argument, NULL, 'r'},
{"fast-checkpoint", no_argument, NULL, 'c'},
{"initdb-no-pwprompt", no_argument, NULL, 1},
{"check-upstream-config", no_argument, NULL, 2},
{"recovery-min-apply-delay", required_argument, NULL, 3},
{"ignore-external-config-files", no_argument, NULL, 4},
{"fast-checkpoint", no_argument, NULL, 4},
{"ignore-external-config-files", no_argument, NULL, 5},
{NULL, 0, NULL, 0}
};
@@ -175,7 +174,7 @@ main(int argc, char **argv)
/* Prevent getopt_long() from printing an error message */
opterr = 0;
while ((c = getopt_long(argc, argv, "d:h:p:U:S:D:l:f:R:w:k:FWIvb:r:c", long_options,
while ((c = getopt_long(argc, argv, "d:h:p:U:S:D:l:f:R:w:k:FWIvr:b:", long_options,
&optindex)) != -1)
{
switch (c)
@@ -240,9 +239,6 @@ main(int argc, char **argv)
case 'r':
runtime_options.rsync_only = true;
break;
case 'c':
runtime_options.fast_checkpoint = true;
break;
case 1:
runtime_options.initdb_no_pwprompt = true;
break;
@@ -271,6 +267,9 @@ main(int argc, char **argv)
strncpy(runtime_options.recovery_min_apply_delay, optarg, MAXLEN);
break;
case 4:
runtime_options.fast_checkpoint = true;
break;
case 5:
runtime_options.ignore_external_config_files = true;
break;
default:
@@ -486,7 +485,7 @@ main(int argc, char **argv)
*/
if (config_file_required)
{
if (options.node == NODE_NOT_FOUND)
if (options.node == -1)
{
if(config_file_parsed == true)
{
@@ -926,7 +925,7 @@ do_standby_clone(void)
int i;
bool pg_start_backup_executed = false;
bool target_directory_provided = false;
bool external_config_file_copy_required = false;
bool config_file_copy_required = false;
char master_data_directory[MAXFILENAME];
char local_data_directory[MAXFILENAME];
@@ -1102,7 +1101,7 @@ do_standby_clone(void)
if(strcmp(PQgetvalue(res, i, 2), "f") == 0)
{
config_file_outside_pgdata = true;
external_config_file_copy_required = true;
config_file_copy_required = true;
strncpy(master_config_file, PQgetvalue(res, i, 1), MAXFILENAME);
}
}
@@ -1111,7 +1110,7 @@ do_standby_clone(void)
if(strcmp(PQgetvalue(res, i, 2), "f") == 0)
{
hba_file_outside_pgdata = true;
external_config_file_copy_required = true;
config_file_copy_required = true;
strncpy(master_hba_file, PQgetvalue(res, i, 1), MAXFILENAME);
}
}
@@ -1120,14 +1119,13 @@ do_standby_clone(void)
if(strcmp(PQgetvalue(res, i, 2), "f") == 0)
{
ident_file_outside_pgdata = true;
external_config_file_copy_required = true;
config_file_copy_required = true;
strncpy(master_ident_file, PQgetvalue(res, i, 1), MAXFILENAME);
}
}
else
log_warning(_("unknown parameter: %s\n"), PQgetvalue(res, i, 0));
}
PQclear(res);
/*
@@ -1318,6 +1316,12 @@ do_standby_clone(void)
}
PQclear(res);
/*
* With rsync we'll need to explicitly copy configuration files in any
* case
*/
config_file_copy_required = true;
}
else
{
@@ -1331,14 +1335,14 @@ do_standby_clone(void)
}
/*
* If configuration files were not inside the data directory, we;ll need to
* copy them via SSH (unless `--ignore-external-config-files` was provided)
* If configuration files were not in the data directory, we need to copy
* them via SSH
*
* TODO: add option to place these files in the same location on the
* standby server as on the primary?
*/
if(external_config_file_copy_required && !runtime_options.ignore_external_config_files)
if(config_file_copy_required == true)
{
log_notice(_("copying configuration files from master\n"));
r = test_ssh_connection(runtime_options.host, runtime_options.remote_user);
@@ -1350,80 +1354,101 @@ do_standby_clone(void)
goto stop_backup;
}
if(config_file_outside_pgdata)
if(strlen(master_config_file))
{
log_info(_("standby clone: master config file '%s'\n"), master_config_file);
r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
master_config_file, local_config_file, false, server_version_num);
if (r != 0)
if(runtime_options.ignore_external_config_files && config_file_outside_pgdata)
{
log_err(_("standby clone: failed copying master config file '%s'\n"),
master_config_file);
retval = ERR_BAD_SSH;
goto stop_backup;
log_notice(_("standby clone: not copying master config file '%s'\n"), master_config_file);
}
else
{
log_info(_("standby clone: master config file '%s'\n"), master_config_file);
r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
master_config_file, local_config_file, false, server_version_num);
if (r != 0)
{
log_warning(_("standby clone: failed copying master config file '%s'\n"),
master_config_file);
retval = ERR_BAD_SSH;
goto stop_backup;
}
}
}
if(hba_file_outside_pgdata)
if(strlen(master_hba_file))
{
log_info(_("standby clone: master hba file '%s'\n"), master_hba_file);
r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
master_hba_file, local_hba_file, false, server_version_num);
if (r != 0)
if(runtime_options.ignore_external_config_files && hba_file_outside_pgdata)
{
log_err(_("standby clone: failed copying master hba file '%s'\n"),
master_hba_file);
retval = ERR_BAD_SSH;
goto stop_backup;
log_notice(_("standby clone: not copying master config file '%s'\n"), master_hba_file);
}
else
{
log_info(_("standby clone: master hba file '%s'\n"), master_hba_file);
r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
master_hba_file, local_hba_file, false, server_version_num);
if (r != 0)
{
log_warning(_("standby clone: failed copying master hba file '%s'\n"),
master_hba_file);
retval = ERR_BAD_SSH;
goto stop_backup;
}
}
}
if(ident_file_outside_pgdata)
if(strlen(master_ident_file))
{
log_info(_("standby clone: master ident file '%s'\n"), master_ident_file);
r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
master_ident_file, local_ident_file, false, server_version_num);
if (r != 0)
if(runtime_options.ignore_external_config_files && ident_file_outside_pgdata)
{
log_err(_("standby clone: failed copying master ident file '%s'\n"),
master_ident_file);
retval = ERR_BAD_SSH;
goto stop_backup;
log_notice(_("standby clone: not copying master config file '%s'\n"), master_ident_file);
}
else
{
log_info(_("standby clone: master ident file '%s'\n"), master_ident_file);
r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
master_ident_file, local_ident_file, false, server_version_num);
if (r != 0)
{
log_warning(_("standby clone: failed copying master ident file '%s'\n"),
master_ident_file);
retval = ERR_BAD_SSH;
goto stop_backup;
}
}
}
}
/*
* When using rsync, copy pg_control file last, emulating the base backup
* protocol.
*/
if(runtime_options.rsync_only)
{
maxlen_snprintf(local_control_file, "%s/global", local_data_directory);
log_info(_("standby clone: local control file '%s'\n"),
local_control_file);
if (!create_dir(local_control_file))
/*
* When using rsync, copy pg_control file last, emulating the base backup
* protocol.
*/
if(runtime_options.rsync_only)
{
log_err(_("couldn't create directory %s ...\n"),
local_control_file);
goto stop_backup;
}
maxlen_snprintf(local_control_file, "%s/global", local_data_directory);
maxlen_snprintf(master_control_file, "%s/global/pg_control",
master_data_directory);
log_info(_("standby clone: master control file '%s'\n"),
log_info(_("standby clone: local control file '%s'\n"),
local_control_file);
if (!create_dir(local_control_file))
{
log_err(_("couldn't create directory %s ...\n"),
local_control_file);
goto stop_backup;
}
maxlen_snprintf(master_control_file, "%s/global/pg_control",
master_data_directory);
log_info(_("standby clone: master control file '%s'\n"),
master_control_file);
r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
master_control_file, local_control_file,
false, server_version_num);
if (r != 0)
{
log_warning(_("standby clone: failed copying master control file '%s'\n"),
master_control_file);
retval = ERR_BAD_SSH;
goto stop_backup;
r = copy_remote_files(runtime_options.host, runtime_options.remote_user,
master_control_file, local_control_file,
false, server_version_num);
if (r != 0)
{
log_warning(_("standby clone: failed copying master control file '%s'\n"),
master_control_file);
retval = ERR_BAD_SSH;
goto stop_backup;
}
}
}
@@ -1537,9 +1562,8 @@ do_standby_promote(void)
int i,
promote_check_timeout = 60,
promote_check_interval = 2;
bool promote_success = false;
bool promote_sucess = false;
bool success;
PQExpBufferData details;
/* We need to connect to check configuration */
log_info(_("connecting to standby database\n"));
@@ -1613,57 +1637,50 @@ do_standby_promote(void)
retval = is_standby(conn);
if(!retval)
{
promote_success = true;
promote_sucess = true;
break;
}
sleep(promote_check_interval);
}
if (promote_success == false)
if (promote_sucess == false)
{
PQExpBufferData details;
initPQExpBuffer(&details);
appendPQExpBuffer(&details,
"Node %i could not be promoted to master",
options.node);
create_event_record(old_master_conn,
&options,
options.node,
"standby_promote",
false,
details.data);
/* XXX exit with error? */
log_err(_(retval == 1 ?
"STANDBY PROMOTE failed, this is still a standby node.\n" :
"connection to node lost!\n"));
exit(ERR_FAILOVER_FAIL);
}
/* update node information to reflect new status */
if(update_node_record_set_master(conn, options.node) == false)
else
{
PQExpBufferData details;
initPQExpBuffer(&details);
appendPQExpBuffer(&details,
_("unable to update node record for node %i"),
"Node %i was successfully promoted to master",
options.node);
log_err("%s\n", details.data);
create_event_record(NULL,
log_notice(_("STANDBY PROMOTE successful. You should REINDEX any hash indexes you have.\n"));
/* Log the event */
create_event_record(conn,
&options,
options.node,
"repmgrd_failover_promote",
false,
"standby_promote",
true,
details.data);
exit(ERR_DB_QUERY);
}
initPQExpBuffer(&details);
appendPQExpBuffer(&details,
"Node %i was successfully promoted to master",
options.node);
log_notice(_("STANDBY PROMOTE successful. You should REINDEX any hash indexes you have.\n"));
/* Log the event */
create_event_record(conn,
&options,
options.node,
"standby_promote",
true,
details.data);
PQfinish(old_master_conn);
PQfinish(conn);
return;
@@ -2215,7 +2232,7 @@ help(const char *progname)
printf(_(" -p, --port=PORT database server port\n"));
printf(_(" -U, --username=USERNAME database user name to connect as\n"));
printf(_("\nConfiguration options:\n"));
printf(_(" -b, --pg_bindir=PATH path to PostgreSQL binaries (optional)\n"));
printf(_(" -b. --pg_bindir=PATH path to PostgreSQL binaries (optional)\n"));
printf(_(" -D, --data-dir=DIR local directory where the files will be\n" \
" copied to\n"));
printf(_(" -l, --local-port=PORT standby or witness server local port\n"));
@@ -2225,13 +2242,15 @@ help(const char *progname)
" (default: postgres)\n"));
printf(_(" -w, --wal-keep-segments=VALUE minimum value for the GUC\n" \
" wal_keep_segments (default: %s)\n"), DEFAULT_WAL_KEEP_SEGMENTS);
printf(_(" -k, --keep-history=VALUE keeps indicated number of days of history\n"));
printf(_(" -F, --force force potentially dangerous operations to happen\n"));
printf(_(" -k, --keep-history=VALUE keeps indicated number of days of\n" \
" history\n"));
printf(_(" -F, --force force potentially dangerous operations\n" \
" to happen\n"));
printf(_(" -W, --wait wait for a master to appear\n"));
printf(_(" -r, --rsync-only use only rsync to clone a standby\n"));
printf(_(" -c, --fast-checkpoint force fast checkpoint when cloning a standby\n"));
printf(_(" --recovery-min-apply-delay=VALUE set recovery_min_apply_delay in recovery.conf\n" \
" when cloning a standby (PostgreSQL 9.4 and later)\n"));
printf(_(" --fast-checkpoint force fast checkpoint when cloning a standby\n"));
printf(_(" --ignore-external-config-files don't copy configuration files located outside \n" \
" the data directory when cloning a standby\n"));
printf(_(" --initdb-no-pwprompt don't require superuser password when running initdb\n"));
@@ -2421,7 +2440,7 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
*/
if (is_directory)
{
/* Files which we don't want */
/* Files we don't want */
appendPQExpBuffer(&rsync_flags, "%s",
" --exclude=postmaster.pid --exclude=postmaster.opts --exclude=global/pg_control");
@@ -2435,11 +2454,11 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
" --exclude=postgresql.auto.conf.tmp");
}
/* Temporary files which we don't want, if they exist */
/* Temporary files we don't want, if they exist */
appendPQExpBuffer(&rsync_flags, " --exclude=%s*",
PG_TEMP_FILE_PREFIX);
/* Directories which we don't want */
/* Directories we don't want */
appendPQExpBuffer(&rsync_flags, "%s",
" --exclude=pg_xlog/* --exclude=pg_log/* --exclude=pg_stat_tmp/*");
@@ -2682,6 +2701,7 @@ create_schema(PGconn *conn)
char sqlquery[QUERY_STR_LEN];
PGresult *res;
/* create schema */
sqlquery_snprintf(sqlquery, "CREATE SCHEMA %s", get_repmgr_schema_quoted(conn));
log_debug(_("master register: %s\n"), sqlquery);
@@ -3251,92 +3271,6 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
}
static bool
update_node_record_set_master(PGconn *conn, int this_node_id)
{
PGresult *res;
char sqlquery[QUERY_STR_LEN];
log_debug(_("Setting %i as master and marking existing master as failed\n"), this_node_id);
res = PQexec(conn, "BEGIN");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to begin transaction: %s\n"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
PQclear(res);
sqlquery_snprintf(sqlquery,
" UPDATE %s.repl_nodes "
" SET active = FALSE "
" WHERE cluster = '%s' "
" AND type = 'master' "
" AND active IS TRUE ",
get_repmgr_schema_quoted(conn),
options.cluster_name);
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to set old master node as inactive: %s\n"),
PQerrorMessage(conn));
PQclear(res);
PQexec(conn, "ROLLBACK");
return false;
}
PQclear(res);
sqlquery_snprintf(sqlquery,
" UPDATE %s.repl_nodes "
" SET type = 'master', "
" upstream_node_id = NULL "
" WHERE cluster = '%s' "
" AND id = %i ",
get_repmgr_schema_quoted(conn),
options.cluster_name,
this_node_id);
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to set current node %i as active master: %s\n"),
this_node_id,
PQerrorMessage(conn));
PQclear(res);
PQexec(conn, "ROLLBACK");
return false;
}
PQclear(res);
res = PQexec(conn, "COMMIT");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to set commit transaction: %s\n"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
PQclear(res);
return true;
}
static void
do_check_upstream_config(void)
{
@@ -3373,6 +3307,7 @@ do_check_upstream_config(void)
}
static char *
make_pg_path(char *file)
{

View File

@@ -7,11 +7,8 @@
#
# repmgr and repmgrd require these items to be configured:
# Cluster name - this will be used by repmgr to generate its internal
# schema (pattern: "repmgr_{cluster}"); while this name will be quoted
# to preserve case, we recommend using lower case and avoiding whitespace
# to facilitate easier querying of the repmgr views and tables.
cluster=example_cluster
# Cluster name
cluster=test
# Node ID and name
# (Note: we recommend to avoid naming nodes after their initial

View File

@@ -49,7 +49,6 @@
#define MANUAL_FAILOVER 0
#define AUTOMATIC_FAILOVER 1
#define NODE_NOT_FOUND -1
#define NO_UPSTREAM_NODE -1

141
repmgrd.c
View File

@@ -91,6 +91,7 @@ static void witness_monitor(void);
static bool check_connection(PGconn *conn, const char *type);
static bool set_local_node_failed(void);
static bool update_node_record_set_master(PGconn *conn, int this_node_id, int old_master_node_id);
static bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id);
static void update_shared_memory(char *last_wal_standby_applied);
@@ -283,17 +284,9 @@ main(int argc, char **argv)
terminate(ERR_BAD_CONFIG);
}
/* Retrieve record for this node from the local database */
/* Retrieve record for this node from the database */
node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);
/* No node record found - exit gracefully */
if(node_info.node_id == NODE_NOT_FOUND)
{
log_err(_("No metadata record found for this node - terminating\n"));
log_notice(_("HINT: was this node registered with 'repmgr (master|standby) register'?\n"));
terminate(ERR_BAD_CONFIG);
}
log_debug("node id is %i, upstream is %i\n", node_info.node_id, node_info.upstream_node_id);
/*
@@ -342,9 +335,9 @@ main(int argc, char **argv)
log_info(_("starting continuous master connection check\n"));
/*
* Check that master is still alive.
* XXX We should also check that the
* standby servers are sending info
* Check that master is still alive.
* XXX We should also check that the
* standby servers are sending info
*/
/*
@@ -644,9 +637,9 @@ witness_monitor(void)
*/
sqlquery_snprintf(sqlquery,
"INSERT INTO %s.repl_monitor "
" (primary_node, standby_node, "
" (master_node, standby_node, "
" last_monitor_time, last_apply_time, "
" last_wal_primary_location, last_wal_standby_location, "
" last_wal_master_location, last_wal_standby_location, "
" replication_lag, apply_lag )"
" VALUES(%d, %d, "
" '%s'::TIMESTAMP WITH TIME ZONE, NULL, "
@@ -1000,9 +993,9 @@ standby_monitor(void)
*/
sqlquery_snprintf(sqlquery,
"INSERT INTO %s.repl_monitor "
" (primary_node, standby_node, "
" (master_node, standby_node, "
" last_monitor_time, last_apply_time, "
" last_wal_primary_location, last_wal_standby_location, "
" last_wal_master_location, last_wal_standby_location, "
" replication_lag, apply_lag ) "
" VALUES(%d, %d, "
" '%s'::TIMESTAMP WITH TIME ZONE, '%s'::TIMESTAMP WITH TIME ZONE, "
@@ -1441,6 +1434,25 @@ do_master_failover(void)
/* and reconnect to the local database */
my_local_conn = establish_db_connection(local_options.conninfo, true);
/* update node information to reflect new status */
if(update_node_record_set_master(my_local_conn, node_info.node_id, failed_master.node_id) == false)
{
appendPQExpBuffer(&event_details,
_("unable to update node record for node %i (promoted to master following failure of node %i)"),
node_info.node_id,
failed_master.node_id);
log_err("%s\n", event_details.data);
create_event_record(NULL,
&local_options,
node_info.node_id,
"repmgrd_failover_promote",
false,
event_details.data);
terminate(ERR_DB_QUERY);
}
/* update internal record for this node */
node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);
@@ -1737,7 +1749,7 @@ set_local_node_failed(void)
{
PGresult *res;
char sqlquery[QUERY_STR_LEN];
int active_master_node_id = NODE_NOT_FOUND;
int active_master_node_id = -1;
char master_conninfo[MAXLEN];
if (!check_connection(master_conn, "master"))
@@ -2077,7 +2089,7 @@ update_registration(void)
log_err("%s\n", errmsg.data);
create_event_record(master_conn,
create_event_record(my_local_conn,
&local_options,
local_options.node,
"repmgrd_shutdown",
@@ -2222,7 +2234,7 @@ get_node_info(PGconn *conn, char *cluster, int node_id)
char sqlquery[QUERY_STR_LEN];
PGresult *res;
t_node_info node_info = { NODE_NOT_FOUND, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
t_node_info node_info = {-1, NO_UPSTREAM_NODE, "", InvalidXLogRecPtr, UNKNOWN, false, false};
sprintf(sqlquery,
"SELECT id, upstream_node_id, conninfo, type, slot_name, active "
@@ -2248,7 +2260,7 @@ get_node_info(PGconn *conn, char *cluster, int node_id)
log_err("%s\n", errmsg.data);
create_event_record(NULL,
create_event_record(my_local_conn,
&local_options,
local_options.node,
"repmgrd_shutdown",
@@ -2262,7 +2274,7 @@ get_node_info(PGconn *conn, char *cluster, int node_id)
if (!PQntuples(res)) {
log_warning(_("No record found record for node %i\n"), node_id);
PQclear(res);
node_info.node_id = NODE_NOT_FOUND;
node_info.node_id = -1;
return node_info;
}
@@ -2301,6 +2313,93 @@ parse_node_type(const char *type)
}
static bool
update_node_record_set_master(PGconn *conn, int this_node_id, int old_master_node_id)
{
PGresult *res;
char sqlquery[QUERY_STR_LEN];
log_debug(_("Setting failed node %i inactive; marking node %i as master\n"), old_master_node_id, this_node_id);
res = PQexec(conn, "BEGIN");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to begin transaction: %s\n"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
PQclear(res);
sqlquery_snprintf(sqlquery,
" UPDATE %s.repl_nodes "
" SET active = FALSE "
" WHERE cluster = '%s' "
" AND id = %i ",
get_repmgr_schema_quoted(conn),
local_options.cluster_name,
old_master_node_id);
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to set old master node %i as inactive: %s\n"),
old_master_node_id,
PQerrorMessage(conn));
PQclear(res);
PQexec(conn, "ROLLBACK");
return false;
}
PQclear(res);
sqlquery_snprintf(sqlquery,
" UPDATE %s.repl_nodes "
" SET type = 'master', "
" upstream_node_id = NULL "
" WHERE cluster = '%s' "
" AND id = %i ",
get_repmgr_schema_quoted(conn),
local_options.cluster_name,
this_node_id);
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to set current node %i as active master: %s\n"),
this_node_id,
PQerrorMessage(conn));
PQclear(res);
PQexec(conn, "ROLLBACK");
return false;
}
PQclear(res);
res = PQexec(conn, "COMMIT");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to set commit transaction: %s\n"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
PQclear(res);
return true;
}
static bool
update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id)
{

View File

@@ -1,6 +1,6 @@
#ifndef _VERSION_H_
#define _VERSION_H_
#define REPMGR_VERSION "3.0.1"
#define REPMGR_VERSION "3.0dev"
#endif