mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 07:06:30 +00:00
Compare commits
31 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9209248420 | ||
|
|
6693b99288 | ||
|
|
8e7b487838 | ||
|
|
7f796e2d15 | ||
|
|
5e04ab6eae | ||
|
|
a1f4285e2b | ||
|
|
493133986d | ||
|
|
8b370dc581 | ||
|
|
43af00aa12 | ||
|
|
3c8df59eb9 | ||
|
|
b410772627 | ||
|
|
d99024ba11 | ||
|
|
1afaa3a26f | ||
|
|
3b66a31ac9 | ||
|
|
ad3630e7a9 | ||
|
|
2e7acf03c4 | ||
|
|
2bc8044fda | ||
|
|
b0b44a157f | ||
|
|
49a2531930 | ||
|
|
4191b77e70 | ||
|
|
2a5d431481 | ||
|
|
93a999adc7 | ||
|
|
088ca29fe3 | ||
|
|
30e9d06172 | ||
|
|
cd1a84252e | ||
|
|
2e19b3688b | ||
|
|
de883a4c84 | ||
|
|
499a501afd | ||
|
|
0a9107d76d | ||
|
|
95ec0450da | ||
|
|
57aa95f674 |
4
CREDITS
4
CREDITS
@@ -10,3 +10,7 @@ Hannu Krosing <hannu@2ndQuadrant.com>
|
||||
Cédric Villemain <cedric@2ndquadrant.com>
|
||||
Charles Duffy <charles@dyfis.net>
|
||||
Daniel Farina <daniel@heroku.com>
|
||||
Shawn Ellis <shawn.ellis17@gmail.com>
|
||||
Jay Taylor <jay@jaytaylor.com>
|
||||
Christian Kruse <christian@2ndQuadrant.com>
|
||||
Krzysztof Gajdemski <songo@debian.org.pl>
|
||||
|
||||
9
HISTORY
9
HISTORY
@@ -1,4 +1,11 @@
|
||||
2.0beta 2012-07-27
|
||||
2.0beta2 2013-12-19
|
||||
Improve autofailover logic and algorithms (Jaime, Andres)
|
||||
Ignore pg_log when cloning (Jaime)
|
||||
Add timestamps to log line in stderr (Christian)
|
||||
Correctly check wal_keep_segments (Jay Taylor)
|
||||
Add a ssh_options parameter (Jay Taylor)
|
||||
|
||||
2.0beta1 2012-07-27
|
||||
Make CLONE command try to make an exact copy including $PGDATA location (Cedric)
|
||||
Add detection of master failure (Jaime)
|
||||
Add the notion of a witness server (Jaime)
|
||||
|
||||
@@ -112,7 +112,7 @@ Log in node2.
|
||||
Clone the node1 (the current Master)::
|
||||
|
||||
su - postgres
|
||||
repmgr -d repmgr -U repmgr standby clone node1
|
||||
repmgr -d repmgr -U repmgr -h node1 standby clone
|
||||
|
||||
Start the PostgreSQL server::
|
||||
|
||||
@@ -173,7 +173,7 @@ Log in witness.
|
||||
Initialize the witness server::
|
||||
|
||||
su - postgres
|
||||
repmgr -d repmgr -U repmgr -h 192.168.1.10 -D $WITNESS_PGDATA -f /etc/repmgr/repmgr.conf witness create node1
|
||||
repmgr -d repmgr -U repmgr -h 192.168.1.10 -D $WITNESS_PGDATA -f /etc/repmgr/repmgr.conf witness create
|
||||
|
||||
It needs information to connect to the master to copy the configuration of the cluster, also it needs to know where it should initialize it's own $PGDATA.
|
||||
As part of the procees it also ask for the superuser password so it can connect when needed.
|
||||
|
||||
@@ -127,10 +127,10 @@ mkdir_p(char *path, mode_t omode)
|
||||
{
|
||||
struct stat sb;
|
||||
mode_t numask,
|
||||
oumask;
|
||||
oumask;
|
||||
int first,
|
||||
last,
|
||||
retval;
|
||||
last,
|
||||
retval;
|
||||
char *p;
|
||||
|
||||
p = path;
|
||||
|
||||
4
config.c
4
config.c
@@ -41,6 +41,7 @@ parse_config(const char *config_file, t_configuration_options *options)
|
||||
memset(options->promote_command, 0, sizeof(options->promote_command));
|
||||
memset(options->follow_command, 0, sizeof(options->follow_command));
|
||||
memset(options->rsync_options, 0, sizeof(options->rsync_options));
|
||||
memset(options->ssh_options, 0, sizeof(options->ssh_options));
|
||||
|
||||
/* if nothing has been provided defaults to 60 */
|
||||
options->master_response_timeout = 60;
|
||||
@@ -78,6 +79,8 @@ parse_config(const char *config_file, t_configuration_options *options)
|
||||
strncpy (options->conninfo, value, MAXLEN);
|
||||
else if (strcmp(name, "rsync_options") == 0)
|
||||
strncpy (options->rsync_options, value, QUERY_STR_LEN);
|
||||
else if (strcmp(name, "ssh_options") == 0)
|
||||
strncpy (options->ssh_options, value, QUERY_STR_LEN);
|
||||
else if (strcmp(name, "loglevel") == 0)
|
||||
strncpy (options->loglevel, value, MAXLEN);
|
||||
else if (strcmp(name, "logfacility") == 0)
|
||||
@@ -283,6 +286,7 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
|
||||
strcpy(orig_options->promote_command, new_options.promote_command);
|
||||
strcpy(orig_options->follow_command, new_options.follow_command);
|
||||
strcpy(orig_options->rsync_options, new_options.rsync_options);
|
||||
strcpy(orig_options->ssh_options, new_options.ssh_options);
|
||||
orig_options->master_response_timeout = new_options.master_response_timeout;
|
||||
orig_options->reconnect_attempts = new_options.reconnect_attempts;
|
||||
orig_options->reconnect_intvl = new_options.reconnect_intvl;
|
||||
|
||||
1
config.h
1
config.h
@@ -36,6 +36,7 @@ typedef struct
|
||||
char loglevel[MAXLEN];
|
||||
char logfacility[MAXLEN];
|
||||
char rsync_options[QUERY_STR_LEN];
|
||||
char ssh_options[QUERY_STR_LEN];
|
||||
int master_response_timeout;
|
||||
int reconnect_attempts;
|
||||
int reconnect_intvl;
|
||||
|
||||
74
dbutils.c
74
dbutils.c
@@ -146,15 +146,16 @@ is_pgup(PGconn *conn, int timeout)
|
||||
/*
|
||||
* Send a SELECT 1 just to check if the connection is OK
|
||||
*/
|
||||
CancelQuery(conn, timeout);
|
||||
if (!CancelQuery(conn, timeout))
|
||||
goto failed;
|
||||
if (wait_connection_availability(conn, timeout) != 1)
|
||||
goto failed;
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT 1");
|
||||
if (PQsendQuery(conn, sqlquery) == 0)
|
||||
{
|
||||
log_warning(_("PQsendQuery: Query could not be sent to primary. %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
log_warning(_("PQsendQuery: Query could not be sent to primary. %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
goto failed;
|
||||
}
|
||||
if (wait_connection_availability(conn, timeout) != 1)
|
||||
@@ -249,6 +250,40 @@ guc_setted(PGconn *conn, const char *parameter, const char *op,
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Just like guc_setted except with an extra parameter containing the name of
|
||||
* the pg datatype so that the comparison can be done properly.
|
||||
*/
|
||||
bool
|
||||
guc_setted_typed(PGconn *conn, const char *parameter, const char *op,
|
||||
const char *value, const char *datatype)
|
||||
{
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings "
|
||||
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
|
||||
parameter, datatype, op, value, datatype);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("GUC setting check PQexec failed: %s"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
PQfinish(conn);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
if (PQntuples(res) == 0)
|
||||
{
|
||||
PQclear(res);
|
||||
return false;
|
||||
}
|
||||
PQclear(res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
const char *
|
||||
get_cluster_size(PGconn *conn)
|
||||
@@ -396,7 +431,7 @@ getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
|
||||
|
||||
/*
|
||||
* wait until current query finishes ignoring any results, this could be an async command
|
||||
* or a cancelation of a query
|
||||
* or a cancelation of a query
|
||||
* return 1 if Ok; 0 if any error ocurred; -1 if timeout reached
|
||||
*/
|
||||
int
|
||||
@@ -408,11 +443,11 @@ wait_connection_availability(PGconn *conn, int timeout)
|
||||
{
|
||||
if (PQconsumeInput(conn) == 0)
|
||||
{
|
||||
log_warning(_("PQconsumeInput: Query could not be sent to primary. %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
log_warning(_("wait_connection_availability: could not receive data from master. %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
if (PQisBusy(conn) == 0)
|
||||
{
|
||||
res = PQgetResult(conn);
|
||||
@@ -424,23 +459,40 @@ wait_connection_availability(PGconn *conn, int timeout)
|
||||
}
|
||||
if (timeout >= 0)
|
||||
return 1;
|
||||
else
|
||||
else {
|
||||
log_warning(_("wait_connection_availability: timeout reached"));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
bool
|
||||
CancelQuery(PGconn *conn, int timeout)
|
||||
{
|
||||
char errbuf[ERRBUFF_SIZE];
|
||||
PGcancel *pgcancel;
|
||||
|
||||
wait_connection_availability(conn, timeout);
|
||||
if (wait_connection_availability(conn, timeout) != 1)
|
||||
return false;
|
||||
|
||||
pgcancel = PQgetCancel(conn);
|
||||
|
||||
if (!pgcancel || PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
|
||||
if (pgcancel == NULL)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* PQcancel can only return 0 if socket()/connect()/send()
|
||||
* fails, in any of those cases we can assume something
|
||||
* bad happened to the connection
|
||||
*/
|
||||
if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
|
||||
{
|
||||
log_warning(_("Can't stop current query: %s\n"), errbuf);
|
||||
PQfreeCancel(pgcancel);
|
||||
return false;
|
||||
}
|
||||
|
||||
PQfreeCancel(pgcancel);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -31,11 +31,13 @@ bool is_witness(PGconn *conn, char *schema, char *cluster, int node_id);
|
||||
bool is_pgup(PGconn *conn, int timeout);
|
||||
char *pg_version(PGconn *conn, char* major_version);
|
||||
bool guc_setted(PGconn *conn, const char *parameter, const char *op,
|
||||
const char *value);
|
||||
const char *value);
|
||||
bool guc_setted_typed(PGconn *conn, const char *parameter, const char *op,
|
||||
const char *value, const char *datatype);
|
||||
const char *get_cluster_size(PGconn *conn);
|
||||
PGconn *getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
|
||||
int *master_id, char *master_conninfo_out);
|
||||
|
||||
int wait_connection_availability(PGconn *conn, int timeout);
|
||||
void CancelQuery(PGconn *conn, int timeout);
|
||||
bool CancelQuery(PGconn *conn, int timeout);
|
||||
#endif
|
||||
|
||||
6
debian/DEBIAN/control
vendored
6
debian/DEBIAN/control
vendored
@@ -1,9 +1,9 @@
|
||||
Package: repmgr-auto
|
||||
Version: 1.0-1
|
||||
Version: 2.0beta2
|
||||
Section: database
|
||||
Priority: optional
|
||||
Architecture: all
|
||||
Depends: rsync, postgresql-9.0
|
||||
Maintainer: Greg Smith <greg@2ndQuadrant.com>
|
||||
Depends: rsync, postgresql-9.0 | postgresql-9.1 | postgresql-9.2 | postgresql-9.3
|
||||
Maintainer: Jaime Casanova <jaime@2ndQuadrant.com>
|
||||
Description: PostgreSQL replication setup, magament and monitoring
|
||||
has two main executables
|
||||
|
||||
14
debian/repmgr.repmgrd.default
vendored
Normal file
14
debian/repmgr.repmgrd.default
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/sh
|
||||
# default settings for repmgrd. This file is source by /bin/sh from
|
||||
# /etc/init.d/repmgrd
|
||||
|
||||
# Options for repmgrd
|
||||
REPMGRD_OPTS=""
|
||||
|
||||
# repmgrd binary
|
||||
REPMGR_BIN="/usr/bin/repmgr"
|
||||
|
||||
# pid file
|
||||
REPMGR_PIDFILE="/var/run/repmgrd.pid"
|
||||
|
||||
|
||||
48
debian/repmgr.repmgrd.init
vendored
Normal file
48
debian/repmgr.repmgrd.init
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
#!/bin/sh
|
||||
### BEGIN INIT INFO
|
||||
# Provides: repmgrd
|
||||
# Required-Start: $local_fs $remote_fs $network $syslog $postgresql
|
||||
# Required-Stop: $local_fs $remote_fs $network $syslog $postgresql
|
||||
# Should-Start: $syslog $postgresql
|
||||
# Should-Start: $syslog $postgresql
|
||||
# Default-Start: 2 3 4 5
|
||||
# Default-Stop: 0 1 6
|
||||
# Short-Description: Start/stop repmgrd
|
||||
### END INIT INFO
|
||||
|
||||
set -e
|
||||
|
||||
if test -f /etc/default/repmgrd; then
|
||||
. /etc/default/repmgrd
|
||||
fi
|
||||
|
||||
if [ -z "$REPMGRD_BIN" ]; then
|
||||
REPMGRD_BIN="/usr/bin/repmgrd"
|
||||
fi
|
||||
|
||||
if [ -z "$REPMGRD_PIDFILE" ]; then
|
||||
REPMGRD_PIDFILE="/var/run/repmgrd.pid"
|
||||
fi
|
||||
|
||||
test -x $REPMGRD_BIN || exit 0
|
||||
|
||||
case "$1" in
|
||||
start)
|
||||
start-stop-daemon --start --quiet --make-pidfile --pidfile $REPMGRD_PIDFILE --exec $REPMGRD_BIN $REPMGRD_OPTS
|
||||
;;
|
||||
|
||||
stop)
|
||||
start-stop-daemon --stop --oknodo --quiet --pidfile $REPMGRD_PIDFILE
|
||||
;;
|
||||
|
||||
restart)
|
||||
$0 stop && $0 start || exit 1
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Usage: $0 {start|stop|restart}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
exit 0
|
||||
27
log.c
27
log.c
@@ -25,9 +25,11 @@
|
||||
|
||||
#ifdef HAVE_SYSLOG
|
||||
#include <syslog.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#define DEFAULT_IDENT "repmgr"
|
||||
@@ -37,6 +39,29 @@
|
||||
|
||||
/* #define REPMGR_DEBUG */
|
||||
|
||||
void stderr_log_with_level(const char *level_name, int level, const char *fmt, ...) {
|
||||
size_t len = strlen(fmt);
|
||||
char fmt1[len + 150];
|
||||
time_t t;
|
||||
struct tm *tm;
|
||||
char buff[100];
|
||||
va_list ap;
|
||||
|
||||
if(log_level >= level) {
|
||||
time(&t);
|
||||
tm = localtime(&t);
|
||||
|
||||
va_start(ap, fmt);
|
||||
|
||||
strftime(buff, 100, "[%Y-%m-%d %H:%M:%S]", tm);
|
||||
snprintf(fmt1, len + 150, "%s [%s] %s", buff, level_name, fmt);
|
||||
vfprintf(stderr, fmt1, ap);
|
||||
|
||||
va_end(ap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int detect_log_level(const char* level);
|
||||
static int detect_log_facility(const char* facility);
|
||||
|
||||
|
||||
18
log.h
18
log.h
@@ -25,15 +25,17 @@
|
||||
#define REPMGR_SYSLOG 1
|
||||
#define REPMGR_STDERR 2
|
||||
|
||||
void stderr_log_with_level(const char *level_name, int level, const char *fmt, ...);
|
||||
|
||||
/* Standard error logging */
|
||||
#define stderr_log_debug(...) if (log_level >= LOG_DEBUG) fprintf(stderr, __VA_ARGS__)
|
||||
#define stderr_log_info(...) if (log_level >= LOG_INFO) fprintf(stderr, __VA_ARGS__)
|
||||
#define stderr_log_notice(...) if (log_level >= LOG_NOTICE) fprintf(stderr, __VA_ARGS__)
|
||||
#define stderr_log_warning(...) if (log_level >= LOG_WARNING) fprintf(stderr, __VA_ARGS__)
|
||||
#define stderr_log_err(...) if (log_level >= LOG_ERR) fprintf(stderr, __VA_ARGS__)
|
||||
#define stderr_log_crit(...) if (log_level >= LOG_CRIT) fprintf(stderr, __VA_ARGS__)
|
||||
#define stderr_log_alert(...) if (log_level >= LOG_ALERT) fprintf(stderr, __VA_ARGS__)
|
||||
#define stderr_log_emerg(...) if (log_level >= LOG_EMERG) fprintf(stderr, __VA_ARGS__)
|
||||
#define stderr_log_debug(...) stderr_log_with_level("DEBUG", LOG_DEBUG, __VA_ARGS__)
|
||||
#define stderr_log_info(...) stderr_log_with_level("INFO", LOG_INFO, __VA_ARGS__)
|
||||
#define stderr_log_notice(...) stderr_log_with_level("NOTICE", LOG_NOTICE, __VA_ARGS__)
|
||||
#define stderr_log_warning(...) stderr_log_with_level("WARNING", LOG_WARNING, __VA_ARGS__)
|
||||
#define stderr_log_err(...) stderr_log_with_level("ERROR", LOG_ERR, __VA_ARGS__)
|
||||
#define stderr_log_crit(...) stderr_log_with_level("CRITICAL", LOG_CRIT, __VA_ARGS__)
|
||||
#define stderr_log_alert(...) stderr_log_with_level("ALERT", LOG_ALERT, __VA_ARGS__)
|
||||
#define stderr_log_emerg(...) stderr_log_with_level("EMERGENCY", LOG_EMERG, __VA_ARGS__)
|
||||
|
||||
#ifdef HAVE_SYSLOG
|
||||
|
||||
|
||||
145
repmgr.c
145
repmgr.c
@@ -30,6 +30,7 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/wait.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
@@ -55,7 +56,7 @@
|
||||
static bool create_recovery_file(const char *data_dir);
|
||||
static int test_ssh_connection(char *host, char *remote_user);
|
||||
static int copy_remote_files(char *host, char *remote_user, char *remote_path,
|
||||
char *local_path, bool is_directory);
|
||||
char *local_path, bool is_directory);
|
||||
static bool check_parameters_for_action(const int action);
|
||||
static bool create_schema(PGconn *conn);
|
||||
static bool copy_configuration(PGconn *masterconn, PGconn *witnessconn);
|
||||
@@ -84,8 +85,8 @@ bool need_a_node = true;
|
||||
bool require_password = false;
|
||||
|
||||
/* Initialization of runtime options */
|
||||
t_runtime_options runtime_options = { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, "", "", 0 };
|
||||
t_configuration_options options = { "", -1, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", -1 };
|
||||
t_runtime_options runtime_options = { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, "", "", 0 };
|
||||
t_configuration_options options = { "", -1, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1 };
|
||||
|
||||
static char *server_mode = NULL;
|
||||
static char *server_cmd = NULL;
|
||||
@@ -104,8 +105,9 @@ main(int argc, char **argv)
|
||||
{"config-file", required_argument, NULL, 'f'},
|
||||
{"remote-user", required_argument, NULL, 'R'},
|
||||
{"wal-keep-segments", required_argument, NULL, 'w'},
|
||||
{"keep-history", required_argument, NULL, 'k'},
|
||||
{"keep-history", required_argument, NULL, 'k'},
|
||||
{"force", no_argument, NULL, 'F'},
|
||||
{"wait", no_argument, NULL, 'W'},
|
||||
{"ignore-rsync-warning", no_argument, NULL, 'I'},
|
||||
{"verbose", no_argument, NULL, 'v'},
|
||||
{NULL, 0, NULL, 0}
|
||||
@@ -132,7 +134,7 @@ main(int argc, char **argv)
|
||||
}
|
||||
|
||||
|
||||
while ((c = getopt_long(argc, argv, "d:h:p:U:D:l:f:R:w:k:F:I:v", long_options,
|
||||
while ((c = getopt_long(argc, argv, "d:h:p:U:D:l:f:R:w:k:FWIv", long_options,
|
||||
&optindex)) != -1)
|
||||
{
|
||||
switch (c)
|
||||
@@ -171,11 +173,14 @@ main(int argc, char **argv)
|
||||
if (atoi(optarg) > 0)
|
||||
runtime_options.keep_history = atoi(optarg);
|
||||
else
|
||||
runtime_options.keep_history = 0;
|
||||
runtime_options.keep_history = 0;
|
||||
break;
|
||||
case 'F':
|
||||
runtime_options.force = true;
|
||||
break;
|
||||
case 'W':
|
||||
runtime_options.wait_for_master = true;
|
||||
break;
|
||||
case 'I':
|
||||
runtime_options.ignore_rsync_warn = true;
|
||||
break;
|
||||
@@ -269,7 +274,7 @@ main(int argc, char **argv)
|
||||
break;
|
||||
default:
|
||||
log_err(_("%s: too many command-line arguments (first extra is \"%s\")\n"),
|
||||
progname, argv[optind + 1]);
|
||||
progname, argv[optind]);
|
||||
usage();
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
@@ -391,7 +396,7 @@ do_cluster_show(void)
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("Can't get nodes informations, have you regitered them?\n%s\n"), PQerrorMessage(conn));
|
||||
log_err(_("Can't get nodes information, have you registered them?\n%s\n"), PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
@@ -420,7 +425,7 @@ do_cluster_show(void)
|
||||
PQclear(res);
|
||||
}
|
||||
|
||||
static void
|
||||
static void
|
||||
do_cluster_cleanup(void)
|
||||
{
|
||||
int master_id;
|
||||
@@ -429,14 +434,14 @@ do_cluster_cleanup(void)
|
||||
PGresult *res;
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
/* We need to connect to check configuration */
|
||||
log_info(_("%s connecting to database\n"), progname);
|
||||
conn = establishDBConnection(options.conninfo, true);
|
||||
/* We need to connect to check configuration */
|
||||
log_info(_("%s connecting to database\n"), progname);
|
||||
conn = establishDBConnection(options.conninfo, true);
|
||||
|
||||
/* check if there is a master in this cluster */
|
||||
log_info(_("%s connecting to master database\n"), progname);
|
||||
master_conn = getMasterConnection(conn, repmgr_schema, options.cluster_name,
|
||||
&master_id, NULL);
|
||||
&master_id, NULL);
|
||||
if (!master_conn)
|
||||
{
|
||||
log_err(_("cluster cleanup: cannot connect to master\n"));
|
||||
@@ -448,8 +453,8 @@ do_cluster_cleanup(void)
|
||||
if (runtime_options.keep_history > 0)
|
||||
{
|
||||
sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_monitor "
|
||||
" WHERE age(now(), last_monitor_time) >= '%d days'::interval;",
|
||||
repmgr_schema, runtime_options.keep_history);
|
||||
" WHERE age(now(), last_monitor_time) >= '%d days'::interval;",
|
||||
repmgr_schema, runtime_options.keep_history);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -561,7 +566,7 @@ do_master_register(void)
|
||||
int id;
|
||||
|
||||
/* Ensure there isn't any other master already registered */
|
||||
master_conn = getMasterConnection(conn, repmgr_schema,
|
||||
master_conn = getMasterConnection(conn, repmgr_schema,
|
||||
options.cluster_name, &id,NULL);
|
||||
if (master_conn != NULL)
|
||||
{
|
||||
@@ -590,8 +595,8 @@ do_master_register(void)
|
||||
|
||||
sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_nodes (id, cluster, name, conninfo, priority) "
|
||||
"VALUES (%d, '%s', '%s', '%s', %d)",
|
||||
repmgr_schema, options.node, options.cluster_name, options.node_name,
|
||||
options.conninfo, options.priority);
|
||||
repmgr_schema, options.node, options.cluster_name, options.node_name,
|
||||
options.conninfo, options.priority);
|
||||
log_debug(_("master register: %s\n"), sqlquery);
|
||||
|
||||
if (!PQexec(conn, sqlquery))
|
||||
@@ -733,8 +738,8 @@ do_standby_register(void)
|
||||
|
||||
sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_nodes(id, cluster, name, conninfo, priority) "
|
||||
"VALUES (%d, '%s', '%s', '%s', %d)",
|
||||
repmgr_schema, options.node, options.cluster_name, options.node_name,
|
||||
options.conninfo, options.priority);
|
||||
repmgr_schema, options.node, options.cluster_name, options.node_name,
|
||||
options.conninfo, options.priority);
|
||||
log_debug(_("standby register: %s\n"), sqlquery);
|
||||
|
||||
if (!PQexec(master_conn, sqlquery))
|
||||
@@ -837,7 +842,7 @@ do_standby_clone(void)
|
||||
log_err(_("%s needs parameter 'wal_level' to be set to 'hot_standby'\n"), progname);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
if (!guc_setted(conn, "wal_keep_segments", ">=", runtime_options.wal_keep_segments))
|
||||
if (!guc_setted_typed(conn, "wal_keep_segments", ">=", runtime_options.wal_keep_segments, "integer"))
|
||||
{
|
||||
PQfinish(conn);
|
||||
log_err(_("%s needs parameter 'wal_keep_segments' to be set to %s or greater (see the '-w' option or edit the postgresql.conf of the PostgreSQL master.)\n"), progname, runtime_options.wal_keep_segments);
|
||||
@@ -920,7 +925,7 @@ do_standby_clone(void)
|
||||
PQfinish(conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
|
||||
/* We need all 5 parameters, and they can be retrieved only by superusers */
|
||||
if (PQntuples(res) != 5)
|
||||
{
|
||||
@@ -947,7 +952,7 @@ do_standby_clone(void)
|
||||
}
|
||||
PQclear(res);
|
||||
|
||||
log_info(_("Succesfully connected to primary. Current installation size is %s\n"), get_cluster_size(conn));
|
||||
log_info(_("Successfully connected to primary. Current installation size is %s\n"), get_cluster_size(conn));
|
||||
|
||||
/*
|
||||
* XXX master_xlog_directory should be discovered from master configuration
|
||||
@@ -983,8 +988,8 @@ do_standby_clone(void)
|
||||
|
||||
log_notice(_("Starting backup...\n"));
|
||||
|
||||
/*
|
||||
* in pg 9.1 default is to wait for a sync standby to ack,
|
||||
/*
|
||||
* in pg 9.1 default is to wait for a sync standby to ack,
|
||||
* avoid that by turning off sync rep for this session
|
||||
*/
|
||||
sqlquery_snprintf(sqlquery, "SET synchronous_commit TO OFF");
|
||||
@@ -1377,10 +1382,19 @@ do_standby_follow(void)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* we also need to check if there is any master in the cluster */
|
||||
log_info(_("%s connecting to master database\n"), progname);
|
||||
master_conn = getMasterConnection(conn, repmgr_schema,
|
||||
options.cluster_name, &master_id,(char *) &master_conninfo);
|
||||
/*
|
||||
* we also need to check if there is any master in the cluster
|
||||
* or wait for one to appear if we have set the wait option
|
||||
*/
|
||||
log_info(_("%s discovering new master...\n"), progname);
|
||||
|
||||
do
|
||||
{
|
||||
master_conn = getMasterConnection(conn, repmgr_schema,
|
||||
options.cluster_name, &master_id,(char *) &master_conninfo);
|
||||
}
|
||||
while (master_conn == NULL && runtime_options.wait_for_master);
|
||||
|
||||
if (master_conn == NULL)
|
||||
{
|
||||
log_err(_("There isn't a master to follow in this cluster\n"));
|
||||
@@ -1486,7 +1500,7 @@ do_witness_create(void)
|
||||
if (!create_pgdir(runtime_options.dest_dir, runtime_options.force))
|
||||
{
|
||||
log_err(_("witness create: couldn't create data directory (\"%s\") for witness"),
|
||||
runtime_options.dest_dir);
|
||||
runtime_options.dest_dir);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
@@ -1521,7 +1535,7 @@ do_witness_create(void)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
log_info(_("Succesfully connected to primary.\n"));
|
||||
log_info(_("Successfully connected to primary.\n"));
|
||||
|
||||
r = test_ssh_connection(runtime_options.host, runtime_options.remote_user);
|
||||
if (r != 0)
|
||||
@@ -1582,8 +1596,8 @@ do_witness_create(void)
|
||||
|
||||
/* Get the pg_hba.conf full path */
|
||||
sqlquery_snprintf(sqlquery, "SELECT name, setting "
|
||||
" FROM pg_settings "
|
||||
" WHERE name IN ('hba_file')");
|
||||
" FROM pg_settings "
|
||||
" WHERE name IN ('hba_file')");
|
||||
log_debug(_("witness create: %s"), sqlquery);
|
||||
res = PQexec(masterconn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -1658,7 +1672,7 @@ do_witness_create(void)
|
||||
PQfinish(masterconn);
|
||||
PQfinish(witnessconn);
|
||||
|
||||
log_notice(_("Configuration has been succesfully copied to the witness\n"));
|
||||
log_notice(_("Configuration has been successfully copied to the witness\n"));
|
||||
}
|
||||
|
||||
|
||||
@@ -1697,8 +1711,9 @@ help(const char *progname)
|
||||
printf(_(" -R, --remote-user=USERNAME database server username for rsync\n"));
|
||||
printf(_(" -w, --wal-keep-segments=VALUE minimum value for the GUC wal_keep_segments (default: 5000)\n"));
|
||||
printf(_(" -I, --ignore-rsync-warning ignore rsync partial transfer warning\n"));
|
||||
printf(_(" -k, --keep-history=VALUE keeps indicated number of days of history\n"));
|
||||
printf(_(" -k, --keep-history=VALUE keeps indicated number of days of history\n"));
|
||||
printf(_(" -F, --force force potentially dangerous operations to happen\n"));
|
||||
printf(_(" -W, --wait wait for a master to appear"));
|
||||
|
||||
printf(_("\n%s performs some tasks like clone a node, promote it "), progname);
|
||||
printf(_("or making follow another node and then exits.\n"));
|
||||
@@ -1709,8 +1724,8 @@ help(const char *progname)
|
||||
printf(_(" standby promote - allows manual promotion of a specific standby into a "));
|
||||
printf(_("new master in the event of a failover\n"));
|
||||
printf(_(" standby follow - allows the standby to re-point itself to a new master\n"));
|
||||
printf(_(" cluster show - print node informations\n"));
|
||||
printf(_(" cluster cleanup - cleans monitor's history\n"));
|
||||
printf(_(" cluster show - print node information\n"));
|
||||
printf(_(" cluster cleanup - cleans monitor's history\n"));
|
||||
}
|
||||
|
||||
|
||||
@@ -1762,11 +1777,18 @@ test_ssh_connection(char *host, char *remote_user)
|
||||
char script[MAXLEN];
|
||||
int r;
|
||||
|
||||
/* On some OS, true is located in a different place than in Linux */
|
||||
#ifdef __FreeBSD__
|
||||
#define TRUEBIN_PATH "/usr/bin/true"
|
||||
#else
|
||||
#define TRUEBIN_PATH "/bin/true"
|
||||
#endif
|
||||
|
||||
/* Check if we have ssh connectivity to host before trying to rsync */
|
||||
if (!remote_user[0])
|
||||
maxlen_snprintf(script, "ssh -o Batchmode=yes %s /bin/true", host);
|
||||
maxlen_snprintf(script, "ssh -o Batchmode=yes %s %s %s", options.ssh_options, host, TRUEBIN_PATH);
|
||||
else
|
||||
maxlen_snprintf(script, "ssh -o Batchmode=yes %s -l %s /bin/true", host, remote_user);
|
||||
maxlen_snprintf(script, "ssh -o Batchmode=yes %s %s -l %s %s", options.ssh_options, host, remote_user, TRUEBIN_PATH);
|
||||
|
||||
log_debug(_("command is: %s"), script);
|
||||
r = system(script);
|
||||
@@ -1805,7 +1827,7 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
|
||||
|
||||
if (is_directory)
|
||||
{
|
||||
strcat(rsync_flags, " --exclude=pg_xlog* --exclude=pg_control --exclude=*.pid");
|
||||
strcat(rsync_flags, " --exclude=pg_xlog* --exclude=pg_log* --exclude=pg_control --exclude=*.pid");
|
||||
maxlen_snprintf(script, "rsync %s %s:%s/* %s",
|
||||
rsync_flags, host_string, remote_path, local_path);
|
||||
}
|
||||
@@ -2004,7 +2026,7 @@ create_schema(PGconn *conn)
|
||||
sqlquery_snprintf(sqlquery, "CREATE TABLE %s.repl_nodes ( "
|
||||
" id integer primary key, "
|
||||
" cluster text not null, "
|
||||
" name text not null, "
|
||||
" name text not null, "
|
||||
" conninfo text not null, "
|
||||
" priority integer not null, "
|
||||
" witness boolean not null default false)", repmgr_schema);
|
||||
@@ -2037,8 +2059,8 @@ create_schema(PGconn *conn)
|
||||
/* a view */
|
||||
sqlquery_snprintf(sqlquery, "CREATE VIEW %s.repl_status AS "
|
||||
" SELECT primary_node, standby_node, name AS standby_name, last_monitor_time, "
|
||||
" last_wal_primary_location, last_wal_standby_location, "
|
||||
" pg_size_pretty(replication_lag) replication_lag, "
|
||||
" last_wal_primary_location, last_wal_standby_location, "
|
||||
" pg_size_pretty(replication_lag) replication_lag, "
|
||||
" pg_size_pretty(apply_lag) apply_lag, "
|
||||
" age(now(), last_monitor_time) AS time_lag "
|
||||
" FROM %s.repl_monitor JOIN %s.repl_nodes ON standby_node = id "
|
||||
@@ -2056,8 +2078,8 @@ create_schema(PGconn *conn)
|
||||
|
||||
/* an index to improve performance of the view */
|
||||
sqlquery_snprintf(sqlquery, "CREATE INDEX idx_repl_status_sort "
|
||||
" ON %s.repl_monitor (last_monitor_time, standby_node) ",
|
||||
repmgr_schema);
|
||||
" ON %s.repl_monitor (last_monitor_time, standby_node) ",
|
||||
repmgr_schema);
|
||||
log_debug(_("master register: %s\n"), sqlquery);
|
||||
if (!PQexec(conn, sqlquery))
|
||||
{
|
||||
@@ -2069,9 +2091,9 @@ create_schema(PGconn *conn)
|
||||
|
||||
/* XXX Here we MUST try to load the repmgr_function.sql not hardcode it here */
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"CREATE OR REPLACE FUNCTION %s.repmgr_update_standby_location(text) RETURNS boolean "
|
||||
"AS '$libdir/repmgr_funcs', 'repmgr_update_standby_location' "
|
||||
"LANGUAGE C STRICT ", repmgr_schema);
|
||||
"CREATE OR REPLACE FUNCTION %s.repmgr_update_standby_location(text) RETURNS boolean "
|
||||
"AS '$libdir/repmgr_funcs', 'repmgr_update_standby_location' "
|
||||
"LANGUAGE C STRICT ", repmgr_schema);
|
||||
if (!PQexec(conn, sqlquery))
|
||||
{
|
||||
fprintf(stderr, "Cannot create the function repmgr_update_standby_location: %s\n",
|
||||
@@ -2080,9 +2102,9 @@ create_schema(PGconn *conn)
|
||||
}
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
"CREATE OR REPLACE FUNCTION %s.repmgr_get_last_standby_location() RETURNS text "
|
||||
"AS '$libdir/repmgr_funcs', 'repmgr_get_last_standby_location' "
|
||||
"LANGUAGE C STRICT ", repmgr_schema);
|
||||
"CREATE OR REPLACE FUNCTION %s.repmgr_get_last_standby_location() RETURNS text "
|
||||
"AS '$libdir/repmgr_funcs', 'repmgr_get_last_standby_location' "
|
||||
"LANGUAGE C STRICT ", repmgr_schema);
|
||||
if (!PQexec(conn, sqlquery))
|
||||
{
|
||||
fprintf(stderr, "Cannot create the function repmgr_get_last_standby_location: %s\n",
|
||||
@@ -2154,30 +2176,35 @@ write_primary_conninfo(char* line)
|
||||
|
||||
/* Environment variable for password (UGLY, please use .pgpass!) */
|
||||
const char *password = getenv("PGPASSWORD");
|
||||
if (password != NULL) {
|
||||
if (password != NULL)
|
||||
{
|
||||
maxlen_snprintf(password_buf, " password=%s", password);
|
||||
}
|
||||
else if (require_password) {
|
||||
else if (require_password)
|
||||
{
|
||||
log_err(_("%s: PGPASSWORD not set, but having one is required\n"),
|
||||
progname);
|
||||
progname);
|
||||
exit(ERR_BAD_PASSWORD);
|
||||
}
|
||||
|
||||
if (runtime_options.host[0]) {
|
||||
if (runtime_options.host[0])
|
||||
{
|
||||
maxlen_snprintf(host_buf, " host=%s", runtime_options.host);
|
||||
}
|
||||
|
||||
if (runtime_options.username[0]) {
|
||||
if (runtime_options.username[0])
|
||||
{
|
||||
maxlen_snprintf(user_buf, " user=%s", runtime_options.username);
|
||||
}
|
||||
|
||||
if (options.node_name[0]) {
|
||||
if (options.node_name[0])
|
||||
{
|
||||
maxlen_snprintf(appname_buf, " application_name=%s", options.node_name);
|
||||
}
|
||||
|
||||
maxlen_snprintf(conn_buf, "port=%s%s%s%s%s",
|
||||
(runtime_options.masterport[0]) ? runtime_options.masterport : "5432", host_buf, user_buf, password_buf,
|
||||
appname_buf);
|
||||
(runtime_options.masterport[0]) ? runtime_options.masterport : "5432", host_buf, user_buf, password_buf,
|
||||
appname_buf);
|
||||
|
||||
maxlen_snprintf(line, "primary_conninfo = '%s'", conn_buf);
|
||||
|
||||
|
||||
@@ -11,7 +11,8 @@ node_name=standby2
|
||||
|
||||
# Connection information
|
||||
conninfo='host=192.168.204.104'
|
||||
rsync_options=--archive --checksum --compress --progress --rsh=ssh
|
||||
rsync_options=--archive --checksum --compress --progress --rsh="ssh -o \"StrictHostKeyChecking no\""
|
||||
ssh_options=-o "StrictHostKeyChecking no"
|
||||
|
||||
# How many seconds we wait for master response before declaring master failure
|
||||
master_response_timeout=60
|
||||
@@ -23,8 +24,8 @@ reconnect_interval=10
|
||||
# Autofailover options
|
||||
failover=automatic
|
||||
priority=-1
|
||||
promote_command='repmgr promote'
|
||||
follow_command='repmgr follow'
|
||||
promote_command='repmgr standby promote -f /path/to/repmgr.conf'
|
||||
follow_command='repmgr standby follow -f /path/to/repmgr.conf -W'
|
||||
|
||||
# Log level: possible values are DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG
|
||||
# Default: NOTICE
|
||||
|
||||
1
repmgr.h
1
repmgr.h
@@ -59,6 +59,7 @@ typedef struct
|
||||
char wal_keep_segments[MAXLEN];
|
||||
bool verbose;
|
||||
bool force;
|
||||
bool wait_for_master;
|
||||
bool ignore_rsync_warn;
|
||||
|
||||
char masterport[MAXLEN];
|
||||
|
||||
587
repmgrd.c
587
repmgrd.c
@@ -32,9 +32,39 @@
|
||||
#include "strutil.h"
|
||||
#include "version.h"
|
||||
|
||||
/* PostgreSQL's headers needed to export some functionality */
|
||||
#include "access/xlogdefs.h"
|
||||
#include "libpq/pqsignal.h"
|
||||
|
||||
/*
|
||||
* we do not export InvalidXLogRecPtr so we need to define it
|
||||
* but since 9.3 it will be defined in xlogdefs.h which we include
|
||||
* so better to ask if it's defined to be future proof
|
||||
*/
|
||||
#ifndef InvalidXLogRecPtr
|
||||
const XLogRecPtr InvalidXLogRecPtr = {0, 0};
|
||||
#endif
|
||||
|
||||
#if PG_VERSION_NUM >= 90300
|
||||
#define XLAssign(a, b) \
|
||||
a = b
|
||||
|
||||
#define XLAssignValue(a, xlogid, xrecoff) \
|
||||
a = xrecoff
|
||||
|
||||
#define XLByteLT(a, b) \
|
||||
(a < b)
|
||||
|
||||
#else
|
||||
#define XLAssign(a, b) \
|
||||
a.xlogid = b.xlogid; \
|
||||
a.xrecoff = b.xrecoff
|
||||
|
||||
#define XLAssignValue(a, uxlogid, uxrecoff) \
|
||||
a.xlogid = uxlogid; \
|
||||
a.xrecoff = uxrecoff
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Struct to keep info about the nodes, used in the voting process in
|
||||
* do_failover()
|
||||
@@ -42,8 +72,11 @@
|
||||
typedef struct nodeInfo
|
||||
{
|
||||
int nodeId;
|
||||
char conninfostr[MAXLEN];
|
||||
XLogRecPtr xlog_location;
|
||||
bool is_ready;
|
||||
bool is_visible;
|
||||
bool is_witness;
|
||||
} nodeInfo;
|
||||
|
||||
|
||||
@@ -68,6 +101,8 @@ bool verbose = false;
|
||||
bool monitoring_history = false;
|
||||
char repmgr_schema[MAXLEN];
|
||||
|
||||
bool failover_done = false;
|
||||
|
||||
/*
|
||||
* should initialize with {0} to be ANSI complaint ? but this raises
|
||||
* error with gcc -Wall
|
||||
@@ -100,7 +135,7 @@ static void setup_event_handlers(void);
|
||||
|
||||
#define CloseConnections() \
|
||||
if (PQisBusy(primaryConn) == 1) \
|
||||
CancelQuery(primaryConn, local_options.master_response_timeout); \
|
||||
(void) CancelQuery(primaryConn, local_options.master_response_timeout); \
|
||||
if (myLocalConn != NULL) \
|
||||
PQfinish(myLocalConn); \
|
||||
if (primaryConn != NULL && primaryConn != myLocalConn) \
|
||||
@@ -190,63 +225,37 @@ main(int argc, char **argv)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Set my server mode, establish a connection to primary
|
||||
* and start monitor
|
||||
*/
|
||||
if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node))
|
||||
myLocalMode = WITNESS_MODE;
|
||||
else if (is_standby(myLocalConn))
|
||||
myLocalMode = STANDBY_MODE;
|
||||
else /* is the master */
|
||||
myLocalMode = PRIMARY_MODE;
|
||||
|
||||
switch (myLocalMode)
|
||||
* MAIN LOOP
|
||||
* This loops cicles once per failover and at startup
|
||||
* Requisites:
|
||||
* - myLocalConn needs to be already setted with an active connection
|
||||
* - no master connection
|
||||
*/
|
||||
do
|
||||
{
|
||||
case PRIMARY_MODE:
|
||||
primary_options.node = local_options.node;
|
||||
strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
|
||||
primaryConn = myLocalConn;
|
||||
|
||||
checkClusterConfiguration(myLocalConn, primaryConn);
|
||||
checkNodeConfiguration(local_options.conninfo);
|
||||
|
||||
if (reload_configuration(config_file, &local_options))
|
||||
{
|
||||
PQfinish(myLocalConn);
|
||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||
primaryConn = myLocalConn;
|
||||
update_registration();
|
||||
}
|
||||
|
||||
log_info(_("%s Starting continuous primary connection check\n"), progname);
|
||||
/* Check that primary is still alive, and standbies are sending info */
|
||||
/*
|
||||
* Every SLEEP_MONITOR seconds, do master checks
|
||||
* XXX
|
||||
* Check that standbies are sending info
|
||||
*/
|
||||
for (;;)
|
||||
* Set my server mode, establish a connection to primary
|
||||
* and start monitor
|
||||
*/
|
||||
if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node))
|
||||
myLocalMode = WITNESS_MODE;
|
||||
else if (is_standby(myLocalConn))
|
||||
myLocalMode = STANDBY_MODE;
|
||||
else /* is the master */
|
||||
myLocalMode = PRIMARY_MODE;
|
||||
|
||||
switch (myLocalMode)
|
||||
{
|
||||
if (CheckPrimaryConnection())
|
||||
{
|
||||
/*
|
||||
CheckActiveStandbiesConnections();
|
||||
CheckInactiveStandbies();
|
||||
*/
|
||||
sleep(SLEEP_MONITOR);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* XXX
|
||||
* May we do something more verbose ?
|
||||
*/
|
||||
exit (1);
|
||||
}
|
||||
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
/* if we can reload, then could need to change myLocalConn */
|
||||
case PRIMARY_MODE:
|
||||
primary_options.node = local_options.node;
|
||||
strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
|
||||
primaryConn = myLocalConn;
|
||||
|
||||
checkClusterConfiguration(myLocalConn, primaryConn);
|
||||
checkNodeConfiguration(local_options.conninfo);
|
||||
|
||||
if (reload_configuration(config_file, &local_options))
|
||||
{
|
||||
PQfinish(myLocalConn);
|
||||
@@ -254,70 +263,112 @@ main(int argc, char **argv)
|
||||
primaryConn = myLocalConn;
|
||||
update_registration();
|
||||
}
|
||||
got_SIGHUP = false;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case WITNESS_MODE:
|
||||
case STANDBY_MODE:
|
||||
/* I need the id of the primary as well as a connection to it */
|
||||
log_info(_("%s Connecting to primary for cluster '%s'\n"),
|
||||
progname, local_options.cluster_name);
|
||||
primaryConn = getMasterConnection(myLocalConn, repmgr_schema,
|
||||
local_options.cluster_name,
|
||||
&primary_options.node, NULL);
|
||||
if (primaryConn == NULL)
|
||||
{
|
||||
CloseConnections();
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
log_info(_("%s Starting continuous primary connection check\n"), progname);
|
||||
|
||||
/* Check that primary is still alive, and standbies are sending info */
|
||||
|
||||
/*
|
||||
* Every SLEEP_MONITOR seconds, do master checks
|
||||
* XXX
|
||||
* Check that standbies are sending info
|
||||
*/
|
||||
do
|
||||
{
|
||||
if (CheckPrimaryConnection())
|
||||
{
|
||||
/*
|
||||
CheckActiveStandbiesConnections();
|
||||
CheckInactiveStandbies();
|
||||
*/
|
||||
sleep(SLEEP_MONITOR);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* XXX
|
||||
* May we do something more verbose ?
|
||||
*/
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
/* if we can reload, then could need to change myLocalConn */
|
||||
if (reload_configuration(config_file, &local_options))
|
||||
{
|
||||
PQfinish(myLocalConn);
|
||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||
primaryConn = myLocalConn;
|
||||
update_registration();
|
||||
}
|
||||
got_SIGHUP = false;
|
||||
}
|
||||
} while (!failover_done);
|
||||
break;
|
||||
case WITNESS_MODE:
|
||||
case STANDBY_MODE:
|
||||
/* I need the id of the primary as well as a connection to it */
|
||||
log_info(_("%s Connecting to primary for cluster '%s'\n"),
|
||||
progname, local_options.cluster_name);
|
||||
primaryConn = getMasterConnection(myLocalConn, repmgr_schema,
|
||||
local_options.cluster_name,
|
||||
&primary_options.node, NULL);
|
||||
if (primaryConn == NULL)
|
||||
{
|
||||
CloseConnections();
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
checkClusterConfiguration(myLocalConn, primaryConn);
|
||||
checkNodeConfiguration(local_options.conninfo);
|
||||
checkClusterConfiguration(myLocalConn, primaryConn);
|
||||
checkNodeConfiguration(local_options.conninfo);
|
||||
|
||||
if (reload_configuration(config_file, &local_options))
|
||||
{
|
||||
PQfinish(myLocalConn);
|
||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||
update_registration();
|
||||
}
|
||||
|
||||
/*
|
||||
* Every SLEEP_MONITOR seconds, do checks
|
||||
*/
|
||||
if (myLocalMode == WITNESS_MODE)
|
||||
{
|
||||
log_info(_("%s Starting continuous witness node monitoring\n"), progname);
|
||||
}
|
||||
else if (myLocalMode == STANDBY_MODE)
|
||||
{
|
||||
log_info(_("%s Starting continuous standby node monitoring\n"), progname);
|
||||
}
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (myLocalMode == WITNESS_MODE)
|
||||
WitnessMonitor();
|
||||
else if (myLocalMode == STANDBY_MODE)
|
||||
StandbyMonitor();
|
||||
sleep(SLEEP_MONITOR);
|
||||
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
/* if we can reload, then could need to change myLocalConn */
|
||||
if (reload_configuration(config_file, &local_options))
|
||||
{
|
||||
PQfinish(myLocalConn);
|
||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||
update_registration();
|
||||
}
|
||||
got_SIGHUP = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Every SLEEP_MONITOR seconds, do checks
|
||||
*/
|
||||
if (myLocalMode == WITNESS_MODE)
|
||||
{
|
||||
log_info(_("%s Starting continuous witness node monitoring\n"), progname);
|
||||
}
|
||||
else if (myLocalMode == STANDBY_MODE)
|
||||
{
|
||||
log_info(_("%s Starting continuous standby node monitoring\n"), progname);
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
if (myLocalMode == WITNESS_MODE)
|
||||
WitnessMonitor();
|
||||
else if (myLocalMode == STANDBY_MODE)
|
||||
StandbyMonitor();
|
||||
sleep(SLEEP_MONITOR);
|
||||
|
||||
if (got_SIGHUP)
|
||||
{
|
||||
/* if we can reload, then could need to change myLocalConn */
|
||||
if (reload_configuration(config_file, &local_options))
|
||||
{
|
||||
PQfinish(myLocalConn);
|
||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||
update_registration();
|
||||
}
|
||||
got_SIGHUP = false;
|
||||
}
|
||||
} while (!failover_done);
|
||||
break;
|
||||
default:
|
||||
log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node);
|
||||
}
|
||||
|
||||
failover_done = false;
|
||||
|
||||
} while (true);
|
||||
|
||||
/* Prevent a double-free */
|
||||
if (primaryConn == myLocalConn)
|
||||
@@ -365,7 +416,8 @@ WitnessMonitor(void)
|
||||
* Cancel any query that is still being executed,
|
||||
* so i can insert the current record
|
||||
*/
|
||||
CancelQuery(primaryConn, local_options.master_response_timeout);
|
||||
if (!CancelQuery(primaryConn, local_options.master_response_timeout))
|
||||
return;
|
||||
if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1)
|
||||
return;
|
||||
|
||||
@@ -438,7 +490,7 @@ StandbyMonitor(void)
|
||||
log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
|
||||
for (connection_retries = 0; connection_retries < 6; connection_retries++)
|
||||
{
|
||||
primaryConn = getMasterConnection(myLocalConn, repmgr_schema,
|
||||
primaryConn = getMasterConnection(myLocalConn, repmgr_schema,
|
||||
local_options.cluster_name, &primary_options.node, NULL);
|
||||
if (PQstatus(primaryConn) == CONNECTION_OK)
|
||||
{
|
||||
@@ -467,6 +519,7 @@ StandbyMonitor(void)
|
||||
* a new primaryConn
|
||||
*/
|
||||
do_failover();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -486,7 +539,8 @@ StandbyMonitor(void)
|
||||
* Cancel any query that is still being executed,
|
||||
* so i can insert the current record
|
||||
*/
|
||||
CancelQuery(primaryConn, local_options.master_response_timeout);
|
||||
if (!CancelQuery(primaryConn, local_options.master_response_timeout))
|
||||
return;
|
||||
if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1)
|
||||
return;
|
||||
|
||||
@@ -557,22 +611,22 @@ StandbyMonitor(void)
|
||||
static void
|
||||
do_failover(void)
|
||||
{
|
||||
PGresult *res1;
|
||||
PGresult *res2;
|
||||
PGresult *res;
|
||||
char sqlquery[8192];
|
||||
|
||||
int total_nodes = 0;
|
||||
int visible_nodes = 0;
|
||||
int ready_nodes = 0;
|
||||
|
||||
bool find_best = false;
|
||||
|
||||
int i;
|
||||
int r;
|
||||
|
||||
int node;
|
||||
char nodeConninfo[MAXLEN];
|
||||
uint32 uxlogid;
|
||||
uint32 uxrecoff;
|
||||
XLogRecPtr xlog_recptr;
|
||||
|
||||
unsigned int uxlogid;
|
||||
unsigned int uxrecoff;
|
||||
char last_wal_standby_applied[MAXLEN];
|
||||
|
||||
PGconn *nodeConn = NULL;
|
||||
@@ -582,91 +636,59 @@ do_failover(void)
|
||||
* which seems to be large enough for most scenarios
|
||||
*/
|
||||
nodeInfo nodes[50];
|
||||
nodeInfo best_candidate;
|
||||
|
||||
/* first we get info about this node, and update shared memory */
|
||||
sprintf(sqlquery, "SELECT pg_last_xlog_replay_location()");
|
||||
res1 = PQexec(myLocalConn, sqlquery);
|
||||
if (PQresultStatus(res1) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
|
||||
PQclear(res1);
|
||||
sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
|
||||
update_shared_memory(last_wal_standby_applied);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
|
||||
/* write last location in shared memory */
|
||||
update_shared_memory(PQgetvalue(res1, 0, 0));
|
||||
|
||||
/*
|
||||
* we sleep the monitor time + one second
|
||||
* we bet it should be enough for other repmgrd to update their own data
|
||||
*/
|
||||
sleep(SLEEP_MONITOR + 1);
|
||||
/* initialize to keep compiler quiet */
|
||||
nodeInfo best_candidate = {-1, "", InvalidXLogRecPtr, false, false, false};
|
||||
|
||||
/* get a list of standby nodes, including myself */
|
||||
sprintf(sqlquery, "SELECT id, conninfo "
|
||||
sprintf(sqlquery, "SELECT id, conninfo, witness "
|
||||
" FROM %s.repl_nodes "
|
||||
" WHERE id <> %d "
|
||||
" AND cluster = '%s' "
|
||||
" ORDER BY priority ",
|
||||
repmgr_schema, primary_options.node, local_options.cluster_name);
|
||||
" WHERE cluster = '%s' "
|
||||
" ORDER BY priority, id ",
|
||||
repmgr_schema, local_options.cluster_name);
|
||||
|
||||
res1 = PQexec(myLocalConn, sqlquery);
|
||||
if (PQresultStatus(res1) != PGRES_TUPLES_OK)
|
||||
res = PQexec(myLocalConn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn));
|
||||
PQclear(res1);
|
||||
log_err(_("Can't get nodes' info: %s\n"), PQerrorMessage(myLocalConn));
|
||||
PQclear(res);
|
||||
PQfinish(myLocalConn);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
|
||||
/* ask for the locations */
|
||||
for (i = 0; i < PQntuples(res1); i++)
|
||||
/*
|
||||
* total nodes that are registered
|
||||
*/
|
||||
total_nodes = PQntuples(res);
|
||||
log_debug(_("%s: there are %d nodes registered\n"), progname, total_nodes);
|
||||
|
||||
/* Build an array with the nodes and indicate which ones are visible and ready */
|
||||
for (i = 0; i < total_nodes; i++)
|
||||
{
|
||||
node = atoi(PQgetvalue(res1, i, 0));
|
||||
nodes[i].nodeId = atoi(PQgetvalue(res, i, 0));
|
||||
strncpy(nodes[i].conninfostr, PQgetvalue(res, i, 1), MAXLEN);
|
||||
nodes[i].is_witness = (strcmp(PQgetvalue(res, i, 2), "t") == 0) ? true : false;
|
||||
|
||||
/* Initialize on false so if we can't reach this node we know that later */
|
||||
nodes[i].is_visible = false;
|
||||
nodes[i].is_ready = false;
|
||||
strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN);
|
||||
nodeConn = establishDBConnection(nodeConninfo, false);
|
||||
XLAssignValue(nodes[i].xlog_location, 0, 0);
|
||||
|
||||
log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s\n"),
|
||||
progname, nodes[i].nodeId, nodes[i].conninfostr, (nodes[i].is_witness) ? "true" : "false");
|
||||
|
||||
nodeConn = establishDBConnection(nodes[i].conninfostr, false);
|
||||
/* if we can't see the node just skip it */
|
||||
if (PQstatus(nodeConn) != CONNECTION_OK)
|
||||
continue;
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema);
|
||||
res2 = PQexec(nodeConn, sqlquery);
|
||||
if (PQresultStatus(res2) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
|
||||
log_info(_("Connection details: %s\n"), nodeConninfo);
|
||||
PQclear(res2);
|
||||
PQfinish(nodeConn);
|
||||
continue;
|
||||
}
|
||||
|
||||
visible_nodes++;
|
||||
|
||||
if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
|
||||
log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0));
|
||||
|
||||
nodes[i].nodeId = node;
|
||||
nodes[i].xlog_location.xlogid = uxlogid;
|
||||
nodes[i].xlog_location.xrecoff = uxrecoff;
|
||||
nodes[i].is_ready = true;
|
||||
|
||||
PQclear(res2);
|
||||
nodes[i].is_visible = true;
|
||||
PQfinish(nodeConn);
|
||||
}
|
||||
PQclear(res1);
|
||||
/* Close the connection to this server */
|
||||
PQfinish(myLocalConn);
|
||||
PQclear(res);
|
||||
|
||||
/*
|
||||
* total nodes that are registered, include master which is a node but was
|
||||
* not counted because it's not a standby
|
||||
*/
|
||||
total_nodes = i + 1;
|
||||
log_debug(_("Total nodes counted: registered=%d, visible=%d\n"), total_nodes, visible_nodes);
|
||||
|
||||
/*
|
||||
* am i on the group that should keep alive?
|
||||
@@ -680,41 +702,201 @@ do_failover(void)
|
||||
exit(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
/* Query all the nodes to determine which ones are ready */
|
||||
for (i = 0; i < total_nodes; i++)
|
||||
{
|
||||
/* if the node is not visible, skip it */
|
||||
if (!nodes[i].is_visible)
|
||||
continue;
|
||||
|
||||
if (nodes[i].is_witness)
|
||||
continue;
|
||||
|
||||
nodeConn = establishDBConnection(nodes[i].conninfostr, false);
|
||||
/* XXX
|
||||
* This shouldn't happen, if this happens it means this is a major problem
|
||||
* maybe network outages? anyway, is better for a human to react
|
||||
*/
|
||||
if (PQstatus(nodeConn) != CONNECTION_OK)
|
||||
{
|
||||
log_err(_("It seems new problems are arising, manual intervention is needed\n"));
|
||||
exit(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
|
||||
res = PQexec(nodeConn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
|
||||
log_info(_("Connection details: %s\n"), nodes[i].conninfostr);
|
||||
PQclear(res);
|
||||
PQfinish(nodeConn);
|
||||
exit(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
|
||||
log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0));
|
||||
|
||||
log_debug("XLog position of node %d: log id=%u (%X), offset=%u (%X)\n",
|
||||
nodes[i].nodeId, uxlogid, uxlogid, uxrecoff, uxrecoff);
|
||||
|
||||
/* If position is 0/0, error */
|
||||
if (uxlogid == 0 && uxrecoff == 0)
|
||||
{
|
||||
PQclear(res);
|
||||
PQfinish(nodeConn);
|
||||
log_info(_("InvalidXLogRecPtr detected in a standby\n"));
|
||||
exit(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff);
|
||||
|
||||
PQclear(res);
|
||||
PQfinish(nodeConn);
|
||||
}
|
||||
|
||||
/* last we get info about this node, and update shared memory */
|
||||
sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
|
||||
res = PQexec(myLocalConn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
|
||||
PQfinish(myLocalConn);
|
||||
PQclear(res);
|
||||
sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
|
||||
update_shared_memory(last_wal_standby_applied);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
|
||||
/* write last location in shared memory */
|
||||
update_shared_memory(PQgetvalue(res, 0, 0));
|
||||
PQclear(res);
|
||||
|
||||
for (i = 0; i < total_nodes; i++)
|
||||
{
|
||||
while (!nodes[i].is_ready)
|
||||
{
|
||||
/*
|
||||
* the witness will always be masked as ready if it's still
|
||||
* not marked that way and avoid a useless query
|
||||
*/
|
||||
if (nodes[i].is_witness)
|
||||
{
|
||||
if (!nodes[i].is_ready)
|
||||
{
|
||||
nodes[i].is_ready = true;
|
||||
ready_nodes++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/* if the node is not visible, skip it */
|
||||
if (!nodes[i].is_visible)
|
||||
break;
|
||||
|
||||
/* if the node is ready there is nothing to check, skip it too */
|
||||
if (nodes[i].is_ready)
|
||||
break;
|
||||
|
||||
nodeConn = establishDBConnection(nodes[i].conninfostr, false);
|
||||
/* XXX
|
||||
* This shouldn't happen, if this happens it means this is a major problem
|
||||
* maybe network outages? anyway, is better for a human to react
|
||||
*/
|
||||
if (PQstatus(nodeConn) != CONNECTION_OK)
|
||||
{
|
||||
/* XXX */
|
||||
log_info(_("At this point, it could be some race conditions that are acceptable, assume the node is restarting and starting failover procedure\n"));
|
||||
break;
|
||||
}
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema);
|
||||
res = PQexec(nodeConn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(nodeConn));
|
||||
PQclear(res);
|
||||
PQfinish(nodeConn);
|
||||
exit(ERR_DB_QUERY);
|
||||
}
|
||||
|
||||
if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
|
||||
log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0));
|
||||
|
||||
PQclear(res);
|
||||
PQfinish(nodeConn);
|
||||
/* If position is 0/0, keep checking */
|
||||
if (uxlogid == 0 && uxrecoff == 0)
|
||||
continue;
|
||||
|
||||
XLAssignValue(xlog_recptr, uxlogid, uxrecoff);
|
||||
|
||||
if (XLByteLT(nodes[i].xlog_location, xlog_recptr))
|
||||
{
|
||||
XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff);
|
||||
}
|
||||
|
||||
log_debug("Last XLog position of node %d: log id=%u (%X), offset=%u (%X)\n",
|
||||
nodes[i].nodeId, uxlogid, uxlogid,
|
||||
uxrecoff, uxrecoff);
|
||||
|
||||
ready_nodes++;
|
||||
nodes[i].is_ready = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Close the connection to this server */
|
||||
PQfinish(myLocalConn);
|
||||
|
||||
/*
|
||||
* determine which one is the best candidate to promote to primary
|
||||
*/
|
||||
for (i = 0; i < total_nodes - 1; i++)
|
||||
for (i = 0; i < total_nodes; i++)
|
||||
{
|
||||
if (!nodes[i].is_ready)
|
||||
/* witness is never a good candidate */
|
||||
if (nodes[i].is_witness)
|
||||
continue;
|
||||
else if (!find_best)
|
||||
|
||||
if (!nodes[i].is_ready || !nodes[i].is_visible)
|
||||
continue;
|
||||
|
||||
if (!find_best)
|
||||
{
|
||||
/* start with the first ready node, and then move on to the next one */
|
||||
best_candidate.nodeId = nodes[i].nodeId;
|
||||
best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid;
|
||||
best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
|
||||
XLAssign(best_candidate.xlog_location, nodes[i].xlog_location);
|
||||
best_candidate.is_ready = nodes[i].is_ready;
|
||||
best_candidate.is_witness = nodes[i].is_witness;
|
||||
find_best = true;
|
||||
}
|
||||
|
||||
/* we use the macros provided by xlogdefs.h to compare XLogPtr */
|
||||
/* we use the macros provided by xlogdefs.h to compare XLogRecPtr */
|
||||
/*
|
||||
* Nodes are retrieved ordered by priority, so if the current
|
||||
* best candidate is lower or equal to the next node's wal location
|
||||
* best candidate is lower than the next node's wal location
|
||||
* then assign next node as the new best candidate.
|
||||
*/
|
||||
if (XLByteLE(best_candidate.xlog_location, nodes[i].xlog_location))
|
||||
if (XLByteLT(best_candidate.xlog_location, nodes[i].xlog_location))
|
||||
{
|
||||
best_candidate.nodeId = nodes[i].nodeId;
|
||||
best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid;
|
||||
best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
|
||||
XLAssign(best_candidate.xlog_location, nodes[i].xlog_location);
|
||||
best_candidate.is_ready = nodes[i].is_ready;
|
||||
best_candidate.is_witness = nodes[i].is_witness;
|
||||
}
|
||||
}
|
||||
|
||||
/* once we know who is the best candidate, promote it */
|
||||
if (find_best && (best_candidate.nodeId == local_options.node))
|
||||
{
|
||||
if (best_candidate.is_witness)
|
||||
{
|
||||
log_err(_("%s: Node selected as new master is a witness. Can't be promoted.\n"), progname);
|
||||
exit(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
/* wait */
|
||||
sleep(5);
|
||||
|
||||
if (verbose)
|
||||
log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"),
|
||||
progname);
|
||||
@@ -728,6 +910,9 @@ do_failover(void)
|
||||
}
|
||||
else if (find_best)
|
||||
{
|
||||
/* wait */
|
||||
sleep(10);
|
||||
|
||||
if (verbose)
|
||||
log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"),
|
||||
progname, best_candidate.nodeId);
|
||||
@@ -749,6 +934,9 @@ do_failover(void)
|
||||
exit(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
/* to force it to re-calculate mode and master node */
|
||||
failover_done = true;
|
||||
|
||||
/* and reconnect to the local database */
|
||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||
}
|
||||
@@ -769,9 +957,9 @@ CheckPrimaryConnection(void)
|
||||
{
|
||||
if (!is_pgup(primaryConn, local_options.master_response_timeout))
|
||||
{
|
||||
log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"),
|
||||
progname,
|
||||
(local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
|
||||
log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"),
|
||||
progname,
|
||||
(local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
|
||||
/* wait local_options.reconnect_intvl seconds between retries */
|
||||
sleep(local_options.reconnect_intvl);
|
||||
}
|
||||
@@ -781,7 +969,7 @@ CheckPrimaryConnection(void)
|
||||
{
|
||||
log_info(_("%s: Connection to master has been restored.\n"), progname);
|
||||
}
|
||||
break;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (!is_pgup(primaryConn, local_options.master_response_timeout))
|
||||
@@ -878,7 +1066,7 @@ checkNodeConfiguration(char *conninfo)
|
||||
"VALUES (%d, '%s', '%s', '%s', 0, 'f')",
|
||||
repmgr_schema, local_options.node,
|
||||
local_options.cluster_name,
|
||||
local_options.node_name,
|
||||
local_options.node_name,
|
||||
local_options.conninfo);
|
||||
|
||||
if (!PQexec(primaryConn, sqlquery))
|
||||
@@ -889,7 +1077,7 @@ checkNodeConfiguration(char *conninfo)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
else
|
||||
else
|
||||
{
|
||||
PQclear(res);
|
||||
}
|
||||
@@ -937,6 +1125,7 @@ static void
|
||||
handle_sigint(SIGNAL_ARGS)
|
||||
{
|
||||
CloseConnections();
|
||||
logger_shutdown();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@@ -962,7 +1151,7 @@ update_shared_memory(char *last_wal_standby_applied)
|
||||
PGresult *res;
|
||||
|
||||
sprintf(sqlquery, "SELECT %s.repmgr_update_standby_location('%s')",
|
||||
repmgr_schema, last_wal_standby_applied);
|
||||
repmgr_schema, last_wal_standby_applied);
|
||||
|
||||
/* If an error happens, just inform about that and continue */
|
||||
res = PQexec(myLocalConn, sqlquery);
|
||||
|
||||
Reference in New Issue
Block a user