mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 17:06:29 +00:00
Merge branch 'REL2_0_STABLE'
Conflicts: HISTORY dbutils.h repmgr.c repmgrd.c version.h
This commit is contained in:
4
CREDITS
4
CREDITS
@@ -10,3 +10,7 @@ Hannu Krosing <hannu@2ndQuadrant.com>
|
|||||||
Cédric Villemain <cedric@2ndquadrant.com>
|
Cédric Villemain <cedric@2ndquadrant.com>
|
||||||
Charles Duffy <charles@dyfis.net>
|
Charles Duffy <charles@dyfis.net>
|
||||||
Daniel Farina <daniel@heroku.com>
|
Daniel Farina <daniel@heroku.com>
|
||||||
|
Shawn Ellis <shawn.ellis17@gmail.com>
|
||||||
|
Jay Taylor <jay@jaytaylor.com>
|
||||||
|
Christian Kruse <christian@2ndQuadrant.com>
|
||||||
|
Krzysztof Gajdemski <songo@debian.org.pl>
|
||||||
|
|||||||
9
HISTORY
9
HISTORY
@@ -1,4 +1,11 @@
|
|||||||
2.0beta 2012-07-27
|
2.0beta2 2013-12-19
|
||||||
|
Improve autofailover logic and algorithms (Jaime, Andres)
|
||||||
|
Ignore pg_log when cloning (Jaime)
|
||||||
|
Add timestamps to log line in stderr (Christian)
|
||||||
|
Correctly check wal_keep_segments (Jay Taylor)
|
||||||
|
Add a ssh_options parameter (Jay Taylor)
|
||||||
|
|
||||||
|
2.0beta1 2012-07-27
|
||||||
Make CLONE command try to make an exact copy including $PGDATA location (Cedric)
|
Make CLONE command try to make an exact copy including $PGDATA location (Cedric)
|
||||||
Add detection of master failure (Jaime)
|
Add detection of master failure (Jaime)
|
||||||
Add the notion of a witness server (Jaime)
|
Add the notion of a witness server (Jaime)
|
||||||
|
|||||||
12
check_dir.c
12
check_dir.c
@@ -225,12 +225,12 @@ is_pg_dir(char *dir)
|
|||||||
struct stat sb;
|
struct stat sb;
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
// test pgdata
|
/* test pgdata */
|
||||||
xsnprintf(path, buf_sz, "%s/PG_VERSION", dir);
|
xsnprintf(path, buf_sz, "%s/PG_VERSION", dir);
|
||||||
if (stat(path, &sb) == 0)
|
if (stat(path, &sb) == 0)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
// test tablespace dir
|
/* test tablespace dir */
|
||||||
sprintf(path, "ls %s/PG_*/ -I*", dir);
|
sprintf(path, "ls %s/PG_*/ -I*", dir);
|
||||||
r = system(path);
|
r = system(path);
|
||||||
if (r == 0)
|
if (r == 0)
|
||||||
@@ -256,7 +256,7 @@ create_pgdir(char *dir, bool force)
|
|||||||
{
|
{
|
||||||
log_err(_("couldn't create directory \"%s\"...\n"),
|
log_err(_("couldn't create directory \"%s\"...\n"),
|
||||||
dir);
|
dir);
|
||||||
exit(ERR_BAD_CONFIG);
|
return false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
@@ -268,7 +268,7 @@ create_pgdir(char *dir, bool force)
|
|||||||
{
|
{
|
||||||
log_err(_("could not change permissions of directory \"%s\": %s\n"),
|
log_err(_("could not change permissions of directory \"%s\": %s\n"),
|
||||||
dir, strerror(errno));
|
dir, strerror(errno));
|
||||||
exit(ERR_BAD_CONFIG);
|
return false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
@@ -293,7 +293,7 @@ create_pgdir(char *dir, bool force)
|
|||||||
"If you are sure you want to clone here, "
|
"If you are sure you want to clone here, "
|
||||||
"please check there is no PostgreSQL server "
|
"please check there is no PostgreSQL server "
|
||||||
"running and use the --force option\n"));
|
"running and use the --force option\n"));
|
||||||
exit(ERR_BAD_CONFIG);
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
@@ -301,7 +301,7 @@ create_pgdir(char *dir, bool force)
|
|||||||
/* Trouble accessing directory */
|
/* Trouble accessing directory */
|
||||||
log_err(_("could not access directory \"%s\": %s\n"),
|
log_err(_("could not access directory \"%s\": %s\n"),
|
||||||
dir, strerror(errno));
|
dir, strerror(errno));
|
||||||
exit(ERR_BAD_CONFIG);
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
36
config.c
36
config.c
@@ -41,6 +41,9 @@ parse_config(const char *config_file, t_configuration_options *options)
|
|||||||
memset(options->promote_command, 0, sizeof(options->promote_command));
|
memset(options->promote_command, 0, sizeof(options->promote_command));
|
||||||
memset(options->follow_command, 0, sizeof(options->follow_command));
|
memset(options->follow_command, 0, sizeof(options->follow_command));
|
||||||
memset(options->rsync_options, 0, sizeof(options->rsync_options));
|
memset(options->rsync_options, 0, sizeof(options->rsync_options));
|
||||||
|
memset(options->ssh_options, 0, sizeof(options->ssh_options));
|
||||||
|
memset(options->pg_bindir, 0, sizeof(options->pg_bindir));
|
||||||
|
memset(options->pgctl_options, 0, sizeof(options->pgctl_options));
|
||||||
|
|
||||||
/* if nothing has been provided defaults to 60 */
|
/* if nothing has been provided defaults to 60 */
|
||||||
options->master_response_timeout = 60;
|
options->master_response_timeout = 60;
|
||||||
@@ -78,6 +81,8 @@ parse_config(const char *config_file, t_configuration_options *options)
|
|||||||
strncpy (options->conninfo, value, MAXLEN);
|
strncpy (options->conninfo, value, MAXLEN);
|
||||||
else if (strcmp(name, "rsync_options") == 0)
|
else if (strcmp(name, "rsync_options") == 0)
|
||||||
strncpy (options->rsync_options, value, QUERY_STR_LEN);
|
strncpy (options->rsync_options, value, QUERY_STR_LEN);
|
||||||
|
else if (strcmp(name, "ssh_options") == 0)
|
||||||
|
strncpy (options->ssh_options, value, QUERY_STR_LEN);
|
||||||
else if (strcmp(name, "loglevel") == 0)
|
else if (strcmp(name, "loglevel") == 0)
|
||||||
strncpy (options->loglevel, value, MAXLEN);
|
strncpy (options->loglevel, value, MAXLEN);
|
||||||
else if (strcmp(name, "logfacility") == 0)
|
else if (strcmp(name, "logfacility") == 0)
|
||||||
@@ -111,6 +116,10 @@ parse_config(const char *config_file, t_configuration_options *options)
|
|||||||
options->reconnect_attempts = atoi(value);
|
options->reconnect_attempts = atoi(value);
|
||||||
else if (strcmp(name, "reconnect_interval") == 0)
|
else if (strcmp(name, "reconnect_interval") == 0)
|
||||||
options->reconnect_intvl = atoi(value);
|
options->reconnect_intvl = atoi(value);
|
||||||
|
else if (strcmp(name, "pg_bindir") == 0)
|
||||||
|
strncpy (options->pg_bindir, value, MAXLEN);
|
||||||
|
else if (strcmp(name, "pg_ctl_options") == 0)
|
||||||
|
strncpy (options->pgctl_options, value, MAXLEN);
|
||||||
else
|
else
|
||||||
log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
|
log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
|
||||||
}
|
}
|
||||||
@@ -148,6 +157,12 @@ parse_config(const char *config_file, t_configuration_options *options)
|
|||||||
log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
|
log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
|
||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (*options->pg_bindir == '\0')
|
||||||
|
{
|
||||||
|
log_err(_("pg_bindir config value not found. Check the configuration file.\n"));
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -218,49 +233,49 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
|
|||||||
parse_config(config_file, &new_options);
|
parse_config(config_file, &new_options);
|
||||||
if (new_options.node == -1)
|
if (new_options.node == -1)
|
||||||
{
|
{
|
||||||
log_warning(_("\nCannot load new configuration, will keep current one.\n"));
|
log_warning(_("Cannot load new configuration, will keep current one.\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strcmp(new_options.cluster_name, orig_options->cluster_name) != 0)
|
if (strcmp(new_options.cluster_name, orig_options->cluster_name) != 0)
|
||||||
{
|
{
|
||||||
log_warning(_("\nCannot change cluster name, will keep current configuration.\n"));
|
log_warning(_("Cannot change cluster name, will keep current configuration.\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (new_options.node != orig_options->node)
|
if (new_options.node != orig_options->node)
|
||||||
{
|
{
|
||||||
log_warning(_("\nCannot change node number, will keep current configuration.\n"));
|
log_warning(_("Cannot change node number, will keep current configuration.\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (new_options.node_name != orig_options->node_name)
|
if (strcmp(new_options.node_name, orig_options->node_name) != 0)
|
||||||
{
|
{
|
||||||
log_warning(_("\nCannot change standby name, will keep current configuration.\n"));
|
log_warning(_("Cannot change standby name, will keep current configuration.\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (new_options.failover != MANUAL_FAILOVER && new_options.failover != AUTOMATIC_FAILOVER)
|
if (new_options.failover != MANUAL_FAILOVER && new_options.failover != AUTOMATIC_FAILOVER)
|
||||||
{
|
{
|
||||||
log_warning(_("\nNew value for failover is not valid. Should be MANUAL or AUTOMATIC.\n"));
|
log_warning(_("New value for failover is not valid. Should be MANUAL or AUTOMATIC.\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (new_options.master_response_timeout <= 0)
|
if (new_options.master_response_timeout <= 0)
|
||||||
{
|
{
|
||||||
log_warning(_("\nNew value for master_response_timeout is not valid. Should be greater than zero.\n"));
|
log_warning(_("New value for master_response_timeout is not valid. Should be greater than zero.\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (new_options.reconnect_attempts < 0)
|
if (new_options.reconnect_attempts < 0)
|
||||||
{
|
{
|
||||||
log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
|
log_warning(_("New value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (new_options.reconnect_intvl < 0)
|
if (new_options.reconnect_intvl < 0)
|
||||||
{
|
{
|
||||||
log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
|
log_warning(_("New value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -268,7 +283,7 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
|
|||||||
conn = establishDBConnection(new_options.conninfo, false);
|
conn = establishDBConnection(new_options.conninfo, false);
|
||||||
if (!conn || (PQstatus(conn) != CONNECTION_OK))
|
if (!conn || (PQstatus(conn) != CONNECTION_OK))
|
||||||
{
|
{
|
||||||
log_warning(_("\nconninfo string is not valid, will keep current configuration.\n"));
|
log_warning(_("conninfo string is not valid, will keep current configuration.\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
PQfinish(conn);
|
PQfinish(conn);
|
||||||
@@ -283,6 +298,7 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
|
|||||||
strcpy(orig_options->promote_command, new_options.promote_command);
|
strcpy(orig_options->promote_command, new_options.promote_command);
|
||||||
strcpy(orig_options->follow_command, new_options.follow_command);
|
strcpy(orig_options->follow_command, new_options.follow_command);
|
||||||
strcpy(orig_options->rsync_options, new_options.rsync_options);
|
strcpy(orig_options->rsync_options, new_options.rsync_options);
|
||||||
|
strcpy(orig_options->ssh_options, new_options.ssh_options);
|
||||||
orig_options->master_response_timeout = new_options.master_response_timeout;
|
orig_options->master_response_timeout = new_options.master_response_timeout;
|
||||||
orig_options->reconnect_attempts = new_options.reconnect_attempts;
|
orig_options->reconnect_attempts = new_options.reconnect_attempts;
|
||||||
orig_options->reconnect_intvl = new_options.reconnect_intvl;
|
orig_options->reconnect_intvl = new_options.reconnect_intvl;
|
||||||
|
|||||||
5
config.h
5
config.h
@@ -36,11 +36,16 @@ typedef struct
|
|||||||
char loglevel[MAXLEN];
|
char loglevel[MAXLEN];
|
||||||
char logfacility[MAXLEN];
|
char logfacility[MAXLEN];
|
||||||
char rsync_options[QUERY_STR_LEN];
|
char rsync_options[QUERY_STR_LEN];
|
||||||
|
char ssh_options[QUERY_STR_LEN];
|
||||||
int master_response_timeout;
|
int master_response_timeout;
|
||||||
int reconnect_attempts;
|
int reconnect_attempts;
|
||||||
int reconnect_intvl;
|
int reconnect_intvl;
|
||||||
|
char pg_bindir[MAXLEN];
|
||||||
|
char pgctl_options[MAXLEN];
|
||||||
} t_configuration_options;
|
} t_configuration_options;
|
||||||
|
|
||||||
|
#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "" }
|
||||||
|
|
||||||
void parse_config(const char *config_file, t_configuration_options *options);
|
void parse_config(const char *config_file, t_configuration_options *options);
|
||||||
void parse_line(char *buff, char *name, char *value);
|
void parse_line(char *buff, char *name, char *value);
|
||||||
char *trim(char *s);
|
char *trim(char *s);
|
||||||
|
|||||||
40
dbutils.c
40
dbutils.c
@@ -138,7 +138,7 @@ is_pgup(PGconn *conn, int timeout)
|
|||||||
{
|
{
|
||||||
if (twice)
|
if (twice)
|
||||||
return false;
|
return false;
|
||||||
PQreset(conn); // reconnect
|
PQreset(conn); /* reconnect */
|
||||||
twice = true;
|
twice = true;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -164,10 +164,10 @@ is_pgup(PGconn *conn, int timeout)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
failed:
|
failed:
|
||||||
// we need to retry, because we might just have loose the connection once
|
/* we need to retry, because we might just have loose the connection once */
|
||||||
if (twice)
|
if (twice)
|
||||||
return false;
|
return false;
|
||||||
PQreset(conn); // reconnect
|
PQreset(conn); /* reconnect */
|
||||||
twice = true;
|
twice = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -250,6 +250,40 @@ guc_setted(PGconn *conn, const char *parameter, const char *op,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Just like guc_setted except with an extra parameter containing the name of
|
||||||
|
* the pg datatype so that the comparison can be done properly.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
guc_setted_typed(PGconn *conn, const char *parameter, const char *op,
|
||||||
|
const char *value, const char *datatype)
|
||||||
|
{
|
||||||
|
PGresult *res;
|
||||||
|
char sqlquery[QUERY_STR_LEN];
|
||||||
|
|
||||||
|
sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings "
|
||||||
|
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
|
||||||
|
parameter, datatype, op, value, datatype);
|
||||||
|
|
||||||
|
res = PQexec(conn, sqlquery);
|
||||||
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
log_err(_("GUC setting check PQexec failed: %s"),
|
||||||
|
PQerrorMessage(conn));
|
||||||
|
PQclear(res);
|
||||||
|
PQfinish(conn);
|
||||||
|
exit(ERR_DB_QUERY);
|
||||||
|
}
|
||||||
|
if (PQntuples(res) == 0)
|
||||||
|
{
|
||||||
|
PQclear(res);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
PQclear(res);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
const char *
|
const char *
|
||||||
get_cluster_size(PGconn *conn)
|
get_cluster_size(PGconn *conn)
|
||||||
|
|||||||
@@ -32,6 +32,9 @@ bool is_pgup(PGconn *conn, int timeout);
|
|||||||
char *pg_version(PGconn *conn, char* major_version);
|
char *pg_version(PGconn *conn, char* major_version);
|
||||||
bool guc_setted(PGconn *conn, const char *parameter, const char *op,
|
bool guc_setted(PGconn *conn, const char *parameter, const char *op,
|
||||||
const char *value);
|
const char *value);
|
||||||
|
bool guc_setted_typed(PGconn *conn, const char *parameter, const char *op,
|
||||||
|
const char *value, const char *datatype);
|
||||||
|
|
||||||
const char *get_cluster_size(PGconn *conn);
|
const char *get_cluster_size(PGconn *conn);
|
||||||
PGconn *getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
|
PGconn *getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
|
||||||
int *master_id, char *master_conninfo_out);
|
int *master_id, char *master_conninfo_out);
|
||||||
|
|||||||
6
debian/DEBIAN/control
vendored
6
debian/DEBIAN/control
vendored
@@ -1,9 +1,9 @@
|
|||||||
Package: repmgr-auto
|
Package: repmgr-auto
|
||||||
Version: 1.0-1
|
Version: 2.0beta2
|
||||||
Section: database
|
Section: database
|
||||||
Priority: optional
|
Priority: optional
|
||||||
Architecture: all
|
Architecture: all
|
||||||
Depends: rsync, postgresql-9.0
|
Depends: rsync, postgresql-9.0 | postgresql-9.1 | postgresql-9.2 | postgresql-9.3
|
||||||
Maintainer: Greg Smith <greg@2ndQuadrant.com>
|
Maintainer: Jaime Casanova <jaime@2ndQuadrant.com>
|
||||||
Description: PostgreSQL replication setup, magament and monitoring
|
Description: PostgreSQL replication setup, magament and monitoring
|
||||||
has two main executables
|
has two main executables
|
||||||
|
|||||||
14
debian/repmgr.repmgrd.default
vendored
Normal file
14
debian/repmgr.repmgrd.default
vendored
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# default settings for repmgrd. This file is source by /bin/sh from
|
||||||
|
# /etc/init.d/repmgrd
|
||||||
|
|
||||||
|
# Options for repmgrd
|
||||||
|
REPMGRD_OPTS=""
|
||||||
|
|
||||||
|
# repmgrd binary
|
||||||
|
REPMGR_BIN="/usr/bin/repmgr"
|
||||||
|
|
||||||
|
# pid file
|
||||||
|
REPMGR_PIDFILE="/var/run/repmgrd.pid"
|
||||||
|
|
||||||
|
|
||||||
48
debian/repmgr.repmgrd.init
vendored
Normal file
48
debian/repmgr.repmgrd.init
vendored
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
### BEGIN INIT INFO
|
||||||
|
# Provides: repmgrd
|
||||||
|
# Required-Start: $local_fs $remote_fs $network $syslog $postgresql
|
||||||
|
# Required-Stop: $local_fs $remote_fs $network $syslog $postgresql
|
||||||
|
# Should-Start: $syslog $postgresql
|
||||||
|
# Should-Start: $syslog $postgresql
|
||||||
|
# Default-Start: 2 3 4 5
|
||||||
|
# Default-Stop: 0 1 6
|
||||||
|
# Short-Description: Start/stop repmgrd
|
||||||
|
### END INIT INFO
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if test -f /etc/default/repmgrd; then
|
||||||
|
. /etc/default/repmgrd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$REPMGRD_BIN" ]; then
|
||||||
|
REPMGRD_BIN="/usr/bin/repmgrd"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$REPMGRD_PIDFILE" ]; then
|
||||||
|
REPMGRD_PIDFILE="/var/run/repmgrd.pid"
|
||||||
|
fi
|
||||||
|
|
||||||
|
test -x $REPMGRD_BIN || exit 0
|
||||||
|
|
||||||
|
case "$1" in
|
||||||
|
start)
|
||||||
|
start-stop-daemon --start --quiet --make-pidfile --pidfile $REPMGRD_PIDFILE --exec $REPMGRD_BIN $REPMGRD_OPTS
|
||||||
|
;;
|
||||||
|
|
||||||
|
stop)
|
||||||
|
start-stop-daemon --stop --oknodo --quiet --pidfile $REPMGRD_PIDFILE
|
||||||
|
;;
|
||||||
|
|
||||||
|
restart)
|
||||||
|
$0 stop && $0 start || exit 1
|
||||||
|
;;
|
||||||
|
|
||||||
|
*)
|
||||||
|
echo "Usage: $0 {start|stop|restart}"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
exit 0
|
||||||
@@ -35,5 +35,6 @@
|
|||||||
#define ERR_STR_OVERFLOW 10
|
#define ERR_STR_OVERFLOW 10
|
||||||
#define ERR_FAILOVER_FAIL 11
|
#define ERR_FAILOVER_FAIL 11
|
||||||
#define ERR_BAD_SSH 12
|
#define ERR_BAD_SSH 12
|
||||||
|
#define ERR_SYS_FAILURE 13
|
||||||
|
|
||||||
#endif /* _ERRCODE_H_ */
|
#endif /* _ERRCODE_H_ */
|
||||||
|
|||||||
27
log.c
27
log.c
@@ -25,9 +25,11 @@
|
|||||||
|
|
||||||
#ifdef HAVE_SYSLOG
|
#ifdef HAVE_SYSLOG
|
||||||
#include <syslog.h>
|
#include <syslog.h>
|
||||||
#include <stdarg.h>
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
#define DEFAULT_IDENT "repmgr"
|
#define DEFAULT_IDENT "repmgr"
|
||||||
@@ -37,6 +39,29 @@
|
|||||||
|
|
||||||
/* #define REPMGR_DEBUG */
|
/* #define REPMGR_DEBUG */
|
||||||
|
|
||||||
|
void stderr_log_with_level(const char *level_name, int level, const char *fmt, ...) {
|
||||||
|
size_t len = strlen(fmt);
|
||||||
|
char fmt1[len + 150];
|
||||||
|
time_t t;
|
||||||
|
struct tm *tm;
|
||||||
|
char buff[100];
|
||||||
|
va_list ap;
|
||||||
|
|
||||||
|
if(log_level >= level) {
|
||||||
|
time(&t);
|
||||||
|
tm = localtime(&t);
|
||||||
|
|
||||||
|
va_start(ap, fmt);
|
||||||
|
|
||||||
|
strftime(buff, 100, "[%Y-%m-%d %H:%M:%S]", tm);
|
||||||
|
snprintf(fmt1, len + 150, "%s [%s] %s", buff, level_name, fmt);
|
||||||
|
vfprintf(stderr, fmt1, ap);
|
||||||
|
|
||||||
|
va_end(ap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static int detect_log_level(const char* level);
|
static int detect_log_level(const char* level);
|
||||||
static int detect_log_facility(const char* facility);
|
static int detect_log_facility(const char* facility);
|
||||||
|
|
||||||
|
|||||||
18
log.h
18
log.h
@@ -25,15 +25,17 @@
|
|||||||
#define REPMGR_SYSLOG 1
|
#define REPMGR_SYSLOG 1
|
||||||
#define REPMGR_STDERR 2
|
#define REPMGR_STDERR 2
|
||||||
|
|
||||||
|
void stderr_log_with_level(const char *level_name, int level, const char *fmt, ...);
|
||||||
|
|
||||||
/* Standard error logging */
|
/* Standard error logging */
|
||||||
#define stderr_log_debug(...) if (log_level >= LOG_DEBUG) fprintf(stderr, __VA_ARGS__)
|
#define stderr_log_debug(...) stderr_log_with_level("DEBUG", LOG_DEBUG, __VA_ARGS__)
|
||||||
#define stderr_log_info(...) if (log_level >= LOG_INFO) fprintf(stderr, __VA_ARGS__)
|
#define stderr_log_info(...) stderr_log_with_level("INFO", LOG_INFO, __VA_ARGS__)
|
||||||
#define stderr_log_notice(...) if (log_level >= LOG_NOTICE) fprintf(stderr, __VA_ARGS__)
|
#define stderr_log_notice(...) stderr_log_with_level("NOTICE", LOG_NOTICE, __VA_ARGS__)
|
||||||
#define stderr_log_warning(...) if (log_level >= LOG_WARNING) fprintf(stderr, __VA_ARGS__)
|
#define stderr_log_warning(...) stderr_log_with_level("WARNING", LOG_WARNING, __VA_ARGS__)
|
||||||
#define stderr_log_err(...) if (log_level >= LOG_ERR) fprintf(stderr, __VA_ARGS__)
|
#define stderr_log_err(...) stderr_log_with_level("ERROR", LOG_ERR, __VA_ARGS__)
|
||||||
#define stderr_log_crit(...) if (log_level >= LOG_CRIT) fprintf(stderr, __VA_ARGS__)
|
#define stderr_log_crit(...) stderr_log_with_level("CRITICAL", LOG_CRIT, __VA_ARGS__)
|
||||||
#define stderr_log_alert(...) if (log_level >= LOG_ALERT) fprintf(stderr, __VA_ARGS__)
|
#define stderr_log_alert(...) stderr_log_with_level("ALERT", LOG_ALERT, __VA_ARGS__)
|
||||||
#define stderr_log_emerg(...) if (log_level >= LOG_EMERG) fprintf(stderr, __VA_ARGS__)
|
#define stderr_log_emerg(...) stderr_log_with_level("EMERGENCY", LOG_EMERG, __VA_ARGS__)
|
||||||
|
|
||||||
#ifdef HAVE_SYSLOG
|
#ifdef HAVE_SYSLOG
|
||||||
|
|
||||||
|
|||||||
88
repmgr.c
88
repmgr.c
@@ -85,8 +85,8 @@ bool need_a_node = true;
|
|||||||
bool require_password = false;
|
bool require_password = false;
|
||||||
|
|
||||||
/* Initialization of runtime options */
|
/* Initialization of runtime options */
|
||||||
t_runtime_options runtime_options = { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, "", "", 0 };
|
t_runtime_options runtime_options = T_RUNTIME_OPTIONS_INITIALIZER;
|
||||||
t_configuration_options options = { "", -1, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", -1 };
|
t_configuration_options options = T_CONFIGURATION_OPTIONS_INITIALIZER;
|
||||||
|
|
||||||
static char *server_mode = NULL;
|
static char *server_mode = NULL;
|
||||||
static char *server_cmd = NULL;
|
static char *server_cmd = NULL;
|
||||||
@@ -396,7 +396,7 @@ do_cluster_show(void)
|
|||||||
|
|
||||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
{
|
{
|
||||||
log_err(_("Cannot get node information, have you registered them?\n%s\n"), PQerrorMessage(conn));
|
log_err(_("Can't get nodes information, have you registered them?\n%s\n"), PQerrorMessage(conn));
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
PQfinish(conn);
|
PQfinish(conn);
|
||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
@@ -565,6 +565,22 @@ do_master_register(void)
|
|||||||
PGconn *master_conn;
|
PGconn *master_conn;
|
||||||
int id;
|
int id;
|
||||||
|
|
||||||
|
if (runtime_options.force)
|
||||||
|
{
|
||||||
|
sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_nodes "
|
||||||
|
" WHERE id = %d",
|
||||||
|
repmgr_schema, options.node);
|
||||||
|
log_debug(_("master register: %s\n"), sqlquery);
|
||||||
|
|
||||||
|
if (!PQexec(conn, sqlquery))
|
||||||
|
{
|
||||||
|
log_warning(_("Cannot delete node details, %s\n"),
|
||||||
|
PQerrorMessage(conn));
|
||||||
|
PQfinish(conn);
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Ensure there isn't any other master already registered */
|
/* Ensure there isn't any other master already registered */
|
||||||
master_conn = getMasterConnection(conn, repmgr_schema,
|
master_conn = getMasterConnection(conn, repmgr_schema,
|
||||||
options.cluster_name, &id,NULL);
|
options.cluster_name, &id,NULL);
|
||||||
@@ -577,21 +593,6 @@ do_master_register(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Now register the master */
|
/* Now register the master */
|
||||||
if (runtime_options.force)
|
|
||||||
{
|
|
||||||
sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_nodes "
|
|
||||||
" WHERE id = %d",
|
|
||||||
repmgr_schema, options.node);
|
|
||||||
log_debug(_("master register: %s\n"), sqlquery);
|
|
||||||
|
|
||||||
if (!PQexec(conn, sqlquery))
|
|
||||||
{
|
|
||||||
log_warning(_("Cannot delete node details, %s\n"),
|
|
||||||
PQerrorMessage(conn));
|
|
||||||
PQfinish(conn);
|
|
||||||
exit(ERR_BAD_CONFIG);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_nodes (id, cluster, name, conninfo, priority) "
|
sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_nodes (id, cluster, name, conninfo, priority) "
|
||||||
"VALUES (%d, '%s', '%s', '%s', %d)",
|
"VALUES (%d, '%s', '%s', '%s', %d)",
|
||||||
@@ -742,7 +743,8 @@ do_standby_register(void)
|
|||||||
options.conninfo, options.priority);
|
options.conninfo, options.priority);
|
||||||
log_debug(_("standby register: %s\n"), sqlquery);
|
log_debug(_("standby register: %s\n"), sqlquery);
|
||||||
|
|
||||||
if (!PQexec(master_conn, sqlquery))
|
res = PQexec(master_conn, sqlquery);
|
||||||
|
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||||
{
|
{
|
||||||
log_err(_("Cannot insert node details, %s\n"),
|
log_err(_("Cannot insert node details, %s\n"),
|
||||||
PQerrorMessage(master_conn));
|
PQerrorMessage(master_conn));
|
||||||
@@ -767,7 +769,7 @@ do_standby_clone(void)
|
|||||||
PGresult *res;
|
PGresult *res;
|
||||||
char sqlquery[QUERY_STR_LEN];
|
char sqlquery[QUERY_STR_LEN];
|
||||||
|
|
||||||
int r = 0;
|
int r = 0, retval = SUCCESS;
|
||||||
int i;
|
int i;
|
||||||
bool flag_success = false;
|
bool flag_success = false;
|
||||||
bool test_mode = false;
|
bool test_mode = false;
|
||||||
@@ -842,7 +844,7 @@ do_standby_clone(void)
|
|||||||
log_err(_("%s needs parameter 'wal_level' to be set to 'hot_standby'\n"), progname);
|
log_err(_("%s needs parameter 'wal_level' to be set to 'hot_standby'\n"), progname);
|
||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
}
|
}
|
||||||
if (!guc_setted(conn, "wal_keep_segments", ">=", runtime_options.wal_keep_segments))
|
if (!guc_setted_typed(conn, "wal_keep_segments", ">=", runtime_options.wal_keep_segments, "integer"))
|
||||||
{
|
{
|
||||||
PQfinish(conn);
|
PQfinish(conn);
|
||||||
log_err(_("%s needs parameter 'wal_keep_segments' to be set to %s or greater (see the '-w' option or edit the postgresql.conf of the PostgreSQL master.)\n"), progname, runtime_options.wal_keep_segments);
|
log_err(_("%s needs parameter 'wal_keep_segments' to be set to %s or greater (see the '-w' option or edit the postgresql.conf of the PostgreSQL master.)\n"), progname, runtime_options.wal_keep_segments);
|
||||||
@@ -1036,6 +1038,8 @@ do_standby_clone(void)
|
|||||||
{
|
{
|
||||||
log_err(_("%s: couldn't use directory %s ...\nUse --force option to force\n"),
|
log_err(_("%s: couldn't use directory %s ...\nUse --force option to force\n"),
|
||||||
progname, local_data_directory);
|
progname, local_data_directory);
|
||||||
|
r = ERR_BAD_CONFIG;
|
||||||
|
retval = ERR_BAD_CONFIG;
|
||||||
goto stop_backup;
|
goto stop_backup;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1175,7 +1179,7 @@ stop_backup:
|
|||||||
log_err(_("Can't stop backup: %s\n"), PQerrorMessage(conn));
|
log_err(_("Can't stop backup: %s\n"), PQerrorMessage(conn));
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
PQfinish(conn);
|
PQfinish(conn);
|
||||||
exit(ERR_STOP_BACKUP);
|
exit(retval);
|
||||||
}
|
}
|
||||||
last_wal_segment = PQgetvalue(res, 0, 0);
|
last_wal_segment = PQgetvalue(res, 0, 0);
|
||||||
|
|
||||||
@@ -1313,13 +1317,12 @@ do_standby_promote(void)
|
|||||||
rename(recovery_file_path, recovery_done_path);
|
rename(recovery_file_path, recovery_done_path);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We assume the pg_ctl script is in the PATH. Restart and wait for
|
* Restart and wait for the server to finish starting, so that the check
|
||||||
* the server to finish starting, so that the check below will
|
* below will find an active server rather than one starting up. This may
|
||||||
* find an active server rather than one starting up. This may
|
|
||||||
* hang for up the default timeout (60 seconds).
|
* hang for up the default timeout (60 seconds).
|
||||||
*/
|
*/
|
||||||
log_notice(_("%s: restarting server using pg_ctl\n"), progname);
|
log_notice(_("%s: restarting server using %s/pg_ctl\n"), progname, options.pg_bindir);
|
||||||
maxlen_snprintf(script, "pg_ctl -D %s -w -m fast restart", data_dir);
|
maxlen_snprintf(script, "%s/pg_ctl %s -D %s -w -m fast restart", options.pg_bindir, options.pgctl_options, data_dir);
|
||||||
r = system(script);
|
r = system(script);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
{
|
{
|
||||||
@@ -1464,8 +1467,7 @@ do_standby_follow(void)
|
|||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
|
|
||||||
/* Finally, restart the service */
|
/* Finally, restart the service */
|
||||||
/* We assume the pg_ctl script is in the PATH */
|
maxlen_snprintf(script, "%s/pg_ctl %s -w -D %s -m fast restart", options.pg_bindir, options.pgctl_options, data_dir);
|
||||||
maxlen_snprintf(script, "pg_ctl -w -D %s -m fast restart", data_dir);
|
|
||||||
r = system(script);
|
r = system(script);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
{
|
{
|
||||||
@@ -1496,14 +1498,6 @@ do_witness_create(void)
|
|||||||
|
|
||||||
char master_hba_file[MAXLEN];
|
char master_hba_file[MAXLEN];
|
||||||
|
|
||||||
/* Check this directory could be used as a PGDATA dir */
|
|
||||||
if (!create_pgdir(runtime_options.dest_dir, runtime_options.force))
|
|
||||||
{
|
|
||||||
log_err(_("witness create: couldn't create data directory (\"%s\") for witness"),
|
|
||||||
runtime_options.dest_dir);
|
|
||||||
exit(ERR_BAD_CONFIG);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Connection parameters for master only */
|
/* Connection parameters for master only */
|
||||||
keywords[0] = "host";
|
keywords[0] = "host";
|
||||||
values[0] = runtime_options.host;
|
values[0] = runtime_options.host;
|
||||||
@@ -1545,6 +1539,15 @@ do_witness_create(void)
|
|||||||
exit(ERR_BAD_SSH);
|
exit(ERR_BAD_SSH);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Check this directory could be used as a PGDATA dir */
|
||||||
|
if (!create_pgdir(runtime_options.dest_dir, runtime_options.force))
|
||||||
|
{
|
||||||
|
log_err(_("witness create: couldn't create data directory (\"%s\") for witness"),
|
||||||
|
runtime_options.dest_dir);
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* To create a witness server we need to:
|
* To create a witness server we need to:
|
||||||
* 1) initialize the cluster
|
* 1) initialize the cluster
|
||||||
@@ -1553,8 +1556,7 @@ do_witness_create(void)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/* Create the cluster for witness */
|
/* Create the cluster for witness */
|
||||||
/* We assume the pg_ctl script is in the PATH */
|
sprintf(script, "%s/pg_ctl %s -D %s init -o \"-W\"", options.pg_bindir, options.pgctl_options, runtime_options.dest_dir);
|
||||||
sprintf(script, "pg_ctl -D %s init -o \"-W\"", runtime_options.dest_dir);
|
|
||||||
log_info("Initialize cluster for witness: %s.\n", script);
|
log_info("Initialize cluster for witness: %s.\n", script);
|
||||||
|
|
||||||
r = system(script);
|
r = system(script);
|
||||||
@@ -1627,7 +1629,7 @@ do_witness_create(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* start new instance */
|
/* start new instance */
|
||||||
sprintf(script, "pg_ctl -w -D %s start", runtime_options.dest_dir);
|
sprintf(script, "%s/pg_ctl %s -w -D %s start", options.pg_bindir, options.pgctl_options, runtime_options.dest_dir);
|
||||||
log_info(_("Start cluster for witness: %s"), script);
|
log_info(_("Start cluster for witness: %s"), script);
|
||||||
r = system(script);
|
r = system(script);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
@@ -1786,9 +1788,9 @@ test_ssh_connection(char *host, char *remote_user)
|
|||||||
|
|
||||||
/* Check if we have ssh connectivity to host before trying to rsync */
|
/* Check if we have ssh connectivity to host before trying to rsync */
|
||||||
if (!remote_user[0])
|
if (!remote_user[0])
|
||||||
maxlen_snprintf(script, "ssh -o Batchmode=yes %s %s", host, TRUEBIN_PATH);
|
maxlen_snprintf(script, "ssh -o Batchmode=yes %s %s %s", options.ssh_options, host, TRUEBIN_PATH);
|
||||||
else
|
else
|
||||||
maxlen_snprintf(script, "ssh -o Batchmode=yes %s -l %s %s", host, remote_user, TRUEBIN_PATH);
|
maxlen_snprintf(script, "ssh -o Batchmode=yes %s %s -l %s %s", options.ssh_options, host, remote_user, TRUEBIN_PATH);
|
||||||
|
|
||||||
log_debug(_("command is: %s"), script);
|
log_debug(_("command is: %s"), script);
|
||||||
r = system(script);
|
r = system(script);
|
||||||
@@ -1827,7 +1829,7 @@ copy_remote_files(char *host, char *remote_user, char *remote_path,
|
|||||||
|
|
||||||
if (is_directory)
|
if (is_directory)
|
||||||
{
|
{
|
||||||
strcat(rsync_flags, " --exclude=pg_xlog* --exclude=pg_control --exclude=*.pid");
|
strcat(rsync_flags, " --exclude=pg_xlog* --exclude=pg_log* --exclude=pg_control --exclude=*.pid");
|
||||||
maxlen_snprintf(script, "rsync %s %s:%s/* %s",
|
maxlen_snprintf(script, "rsync %s %s:%s/* %s",
|
||||||
rsync_flags, host_string, remote_path, local_path);
|
rsync_flags, host_string, remote_path, local_path);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,8 @@ node_name=standby2
|
|||||||
|
|
||||||
# Connection information
|
# Connection information
|
||||||
conninfo='host=192.168.204.104'
|
conninfo='host=192.168.204.104'
|
||||||
rsync_options=--archive --checksum --compress --progress --rsh=ssh
|
rsync_options=--archive --checksum --compress --progress --rsh="ssh -o \"StrictHostKeyChecking no\""
|
||||||
|
ssh_options=-o "StrictHostKeyChecking no"
|
||||||
|
|
||||||
# How many seconds we wait for master response before declaring master failure
|
# How many seconds we wait for master response before declaring master failure
|
||||||
master_response_timeout=60
|
master_response_timeout=60
|
||||||
@@ -33,3 +34,11 @@ loglevel=NOTICE
|
|||||||
# Logging facility: possible values are STDERR or - for Syslog integration - one of LOCAL0, LOCAL1, ..., LOCAL7, USER
|
# Logging facility: possible values are STDERR or - for Syslog integration - one of LOCAL0, LOCAL1, ..., LOCAL7, USER
|
||||||
# Default: STDERR
|
# Default: STDERR
|
||||||
logfacility=STDERR
|
logfacility=STDERR
|
||||||
|
|
||||||
|
# path to pg_ctl executable
|
||||||
|
pg_bindir=/usr/bin/
|
||||||
|
|
||||||
|
#
|
||||||
|
# you may add command line arguments for pg_ctl
|
||||||
|
#
|
||||||
|
# pg_ctl_options='-s'
|
||||||
|
|||||||
2
repmgr.h
2
repmgr.h
@@ -69,6 +69,8 @@ typedef struct
|
|||||||
int keep_history;
|
int keep_history;
|
||||||
} t_runtime_options;
|
} t_runtime_options;
|
||||||
|
|
||||||
|
#define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, "", "", 0 }
|
||||||
|
|
||||||
#define SLEEP_MONITOR 2
|
#define SLEEP_MONITOR 2
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
618
repmgrd.c
618
repmgrd.c
@@ -22,6 +22,9 @@
|
|||||||
|
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
@@ -45,6 +48,27 @@
|
|||||||
const XLogRecPtr InvalidXLogRecPtr = {0, 0};
|
const XLogRecPtr InvalidXLogRecPtr = {0, 0};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if PG_VERSION_NUM >= 90300
|
||||||
|
#define XLAssign(a, b) \
|
||||||
|
a = b
|
||||||
|
|
||||||
|
#define XLAssignValue(a, xlogid, xrecoff) \
|
||||||
|
a = xrecoff
|
||||||
|
|
||||||
|
#define XLByteLT(a, b) \
|
||||||
|
(a < b)
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define XLAssign(a, b) \
|
||||||
|
a.xlogid = b.xlogid; \
|
||||||
|
a.xrecoff = b.xrecoff
|
||||||
|
|
||||||
|
#define XLAssignValue(a, uxlogid, uxrecoff) \
|
||||||
|
a.xlogid = uxlogid; \
|
||||||
|
a.xrecoff = uxrecoff
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Struct to keep info about the nodes, used in the voting process in
|
* Struct to keep info about the nodes, used in the voting process in
|
||||||
* do_failover()
|
* do_failover()
|
||||||
@@ -52,8 +76,10 @@ const XLogRecPtr InvalidXLogRecPtr = {0, 0};
|
|||||||
typedef struct nodeInfo
|
typedef struct nodeInfo
|
||||||
{
|
{
|
||||||
int nodeId;
|
int nodeId;
|
||||||
|
char conninfostr[MAXLEN];
|
||||||
XLogRecPtr xlog_location;
|
XLogRecPtr xlog_location;
|
||||||
bool is_ready;
|
bool is_ready;
|
||||||
|
bool is_visible;
|
||||||
bool is_witness;
|
bool is_witness;
|
||||||
} nodeInfo;
|
} nodeInfo;
|
||||||
|
|
||||||
@@ -79,11 +105,15 @@ bool verbose = false;
|
|||||||
bool monitoring_history = false;
|
bool monitoring_history = false;
|
||||||
char repmgr_schema[MAXLEN];
|
char repmgr_schema[MAXLEN];
|
||||||
|
|
||||||
|
bool failover_done = false;
|
||||||
|
|
||||||
|
char *pid_file = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* should initialize with {0} to be ANSI complaint ? but this raises
|
* should initialize with {0} to be ANSI complaint ? but this raises
|
||||||
* error with gcc -Wall
|
* error with gcc -Wall
|
||||||
*/
|
*/
|
||||||
t_configuration_options config = {};
|
t_configuration_options config = T_CONFIGURATION_OPTIONS_INITIALIZER;
|
||||||
|
|
||||||
static void help(const char* progname);
|
static void help(const char* progname);
|
||||||
static void usage(void);
|
static void usage(void);
|
||||||
@@ -107,7 +137,10 @@ static volatile sig_atomic_t got_SIGHUP = false;
|
|||||||
|
|
||||||
static void handle_sighup(SIGNAL_ARGS);
|
static void handle_sighup(SIGNAL_ARGS);
|
||||||
static void handle_sigint(SIGNAL_ARGS);
|
static void handle_sigint(SIGNAL_ARGS);
|
||||||
|
|
||||||
|
#ifndef WIN32
|
||||||
static void setup_event_handlers(void);
|
static void setup_event_handlers(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
#define CloseConnections() \
|
#define CloseConnections() \
|
||||||
if (PQisBusy(primaryConn) == 1) \
|
if (PQisBusy(primaryConn) == 1) \
|
||||||
@@ -126,11 +159,14 @@ main(int argc, char **argv)
|
|||||||
{"config", required_argument, NULL, 'f'},
|
{"config", required_argument, NULL, 'f'},
|
||||||
{"verbose", no_argument, NULL, 'v'},
|
{"verbose", no_argument, NULL, 'v'},
|
||||||
{"monitoring-history", no_argument, NULL, 'm'},
|
{"monitoring-history", no_argument, NULL, 'm'},
|
||||||
|
{"daemonize", no_argument, NULL, 'd'},
|
||||||
|
{"pid-file", required_argument, NULL, 'p'},
|
||||||
{NULL, 0, NULL, 0}
|
{NULL, 0, NULL, 0}
|
||||||
};
|
};
|
||||||
|
|
||||||
int optindex;
|
int optindex;
|
||||||
int c;
|
int c;
|
||||||
|
bool daemonize = false;
|
||||||
|
|
||||||
char standby_version[MAXVERSIONSTR];
|
char standby_version[MAXVERSIONSTR];
|
||||||
|
|
||||||
@@ -150,7 +186,7 @@ main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while ((c = getopt_long(argc, argv, "f:v:m", long_options, &optindex)) != -1)
|
while ((c = getopt_long(argc, argv, "f:v:mdp:", long_options, &optindex)) != -1)
|
||||||
{
|
{
|
||||||
switch (c)
|
switch (c)
|
||||||
{
|
{
|
||||||
@@ -163,13 +199,67 @@ main(int argc, char **argv)
|
|||||||
case 'm':
|
case 'm':
|
||||||
monitoring_history = true;
|
monitoring_history = true;
|
||||||
break;
|
break;
|
||||||
|
case 'd':
|
||||||
|
daemonize = true;
|
||||||
|
break;
|
||||||
|
case 'p':
|
||||||
|
pid_file = optarg;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
usage();
|
usage();
|
||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (daemonize)
|
||||||
|
{
|
||||||
|
pid_t pid = fork();
|
||||||
|
switch (pid)
|
||||||
|
{
|
||||||
|
case -1:
|
||||||
|
log_err("Error in fork(): %s\n", strerror(errno));
|
||||||
|
exit(ERR_SYS_FAILURE);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 0: /* child process */
|
||||||
|
pid = setsid();
|
||||||
|
if (pid == (pid_t)-1)
|
||||||
|
{
|
||||||
|
log_err("Error in setsid(): %s\n", strerror(errno));
|
||||||
|
exit(ERR_SYS_FAILURE);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default: /* parent process */
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pid_file)
|
||||||
|
{
|
||||||
|
struct stat st;
|
||||||
|
FILE *fd;
|
||||||
|
|
||||||
|
if (stat(pid_file, &st) != -1)
|
||||||
|
{
|
||||||
|
log_err("PID file %s exists. If repmgrd is no longer alive remove the file and restart repmgrd.\n", pid_file);
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
fd = fopen(pid_file, "w");
|
||||||
|
if (fd == NULL)
|
||||||
|
{
|
||||||
|
log_err("Could not open PID file %s!\n", pid_file);
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(fd, "%d", getpid());
|
||||||
|
fclose(fd);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef WIN32
|
||||||
setup_event_handlers();
|
setup_event_handlers();
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Read the configuration file: repmgr.conf
|
* Read the configuration file: repmgr.conf
|
||||||
@@ -201,63 +291,37 @@ main(int argc, char **argv)
|
|||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Set my server mode, establish a connection to primary
|
* MAIN LOOP
|
||||||
* and start monitor
|
* This loops cicles once per failover and at startup
|
||||||
*/
|
* Requisites:
|
||||||
if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node))
|
* - myLocalConn needs to be already setted with an active connection
|
||||||
myLocalMode = WITNESS_MODE;
|
* - no master connection
|
||||||
else if (is_standby(myLocalConn))
|
*/
|
||||||
myLocalMode = STANDBY_MODE;
|
do
|
||||||
else /* is the master */
|
|
||||||
myLocalMode = PRIMARY_MODE;
|
|
||||||
|
|
||||||
switch (myLocalMode)
|
|
||||||
{
|
{
|
||||||
case PRIMARY_MODE:
|
|
||||||
primary_options.node = local_options.node;
|
|
||||||
strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
|
|
||||||
primaryConn = myLocalConn;
|
|
||||||
|
|
||||||
checkClusterConfiguration(myLocalConn, primaryConn);
|
|
||||||
checkNodeConfiguration(local_options.conninfo);
|
|
||||||
|
|
||||||
if (reload_configuration(config_file, &local_options))
|
|
||||||
{
|
|
||||||
PQfinish(myLocalConn);
|
|
||||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
|
||||||
primaryConn = myLocalConn;
|
|
||||||
update_registration();
|
|
||||||
}
|
|
||||||
|
|
||||||
log_info(_("%s Starting continuous primary connection check\n"), progname);
|
|
||||||
/* Check that primary is still alive, and standbies are sending info */
|
|
||||||
/*
|
/*
|
||||||
* Every SLEEP_MONITOR seconds, do master checks
|
* Set my server mode, establish a connection to primary
|
||||||
* XXX
|
* and start monitor
|
||||||
* Check that standbies are sending info
|
*/
|
||||||
*/
|
if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node))
|
||||||
for (;;)
|
myLocalMode = WITNESS_MODE;
|
||||||
{
|
else if (is_standby(myLocalConn))
|
||||||
if (CheckPrimaryConnection())
|
myLocalMode = STANDBY_MODE;
|
||||||
{
|
else /* is the master */
|
||||||
/*
|
myLocalMode = PRIMARY_MODE;
|
||||||
CheckActiveStandbiesConnections();
|
|
||||||
CheckInactiveStandbies();
|
switch (myLocalMode)
|
||||||
*/
|
{
|
||||||
sleep(SLEEP_MONITOR);
|
case PRIMARY_MODE:
|
||||||
}
|
primary_options.node = local_options.node;
|
||||||
else
|
strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
|
||||||
{
|
primaryConn = myLocalConn;
|
||||||
/* XXX
|
|
||||||
* May we do something more verbose ?
|
checkClusterConfiguration(myLocalConn, primaryConn);
|
||||||
*/
|
checkNodeConfiguration(local_options.conninfo);
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (got_SIGHUP)
|
|
||||||
{
|
|
||||||
/* if we can reload, then could need to change myLocalConn */
|
|
||||||
if (reload_configuration(config_file, &local_options))
|
if (reload_configuration(config_file, &local_options))
|
||||||
{
|
{
|
||||||
PQfinish(myLocalConn);
|
PQfinish(myLocalConn);
|
||||||
@@ -265,70 +329,112 @@ main(int argc, char **argv)
|
|||||||
primaryConn = myLocalConn;
|
primaryConn = myLocalConn;
|
||||||
update_registration();
|
update_registration();
|
||||||
}
|
}
|
||||||
got_SIGHUP = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case WITNESS_MODE:
|
|
||||||
case STANDBY_MODE:
|
|
||||||
/* I need the id of the primary as well as a connection to it */
|
|
||||||
log_info(_("%s Connecting to primary for cluster '%s'\n"),
|
|
||||||
progname, local_options.cluster_name);
|
|
||||||
primaryConn = getMasterConnection(myLocalConn, repmgr_schema,
|
|
||||||
local_options.cluster_name,
|
|
||||||
&primary_options.node, NULL);
|
|
||||||
if (primaryConn == NULL)
|
|
||||||
{
|
|
||||||
CloseConnections();
|
|
||||||
exit(ERR_BAD_CONFIG);
|
|
||||||
}
|
|
||||||
|
|
||||||
checkClusterConfiguration(myLocalConn, primaryConn);
|
log_info(_("%s Starting continuous primary connection check\n"), progname);
|
||||||
checkNodeConfiguration(local_options.conninfo);
|
|
||||||
|
|
||||||
if (reload_configuration(config_file, &local_options))
|
/* Check that primary is still alive, and standbies are sending info */
|
||||||
{
|
|
||||||
PQfinish(myLocalConn);
|
|
||||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
|
||||||
update_registration();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Every SLEEP_MONITOR seconds, do checks
|
* Every SLEEP_MONITOR seconds, do master checks
|
||||||
*/
|
* XXX
|
||||||
if (myLocalMode == WITNESS_MODE)
|
* Check that standbies are sending info
|
||||||
{
|
*/
|
||||||
log_info(_("%s Starting continuous witness node monitoring\n"), progname);
|
do
|
||||||
}
|
{
|
||||||
else if (myLocalMode == STANDBY_MODE)
|
if (CheckPrimaryConnection())
|
||||||
{
|
{
|
||||||
log_info(_("%s Starting continuous standby node monitoring\n"), progname);
|
/*
|
||||||
}
|
CheckActiveStandbiesConnections();
|
||||||
|
CheckInactiveStandbies();
|
||||||
|
*/
|
||||||
|
sleep(SLEEP_MONITOR);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* XXX
|
||||||
|
* May we do something more verbose ?
|
||||||
|
*/
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
for (;;)
|
if (got_SIGHUP)
|
||||||
{
|
{
|
||||||
if (myLocalMode == WITNESS_MODE)
|
/* if we can reload, then could need to change myLocalConn */
|
||||||
WitnessMonitor();
|
if (reload_configuration(config_file, &local_options))
|
||||||
else if (myLocalMode == STANDBY_MODE)
|
{
|
||||||
StandbyMonitor();
|
PQfinish(myLocalConn);
|
||||||
sleep(SLEEP_MONITOR);
|
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||||
|
primaryConn = myLocalConn;
|
||||||
|
update_registration();
|
||||||
|
}
|
||||||
|
got_SIGHUP = false;
|
||||||
|
}
|
||||||
|
} while (!failover_done);
|
||||||
|
break;
|
||||||
|
case WITNESS_MODE:
|
||||||
|
case STANDBY_MODE:
|
||||||
|
/* I need the id of the primary as well as a connection to it */
|
||||||
|
log_info(_("%s Connecting to primary for cluster '%s'\n"),
|
||||||
|
progname, local_options.cluster_name);
|
||||||
|
primaryConn = getMasterConnection(myLocalConn, repmgr_schema,
|
||||||
|
local_options.cluster_name,
|
||||||
|
&primary_options.node, NULL);
|
||||||
|
if (primaryConn == NULL)
|
||||||
|
{
|
||||||
|
CloseConnections();
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
checkClusterConfiguration(myLocalConn, primaryConn);
|
||||||
|
checkNodeConfiguration(local_options.conninfo);
|
||||||
|
|
||||||
if (got_SIGHUP)
|
|
||||||
{
|
|
||||||
/* if we can reload, then could need to change myLocalConn */
|
|
||||||
if (reload_configuration(config_file, &local_options))
|
if (reload_configuration(config_file, &local_options))
|
||||||
{
|
{
|
||||||
PQfinish(myLocalConn);
|
PQfinish(myLocalConn);
|
||||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||||
update_registration();
|
update_registration();
|
||||||
}
|
}
|
||||||
got_SIGHUP = false;
|
|
||||||
}
|
/*
|
||||||
|
* Every SLEEP_MONITOR seconds, do checks
|
||||||
|
*/
|
||||||
|
if (myLocalMode == WITNESS_MODE)
|
||||||
|
{
|
||||||
|
log_info(_("%s Starting continuous witness node monitoring\n"), progname);
|
||||||
|
}
|
||||||
|
else if (myLocalMode == STANDBY_MODE)
|
||||||
|
{
|
||||||
|
log_info(_("%s Starting continuous standby node monitoring\n"), progname);
|
||||||
|
}
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
if (myLocalMode == WITNESS_MODE)
|
||||||
|
WitnessMonitor();
|
||||||
|
else if (myLocalMode == STANDBY_MODE)
|
||||||
|
StandbyMonitor();
|
||||||
|
sleep(SLEEP_MONITOR);
|
||||||
|
|
||||||
|
if (got_SIGHUP)
|
||||||
|
{
|
||||||
|
/* if we can reload, then could need to change myLocalConn */
|
||||||
|
if (reload_configuration(config_file, &local_options))
|
||||||
|
{
|
||||||
|
PQfinish(myLocalConn);
|
||||||
|
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||||
|
update_registration();
|
||||||
|
}
|
||||||
|
got_SIGHUP = false;
|
||||||
|
}
|
||||||
|
} while (!failover_done);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node);
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
default:
|
failover_done = false;
|
||||||
log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node);
|
|
||||||
}
|
} while (true);
|
||||||
|
|
||||||
/* Prevent a double-free */
|
/* Prevent a double-free */
|
||||||
if (primaryConn == myLocalConn)
|
if (primaryConn == myLocalConn)
|
||||||
@@ -356,7 +462,7 @@ WitnessMonitor(void)
|
|||||||
* Check if the master is still available, if after 5 minutes of retries
|
* Check if the master is still available, if after 5 minutes of retries
|
||||||
* we cannot reconnect, return false.
|
* we cannot reconnect, return false.
|
||||||
*/
|
*/
|
||||||
CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
|
CheckPrimaryConnection(); /* this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds */
|
||||||
|
|
||||||
if (PQstatus(primaryConn) != CONNECTION_OK)
|
if (PQstatus(primaryConn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
@@ -441,7 +547,7 @@ StandbyMonitor(void)
|
|||||||
* Check if the master is still available, if after 5 minutes of retries
|
* Check if the master is still available, if after 5 minutes of retries
|
||||||
* we cannot reconnect, try to get a new master.
|
* we cannot reconnect, try to get a new master.
|
||||||
*/
|
*/
|
||||||
CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
|
CheckPrimaryConnection(); /* this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds */
|
||||||
|
|
||||||
if (PQstatus(primaryConn) != CONNECTION_OK)
|
if (PQstatus(primaryConn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
@@ -479,6 +585,7 @@ StandbyMonitor(void)
|
|||||||
* a new primaryConn
|
* a new primaryConn
|
||||||
*/
|
*/
|
||||||
do_failover();
|
do_failover();
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -570,23 +677,23 @@ StandbyMonitor(void)
|
|||||||
static void
|
static void
|
||||||
do_failover(void)
|
do_failover(void)
|
||||||
{
|
{
|
||||||
PGresult *res1;
|
PGresult *res;
|
||||||
PGresult *res2;
|
|
||||||
char sqlquery[8192];
|
char sqlquery[8192];
|
||||||
|
|
||||||
int total_nodes = 0;
|
int total_nodes = 0;
|
||||||
int visible_nodes = 0;
|
int visible_nodes = 0;
|
||||||
|
int ready_nodes = 0;
|
||||||
|
|
||||||
bool find_best = false;
|
bool find_best = false;
|
||||||
bool witness = false;
|
bool witness = false;
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
int node;
|
uint32 uxlogid;
|
||||||
char nodeConninfo[MAXLEN];
|
uint32 uxrecoff;
|
||||||
|
XLogRecPtr xlog_recptr;
|
||||||
|
|
||||||
unsigned int uxlogid;
|
|
||||||
unsigned int uxrecoff;
|
|
||||||
char last_wal_standby_applied[MAXLEN];
|
char last_wal_standby_applied[MAXLEN];
|
||||||
|
|
||||||
PGconn *nodeConn = NULL;
|
PGconn *nodeConn = NULL;
|
||||||
@@ -596,108 +703,62 @@ do_failover(void)
|
|||||||
* which seems to be large enough for most scenarios
|
* which seems to be large enough for most scenarios
|
||||||
*/
|
*/
|
||||||
nodeInfo nodes[50];
|
nodeInfo nodes[50];
|
||||||
|
|
||||||
/* initialize to keep compiler quiet */
|
/* initialize to keep compiler quiet */
|
||||||
nodeInfo best_candidate = {-1, InvalidXLogRecPtr, false, false};
|
nodeInfo best_candidate = {-1, "", InvalidXLogRecPtr, false, false, false};
|
||||||
|
|
||||||
/* first we get info about this node, and update shared memory */
|
|
||||||
sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
|
|
||||||
res1 = PQexec(myLocalConn, sqlquery);
|
|
||||||
if (PQresultStatus(res1) != PGRES_TUPLES_OK)
|
|
||||||
{
|
|
||||||
log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
|
|
||||||
PQclear(res1);
|
|
||||||
sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
|
|
||||||
update_shared_memory(last_wal_standby_applied);
|
|
||||||
exit(ERR_DB_QUERY);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* write last location in shared memory */
|
|
||||||
update_shared_memory(PQgetvalue(res1, 0, 0));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* we sleep the monitor time + one second
|
|
||||||
* we bet it should be enough for other repmgrd to update their own data
|
|
||||||
*/
|
|
||||||
sleep(SLEEP_MONITOR + 1);
|
|
||||||
|
|
||||||
/* get a list of standby nodes, including myself */
|
/* get a list of standby nodes, including myself */
|
||||||
sprintf(sqlquery, "SELECT id, conninfo, witness "
|
sprintf(sqlquery, "SELECT id, conninfo, witness "
|
||||||
" FROM %s.repl_nodes "
|
" FROM %s.repl_nodes "
|
||||||
" WHERE id <> %d "
|
" WHERE cluster = '%s' "
|
||||||
" AND cluster = '%s' "
|
|
||||||
" ORDER BY priority, id ",
|
" ORDER BY priority, id ",
|
||||||
repmgr_schema, primary_options.node, local_options.cluster_name);
|
repmgr_schema, local_options.cluster_name);
|
||||||
|
|
||||||
res1 = PQexec(myLocalConn, sqlquery);
|
res = PQexec(myLocalConn, sqlquery);
|
||||||
if (PQresultStatus(res1) != PGRES_TUPLES_OK)
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
{
|
{
|
||||||
log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn));
|
log_err(_("Can't get nodes' info: %s\n"), PQerrorMessage(myLocalConn));
|
||||||
PQclear(res1);
|
PQclear(res);
|
||||||
PQfinish(myLocalConn);
|
PQfinish(myLocalConn);
|
||||||
exit(ERR_DB_QUERY);
|
exit(ERR_DB_QUERY);
|
||||||
}
|
}
|
||||||
|
|
||||||
log_debug(_("%s: there are %d nodes registered"), progname, PQntuples(res1));
|
/*
|
||||||
/* ask for the locations */
|
* total nodes that are registered
|
||||||
for (i = 0; i < PQntuples(res1); i++)
|
*/
|
||||||
|
total_nodes = PQntuples(res);
|
||||||
|
log_debug(_("%s: there are %d nodes registered\n"), progname, total_nodes);
|
||||||
|
|
||||||
|
/* Build an array with the nodes and indicate which ones are visible and ready */
|
||||||
|
for (i = 0; i < total_nodes; i++)
|
||||||
{
|
{
|
||||||
node = atoi(PQgetvalue(res1, i, 0));
|
nodes[i].nodeId = atoi(PQgetvalue(res, i, 0));
|
||||||
|
strncpy(nodes[i].conninfostr, PQgetvalue(res, i, 1), MAXLEN);
|
||||||
|
nodes[i].is_witness = (strcmp(PQgetvalue(res, i, 2), "t") == 0) ? true : false;
|
||||||
|
|
||||||
/* Initialize on false so if we can't reach this node we know that later */
|
/* Initialize on false so if we can't reach this node we know that later */
|
||||||
|
nodes[i].is_visible = false;
|
||||||
nodes[i].is_ready = false;
|
nodes[i].is_ready = false;
|
||||||
strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN);
|
|
||||||
witness = (strcmp(PQgetvalue(res1, i, 2), "t") == 0) ? true : false;
|
|
||||||
|
|
||||||
log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s"), progname, node, nodeConninfo, (witness) ? "true" : "false");
|
XLAssignValue(nodes[i].xlog_location, 0, 0);
|
||||||
|
|
||||||
|
log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s\n"),
|
||||||
|
progname, nodes[i].nodeId, nodes[i].conninfostr, (nodes[i].is_witness) ? "true" : "false");
|
||||||
|
|
||||||
|
nodeConn = establishDBConnection(nodes[i].conninfostr, false);
|
||||||
|
|
||||||
nodeConn = establishDBConnection(nodeConninfo, false);
|
|
||||||
/* if we can't see the node just skip it */
|
/* if we can't see the node just skip it */
|
||||||
if (PQstatus(nodeConn) != CONNECTION_OK)
|
if (PQstatus(nodeConn) != CONNECTION_OK)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* the witness will always show 0/0 so avoid a useless query */
|
|
||||||
if (!witness)
|
|
||||||
{
|
|
||||||
sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema);
|
|
||||||
res2 = PQexec(nodeConn, sqlquery);
|
|
||||||
if (PQresultStatus(res2) != PGRES_TUPLES_OK)
|
|
||||||
{
|
|
||||||
log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
|
|
||||||
log_info(_("Connection details: %s\n"), nodeConninfo);
|
|
||||||
PQclear(res2);
|
|
||||||
PQfinish(nodeConn);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
|
|
||||||
log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0));
|
|
||||||
|
|
||||||
PQclear(res2);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
uxlogid = 0;
|
|
||||||
uxrecoff = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
visible_nodes++;
|
visible_nodes++;
|
||||||
|
nodes[i].is_visible = true;
|
||||||
nodes[i].nodeId = node;
|
|
||||||
nodes[i].xlog_location.xlogid = uxlogid;
|
|
||||||
nodes[i].xlog_location.xrecoff = uxrecoff;
|
|
||||||
nodes[i].is_ready = true;
|
|
||||||
nodes[i].is_witness = witness;
|
|
||||||
|
|
||||||
PQfinish(nodeConn);
|
PQfinish(nodeConn);
|
||||||
}
|
}
|
||||||
PQclear(res1);
|
PQclear(res);
|
||||||
/* Close the connection to this server */
|
|
||||||
PQfinish(myLocalConn);
|
|
||||||
|
|
||||||
/*
|
log_debug(_("Total nodes counted: registered=%d, visible=%d\n"), total_nodes, visible_nodes);
|
||||||
* total nodes that are registered, include master which is a node but was
|
|
||||||
* not counted because it's not a standby
|
|
||||||
*/
|
|
||||||
total_nodes = i + 1;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* am i on the group that should keep alive?
|
* am i on the group that should keep alive?
|
||||||
@@ -711,24 +772,169 @@ do_failover(void)
|
|||||||
exit(ERR_FAILOVER_FAIL);
|
exit(ERR_FAILOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Query all the nodes to determine which ones are ready */
|
||||||
|
for (i = 0; i < total_nodes; i++)
|
||||||
|
{
|
||||||
|
/* if the node is not visible, skip it */
|
||||||
|
if (!nodes[i].is_visible)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (nodes[i].is_witness)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
nodeConn = establishDBConnection(nodes[i].conninfostr, false);
|
||||||
|
/* XXX
|
||||||
|
* This shouldn't happen, if this happens it means this is a major problem
|
||||||
|
* maybe network outages? anyway, is better for a human to react
|
||||||
|
*/
|
||||||
|
if (PQstatus(nodeConn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
log_err(_("It seems new problems are arising, manual intervention is needed\n"));
|
||||||
|
exit(ERR_FAILOVER_FAIL);
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlquery_snprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
|
||||||
|
res = PQexec(nodeConn, sqlquery);
|
||||||
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
|
||||||
|
log_info(_("Connection details: %s\n"), nodes[i].conninfostr);
|
||||||
|
PQclear(res);
|
||||||
|
PQfinish(nodeConn);
|
||||||
|
exit(ERR_FAILOVER_FAIL);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
|
||||||
|
log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0));
|
||||||
|
|
||||||
|
log_debug("XLog position of node %d: log id=%u (%X), offset=%u (%X)\n",
|
||||||
|
nodes[i].nodeId, uxlogid, uxlogid, uxrecoff, uxrecoff);
|
||||||
|
|
||||||
|
/* If position is 0/0, error */
|
||||||
|
if (uxlogid == 0 && uxrecoff == 0)
|
||||||
|
{
|
||||||
|
PQclear(res);
|
||||||
|
PQfinish(nodeConn);
|
||||||
|
log_info(_("InvalidXLogRecPtr detected in a standby\n"));
|
||||||
|
exit(ERR_FAILOVER_FAIL);
|
||||||
|
}
|
||||||
|
|
||||||
|
XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff);
|
||||||
|
|
||||||
|
PQclear(res);
|
||||||
|
PQfinish(nodeConn);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* last we get info about this node, and update shared memory */
|
||||||
|
sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
|
||||||
|
res = PQexec(myLocalConn, sqlquery);
|
||||||
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
|
||||||
|
PQfinish(myLocalConn);
|
||||||
|
PQclear(res);
|
||||||
|
sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
|
||||||
|
update_shared_memory(last_wal_standby_applied);
|
||||||
|
exit(ERR_DB_QUERY);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* write last location in shared memory */
|
||||||
|
update_shared_memory(PQgetvalue(res, 0, 0));
|
||||||
|
PQclear(res);
|
||||||
|
|
||||||
|
for (i = 0; i < total_nodes; i++)
|
||||||
|
{
|
||||||
|
while (!nodes[i].is_ready)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* the witness will always be masked as ready if it's still
|
||||||
|
* not marked that way and avoid a useless query
|
||||||
|
*/
|
||||||
|
if (nodes[i].is_witness)
|
||||||
|
{
|
||||||
|
if (!nodes[i].is_ready)
|
||||||
|
{
|
||||||
|
nodes[i].is_ready = true;
|
||||||
|
ready_nodes++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if the node is not visible, skip it */
|
||||||
|
if (!nodes[i].is_visible)
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* if the node is ready there is nothing to check, skip it too */
|
||||||
|
if (nodes[i].is_ready)
|
||||||
|
break;
|
||||||
|
|
||||||
|
nodeConn = establishDBConnection(nodes[i].conninfostr, false);
|
||||||
|
/* XXX
|
||||||
|
* This shouldn't happen, if this happens it means this is a major problem
|
||||||
|
* maybe network outages? anyway, is better for a human to react
|
||||||
|
*/
|
||||||
|
if (PQstatus(nodeConn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
/* XXX */
|
||||||
|
log_info(_("At this point, it could be some race conditions that are acceptable, assume the node is restarting and starting failover procedure\n"));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema);
|
||||||
|
res = PQexec(nodeConn, sqlquery);
|
||||||
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(nodeConn));
|
||||||
|
PQclear(res);
|
||||||
|
PQfinish(nodeConn);
|
||||||
|
exit(ERR_DB_QUERY);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
|
||||||
|
log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0));
|
||||||
|
|
||||||
|
PQclear(res);
|
||||||
|
PQfinish(nodeConn);
|
||||||
|
/* If position is 0/0, keep checking */
|
||||||
|
if (uxlogid == 0 && uxrecoff == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
XLAssignValue(xlog_recptr, uxlogid, uxrecoff);
|
||||||
|
|
||||||
|
if (XLByteLT(nodes[i].xlog_location, xlog_recptr))
|
||||||
|
{
|
||||||
|
XLAssignValue(nodes[i].xlog_location, uxlogid, uxrecoff);
|
||||||
|
}
|
||||||
|
|
||||||
|
log_debug("Last XLog position of node %d: log id=%u (%X), offset=%u (%X)\n",
|
||||||
|
nodes[i].nodeId, uxlogid, uxlogid,
|
||||||
|
uxrecoff, uxrecoff);
|
||||||
|
|
||||||
|
ready_nodes++;
|
||||||
|
nodes[i].is_ready = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Close the connection to this server */
|
||||||
|
PQfinish(myLocalConn);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* determine which one is the best candidate to promote to primary
|
* determine which one is the best candidate to promote to primary
|
||||||
*/
|
*/
|
||||||
for (i = 0; i < total_nodes - 1; i++)
|
for (i = 0; i < total_nodes; i++)
|
||||||
{
|
{
|
||||||
/* witness is never a good candidate */
|
/* witness is never a good candidate */
|
||||||
if (nodes[i].is_witness)
|
if (nodes[i].is_witness)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (!nodes[i].is_ready)
|
if (!nodes[i].is_ready || !nodes[i].is_visible)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (!find_best)
|
if (!find_best)
|
||||||
{
|
{
|
||||||
/* start with the first ready node, and then move on to the next one */
|
/* start with the first ready node, and then move on to the next one */
|
||||||
best_candidate.nodeId = nodes[i].nodeId;
|
best_candidate.nodeId = nodes[i].nodeId;
|
||||||
best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid;
|
XLAssign(best_candidate.xlog_location, nodes[i].xlog_location);
|
||||||
best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
|
|
||||||
best_candidate.is_ready = nodes[i].is_ready;
|
best_candidate.is_ready = nodes[i].is_ready;
|
||||||
best_candidate.is_witness = nodes[i].is_witness;
|
best_candidate.is_witness = nodes[i].is_witness;
|
||||||
find_best = true;
|
find_best = true;
|
||||||
@@ -743,8 +949,7 @@ do_failover(void)
|
|||||||
if (XLByteLT(best_candidate.xlog_location, nodes[i].xlog_location))
|
if (XLByteLT(best_candidate.xlog_location, nodes[i].xlog_location))
|
||||||
{
|
{
|
||||||
best_candidate.nodeId = nodes[i].nodeId;
|
best_candidate.nodeId = nodes[i].nodeId;
|
||||||
best_candidate.xlog_location.xlogid = nodes[i].xlog_location.xlogid;
|
XLAssign(best_candidate.xlog_location, nodes[i].xlog_location);
|
||||||
best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
|
|
||||||
best_candidate.is_ready = nodes[i].is_ready;
|
best_candidate.is_ready = nodes[i].is_ready;
|
||||||
best_candidate.is_witness = nodes[i].is_witness;
|
best_candidate.is_witness = nodes[i].is_witness;
|
||||||
}
|
}
|
||||||
@@ -759,6 +964,9 @@ do_failover(void)
|
|||||||
exit(ERR_FAILOVER_FAIL);
|
exit(ERR_FAILOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* wait */
|
||||||
|
sleep(5);
|
||||||
|
|
||||||
if (verbose)
|
if (verbose)
|
||||||
log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"),
|
log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"),
|
||||||
progname);
|
progname);
|
||||||
@@ -772,6 +980,9 @@ do_failover(void)
|
|||||||
}
|
}
|
||||||
else if (find_best)
|
else if (find_best)
|
||||||
{
|
{
|
||||||
|
/* wait */
|
||||||
|
sleep(10);
|
||||||
|
|
||||||
if (verbose)
|
if (verbose)
|
||||||
log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"),
|
log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"),
|
||||||
progname, best_candidate.nodeId);
|
progname, best_candidate.nodeId);
|
||||||
@@ -793,6 +1004,9 @@ do_failover(void)
|
|||||||
exit(ERR_FAILOVER_FAIL);
|
exit(ERR_FAILOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* to force it to re-calculate mode and master node */
|
||||||
|
failover_done = true;
|
||||||
|
|
||||||
/* and reconnect to the local database */
|
/* and reconnect to the local database */
|
||||||
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
myLocalConn = establishDBConnection(local_options.conninfo, true);
|
||||||
}
|
}
|
||||||
@@ -972,6 +1186,8 @@ void help(const char *progname)
|
|||||||
printf(_(" --verbose output verbose activity information\n"));
|
printf(_(" --verbose output verbose activity information\n"));
|
||||||
printf(_(" --monitoring-history track advance or lag of the replication in every standby in repl_monitor\n"));
|
printf(_(" --monitoring-history track advance or lag of the replication in every standby in repl_monitor\n"));
|
||||||
printf(_(" -f, --config_file=PATH configuration file\n"));
|
printf(_(" -f, --config_file=PATH configuration file\n"));
|
||||||
|
printf(_(" -d, --daemonize detach process from foreground\n"));
|
||||||
|
printf(_(" -p, --pid-file=PATH write a PID file\n"));
|
||||||
printf(_("\n%s monitors a cluster of servers.\n"), progname);
|
printf(_("\n%s monitors a cluster of servers.\n"), progname);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -981,6 +1197,13 @@ static void
|
|||||||
handle_sigint(SIGNAL_ARGS)
|
handle_sigint(SIGNAL_ARGS)
|
||||||
{
|
{
|
||||||
CloseConnections();
|
CloseConnections();
|
||||||
|
logger_shutdown();
|
||||||
|
|
||||||
|
if (pid_file)
|
||||||
|
{
|
||||||
|
remove(pid_file);
|
||||||
|
}
|
||||||
|
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -996,6 +1219,7 @@ setup_event_handlers(void)
|
|||||||
{
|
{
|
||||||
pqsignal(SIGHUP, handle_sighup);
|
pqsignal(SIGHUP, handle_sighup);
|
||||||
pqsignal(SIGINT, handle_sigint);
|
pqsignal(SIGINT, handle_sigint);
|
||||||
|
pqsignal(SIGTERM, handle_sigint);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user