mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-24 07:36:30 +00:00
Compare commits
48 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
274a30efa5 | ||
|
|
db63b5bb1c | ||
|
|
e100728b93 | ||
|
|
d104f2a914 | ||
|
|
2946c097f0 | ||
|
|
a538ceb0ea | ||
|
|
5a2a8d1c82 | ||
|
|
b5a7efa58e | ||
|
|
9f6f58e4ed | ||
|
|
c22f4eaf6f | ||
|
|
925d82f7a4 | ||
|
|
1db577e294 | ||
|
|
a886fddccc | ||
|
|
83e5f98171 | ||
|
|
eb31a56186 | ||
|
|
8cd2c6fd05 | ||
|
|
e3e1c5de4e | ||
|
|
f9a150504a | ||
|
|
5bc809466c | ||
|
|
5d32026b79 | ||
|
|
2a8d6f72c6 | ||
|
|
190cc7dcb4 | ||
|
|
819937d4bd | ||
|
|
57299cb978 | ||
|
|
59f503835b | ||
|
|
33e626cd75 | ||
|
|
491ec37adf | ||
|
|
c93790fc96 | ||
|
|
ecabe2c294 | ||
|
|
2ba57e5938 | ||
|
|
2eec17e25f | ||
|
|
c48c248c15 | ||
|
|
958e45f2b8 | ||
|
|
daafd70383 | ||
|
|
c828598bfb | ||
|
|
b55519c4a2 | ||
|
|
4cafd443e1 | ||
|
|
d400d7f9ac | ||
|
|
62bb3db1f8 | ||
|
|
d9961bbb17 | ||
|
|
e1b8982c14 | ||
|
|
2fe3b3c2a3 | ||
|
|
c6e1bc205a | ||
|
|
7241391ddc | ||
|
|
c8f449f178 | ||
|
|
49420c437f | ||
|
|
827ffef5f9 | ||
|
|
16296bb1c3 |
9
HISTORY
9
HISTORY
@@ -1,12 +1,3 @@
|
|||||||
3.1.3 2016-05-17
|
|
||||||
repmgrd: enable monitoring when a standby is catching up by
|
|
||||||
replaying archived WAL (Ian)
|
|
||||||
repmgrd: when upstream_node_id is NULL, assume upstream node
|
|
||||||
to be current master (Ian)
|
|
||||||
repmgrd: check for reappearance of the master node if standby
|
|
||||||
promotion fails (Ian)
|
|
||||||
improve handling of rsync failure conditions (Martín)
|
|
||||||
|
|
||||||
3.1.2 2016-04-12
|
3.1.2 2016-04-12
|
||||||
Fix pg_ctl path generation in do_standby_switchover() (Ian)
|
Fix pg_ctl path generation in do_standby_switchover() (Ian)
|
||||||
Regularly sync witness server repl_nodes table (Ian)
|
Regularly sync witness server repl_nodes table (Ian)
|
||||||
|
|||||||
25
README.md
25
README.md
@@ -259,6 +259,20 @@ The following replication settings must be included in `postgresql.conf`:
|
|||||||
|
|
||||||
hot_standby = on
|
hot_standby = on
|
||||||
|
|
||||||
|
# If archive_mode is enabled, check that 'archive_command' is non empty
|
||||||
|
# (however it's not practical to check that it actually represents a valid
|
||||||
|
# command).
|
||||||
|
#
|
||||||
|
# From PostgreSQL 9.5, archive_mode can be one of 'off', 'on' or 'always'
|
||||||
|
# so for ease of backwards compatibility, rather than explicitly check for an
|
||||||
|
# enabled mode, check that it's not "off".
|
||||||
|
archive_mode = on
|
||||||
|
|
||||||
|
# Set archive command to a script or application that will safetly store
|
||||||
|
# you WALs in a secure place. /bin/true is an example of a command that
|
||||||
|
# ignores archiving. Use something more sensible.
|
||||||
|
archive_command = '/bin/true'
|
||||||
|
|
||||||
|
|
||||||
* * *
|
* * *
|
||||||
|
|
||||||
@@ -1002,11 +1016,8 @@ Monitoring
|
|||||||
----------
|
----------
|
||||||
|
|
||||||
When `repmgrd` is running with the option `-m/--monitoring-history`, it will
|
When `repmgrd` is running with the option `-m/--monitoring-history`, it will
|
||||||
constantly write standby node status information to the `repl_monitor` table,
|
constantly write node status information to the `repl_monitor` table, which can
|
||||||
providing a near-real time overview of replication status on all nodes
|
be queried easily using the view `repl_status`:
|
||||||
in the cluster.
|
|
||||||
|
|
||||||
The view `repl_status` shows the most recent state for each node, e.g.:
|
|
||||||
|
|
||||||
repmgr=# SELECT * FROM repmgr_test.repl_status;
|
repmgr=# SELECT * FROM repmgr_test.repl_status;
|
||||||
-[ RECORD 1 ]-------------+-----------------------------
|
-[ RECORD 1 ]-------------+-----------------------------
|
||||||
@@ -1031,10 +1042,6 @@ table , it's advisable to regularly purge historical data with
|
|||||||
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
|
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
|
||||||
many day's worth of data should be retained.
|
many day's worth of data should be retained.
|
||||||
|
|
||||||
Note that when a standby node is not streaming directly from its upstream
|
|
||||||
node, i.e. recovering WAL from an archive, `apply_lag` will always
|
|
||||||
appear as `0 bytes`.
|
|
||||||
|
|
||||||
|
|
||||||
Using a witness server with repmgrd
|
Using a witness server with repmgrd
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
|||||||
37
dbutils.c
37
dbutils.c
@@ -420,7 +420,7 @@ guc_set_typed(PGconn *conn, const char *parameter, const char *op,
|
|||||||
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
|
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
|
||||||
parameter, datatype, op, value, datatype);
|
parameter, datatype, op, value, datatype);
|
||||||
|
|
||||||
log_verbose(LOG_DEBUG, "guc_set_typed():\n%s\n", sqlquery);
|
log_verbose(LOG_DEBUG, "guc_set_typed():n%s\n", sqlquery);
|
||||||
|
|
||||||
res = PQexec(conn, sqlquery);
|
res = PQexec(conn, sqlquery);
|
||||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
@@ -587,7 +587,7 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
|||||||
upstream_conninfo = upstream_conninfo_out;
|
upstream_conninfo = upstream_conninfo_out;
|
||||||
|
|
||||||
sqlquery_snprintf(sqlquery,
|
sqlquery_snprintf(sqlquery,
|
||||||
" SELECT un.conninfo, un.id "
|
" SELECT un.conninfo, un.name, un.id "
|
||||||
" FROM %s.repl_nodes un "
|
" FROM %s.repl_nodes un "
|
||||||
"INNER JOIN %s.repl_nodes n "
|
"INNER JOIN %s.repl_nodes n "
|
||||||
" ON (un.id = n.upstream_node_id AND un.cluster = n.cluster)"
|
" ON (un.id = n.upstream_node_id AND un.cluster = n.cluster)"
|
||||||
@@ -604,7 +604,7 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
|||||||
|
|
||||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
{
|
{
|
||||||
log_err(_("error when attempting to find upstream server\n%s\n"),
|
log_err(_("unable to get conninfo for upstream server\n%s\n"),
|
||||||
PQerrorMessage(standby_conn));
|
PQerrorMessage(standby_conn));
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -612,36 +612,9 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
|||||||
|
|
||||||
if (!PQntuples(res))
|
if (!PQntuples(res))
|
||||||
{
|
{
|
||||||
|
log_notice(_("no record found for upstream server"));
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
log_debug("no record found for upstream server\n");
|
return NULL;
|
||||||
|
|
||||||
sqlquery_snprintf(sqlquery,
|
|
||||||
" SELECT un.conninfo, un.id "
|
|
||||||
" FROM %s.repl_nodes un "
|
|
||||||
" WHERE un.cluster = '%s' "
|
|
||||||
" AND un.type='master' "
|
|
||||||
" AND un.active IS TRUE",
|
|
||||||
get_repmgr_schema_quoted(standby_conn),
|
|
||||||
cluster);
|
|
||||||
res = PQexec(standby_conn, sqlquery);
|
|
||||||
|
|
||||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
||||||
{
|
|
||||||
log_err(_("error when attempting to find active master server\n%s\n"),
|
|
||||||
PQerrorMessage(standby_conn));
|
|
||||||
PQclear(res);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!PQntuples(res))
|
|
||||||
{
|
|
||||||
PQclear(res);
|
|
||||||
log_notice(_("no record found for active master server\n"));
|
|
||||||
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
log_debug("record found for active master server\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
strncpy(upstream_conninfo, PQgetvalue(res, 0, 0), MAXCONNINFO);
|
strncpy(upstream_conninfo, PQgetvalue(res, 0, 0), MAXCONNINFO);
|
||||||
|
|||||||
5
log.c
5
log.c
@@ -40,8 +40,7 @@
|
|||||||
/* #define REPMGR_DEBUG */
|
/* #define REPMGR_DEBUG */
|
||||||
|
|
||||||
static int detect_log_facility(const char *facility);
|
static int detect_log_facility(const char *facility);
|
||||||
static void _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap)
|
static void _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap);
|
||||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 0)));
|
|
||||||
|
|
||||||
int log_type = REPMGR_STDERR;
|
int log_type = REPMGR_STDERR;
|
||||||
int log_level = LOG_NOTICE;
|
int log_level = LOG_NOTICE;
|
||||||
@@ -49,7 +48,7 @@ int last_log_level = LOG_NOTICE;
|
|||||||
int verbose_logging = false;
|
int verbose_logging = false;
|
||||||
int terse_logging = false;
|
int terse_logging = false;
|
||||||
|
|
||||||
extern void
|
void
|
||||||
stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
|
stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
|
||||||
{
|
{
|
||||||
va_list arglist;
|
va_list arglist;
|
||||||
|
|||||||
8
log.h
8
log.h
@@ -25,7 +25,7 @@
|
|||||||
#define REPMGR_SYSLOG 1
|
#define REPMGR_SYSLOG 1
|
||||||
#define REPMGR_STDERR 2
|
#define REPMGR_STDERR 2
|
||||||
|
|
||||||
extern void
|
void
|
||||||
stderr_log_with_level(const char *level_name, int level, const char *fmt,...)
|
stderr_log_with_level(const char *level_name, int level, const char *fmt,...)
|
||||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
|
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
|
||||||
|
|
||||||
@@ -123,10 +123,8 @@ bool logger_shutdown(void);
|
|||||||
void logger_set_verbose(void);
|
void logger_set_verbose(void);
|
||||||
void logger_set_terse(void);
|
void logger_set_terse(void);
|
||||||
|
|
||||||
void log_hint(const char *fmt, ...)
|
void log_hint(const char *fmt, ...);
|
||||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
|
void log_verbose(int level, const char *fmt, ...);
|
||||||
void log_verbose(int level, const char *fmt, ...)
|
|
||||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));
|
|
||||||
|
|
||||||
extern int log_type;
|
extern int log_type;
|
||||||
extern int log_level;
|
extern int log_level;
|
||||||
|
|||||||
10
repmgr.c
10
repmgr.c
@@ -1665,7 +1665,7 @@ do_standby_clone(void)
|
|||||||
It's quite common for this to happen on the data directory, particularly
|
It's quite common for this to happen on the data directory, particularly
|
||||||
with long running rsync on a busy server.
|
with long running rsync on a busy server.
|
||||||
*/
|
*/
|
||||||
if (!WIFEXITED(r) && WEXITSTATUS(r) != 24)
|
if (r != 0 && r != 24)
|
||||||
{
|
{
|
||||||
log_warning(_("standby clone: failed copying master data directory '%s'\n"),
|
log_warning(_("standby clone: failed copying master data directory '%s'\n"),
|
||||||
master_data_directory);
|
master_data_directory);
|
||||||
@@ -1751,7 +1751,7 @@ do_standby_clone(void)
|
|||||||
It's quite common for this to happen on the data directory, particularly
|
It's quite common for this to happen on the data directory, particularly
|
||||||
with long running rsync on a busy server.
|
with long running rsync on a busy server.
|
||||||
*/
|
*/
|
||||||
if (!WIFEXITED(r) && WEXITSTATUS(r) != 24)
|
if (r != 0 && r != 24)
|
||||||
{
|
{
|
||||||
log_warning(_("standby clone: failed copying tablespace directory '%s'\n"),
|
log_warning(_("standby clone: failed copying tablespace directory '%s'\n"),
|
||||||
tblspc_dir_src.data);
|
tblspc_dir_src.data);
|
||||||
@@ -5067,7 +5067,7 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
|
|||||||
char *wal_error_message = NULL;
|
char *wal_error_message = NULL;
|
||||||
|
|
||||||
/* Check that WAL level is set correctly */
|
/* Check that WAL level is set correctly */
|
||||||
if (server_version_num < 90400)
|
if (server_version_num < 90300)
|
||||||
{
|
{
|
||||||
i = guc_set(conn, "wal_level", "=", "hot_standby");
|
i = guc_set(conn, "wal_level", "=", "hot_standby");
|
||||||
wal_error_message = _("parameter 'wal_level' must be set to 'hot_standby'");
|
wal_error_message = _("parameter 'wal_level' must be set to 'hot_standby'");
|
||||||
@@ -5080,6 +5080,10 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
|
|||||||
NULL,
|
NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note that in 9.6+, "hot_standby" and "archive" are accepted as aliases
|
||||||
|
* for "replica", but current_setting() will of course always return "replica"
|
||||||
|
*/
|
||||||
char *levels_96plus[] = {
|
char *levels_96plus[] = {
|
||||||
"replica",
|
"replica",
|
||||||
"logical",
|
"logical",
|
||||||
|
|||||||
116
repmgrd.c
116
repmgrd.c
@@ -716,9 +716,8 @@ standby_monitor(void)
|
|||||||
t_node_info upstream_node;
|
t_node_info upstream_node;
|
||||||
|
|
||||||
int active_master_id;
|
int active_master_id;
|
||||||
const char *upstream_node_type = NULL;
|
const char *type = NULL;
|
||||||
|
|
||||||
bool receiving_streamed_wal = true;
|
|
||||||
/*
|
/*
|
||||||
* Verify that the local node is still available - if not there's
|
* Verify that the local node is still available - if not there's
|
||||||
* no point in doing much else anyway
|
* no point in doing much else anyway
|
||||||
@@ -743,10 +742,9 @@ standby_monitor(void)
|
|||||||
upstream_conn = get_upstream_connection(my_local_conn,
|
upstream_conn = get_upstream_connection(my_local_conn,
|
||||||
local_options.cluster_name,
|
local_options.cluster_name,
|
||||||
local_options.node,
|
local_options.node,
|
||||||
&upstream_node_id,
|
&upstream_node_id, upstream_conninfo);
|
||||||
upstream_conninfo);
|
|
||||||
|
|
||||||
upstream_node_type = (upstream_node_id == master_options.node)
|
type = upstream_node_id == master_options.node
|
||||||
? "master"
|
? "master"
|
||||||
: "upstream";
|
: "upstream";
|
||||||
|
|
||||||
@@ -756,7 +754,7 @@ standby_monitor(void)
|
|||||||
* we cannot reconnect, try to get a new upstream node.
|
* we cannot reconnect, try to get a new upstream node.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
|
check_connection(&upstream_conn, type, upstream_conninfo);
|
||||||
/*
|
/*
|
||||||
* This takes up to local_options.reconnect_attempts *
|
* This takes up to local_options.reconnect_attempts *
|
||||||
* local_options.reconnect_interval seconds
|
* local_options.reconnect_interval seconds
|
||||||
@@ -769,7 +767,7 @@ standby_monitor(void)
|
|||||||
|
|
||||||
if (local_options.failover == MANUAL_FAILOVER)
|
if (local_options.failover == MANUAL_FAILOVER)
|
||||||
{
|
{
|
||||||
log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), upstream_node_type);
|
log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), type);
|
||||||
|
|
||||||
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
||||||
{
|
{
|
||||||
@@ -828,7 +826,7 @@ standby_monitor(void)
|
|||||||
* Failover handling is handled differently depending on whether
|
* Failover handling is handled differently depending on whether
|
||||||
* the failed node is the master or a cascading standby
|
* the failed node is the master or a cascading standby
|
||||||
*/
|
*/
|
||||||
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, node_info.upstream_node_id);
|
||||||
|
|
||||||
if (upstream_node.type == MASTER)
|
if (upstream_node.type == MASTER)
|
||||||
{
|
{
|
||||||
@@ -931,7 +929,7 @@ standby_monitor(void)
|
|||||||
* from the upstream node to write monitoring information
|
* from the upstream node to write monitoring information
|
||||||
*/
|
*/
|
||||||
|
|
||||||
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, node_info.upstream_node_id);
|
||||||
|
|
||||||
sprintf(sqlquery,
|
sprintf(sqlquery,
|
||||||
"SELECT id "
|
"SELECT id "
|
||||||
@@ -1003,24 +1001,10 @@ standby_monitor(void)
|
|||||||
strncpy(last_xlog_receive_location, PQgetvalue(res, 0, 1), MAXLEN);
|
strncpy(last_xlog_receive_location, PQgetvalue(res, 0, 1), MAXLEN);
|
||||||
strncpy(last_xlog_replay_location, PQgetvalue(res, 0, 2), MAXLEN);
|
strncpy(last_xlog_replay_location, PQgetvalue(res, 0, 2), MAXLEN);
|
||||||
strncpy(last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
|
strncpy(last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
|
||||||
|
|
||||||
last_xlog_receive_location_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
|
last_xlog_receive_location_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
|
||||||
? true
|
? true
|
||||||
: false;
|
: false;
|
||||||
|
|
||||||
/*
|
|
||||||
* If pg_last_xlog_receive_location is NULL, this means we're in archive
|
|
||||||
* recovery and will need to calculate lag based on pg_last_xlog_replay_location
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Replayed WAL is greater than received streamed WAL
|
|
||||||
*/
|
|
||||||
if (PQgetisnull(res, 0, 1))
|
|
||||||
{
|
|
||||||
receiving_streamed_wal = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -1032,10 +1016,11 @@ standby_monitor(void)
|
|||||||
* PostgreSQL log. In the absence of a better strategy, skip attempting
|
* PostgreSQL log. In the absence of a better strategy, skip attempting
|
||||||
* to insert a monitoring record.
|
* to insert a monitoring record.
|
||||||
*/
|
*/
|
||||||
if (receiving_streamed_wal == true && last_xlog_receive_location_gte_replayed == false)
|
if (last_xlog_receive_location_gte_replayed == false)
|
||||||
{
|
{
|
||||||
log_verbose(LOG_WARNING,
|
log_verbose(LOG_WARNING,
|
||||||
"Replayed WAL newer than received WAL - is this standby connected to its upstream?\n");
|
"Invalid replication_lag value calculated - is this standby connected to its upstream?\n");
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Get master xlog info */
|
/* Get master xlog info */
|
||||||
@@ -1054,18 +1039,9 @@ standby_monitor(void)
|
|||||||
|
|
||||||
/* Calculate the lag */
|
/* Calculate the lag */
|
||||||
lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_master_location, NULL);
|
lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_master_location, NULL);
|
||||||
|
lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);
|
||||||
lsn_last_xlog_replay_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL);
|
lsn_last_xlog_replay_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL);
|
||||||
|
|
||||||
if (last_xlog_receive_location_gte_replayed == false)
|
|
||||||
{
|
|
||||||
lsn_last_xlog_receive_location = lsn_last_xlog_replay_location;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Build the SQL to execute on master
|
* Build the SQL to execute on master
|
||||||
*/
|
*/
|
||||||
@@ -1447,6 +1423,9 @@ do_master_failover(void)
|
|||||||
PQfinish(node_conn);
|
PQfinish(node_conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Close the connection to this server */
|
||||||
|
PQfinish(my_local_conn);
|
||||||
|
my_local_conn = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* determine which one is the best candidate to promote to master
|
* determine which one is the best candidate to promote to master
|
||||||
@@ -1494,24 +1473,18 @@ do_master_failover(void)
|
|||||||
terminate(ERR_FAILOVER_FAIL);
|
terminate(ERR_FAILOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
log_debug("best candidate node id is %i\n", best_candidate.node_id);
|
|
||||||
|
|
||||||
/* if local node is the best candidate, promote it */
|
/* if local node is the best candidate, promote it */
|
||||||
if (best_candidate.node_id == local_options.node)
|
if (best_candidate.node_id == local_options.node)
|
||||||
{
|
{
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
|
|
||||||
/* Close the connection to this server */
|
|
||||||
PQfinish(my_local_conn);
|
|
||||||
my_local_conn = NULL;
|
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
/* wait */
|
/* wait */
|
||||||
sleep(5);
|
sleep(5);
|
||||||
|
|
||||||
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||||
|
|
||||||
log_debug("promote command is: \"%s\"\n",
|
log_debug(_("promote command is: \"%s\"\n"),
|
||||||
local_options.promote_command);
|
local_options.promote_command);
|
||||||
|
|
||||||
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
||||||
@@ -1522,33 +1495,6 @@ do_master_failover(void)
|
|||||||
r = system(local_options.promote_command);
|
r = system(local_options.promote_command);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
{
|
{
|
||||||
/*
|
|
||||||
* Check whether the primary reappeared, which will have caused the
|
|
||||||
* promote command to fail
|
|
||||||
*/
|
|
||||||
my_local_conn = establish_db_connection(local_options.conninfo, false);
|
|
||||||
|
|
||||||
if (my_local_conn != NULL)
|
|
||||||
{
|
|
||||||
int master_node_id;
|
|
||||||
|
|
||||||
master_conn = get_master_connection(my_local_conn,
|
|
||||||
local_options.cluster_name,
|
|
||||||
&master_node_id, NULL);
|
|
||||||
|
|
||||||
if (master_conn != NULL && master_node_id == failed_master.node_id)
|
|
||||||
{
|
|
||||||
log_notice(_("Original master reappeared before this standby was promoted - no action taken\n"));
|
|
||||||
|
|
||||||
PQfinish(master_conn);
|
|
||||||
/* no failover occurred but we'll want to restart connections */
|
|
||||||
failover_done = true;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
PQfinish(my_local_conn);
|
|
||||||
}
|
|
||||||
|
|
||||||
log_err(_("promote command failed. You could check and try it manually.\n"));
|
log_err(_("promote command failed. You could check and try it manually.\n"));
|
||||||
|
|
||||||
terminate(ERR_DB_QUERY);
|
terminate(ERR_DB_QUERY);
|
||||||
@@ -1580,39 +1526,11 @@ do_master_failover(void)
|
|||||||
{
|
{
|
||||||
PGconn *new_master_conn;
|
PGconn *new_master_conn;
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
int master_node_id;
|
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
/* wait */
|
/* wait */
|
||||||
sleep(10);
|
sleep(10);
|
||||||
|
|
||||||
/*
|
|
||||||
* Check whether the primary reappeared while we were waiting, so we
|
|
||||||
* don't end up following the promotion candidate
|
|
||||||
*/
|
|
||||||
|
|
||||||
master_conn = get_master_connection(my_local_conn,
|
|
||||||
local_options.cluster_name,
|
|
||||||
&master_node_id, NULL);
|
|
||||||
|
|
||||||
if (master_conn != NULL && master_node_id == failed_master.node_id)
|
|
||||||
{
|
|
||||||
log_notice(_("Original master reappeared - no action taken\n"));
|
|
||||||
|
|
||||||
PQfinish(master_conn);
|
|
||||||
/* no failover occurred but we'll want to restart connections */
|
|
||||||
failover_done = true;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* Close the connection to this server */
|
|
||||||
PQfinish(my_local_conn);
|
|
||||||
my_local_conn = NULL;
|
|
||||||
|
|
||||||
/* XXX double-check the promotion candidate did become the new primary */
|
|
||||||
|
|
||||||
log_notice(_("node %d is the best candidate for new master, attempting to follow...\n"),
|
log_notice(_("node %d is the best candidate for new master, attempting to follow...\n"),
|
||||||
best_candidate.node_id);
|
best_candidate.node_id);
|
||||||
|
|
||||||
@@ -1736,7 +1654,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
|||||||
|
|
||||||
if (PQntuples(res) == 0)
|
if (PQntuples(res) == 0)
|
||||||
{
|
{
|
||||||
log_err(_("no node with id %i found\n"), upstream_node_id);
|
log_err(_("no node with id %i found"), upstream_node_id);
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -2421,7 +2339,7 @@ get_node_info(PGconn *conn, char *cluster, int node_id)
|
|||||||
|
|
||||||
if (res == 0)
|
if (res == 0)
|
||||||
{
|
{
|
||||||
log_warning(_("No record found for node %i\n"), node_id);
|
log_warning(_("No record found record for node %i\n"), node_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
return node_info;
|
return node_info;
|
||||||
|
|||||||
Reference in New Issue
Block a user