mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-25 08:06:29 +00:00
Compare commits
61 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
16896510dc | ||
|
|
1c155a1088 | ||
|
|
31d57f4122 | ||
|
|
7b313b9d71 | ||
|
|
cf126642bd | ||
|
|
52281fcde8 | ||
|
|
de573edaaa | ||
|
|
4cb7f301ad | ||
|
|
87d8de4441 | ||
|
|
6db742f81e | ||
|
|
c79933685c | ||
|
|
04ba672b9f | ||
|
|
4f4111063a | ||
|
|
3a3a536e6d | ||
|
|
6f7206a5a1 | ||
|
|
f9fd1dd227 | ||
|
|
8140ba9c27 | ||
|
|
32dba444e1 | ||
|
|
8212ff8d8a | ||
|
|
1ccd0edad2 | ||
|
|
59b31dd1ca | ||
|
|
300b9f0cc2 | ||
|
|
0efee4cf65 | ||
|
|
0cb2584886 | ||
|
|
b88d27248c | ||
|
|
683c54325e | ||
|
|
70d398cd47 | ||
|
|
7b7d80e5f2 | ||
|
|
96b0e26084 | ||
|
|
91c498f6f1 | ||
|
|
d48093e732 | ||
|
|
3f0d1754a4 | ||
|
|
f27979bbe1 | ||
|
|
e9445a5d5e | ||
|
|
9a2717b5e3 | ||
|
|
dd6ea1cd77 | ||
|
|
de5908c122 | ||
|
|
4b5c84921c | ||
|
|
aaa8d70cef | ||
|
|
ca31b846e7 | ||
|
|
a27cecb559 | ||
|
|
cf0cdfa6a1 | ||
|
|
31489d92c0 | ||
|
|
b7fd13aed2 | ||
|
|
3c4bf27aa7 | ||
|
|
0ebd9c15d9 | ||
|
|
f9dba283d4 | ||
|
|
205f1cebbb | ||
|
|
4d97c1ebf7 | ||
|
|
12c395e91f | ||
|
|
bd1e4f71d6 | ||
|
|
cb49071ea4 | ||
|
|
5ad674edff | ||
|
|
ac09bad89c | ||
|
|
009d92fec8 | ||
|
|
b3d8a68a1d | ||
|
|
05b47cb2a8 | ||
|
|
dc542a1b7d | ||
|
|
6ce8058749 | ||
|
|
2edcac77f0 | ||
|
|
f740374392 |
9
HISTORY
9
HISTORY
@@ -1,3 +1,12 @@
|
|||||||
|
3.1.3 2016-05-17
|
||||||
|
repmgrd: enable monitoring when a standby is catching up by
|
||||||
|
replaying archived WAL (Ian)
|
||||||
|
repmgrd: when upstream_node_id is NULL, assume upstream node
|
||||||
|
to be current master (Ian)
|
||||||
|
repmgrd: check for reappearance of the master node if standby
|
||||||
|
promotion fails (Ian)
|
||||||
|
improve handling of rsync failure conditions (Martín)
|
||||||
|
|
||||||
3.1.2 2016-04-12
|
3.1.2 2016-04-12
|
||||||
Fix pg_ctl path generation in do_standby_switchover() (Ian)
|
Fix pg_ctl path generation in do_standby_switchover() (Ian)
|
||||||
Regularly sync witness server repl_nodes table (Ian)
|
Regularly sync witness server repl_nodes table (Ian)
|
||||||
|
|||||||
25
README.md
25
README.md
@@ -259,20 +259,6 @@ The following replication settings must be included in `postgresql.conf`:
|
|||||||
|
|
||||||
hot_standby = on
|
hot_standby = on
|
||||||
|
|
||||||
# If archive_mode is enabled, check that 'archive_command' is non empty
|
|
||||||
# (however it's not practical to check that it actually represents a valid
|
|
||||||
# command).
|
|
||||||
#
|
|
||||||
# From PostgreSQL 9.5, archive_mode can be one of 'off', 'on' or 'always'
|
|
||||||
# so for ease of backwards compatibility, rather than explicitly check for an
|
|
||||||
# enabled mode, check that it's not "off".
|
|
||||||
archive_mode = on
|
|
||||||
|
|
||||||
# Set archive command to a script or application that will safetly store
|
|
||||||
# you WALs in a secure place. /bin/true is an example of a command that
|
|
||||||
# ignores archiving. Use something more sensible.
|
|
||||||
archive_command = '/bin/true'
|
|
||||||
|
|
||||||
|
|
||||||
* * *
|
* * *
|
||||||
|
|
||||||
@@ -1016,8 +1002,11 @@ Monitoring
|
|||||||
----------
|
----------
|
||||||
|
|
||||||
When `repmgrd` is running with the option `-m/--monitoring-history`, it will
|
When `repmgrd` is running with the option `-m/--monitoring-history`, it will
|
||||||
constantly write node status information to the `repl_monitor` table, which can
|
constantly write standby node status information to the `repl_monitor` table,
|
||||||
be queried easily using the view `repl_status`:
|
providing a near-real time overview of replication status on all nodes
|
||||||
|
in the cluster.
|
||||||
|
|
||||||
|
The view `repl_status` shows the most recent state for each node, e.g.:
|
||||||
|
|
||||||
repmgr=# SELECT * FROM repmgr_test.repl_status;
|
repmgr=# SELECT * FROM repmgr_test.repl_status;
|
||||||
-[ RECORD 1 ]-------------+-----------------------------
|
-[ RECORD 1 ]-------------+-----------------------------
|
||||||
@@ -1042,6 +1031,10 @@ table , it's advisable to regularly purge historical data with
|
|||||||
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
|
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
|
||||||
many day's worth of data should be retained.
|
many day's worth of data should be retained.
|
||||||
|
|
||||||
|
Note that when a standby node is not streaming directly from its upstream
|
||||||
|
node, i.e. recovering WAL from an archive, `apply_lag` will always
|
||||||
|
appear as `0 bytes`.
|
||||||
|
|
||||||
|
|
||||||
Using a witness server with repmgrd
|
Using a witness server with repmgrd
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
|||||||
37
dbutils.c
37
dbutils.c
@@ -420,7 +420,7 @@ guc_set_typed(PGconn *conn, const char *parameter, const char *op,
|
|||||||
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
|
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
|
||||||
parameter, datatype, op, value, datatype);
|
parameter, datatype, op, value, datatype);
|
||||||
|
|
||||||
log_verbose(LOG_DEBUG, "guc_set_typed():n%s\n", sqlquery);
|
log_verbose(LOG_DEBUG, "guc_set_typed():\n%s\n", sqlquery);
|
||||||
|
|
||||||
res = PQexec(conn, sqlquery);
|
res = PQexec(conn, sqlquery);
|
||||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
@@ -587,7 +587,7 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
|||||||
upstream_conninfo = upstream_conninfo_out;
|
upstream_conninfo = upstream_conninfo_out;
|
||||||
|
|
||||||
sqlquery_snprintf(sqlquery,
|
sqlquery_snprintf(sqlquery,
|
||||||
" SELECT un.conninfo, un.name, un.id "
|
" SELECT un.conninfo, un.id "
|
||||||
" FROM %s.repl_nodes un "
|
" FROM %s.repl_nodes un "
|
||||||
"INNER JOIN %s.repl_nodes n "
|
"INNER JOIN %s.repl_nodes n "
|
||||||
" ON (un.id = n.upstream_node_id AND un.cluster = n.cluster)"
|
" ON (un.id = n.upstream_node_id AND un.cluster = n.cluster)"
|
||||||
@@ -604,7 +604,7 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
|||||||
|
|
||||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
{
|
{
|
||||||
log_err(_("unable to get conninfo for upstream server\n%s\n"),
|
log_err(_("error when attempting to find upstream server\n%s\n"),
|
||||||
PQerrorMessage(standby_conn));
|
PQerrorMessage(standby_conn));
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -612,9 +612,36 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
|||||||
|
|
||||||
if (!PQntuples(res))
|
if (!PQntuples(res))
|
||||||
{
|
{
|
||||||
log_notice(_("no record found for upstream server"));
|
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
return NULL;
|
log_debug("no record found for upstream server\n");
|
||||||
|
|
||||||
|
sqlquery_snprintf(sqlquery,
|
||||||
|
" SELECT un.conninfo, un.id "
|
||||||
|
" FROM %s.repl_nodes un "
|
||||||
|
" WHERE un.cluster = '%s' "
|
||||||
|
" AND un.type='master' "
|
||||||
|
" AND un.active IS TRUE",
|
||||||
|
get_repmgr_schema_quoted(standby_conn),
|
||||||
|
cluster);
|
||||||
|
res = PQexec(standby_conn, sqlquery);
|
||||||
|
|
||||||
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
log_err(_("error when attempting to find active master server\n%s\n"),
|
||||||
|
PQerrorMessage(standby_conn));
|
||||||
|
PQclear(res);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!PQntuples(res))
|
||||||
|
{
|
||||||
|
PQclear(res);
|
||||||
|
log_notice(_("no record found for active master server\n"));
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
log_debug("record found for active master server\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
strncpy(upstream_conninfo, PQgetvalue(res, 0, 0), MAXCONNINFO);
|
strncpy(upstream_conninfo, PQgetvalue(res, 0, 0), MAXCONNINFO);
|
||||||
|
|||||||
5
log.c
5
log.c
@@ -40,7 +40,8 @@
|
|||||||
/* #define REPMGR_DEBUG */
|
/* #define REPMGR_DEBUG */
|
||||||
|
|
||||||
static int detect_log_facility(const char *facility);
|
static int detect_log_facility(const char *facility);
|
||||||
static void _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap);
|
static void _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap)
|
||||||
|
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 0)));
|
||||||
|
|
||||||
int log_type = REPMGR_STDERR;
|
int log_type = REPMGR_STDERR;
|
||||||
int log_level = LOG_NOTICE;
|
int log_level = LOG_NOTICE;
|
||||||
@@ -48,7 +49,7 @@ int last_log_level = LOG_NOTICE;
|
|||||||
int verbose_logging = false;
|
int verbose_logging = false;
|
||||||
int terse_logging = false;
|
int terse_logging = false;
|
||||||
|
|
||||||
void
|
extern void
|
||||||
stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
|
stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
|
||||||
{
|
{
|
||||||
va_list arglist;
|
va_list arglist;
|
||||||
|
|||||||
8
log.h
8
log.h
@@ -25,7 +25,7 @@
|
|||||||
#define REPMGR_SYSLOG 1
|
#define REPMGR_SYSLOG 1
|
||||||
#define REPMGR_STDERR 2
|
#define REPMGR_STDERR 2
|
||||||
|
|
||||||
void
|
extern void
|
||||||
stderr_log_with_level(const char *level_name, int level, const char *fmt,...)
|
stderr_log_with_level(const char *level_name, int level, const char *fmt,...)
|
||||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
|
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
|
||||||
|
|
||||||
@@ -123,8 +123,10 @@ bool logger_shutdown(void);
|
|||||||
void logger_set_verbose(void);
|
void logger_set_verbose(void);
|
||||||
void logger_set_terse(void);
|
void logger_set_terse(void);
|
||||||
|
|
||||||
void log_hint(const char *fmt, ...);
|
void log_hint(const char *fmt, ...)
|
||||||
void log_verbose(int level, const char *fmt, ...);
|
__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
|
||||||
|
void log_verbose(int level, const char *fmt, ...)
|
||||||
|
__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));
|
||||||
|
|
||||||
extern int log_type;
|
extern int log_type;
|
||||||
extern int log_level;
|
extern int log_level;
|
||||||
|
|||||||
10
repmgr.c
10
repmgr.c
@@ -1665,7 +1665,7 @@ do_standby_clone(void)
|
|||||||
It's quite common for this to happen on the data directory, particularly
|
It's quite common for this to happen on the data directory, particularly
|
||||||
with long running rsync on a busy server.
|
with long running rsync on a busy server.
|
||||||
*/
|
*/
|
||||||
if (r != 0 && r != 24)
|
if (!WIFEXITED(r) && WEXITSTATUS(r) != 24)
|
||||||
{
|
{
|
||||||
log_warning(_("standby clone: failed copying master data directory '%s'\n"),
|
log_warning(_("standby clone: failed copying master data directory '%s'\n"),
|
||||||
master_data_directory);
|
master_data_directory);
|
||||||
@@ -1751,7 +1751,7 @@ do_standby_clone(void)
|
|||||||
It's quite common for this to happen on the data directory, particularly
|
It's quite common for this to happen on the data directory, particularly
|
||||||
with long running rsync on a busy server.
|
with long running rsync on a busy server.
|
||||||
*/
|
*/
|
||||||
if (r != 0 && r != 24)
|
if (!WIFEXITED(r) && WEXITSTATUS(r) != 24)
|
||||||
{
|
{
|
||||||
log_warning(_("standby clone: failed copying tablespace directory '%s'\n"),
|
log_warning(_("standby clone: failed copying tablespace directory '%s'\n"),
|
||||||
tblspc_dir_src.data);
|
tblspc_dir_src.data);
|
||||||
@@ -5067,7 +5067,7 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
|
|||||||
char *wal_error_message = NULL;
|
char *wal_error_message = NULL;
|
||||||
|
|
||||||
/* Check that WAL level is set correctly */
|
/* Check that WAL level is set correctly */
|
||||||
if (server_version_num < 90300)
|
if (server_version_num < 90400)
|
||||||
{
|
{
|
||||||
i = guc_set(conn, "wal_level", "=", "hot_standby");
|
i = guc_set(conn, "wal_level", "=", "hot_standby");
|
||||||
wal_error_message = _("parameter 'wal_level' must be set to 'hot_standby'");
|
wal_error_message = _("parameter 'wal_level' must be set to 'hot_standby'");
|
||||||
@@ -5080,10 +5080,6 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
|
|||||||
NULL,
|
NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
* Note that in 9.6+, "hot_standby" and "archive" are accepted as aliases
|
|
||||||
* for "replica", but current_setting() will of course always return "replica"
|
|
||||||
*/
|
|
||||||
char *levels_96plus[] = {
|
char *levels_96plus[] = {
|
||||||
"replica",
|
"replica",
|
||||||
"logical",
|
"logical",
|
||||||
|
|||||||
116
repmgrd.c
116
repmgrd.c
@@ -716,8 +716,9 @@ standby_monitor(void)
|
|||||||
t_node_info upstream_node;
|
t_node_info upstream_node;
|
||||||
|
|
||||||
int active_master_id;
|
int active_master_id;
|
||||||
const char *type = NULL;
|
const char *upstream_node_type = NULL;
|
||||||
|
|
||||||
|
bool receiving_streamed_wal = true;
|
||||||
/*
|
/*
|
||||||
* Verify that the local node is still available - if not there's
|
* Verify that the local node is still available - if not there's
|
||||||
* no point in doing much else anyway
|
* no point in doing much else anyway
|
||||||
@@ -742,9 +743,10 @@ standby_monitor(void)
|
|||||||
upstream_conn = get_upstream_connection(my_local_conn,
|
upstream_conn = get_upstream_connection(my_local_conn,
|
||||||
local_options.cluster_name,
|
local_options.cluster_name,
|
||||||
local_options.node,
|
local_options.node,
|
||||||
&upstream_node_id, upstream_conninfo);
|
&upstream_node_id,
|
||||||
|
upstream_conninfo);
|
||||||
|
|
||||||
type = upstream_node_id == master_options.node
|
upstream_node_type = (upstream_node_id == master_options.node)
|
||||||
? "master"
|
? "master"
|
||||||
: "upstream";
|
: "upstream";
|
||||||
|
|
||||||
@@ -754,7 +756,7 @@ standby_monitor(void)
|
|||||||
* we cannot reconnect, try to get a new upstream node.
|
* we cannot reconnect, try to get a new upstream node.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
check_connection(&upstream_conn, type, upstream_conninfo);
|
check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
|
||||||
/*
|
/*
|
||||||
* This takes up to local_options.reconnect_attempts *
|
* This takes up to local_options.reconnect_attempts *
|
||||||
* local_options.reconnect_interval seconds
|
* local_options.reconnect_interval seconds
|
||||||
@@ -767,7 +769,7 @@ standby_monitor(void)
|
|||||||
|
|
||||||
if (local_options.failover == MANUAL_FAILOVER)
|
if (local_options.failover == MANUAL_FAILOVER)
|
||||||
{
|
{
|
||||||
log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), type);
|
log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), upstream_node_type);
|
||||||
|
|
||||||
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
||||||
{
|
{
|
||||||
@@ -826,7 +828,7 @@ standby_monitor(void)
|
|||||||
* Failover handling is handled differently depending on whether
|
* Failover handling is handled differently depending on whether
|
||||||
* the failed node is the master or a cascading standby
|
* the failed node is the master or a cascading standby
|
||||||
*/
|
*/
|
||||||
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, node_info.upstream_node_id);
|
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
||||||
|
|
||||||
if (upstream_node.type == MASTER)
|
if (upstream_node.type == MASTER)
|
||||||
{
|
{
|
||||||
@@ -929,7 +931,7 @@ standby_monitor(void)
|
|||||||
* from the upstream node to write monitoring information
|
* from the upstream node to write monitoring information
|
||||||
*/
|
*/
|
||||||
|
|
||||||
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, node_info.upstream_node_id);
|
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
||||||
|
|
||||||
sprintf(sqlquery,
|
sprintf(sqlquery,
|
||||||
"SELECT id "
|
"SELECT id "
|
||||||
@@ -1001,10 +1003,24 @@ standby_monitor(void)
|
|||||||
strncpy(last_xlog_receive_location, PQgetvalue(res, 0, 1), MAXLEN);
|
strncpy(last_xlog_receive_location, PQgetvalue(res, 0, 1), MAXLEN);
|
||||||
strncpy(last_xlog_replay_location, PQgetvalue(res, 0, 2), MAXLEN);
|
strncpy(last_xlog_replay_location, PQgetvalue(res, 0, 2), MAXLEN);
|
||||||
strncpy(last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
|
strncpy(last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
|
||||||
|
|
||||||
last_xlog_receive_location_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
|
last_xlog_receive_location_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
|
||||||
? true
|
? true
|
||||||
: false;
|
: false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If pg_last_xlog_receive_location is NULL, this means we're in archive
|
||||||
|
* recovery and will need to calculate lag based on pg_last_xlog_replay_location
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Replayed WAL is greater than received streamed WAL
|
||||||
|
*/
|
||||||
|
if (PQgetisnull(res, 0, 1))
|
||||||
|
{
|
||||||
|
receiving_streamed_wal = false;
|
||||||
|
}
|
||||||
|
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -1016,11 +1032,10 @@ standby_monitor(void)
|
|||||||
* PostgreSQL log. In the absence of a better strategy, skip attempting
|
* PostgreSQL log. In the absence of a better strategy, skip attempting
|
||||||
* to insert a monitoring record.
|
* to insert a monitoring record.
|
||||||
*/
|
*/
|
||||||
if (last_xlog_receive_location_gte_replayed == false)
|
if (receiving_streamed_wal == true && last_xlog_receive_location_gte_replayed == false)
|
||||||
{
|
{
|
||||||
log_verbose(LOG_WARNING,
|
log_verbose(LOG_WARNING,
|
||||||
"Invalid replication_lag value calculated - is this standby connected to its upstream?\n");
|
"Replayed WAL newer than received WAL - is this standby connected to its upstream?\n");
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Get master xlog info */
|
/* Get master xlog info */
|
||||||
@@ -1039,9 +1054,18 @@ standby_monitor(void)
|
|||||||
|
|
||||||
/* Calculate the lag */
|
/* Calculate the lag */
|
||||||
lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_master_location, NULL);
|
lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_master_location, NULL);
|
||||||
lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);
|
|
||||||
lsn_last_xlog_replay_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL);
|
lsn_last_xlog_replay_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL);
|
||||||
|
|
||||||
|
if (last_xlog_receive_location_gte_replayed == false)
|
||||||
|
{
|
||||||
|
lsn_last_xlog_receive_location = lsn_last_xlog_replay_location;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Build the SQL to execute on master
|
* Build the SQL to execute on master
|
||||||
*/
|
*/
|
||||||
@@ -1423,9 +1447,6 @@ do_master_failover(void)
|
|||||||
PQfinish(node_conn);
|
PQfinish(node_conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Close the connection to this server */
|
|
||||||
PQfinish(my_local_conn);
|
|
||||||
my_local_conn = NULL;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* determine which one is the best candidate to promote to master
|
* determine which one is the best candidate to promote to master
|
||||||
@@ -1473,18 +1494,24 @@ do_master_failover(void)
|
|||||||
terminate(ERR_FAILOVER_FAIL);
|
terminate(ERR_FAILOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log_debug("best candidate node id is %i\n", best_candidate.node_id);
|
||||||
|
|
||||||
/* if local node is the best candidate, promote it */
|
/* if local node is the best candidate, promote it */
|
||||||
if (best_candidate.node_id == local_options.node)
|
if (best_candidate.node_id == local_options.node)
|
||||||
{
|
{
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
|
|
||||||
|
/* Close the connection to this server */
|
||||||
|
PQfinish(my_local_conn);
|
||||||
|
my_local_conn = NULL;
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
/* wait */
|
/* wait */
|
||||||
sleep(5);
|
sleep(5);
|
||||||
|
|
||||||
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||||
|
|
||||||
log_debug(_("promote command is: \"%s\"\n"),
|
log_debug("promote command is: \"%s\"\n",
|
||||||
local_options.promote_command);
|
local_options.promote_command);
|
||||||
|
|
||||||
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
||||||
@@ -1495,6 +1522,33 @@ do_master_failover(void)
|
|||||||
r = system(local_options.promote_command);
|
r = system(local_options.promote_command);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Check whether the primary reappeared, which will have caused the
|
||||||
|
* promote command to fail
|
||||||
|
*/
|
||||||
|
my_local_conn = establish_db_connection(local_options.conninfo, false);
|
||||||
|
|
||||||
|
if (my_local_conn != NULL)
|
||||||
|
{
|
||||||
|
int master_node_id;
|
||||||
|
|
||||||
|
master_conn = get_master_connection(my_local_conn,
|
||||||
|
local_options.cluster_name,
|
||||||
|
&master_node_id, NULL);
|
||||||
|
|
||||||
|
if (master_conn != NULL && master_node_id == failed_master.node_id)
|
||||||
|
{
|
||||||
|
log_notice(_("Original master reappeared before this standby was promoted - no action taken\n"));
|
||||||
|
|
||||||
|
PQfinish(master_conn);
|
||||||
|
/* no failover occurred but we'll want to restart connections */
|
||||||
|
failover_done = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
PQfinish(my_local_conn);
|
||||||
|
}
|
||||||
|
|
||||||
log_err(_("promote command failed. You could check and try it manually.\n"));
|
log_err(_("promote command failed. You could check and try it manually.\n"));
|
||||||
|
|
||||||
terminate(ERR_DB_QUERY);
|
terminate(ERR_DB_QUERY);
|
||||||
@@ -1526,11 +1580,39 @@ do_master_failover(void)
|
|||||||
{
|
{
|
||||||
PGconn *new_master_conn;
|
PGconn *new_master_conn;
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
|
int master_node_id;
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
/* wait */
|
/* wait */
|
||||||
sleep(10);
|
sleep(10);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check whether the primary reappeared while we were waiting, so we
|
||||||
|
* don't end up following the promotion candidate
|
||||||
|
*/
|
||||||
|
|
||||||
|
master_conn = get_master_connection(my_local_conn,
|
||||||
|
local_options.cluster_name,
|
||||||
|
&master_node_id, NULL);
|
||||||
|
|
||||||
|
if (master_conn != NULL && master_node_id == failed_master.node_id)
|
||||||
|
{
|
||||||
|
log_notice(_("Original master reappeared - no action taken\n"));
|
||||||
|
|
||||||
|
PQfinish(master_conn);
|
||||||
|
/* no failover occurred but we'll want to restart connections */
|
||||||
|
failover_done = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Close the connection to this server */
|
||||||
|
PQfinish(my_local_conn);
|
||||||
|
my_local_conn = NULL;
|
||||||
|
|
||||||
|
/* XXX double-check the promotion candidate did become the new primary */
|
||||||
|
|
||||||
log_notice(_("node %d is the best candidate for new master, attempting to follow...\n"),
|
log_notice(_("node %d is the best candidate for new master, attempting to follow...\n"),
|
||||||
best_candidate.node_id);
|
best_candidate.node_id);
|
||||||
|
|
||||||
@@ -1654,7 +1736,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
|||||||
|
|
||||||
if (PQntuples(res) == 0)
|
if (PQntuples(res) == 0)
|
||||||
{
|
{
|
||||||
log_err(_("no node with id %i found"), upstream_node_id);
|
log_err(_("no node with id %i found\n"), upstream_node_id);
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -2339,7 +2421,7 @@ get_node_info(PGconn *conn, char *cluster, int node_id)
|
|||||||
|
|
||||||
if (res == 0)
|
if (res == 0)
|
||||||
{
|
{
|
||||||
log_warning(_("No record found record for node %i\n"), node_id);
|
log_warning(_("No record found for node %i\n"), node_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
return node_info;
|
return node_info;
|
||||||
|
|||||||
Reference in New Issue
Block a user