mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-24 07:36:30 +00:00
Compare commits
48 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
274a30efa5 | ||
|
|
db63b5bb1c | ||
|
|
e100728b93 | ||
|
|
d104f2a914 | ||
|
|
2946c097f0 | ||
|
|
a538ceb0ea | ||
|
|
5a2a8d1c82 | ||
|
|
b5a7efa58e | ||
|
|
9f6f58e4ed | ||
|
|
c22f4eaf6f | ||
|
|
925d82f7a4 | ||
|
|
1db577e294 | ||
|
|
a886fddccc | ||
|
|
83e5f98171 | ||
|
|
eb31a56186 | ||
|
|
8cd2c6fd05 | ||
|
|
e3e1c5de4e | ||
|
|
f9a150504a | ||
|
|
5bc809466c | ||
|
|
5d32026b79 | ||
|
|
2a8d6f72c6 | ||
|
|
190cc7dcb4 | ||
|
|
819937d4bd | ||
|
|
57299cb978 | ||
|
|
59f503835b | ||
|
|
33e626cd75 | ||
|
|
491ec37adf | ||
|
|
c93790fc96 | ||
|
|
ecabe2c294 | ||
|
|
2ba57e5938 | ||
|
|
2eec17e25f | ||
|
|
c48c248c15 | ||
|
|
958e45f2b8 | ||
|
|
daafd70383 | ||
|
|
c828598bfb | ||
|
|
b55519c4a2 | ||
|
|
4cafd443e1 | ||
|
|
d400d7f9ac | ||
|
|
62bb3db1f8 | ||
|
|
d9961bbb17 | ||
|
|
e1b8982c14 | ||
|
|
2fe3b3c2a3 | ||
|
|
c6e1bc205a | ||
|
|
7241391ddc | ||
|
|
c8f449f178 | ||
|
|
49420c437f | ||
|
|
827ffef5f9 | ||
|
|
16296bb1c3 |
9
HISTORY
9
HISTORY
@@ -1,12 +1,3 @@
|
||||
3.1.3 2016-05-17
|
||||
repmgrd: enable monitoring when a standby is catching up by
|
||||
replaying archived WAL (Ian)
|
||||
repmgrd: when upstream_node_id is NULL, assume upstream node
|
||||
to be current master (Ian)
|
||||
repmgrd: check for reappearance of the master node if standby
|
||||
promotion fails (Ian)
|
||||
improve handling of rsync failure conditions (Martín)
|
||||
|
||||
3.1.2 2016-04-12
|
||||
Fix pg_ctl path generation in do_standby_switchover() (Ian)
|
||||
Regularly sync witness server repl_nodes table (Ian)
|
||||
|
||||
25
README.md
25
README.md
@@ -259,6 +259,20 @@ The following replication settings must be included in `postgresql.conf`:
|
||||
|
||||
hot_standby = on
|
||||
|
||||
# If archive_mode is enabled, check that 'archive_command' is non empty
|
||||
# (however it's not practical to check that it actually represents a valid
|
||||
# command).
|
||||
#
|
||||
# From PostgreSQL 9.5, archive_mode can be one of 'off', 'on' or 'always'
|
||||
# so for ease of backwards compatibility, rather than explicitly check for an
|
||||
# enabled mode, check that it's not "off".
|
||||
archive_mode = on
|
||||
|
||||
# Set archive command to a script or application that will safetly store
|
||||
# you WALs in a secure place. /bin/true is an example of a command that
|
||||
# ignores archiving. Use something more sensible.
|
||||
archive_command = '/bin/true'
|
||||
|
||||
|
||||
* * *
|
||||
|
||||
@@ -1002,11 +1016,8 @@ Monitoring
|
||||
----------
|
||||
|
||||
When `repmgrd` is running with the option `-m/--monitoring-history`, it will
|
||||
constantly write standby node status information to the `repl_monitor` table,
|
||||
providing a near-real time overview of replication status on all nodes
|
||||
in the cluster.
|
||||
|
||||
The view `repl_status` shows the most recent state for each node, e.g.:
|
||||
constantly write node status information to the `repl_monitor` table, which can
|
||||
be queried easily using the view `repl_status`:
|
||||
|
||||
repmgr=# SELECT * FROM repmgr_test.repl_status;
|
||||
-[ RECORD 1 ]-------------+-----------------------------
|
||||
@@ -1031,10 +1042,6 @@ table , it's advisable to regularly purge historical data with
|
||||
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
|
||||
many day's worth of data should be retained.
|
||||
|
||||
Note that when a standby node is not streaming directly from its upstream
|
||||
node, i.e. recovering WAL from an archive, `apply_lag` will always
|
||||
appear as `0 bytes`.
|
||||
|
||||
|
||||
Using a witness server with repmgrd
|
||||
------------------------------------
|
||||
|
||||
37
dbutils.c
37
dbutils.c
@@ -420,7 +420,7 @@ guc_set_typed(PGconn *conn, const char *parameter, const char *op,
|
||||
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
|
||||
parameter, datatype, op, value, datatype);
|
||||
|
||||
log_verbose(LOG_DEBUG, "guc_set_typed():\n%s\n", sqlquery);
|
||||
log_verbose(LOG_DEBUG, "guc_set_typed():n%s\n", sqlquery);
|
||||
|
||||
res = PQexec(conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
@@ -587,7 +587,7 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
||||
upstream_conninfo = upstream_conninfo_out;
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
" SELECT un.conninfo, un.id "
|
||||
" SELECT un.conninfo, un.name, un.id "
|
||||
" FROM %s.repl_nodes un "
|
||||
"INNER JOIN %s.repl_nodes n "
|
||||
" ON (un.id = n.upstream_node_id AND un.cluster = n.cluster)"
|
||||
@@ -604,7 +604,7 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("error when attempting to find upstream server\n%s\n"),
|
||||
log_err(_("unable to get conninfo for upstream server\n%s\n"),
|
||||
PQerrorMessage(standby_conn));
|
||||
PQclear(res);
|
||||
return NULL;
|
||||
@@ -612,36 +612,9 @@ get_upstream_connection(PGconn *standby_conn, char *cluster, int node_id,
|
||||
|
||||
if (!PQntuples(res))
|
||||
{
|
||||
log_notice(_("no record found for upstream server"));
|
||||
PQclear(res);
|
||||
log_debug("no record found for upstream server\n");
|
||||
|
||||
sqlquery_snprintf(sqlquery,
|
||||
" SELECT un.conninfo, un.id "
|
||||
" FROM %s.repl_nodes un "
|
||||
" WHERE un.cluster = '%s' "
|
||||
" AND un.type='master' "
|
||||
" AND un.active IS TRUE",
|
||||
get_repmgr_schema_quoted(standby_conn),
|
||||
cluster);
|
||||
res = PQexec(standby_conn, sqlquery);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_err(_("error when attempting to find active master server\n%s\n"),
|
||||
PQerrorMessage(standby_conn));
|
||||
PQclear(res);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!PQntuples(res))
|
||||
{
|
||||
PQclear(res);
|
||||
log_notice(_("no record found for active master server\n"));
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
log_debug("record found for active master server\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
strncpy(upstream_conninfo, PQgetvalue(res, 0, 0), MAXCONNINFO);
|
||||
|
||||
5
log.c
5
log.c
@@ -40,8 +40,7 @@
|
||||
/* #define REPMGR_DEBUG */
|
||||
|
||||
static int detect_log_facility(const char *facility);
|
||||
static void _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap)
|
||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 0)));
|
||||
static void _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_list ap);
|
||||
|
||||
int log_type = REPMGR_STDERR;
|
||||
int log_level = LOG_NOTICE;
|
||||
@@ -49,7 +48,7 @@ int last_log_level = LOG_NOTICE;
|
||||
int verbose_logging = false;
|
||||
int terse_logging = false;
|
||||
|
||||
extern void
|
||||
void
|
||||
stderr_log_with_level(const char *level_name, int level, const char *fmt, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
|
||||
8
log.h
8
log.h
@@ -25,7 +25,7 @@
|
||||
#define REPMGR_SYSLOG 1
|
||||
#define REPMGR_STDERR 2
|
||||
|
||||
extern void
|
||||
void
|
||||
stderr_log_with_level(const char *level_name, int level, const char *fmt,...)
|
||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));
|
||||
|
||||
@@ -123,10 +123,8 @@ bool logger_shutdown(void);
|
||||
void logger_set_verbose(void);
|
||||
void logger_set_terse(void);
|
||||
|
||||
void log_hint(const char *fmt, ...)
|
||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2)));
|
||||
void log_verbose(int level, const char *fmt, ...)
|
||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 2, 3)));
|
||||
void log_hint(const char *fmt, ...);
|
||||
void log_verbose(int level, const char *fmt, ...);
|
||||
|
||||
extern int log_type;
|
||||
extern int log_level;
|
||||
|
||||
10
repmgr.c
10
repmgr.c
@@ -1665,7 +1665,7 @@ do_standby_clone(void)
|
||||
It's quite common for this to happen on the data directory, particularly
|
||||
with long running rsync on a busy server.
|
||||
*/
|
||||
if (!WIFEXITED(r) && WEXITSTATUS(r) != 24)
|
||||
if (r != 0 && r != 24)
|
||||
{
|
||||
log_warning(_("standby clone: failed copying master data directory '%s'\n"),
|
||||
master_data_directory);
|
||||
@@ -1751,7 +1751,7 @@ do_standby_clone(void)
|
||||
It's quite common for this to happen on the data directory, particularly
|
||||
with long running rsync on a busy server.
|
||||
*/
|
||||
if (!WIFEXITED(r) && WEXITSTATUS(r) != 24)
|
||||
if (r != 0 && r != 24)
|
||||
{
|
||||
log_warning(_("standby clone: failed copying tablespace directory '%s'\n"),
|
||||
tblspc_dir_src.data);
|
||||
@@ -5067,7 +5067,7 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
|
||||
char *wal_error_message = NULL;
|
||||
|
||||
/* Check that WAL level is set correctly */
|
||||
if (server_version_num < 90400)
|
||||
if (server_version_num < 90300)
|
||||
{
|
||||
i = guc_set(conn, "wal_level", "=", "hot_standby");
|
||||
wal_error_message = _("parameter 'wal_level' must be set to 'hot_standby'");
|
||||
@@ -5080,6 +5080,10 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
|
||||
NULL,
|
||||
};
|
||||
|
||||
/*
|
||||
* Note that in 9.6+, "hot_standby" and "archive" are accepted as aliases
|
||||
* for "replica", but current_setting() will of course always return "replica"
|
||||
*/
|
||||
char *levels_96plus[] = {
|
||||
"replica",
|
||||
"logical",
|
||||
|
||||
116
repmgrd.c
116
repmgrd.c
@@ -716,9 +716,8 @@ standby_monitor(void)
|
||||
t_node_info upstream_node;
|
||||
|
||||
int active_master_id;
|
||||
const char *upstream_node_type = NULL;
|
||||
const char *type = NULL;
|
||||
|
||||
bool receiving_streamed_wal = true;
|
||||
/*
|
||||
* Verify that the local node is still available - if not there's
|
||||
* no point in doing much else anyway
|
||||
@@ -743,10 +742,9 @@ standby_monitor(void)
|
||||
upstream_conn = get_upstream_connection(my_local_conn,
|
||||
local_options.cluster_name,
|
||||
local_options.node,
|
||||
&upstream_node_id,
|
||||
upstream_conninfo);
|
||||
&upstream_node_id, upstream_conninfo);
|
||||
|
||||
upstream_node_type = (upstream_node_id == master_options.node)
|
||||
type = upstream_node_id == master_options.node
|
||||
? "master"
|
||||
: "upstream";
|
||||
|
||||
@@ -756,7 +754,7 @@ standby_monitor(void)
|
||||
* we cannot reconnect, try to get a new upstream node.
|
||||
*/
|
||||
|
||||
check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
|
||||
check_connection(&upstream_conn, type, upstream_conninfo);
|
||||
/*
|
||||
* This takes up to local_options.reconnect_attempts *
|
||||
* local_options.reconnect_interval seconds
|
||||
@@ -769,7 +767,7 @@ standby_monitor(void)
|
||||
|
||||
if (local_options.failover == MANUAL_FAILOVER)
|
||||
{
|
||||
log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), upstream_node_type);
|
||||
log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), type);
|
||||
|
||||
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
||||
{
|
||||
@@ -828,7 +826,7 @@ standby_monitor(void)
|
||||
* Failover handling is handled differently depending on whether
|
||||
* the failed node is the master or a cascading standby
|
||||
*/
|
||||
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
||||
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, node_info.upstream_node_id);
|
||||
|
||||
if (upstream_node.type == MASTER)
|
||||
{
|
||||
@@ -931,7 +929,7 @@ standby_monitor(void)
|
||||
* from the upstream node to write monitoring information
|
||||
*/
|
||||
|
||||
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
||||
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, node_info.upstream_node_id);
|
||||
|
||||
sprintf(sqlquery,
|
||||
"SELECT id "
|
||||
@@ -1003,24 +1001,10 @@ standby_monitor(void)
|
||||
strncpy(last_xlog_receive_location, PQgetvalue(res, 0, 1), MAXLEN);
|
||||
strncpy(last_xlog_replay_location, PQgetvalue(res, 0, 2), MAXLEN);
|
||||
strncpy(last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
|
||||
|
||||
last_xlog_receive_location_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
|
||||
? true
|
||||
: false;
|
||||
|
||||
/*
|
||||
* If pg_last_xlog_receive_location is NULL, this means we're in archive
|
||||
* recovery and will need to calculate lag based on pg_last_xlog_replay_location
|
||||
*/
|
||||
|
||||
/*
|
||||
* Replayed WAL is greater than received streamed WAL
|
||||
*/
|
||||
if (PQgetisnull(res, 0, 1))
|
||||
{
|
||||
receiving_streamed_wal = false;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
/*
|
||||
@@ -1032,10 +1016,11 @@ standby_monitor(void)
|
||||
* PostgreSQL log. In the absence of a better strategy, skip attempting
|
||||
* to insert a monitoring record.
|
||||
*/
|
||||
if (receiving_streamed_wal == true && last_xlog_receive_location_gte_replayed == false)
|
||||
if (last_xlog_receive_location_gte_replayed == false)
|
||||
{
|
||||
log_verbose(LOG_WARNING,
|
||||
"Replayed WAL newer than received WAL - is this standby connected to its upstream?\n");
|
||||
"Invalid replication_lag value calculated - is this standby connected to its upstream?\n");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Get master xlog info */
|
||||
@@ -1054,18 +1039,9 @@ standby_monitor(void)
|
||||
|
||||
/* Calculate the lag */
|
||||
lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_master_location, NULL);
|
||||
|
||||
lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);
|
||||
lsn_last_xlog_replay_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL);
|
||||
|
||||
if (last_xlog_receive_location_gte_replayed == false)
|
||||
{
|
||||
lsn_last_xlog_receive_location = lsn_last_xlog_replay_location;
|
||||
}
|
||||
else
|
||||
{
|
||||
lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Build the SQL to execute on master
|
||||
*/
|
||||
@@ -1447,6 +1423,9 @@ do_master_failover(void)
|
||||
PQfinish(node_conn);
|
||||
}
|
||||
|
||||
/* Close the connection to this server */
|
||||
PQfinish(my_local_conn);
|
||||
my_local_conn = NULL;
|
||||
|
||||
/*
|
||||
* determine which one is the best candidate to promote to master
|
||||
@@ -1494,24 +1473,18 @@ do_master_failover(void)
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
log_debug("best candidate node id is %i\n", best_candidate.node_id);
|
||||
|
||||
/* if local node is the best candidate, promote it */
|
||||
if (best_candidate.node_id == local_options.node)
|
||||
{
|
||||
PQExpBufferData event_details;
|
||||
|
||||
/* Close the connection to this server */
|
||||
PQfinish(my_local_conn);
|
||||
my_local_conn = NULL;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
/* wait */
|
||||
sleep(5);
|
||||
|
||||
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||
|
||||
log_debug("promote command is: \"%s\"\n",
|
||||
log_debug(_("promote command is: \"%s\"\n"),
|
||||
local_options.promote_command);
|
||||
|
||||
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
||||
@@ -1522,33 +1495,6 @@ do_master_failover(void)
|
||||
r = system(local_options.promote_command);
|
||||
if (r != 0)
|
||||
{
|
||||
/*
|
||||
* Check whether the primary reappeared, which will have caused the
|
||||
* promote command to fail
|
||||
*/
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, false);
|
||||
|
||||
if (my_local_conn != NULL)
|
||||
{
|
||||
int master_node_id;
|
||||
|
||||
master_conn = get_master_connection(my_local_conn,
|
||||
local_options.cluster_name,
|
||||
&master_node_id, NULL);
|
||||
|
||||
if (master_conn != NULL && master_node_id == failed_master.node_id)
|
||||
{
|
||||
log_notice(_("Original master reappeared before this standby was promoted - no action taken\n"));
|
||||
|
||||
PQfinish(master_conn);
|
||||
/* no failover occurred but we'll want to restart connections */
|
||||
failover_done = true;
|
||||
return;
|
||||
}
|
||||
|
||||
PQfinish(my_local_conn);
|
||||
}
|
||||
|
||||
log_err(_("promote command failed. You could check and try it manually.\n"));
|
||||
|
||||
terminate(ERR_DB_QUERY);
|
||||
@@ -1580,39 +1526,11 @@ do_master_failover(void)
|
||||
{
|
||||
PGconn *new_master_conn;
|
||||
PQExpBufferData event_details;
|
||||
int master_node_id;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
/* wait */
|
||||
sleep(10);
|
||||
|
||||
/*
|
||||
* Check whether the primary reappeared while we were waiting, so we
|
||||
* don't end up following the promotion candidate
|
||||
*/
|
||||
|
||||
master_conn = get_master_connection(my_local_conn,
|
||||
local_options.cluster_name,
|
||||
&master_node_id, NULL);
|
||||
|
||||
if (master_conn != NULL && master_node_id == failed_master.node_id)
|
||||
{
|
||||
log_notice(_("Original master reappeared - no action taken\n"));
|
||||
|
||||
PQfinish(master_conn);
|
||||
/* no failover occurred but we'll want to restart connections */
|
||||
failover_done = true;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
/* Close the connection to this server */
|
||||
PQfinish(my_local_conn);
|
||||
my_local_conn = NULL;
|
||||
|
||||
/* XXX double-check the promotion candidate did become the new primary */
|
||||
|
||||
log_notice(_("node %d is the best candidate for new master, attempting to follow...\n"),
|
||||
best_candidate.node_id);
|
||||
|
||||
@@ -1736,7 +1654,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
|
||||
if (PQntuples(res) == 0)
|
||||
{
|
||||
log_err(_("no node with id %i found\n"), upstream_node_id);
|
||||
log_err(_("no node with id %i found"), upstream_node_id);
|
||||
PQclear(res);
|
||||
return false;
|
||||
}
|
||||
@@ -2421,7 +2339,7 @@ get_node_info(PGconn *conn, char *cluster, int node_id)
|
||||
|
||||
if (res == 0)
|
||||
{
|
||||
log_warning(_("No record found for node %i\n"), node_id);
|
||||
log_warning(_("No record found record for node %i\n"), node_id);
|
||||
}
|
||||
|
||||
return node_info;
|
||||
|
||||
Reference in New Issue
Block a user