mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 15:16:29 +00:00
Compare commits
10 Commits
REL3_3_STA
...
REL3_4_STA
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3988653d6c | ||
|
|
3f9b10a02c | ||
|
|
df34e7e8c5 | ||
|
|
668b2c9b59 | ||
|
|
9629fb6eb5 | ||
|
|
967b7c6876 | ||
|
|
120dd5b82d | ||
|
|
243b5d2b48 | ||
|
|
24a354c0a7 | ||
|
|
a4f572a1ff |
5
HISTORY
5
HISTORY
@@ -1,5 +1,8 @@
|
||||
3.3.3 2017-06
|
||||
3.4.0 2019-02-
|
||||
default log level is now INFO (Ian)
|
||||
repmgr: fix `standby register --force` when updating existing node record (Ian)
|
||||
repmgrd: set LSN shared memory value at standby startup (Ian)
|
||||
repmgrd: improve logging during failover (Ian)
|
||||
|
||||
3.3.2 2017-06-01
|
||||
Add support for PostgreSQL 10 (Ian)
|
||||
|
||||
@@ -7,9 +7,13 @@ replication capabilities with utilities to set up standby servers, monitor
|
||||
replication, and perform administrative tasks such as failover or switchover
|
||||
operations.
|
||||
|
||||
The current `repmgr` version (3.3) supports all PostgreSQL versions from
|
||||
This `repmgr` version (3.4) supports PostgreSQL versions from
|
||||
9.3 to 9.6.
|
||||
|
||||
*NOTE*: we strongly recommend using the repmgr 4.x series, which contains
|
||||
many new features and usability enhancements and is being actively developed
|
||||
and maintained.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
@@ -210,7 +214,7 @@ See `PACKAGES.md` for details on building .deb and .rpm packages from the
|
||||
Release tarballs are also available:
|
||||
|
||||
https://github.com/2ndQuadrant/repmgr/releases
|
||||
http://repmgr.org/
|
||||
https://repmgr.org/
|
||||
|
||||
`repmgr` is compiled in the same way as a PostgreSQL extension using the PGXS
|
||||
infrastructure, e.g.:
|
||||
|
||||
2
config.c
2
config.c
@@ -59,7 +59,7 @@ progname(void)
|
||||
* added/changed in reload_config()
|
||||
*
|
||||
* NOTE: this function is called before the logger is set up, so we need
|
||||
* to handle the verbose option ourselves; also the default log level is NOTICE,
|
||||
* to handle the verbose option ourselves; also the default log level is INFO,
|
||||
* so we can't use DEBUG.
|
||||
*/
|
||||
bool
|
||||
|
||||
66
dbutils.c
66
dbutils.c
@@ -322,8 +322,6 @@ is_standby(PGconn *conn)
|
||||
bool
|
||||
is_pgup(PGconn *conn, int timeout)
|
||||
{
|
||||
char sqlquery[QUERY_STR_LEN];
|
||||
|
||||
/* Check the connection status twice in case it changes after reset */
|
||||
bool twice = false;
|
||||
|
||||
@@ -346,8 +344,7 @@ is_pgup(PGconn *conn, int timeout)
|
||||
if (wait_connection_availability(conn, timeout) != 1)
|
||||
goto failed;
|
||||
|
||||
sqlquery_snprintf(sqlquery, "SELECT 1");
|
||||
if (PQsendQuery(conn, sqlquery) == 0)
|
||||
if (PQsendQuery(conn, "SELECT 1") == 0)
|
||||
{
|
||||
log_warning(_("PQsendQuery: Query could not be sent to primary. %s\n"),
|
||||
PQerrorMessage(conn));
|
||||
@@ -2095,3 +2092,64 @@ get_data_checksum_version(const char *data_directory)
|
||||
|
||||
return (int)control_file.data_checksum_version;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* ========================== */
|
||||
/* backported from repmgr 4.x */
|
||||
/* ========================== */
|
||||
|
||||
XLogRecPtr
|
||||
parse_lsn(const char *str)
|
||||
{
|
||||
XLogRecPtr ptr = InvalidXLogRecPtr;
|
||||
uint32 high,
|
||||
low;
|
||||
|
||||
if (sscanf(str, "%x/%x", &high, &low) == 2)
|
||||
ptr = (((XLogRecPtr) high) << 32) + (XLogRecPtr) low;
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
||||
XLogRecPtr
|
||||
get_last_wal_receive_location(PGconn *conn)
|
||||
{
|
||||
PGresult *res = NULL;
|
||||
XLogRecPtr ptr = InvalidXLogRecPtr;
|
||||
|
||||
if (PQserverVersion(conn) >= 100000)
|
||||
{
|
||||
res = PQexec(conn, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
|
||||
}
|
||||
else
|
||||
{
|
||||
res = PQexec(conn, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
||||
}
|
||||
|
||||
if (PQresultStatus(res) == PGRES_TUPLES_OK)
|
||||
{
|
||||
ptr = parse_lsn(PQgetvalue(res, 0, 0));
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
is_server_available(const char *conninfo)
|
||||
{
|
||||
PGPing status = PQping(conninfo);
|
||||
|
||||
log_verbose(LOG_DEBUG, "is_server_available(): ping status for \"%s\" is %i\n", conninfo, (int)status);
|
||||
|
||||
if (status == PQPING_OK)
|
||||
return true;
|
||||
|
||||
log_warning("is_server_available(): ping status for \"%s\" is %i\n", conninfo, (int)status);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
#include "strutil.h"
|
||||
|
||||
|
||||
#define format_lsn(x) (uint32) (x >> 32), (uint32) x
|
||||
|
||||
typedef enum {
|
||||
UNKNOWN = 0,
|
||||
MASTER,
|
||||
@@ -140,4 +142,10 @@ void create_checkpoint(PGconn *conn);
|
||||
int get_node_replication_state(PGconn *conn, char *node_name, char *output);
|
||||
t_server_type parse_node_type(const char *type);
|
||||
int get_data_checksum_version(const char *data_directory);
|
||||
|
||||
/* backported from repmgr 4.x */
|
||||
XLogRecPtr parse_lsn(const char *str);
|
||||
XLogRecPtr get_last_wal_receive_location(PGconn *conn);
|
||||
bool is_server_available(const char *conninfo);
|
||||
|
||||
#endif
|
||||
|
||||
4
log.c
4
log.c
@@ -44,8 +44,8 @@ static void _stderr_log_with_level(const char *level_name, int level, const char
|
||||
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 0)));
|
||||
|
||||
int log_type = REPMGR_STDERR;
|
||||
int log_level = LOG_NOTICE;
|
||||
int last_log_level = LOG_NOTICE;
|
||||
int log_level = LOG_INFO;
|
||||
int last_log_level = LOG_INFO;
|
||||
int verbose_logging = false;
|
||||
int terse_logging = false;
|
||||
/*
|
||||
|
||||
26
repmgr.c
26
repmgr.c
@@ -3601,15 +3601,15 @@ do_standby_clone(void)
|
||||
/* Only from 9.4 */
|
||||
"pg_dynshmem", "pg_logical", "pg_logical/snapshots", "pg_logical/mappings", "pg_replslot",
|
||||
/* Already in 9.3 */
|
||||
"pg_notify", "pg_serial", "pg_snapshots", "pg_stat", "pg_stat_tmp", "pg_tblspc",
|
||||
"pg_twophase", "pg_xlog", 0
|
||||
"pg_notify", "pg_serial", "pg_snapshots", "pg_stat", "pg_stat_tmp",
|
||||
"pg_subtrans", "pg_tblspc", "pg_twophase", "pg_xlog", 0
|
||||
};
|
||||
const int vers[] = {
|
||||
100000,
|
||||
90500,
|
||||
90400, 90400, 90400, 90400, 90400,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
0, -100000, 0
|
||||
0, 0, 0, 0, 0,
|
||||
0, 0, 0, -100000
|
||||
};
|
||||
for (i = 0; dirs[i]; i++)
|
||||
{
|
||||
@@ -4201,27 +4201,13 @@ stop_backup:
|
||||
exit(retval);
|
||||
}
|
||||
|
||||
static void
|
||||
parse_lsn(XLogRecPtr *ptr, const char *str)
|
||||
{
|
||||
uint32 high, low;
|
||||
|
||||
if (sscanf(str, "%x/%x", &high, &low) != 2)
|
||||
return;
|
||||
|
||||
*ptr = (((XLogRecPtr)high) << 32) + (XLogRecPtr)low;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
parse_label_lsn(const char *label_key, const char *label_value)
|
||||
{
|
||||
XLogRecPtr ptr = InvalidXLogRecPtr;
|
||||
XLogRecPtr ptr = parse_lsn(label_value);
|
||||
|
||||
parse_lsn(&ptr, label_value);
|
||||
|
||||
/* parse_lsn() will not modify ptr if it can't parse the label value */
|
||||
/* parse_lsn() will return InvalidXLogRecPtr if it can't parse the label value */
|
||||
if (ptr == InvalidXLogRecPtr)
|
||||
{
|
||||
log_err(_("Couldn't parse backup label entry \"%s: %s\" as lsn"),
|
||||
|
||||
@@ -66,8 +66,8 @@
|
||||
# -------------------------------
|
||||
|
||||
# Log level: possible values are DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG
|
||||
# (default: NOTICE)
|
||||
#loglevel=NOTICE
|
||||
# (default: INFO)
|
||||
#loglevel=INFO
|
||||
|
||||
# Note that logging facility settings will only apply to `repmgrd` by default;
|
||||
# `repmgr` will always write to STDERR unless the switch `--log-to-file` is
|
||||
|
||||
109
repmgrd.c
109
repmgrd.c
@@ -514,6 +514,33 @@ main(int argc, char **argv)
|
||||
else if (node_info.type == STANDBY)
|
||||
{
|
||||
log_info(_("starting continuous standby node monitoring\n"));
|
||||
|
||||
/*
|
||||
* Call update_shared_memory() so it's not stuck at 0/0; this
|
||||
* will otherwise cause an infinite loop on other repmgrds if
|
||||
* this repmgrd does not enter failover.
|
||||
*
|
||||
* NOTE: this is a temporary workaround for a structural
|
||||
* issue resolved through architectural redesign in repmgr 4.
|
||||
*/
|
||||
if (local_options.failover == MANUAL_FAILOVER)
|
||||
{
|
||||
update_shared_memory(PASSIVE_NODE);
|
||||
}
|
||||
else
|
||||
{
|
||||
PQExpBufferData current_lsn;
|
||||
|
||||
XLogRecPtr last_wal_receive_location = get_last_wal_receive_location(my_local_conn);
|
||||
|
||||
initPQExpBuffer(¤t_lsn);
|
||||
appendPQExpBuffer(¤t_lsn, "%X/%X",
|
||||
format_lsn(last_wal_receive_location));
|
||||
|
||||
update_shared_memory(current_lsn.data);
|
||||
|
||||
termPQExpBuffer(¤t_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
do
|
||||
@@ -847,6 +874,8 @@ standby_monitor(void)
|
||||
: "upstream";
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Check that the upstream node is still available
|
||||
* If not, initiate failover process
|
||||
@@ -855,9 +884,7 @@ standby_monitor(void)
|
||||
* local_options.reconnect_interval seconds
|
||||
*/
|
||||
|
||||
check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
|
||||
|
||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||
if (!check_connection(&upstream_conn, upstream_node_type, upstream_conninfo))
|
||||
{
|
||||
int previous_master_node_id = master_options.node;
|
||||
|
||||
@@ -1372,7 +1399,7 @@ do_master_failover(void)
|
||||
}
|
||||
|
||||
total_active_nodes = PQntuples(res);
|
||||
log_debug(_("%d active nodes registered\n"), total_active_nodes);
|
||||
log_info(_("%d active nodes registered\n"), total_active_nodes);
|
||||
|
||||
/*
|
||||
* Build an array with the nodes and indicate which ones are visible and
|
||||
@@ -1421,7 +1448,7 @@ do_master_failover(void)
|
||||
*
|
||||
* If the master did come back at this point, the voting algorithm should decide
|
||||
* it's the "best candidate" anyway and no standby will promote itself or
|
||||
* attempt to follow* another server.
|
||||
* attempt to follow another server.
|
||||
*
|
||||
* If we don't try and connect to the master here (and the code generally
|
||||
* assumes it's failed anyway) but it does come back any time from here
|
||||
@@ -1455,8 +1482,8 @@ do_master_failover(void)
|
||||
}
|
||||
PQclear(res);
|
||||
|
||||
log_debug(_("total nodes counted: registered=%d, visible=%d\n"),
|
||||
total_active_nodes, visible_nodes);
|
||||
log_info(_("total nodes counted: registered=%d, visible=%d\n"),
|
||||
total_active_nodes, visible_nodes);
|
||||
|
||||
/*
|
||||
* Am I on the group that should keep alive? If I see less than half of
|
||||
@@ -1473,7 +1500,7 @@ do_master_failover(void)
|
||||
/* Query all available nodes to determine readiness and LSN */
|
||||
for (i = 0; i < total_active_nodes; i++)
|
||||
{
|
||||
log_debug("checking node %i...\n", nodes[i].node_id);
|
||||
log_info("checking node %i...\n", nodes[i].node_id);
|
||||
|
||||
/* if the node is not visible, skip it */
|
||||
if (!nodes[i].is_visible)
|
||||
@@ -1497,31 +1524,25 @@ do_master_failover(void)
|
||||
if (PQstatus(node_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_err(_("It seems new problems are arising, manual intervention is needed\n"));
|
||||
log_detail("%s\n", PQerrorMessage(node_conn));
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
if (server_version_num >= 100000)
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
|
||||
else
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
||||
xlog_recptr = get_last_wal_receive_location(node_conn);
|
||||
|
||||
res = PQexec(node_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
if (xlog_recptr == InvalidXLogRecPtr)
|
||||
{
|
||||
log_info(_("unable to retrieve node's last standby location: %s\n"),
|
||||
log_info(_("unable to retrieve last standby location for node %i: %s\n"),
|
||||
nodes[i].node_id,
|
||||
PQerrorMessage(node_conn));
|
||||
|
||||
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
|
||||
PQclear(res);
|
||||
log_detail(_("connection details: %s\n"), nodes[i].conninfo_str);
|
||||
PQfinish(node_conn);
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
xlog_recptr = lsn_to_xlogrecptr(PQgetvalue(res, 0, 0), &lsn_format_ok);
|
||||
log_info(_("current LSN of node %i is: %X/%X\n"), nodes[i].node_id, format_lsn(xlog_recptr));
|
||||
|
||||
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, PQgetvalue(res, 0, 0));
|
||||
|
||||
PQclear(res);
|
||||
PQfinish(node_conn);
|
||||
|
||||
/* If position is 0/0, error */
|
||||
@@ -1536,7 +1557,6 @@ do_master_failover(void)
|
||||
}
|
||||
|
||||
/* last we get info about this node, and update shared memory */
|
||||
|
||||
if (server_version_num >= 100000)
|
||||
sprintf(sqlquery, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
|
||||
else
|
||||
@@ -1555,6 +1575,9 @@ do_master_failover(void)
|
||||
}
|
||||
/* write last location in shared memory */
|
||||
update_shared_memory(PQgetvalue(res, 0, 0));
|
||||
|
||||
log_info("local node's LSN is %s\n", PQgetvalue(res, 0, 0));
|
||||
|
||||
PQclear(res);
|
||||
|
||||
/* Wait for each node to come up and report a valid LSN */
|
||||
@@ -1591,6 +1614,9 @@ do_master_failover(void)
|
||||
*/
|
||||
if (PQstatus(node_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_err(_("connection to node %i has gone away:\n%s\n"),
|
||||
nodes[i].node_id,
|
||||
PQerrorMessage(node_conn));
|
||||
log_info(_("At this point, it could be some race conditions "
|
||||
"that are acceptable, assume the node is restarting "
|
||||
"and starting failover procedure\n"));
|
||||
@@ -1607,6 +1633,9 @@ do_master_failover(void)
|
||||
res = PQexec(node_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
/*
|
||||
* Note: in repmgr4 we handle this kind of situation much more gracefully.
|
||||
*/
|
||||
log_err(_("PQexec failed: %s.\nReport an invalid value to not "
|
||||
"be considered as new master and exit.\n"),
|
||||
PQerrorMessage(node_conn));
|
||||
@@ -1639,8 +1668,8 @@ do_master_failover(void)
|
||||
*/
|
||||
if (strcmp(location_value, PASSIVE_NODE) == 0)
|
||||
{
|
||||
log_debug("node %i is passive mode\n", nodes[i].node_id);
|
||||
log_info(_("node %i will not be considered for promotion\n"), nodes[i].node_id);
|
||||
log_detail("node %i indicates it is a passive node\n", nodes[i].node_id);
|
||||
nodes[i].xlog_location = InvalidXLogRecPtr;
|
||||
continue_loop = false;
|
||||
}
|
||||
@@ -1650,7 +1679,8 @@ do_master_failover(void)
|
||||
*/
|
||||
else if (strcmp(location_value, LSN_QUERY_ERROR) == 0)
|
||||
{
|
||||
log_warning(_("node %i is unable to update its shared memory and will not be considered for promotion\n"), nodes[i].node_id);
|
||||
log_warning(_("node %i is unable to update its shared memory and will not be considered for promotion\n"),
|
||||
nodes[i].node_id);
|
||||
nodes[i].xlog_location = InvalidXLogRecPtr;
|
||||
continue_loop = false;
|
||||
}
|
||||
@@ -1658,12 +1688,8 @@ do_master_failover(void)
|
||||
/* Unable to parse value returned by `repmgr_get_last_standby_location()` */
|
||||
else if (*location_value == '\0')
|
||||
{
|
||||
log_crit(
|
||||
_("unable to obtain LSN from node %i"), nodes[i].node_id
|
||||
);
|
||||
log_hint(
|
||||
_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n")
|
||||
);
|
||||
log_crit(_("unable to obtain LSN from node %i"), nodes[i].node_id);
|
||||
log_hint(_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n"));
|
||||
|
||||
PQfinish(node_conn);
|
||||
/* XXX shouldn't we just ignore this node? */
|
||||
@@ -1675,14 +1701,14 @@ do_master_failover(void)
|
||||
* strategy keep checking
|
||||
*/
|
||||
else {
|
||||
log_warning(_("unable to parse LSN \"%s\"\n"),
|
||||
log_warning(_("unable to parse shared memory LSN \"%s\"\n"),
|
||||
location_value);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log_debug(
|
||||
_("invalid LSN returned from node %i: '%s'\n"),
|
||||
_("invalid shared memory LSN returned from node %i: '%s'\n"),
|
||||
nodes[i].node_id,
|
||||
location_value);
|
||||
}
|
||||
@@ -1704,7 +1730,7 @@ do_master_failover(void)
|
||||
nodes[i].xlog_location = xlog_recptr;
|
||||
}
|
||||
|
||||
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, location_value);
|
||||
log_info(_("shared memory LSN of node %i is: %s\n"), nodes[i].node_id, location_value);
|
||||
|
||||
ready_nodes++;
|
||||
nodes[i].is_ready = true;
|
||||
@@ -1760,7 +1786,7 @@ do_master_failover(void)
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
log_debug("best candidate node id is %i\n", best_candidate.node_id);
|
||||
log_info("best candidate node id is %i\n", best_candidate.node_id);
|
||||
|
||||
/* if local node is the best candidate, promote it */
|
||||
if (best_candidate.node_id == local_options.node)
|
||||
@@ -1776,9 +1802,9 @@ do_master_failover(void)
|
||||
sleep(5);
|
||||
|
||||
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||
|
||||
log_debug("promote command is: \"%s\"\n",
|
||||
local_options.promote_command);
|
||||
log_detail(_("LSN is %X/%X\n"), format_lsn(best_candidate.xlog_location));
|
||||
log_info("promote command is: \"%s\"\n",
|
||||
local_options.promote_command);
|
||||
|
||||
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
||||
{
|
||||
@@ -1834,6 +1860,8 @@ do_master_failover(void)
|
||||
node_info.node_id,
|
||||
failed_master.node_id);
|
||||
|
||||
log_notice("%s\n", event_details.data);
|
||||
|
||||
/* my_local_conn is now the master */
|
||||
create_event_record(my_local_conn,
|
||||
&local_options,
|
||||
@@ -1894,7 +1922,7 @@ do_master_failover(void)
|
||||
}
|
||||
|
||||
|
||||
log_debug(_("executing follow command: \"%s\"\n"), local_options.follow_command);
|
||||
log_notice(_("executing follow command: \"%s\"\n"), local_options.follow_command);
|
||||
|
||||
r = system(local_options.follow_command);
|
||||
if (r != 0)
|
||||
@@ -2112,8 +2140,11 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
{
|
||||
int connection_retries;
|
||||
|
||||
if (conninfo != NULL && is_server_available(conninfo))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Check if the node is still available if after
|
||||
* Check if the node is still available; if after
|
||||
* local_options.reconnect_attempts * local_options.reconnect_interval
|
||||
* seconds of retries we cannot reconnect return false
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user