Compare commits

...

10 Commits

Author SHA1 Message Date
Ian Barwick
3988653d6c Add missing line break in logging output
Something else we don't have to worry about in repmgr4.
2019-02-27 09:52:45 +09:00
Ian Barwick
3f9b10a02c Fix directories to exclude in clone from Barman
Backport fix from repmgr4.
2019-02-22 16:31:58 +09:00
Ian Barwick
df34e7e8c5 Prevent "invalid LSN ..." infinite loop when node is passive 2019-02-22 15:49:02 +09:00
Ian Barwick
668b2c9b59 repmgrd: use PQping() as a first test of whether an upstream node is available
It's possible the upstream node may be temporarily not accepting connections
but is still running, so we only confirm that connections are not possible once
PQping() reports a negative result.

This feature has been adapted from repmgr4.
2019-02-22 14:04:37 +09:00
Ian Barwick
9629fb6eb5 repmgrd: remove superfluous query buffer
Query can be sent as-is, no need to copy it to a buffer.
2019-02-21 16:14:08 +09:00
Ian Barwick
967b7c6876 repmgrd: improve logging during failover
Ensure relevant decision making information is visible at the
default log level (INFO), and also that where log messages are
specific to a particular node, that node's ID is noted.
2019-02-21 15:12:23 +09:00
Ian Barwick
120dd5b82d Make the default log level INFO
This ensures that repmgrd outputs a reasonable amount of logging
information at the default log level.
2019-02-21 14:43:14 +09:00
Ian Barwick
243b5d2b48 doc: update README
Add note about repmgr4.1
2019-02-21 14:38:13 +09:00
Ian Barwick
24a354c0a7 Prevent "invalid LSN returned from node..." infinite loop
Currently in repmgrd3, if a repmgrd enters failover, but one or more other
repmgrds do not (e.g. partial primary invisibility), the repmgrd in failover
may enter an infinite loop waiting for the repmgrd(s) not in failover to
update shared memory.
2019-02-21 14:18:50 +09:00
Ian Barwick
a4f572a1ff Bump version
3.4.0
2019-02-21 13:07:48 +09:00
10 changed files with 162 additions and 72 deletions

View File

@@ -1,5 +1,8 @@
3.3.3 2017-06
3.4.0 2019-02-
default log level is now INFO (Ian)
repmgr: fix `standby register --force` when updating existing node record (Ian)
repmgrd: set LSN shared memory value at standby startup (Ian)
repmgrd: improve logging during failover (Ian)
3.3.2 2017-06-01
Add support for PostgreSQL 10 (Ian)

View File

@@ -7,9 +7,13 @@ replication capabilities with utilities to set up standby servers, monitor
replication, and perform administrative tasks such as failover or switchover
operations.
The current `repmgr` version (3.3) supports all PostgreSQL versions from
This `repmgr` version (3.4) supports PostgreSQL versions from
9.3 to 9.6.
*NOTE*: we strongly recommend using the repmgr 4.x series, which contains
many new features and usability enhancements and is being actively developed
and maintained.
Overview
--------
@@ -210,7 +214,7 @@ See `PACKAGES.md` for details on building .deb and .rpm packages from the
Release tarballs are also available:
https://github.com/2ndQuadrant/repmgr/releases
http://repmgr.org/
https://repmgr.org/
`repmgr` is compiled in the same way as a PostgreSQL extension using the PGXS
infrastructure, e.g.:

View File

@@ -59,7 +59,7 @@ progname(void)
* added/changed in reload_config()
*
* NOTE: this function is called before the logger is set up, so we need
* to handle the verbose option ourselves; also the default log level is NOTICE,
* to handle the verbose option ourselves; also the default log level is INFO,
* so we can't use DEBUG.
*/
bool

View File

@@ -322,8 +322,6 @@ is_standby(PGconn *conn)
bool
is_pgup(PGconn *conn, int timeout)
{
char sqlquery[QUERY_STR_LEN];
/* Check the connection status twice in case it changes after reset */
bool twice = false;
@@ -346,8 +344,7 @@ is_pgup(PGconn *conn, int timeout)
if (wait_connection_availability(conn, timeout) != 1)
goto failed;
sqlquery_snprintf(sqlquery, "SELECT 1");
if (PQsendQuery(conn, sqlquery) == 0)
if (PQsendQuery(conn, "SELECT 1") == 0)
{
log_warning(_("PQsendQuery: Query could not be sent to primary. %s\n"),
PQerrorMessage(conn));
@@ -2095,3 +2092,64 @@ get_data_checksum_version(const char *data_directory)
return (int)control_file.data_checksum_version;
}
/* ========================== */
/* backported from repmgr 4.x */
/* ========================== */
XLogRecPtr
parse_lsn(const char *str)
{
XLogRecPtr ptr = InvalidXLogRecPtr;
uint32 high,
low;
if (sscanf(str, "%x/%x", &high, &low) == 2)
ptr = (((XLogRecPtr) high) << 32) + (XLogRecPtr) low;
return ptr;
}
XLogRecPtr
get_last_wal_receive_location(PGconn *conn)
{
PGresult *res = NULL;
XLogRecPtr ptr = InvalidXLogRecPtr;
if (PQserverVersion(conn) >= 100000)
{
res = PQexec(conn, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
}
else
{
res = PQexec(conn, "SELECT pg_catalog.pg_last_xlog_receive_location()");
}
if (PQresultStatus(res) == PGRES_TUPLES_OK)
{
ptr = parse_lsn(PQgetvalue(res, 0, 0));
}
PQclear(res);
return ptr;
}
bool
is_server_available(const char *conninfo)
{
PGPing status = PQping(conninfo);
log_verbose(LOG_DEBUG, "is_server_available(): ping status for \"%s\" is %i\n", conninfo, (int)status);
if (status == PQPING_OK)
return true;
log_warning("is_server_available(): ping status for \"%s\" is %i\n", conninfo, (int)status);
return false;
}

View File

@@ -28,6 +28,8 @@
#include "strutil.h"
#define format_lsn(x) (uint32) (x >> 32), (uint32) x
typedef enum {
UNKNOWN = 0,
MASTER,
@@ -140,4 +142,10 @@ void create_checkpoint(PGconn *conn);
int get_node_replication_state(PGconn *conn, char *node_name, char *output);
t_server_type parse_node_type(const char *type);
int get_data_checksum_version(const char *data_directory);
/* backported from repmgr 4.x */
XLogRecPtr parse_lsn(const char *str);
XLogRecPtr get_last_wal_receive_location(PGconn *conn);
bool is_server_available(const char *conninfo);
#endif

4
log.c
View File

@@ -44,8 +44,8 @@ static void _stderr_log_with_level(const char *level_name, int level, const char
__attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 0)));
int log_type = REPMGR_STDERR;
int log_level = LOG_NOTICE;
int last_log_level = LOG_NOTICE;
int log_level = LOG_INFO;
int last_log_level = LOG_INFO;
int verbose_logging = false;
int terse_logging = false;
/*

View File

@@ -3601,15 +3601,15 @@ do_standby_clone(void)
/* Only from 9.4 */
"pg_dynshmem", "pg_logical", "pg_logical/snapshots", "pg_logical/mappings", "pg_replslot",
/* Already in 9.3 */
"pg_notify", "pg_serial", "pg_snapshots", "pg_stat", "pg_stat_tmp", "pg_tblspc",
"pg_twophase", "pg_xlog", 0
"pg_notify", "pg_serial", "pg_snapshots", "pg_stat", "pg_stat_tmp",
"pg_subtrans", "pg_tblspc", "pg_twophase", "pg_xlog", 0
};
const int vers[] = {
100000,
90500,
90400, 90400, 90400, 90400, 90400,
0, 0, 0, 0, 0, 0,
0, -100000, 0
0, 0, 0, 0, 0,
0, 0, 0, -100000
};
for (i = 0; dirs[i]; i++)
{
@@ -4201,27 +4201,13 @@ stop_backup:
exit(retval);
}
static void
parse_lsn(XLogRecPtr *ptr, const char *str)
{
uint32 high, low;
if (sscanf(str, "%x/%x", &high, &low) != 2)
return;
*ptr = (((XLogRecPtr)high) << 32) + (XLogRecPtr)low;
return;
}
static XLogRecPtr
parse_label_lsn(const char *label_key, const char *label_value)
{
XLogRecPtr ptr = InvalidXLogRecPtr;
XLogRecPtr ptr = parse_lsn(label_value);
parse_lsn(&ptr, label_value);
/* parse_lsn() will not modify ptr if it can't parse the label value */
/* parse_lsn() will return InvalidXLogRecPtr if it can't parse the label value */
if (ptr == InvalidXLogRecPtr)
{
log_err(_("Couldn't parse backup label entry \"%s: %s\" as lsn"),

View File

@@ -66,8 +66,8 @@
# -------------------------------
# Log level: possible values are DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG
# (default: NOTICE)
#loglevel=NOTICE
# (default: INFO)
#loglevel=INFO
# Note that logging facility settings will only apply to `repmgrd` by default;
# `repmgr` will always write to STDERR unless the switch `--log-to-file` is

109
repmgrd.c
View File

@@ -514,6 +514,33 @@ main(int argc, char **argv)
else if (node_info.type == STANDBY)
{
log_info(_("starting continuous standby node monitoring\n"));
/*
* Call update_shared_memory() so it's not stuck at 0/0; this
* will otherwise cause an infinite loop on other repmgrds if
* this repmgrd does not enter failover.
*
* NOTE: this is a temporary workaround for a structural
* issue resolved through architectural redesign in repmgr 4.
*/
if (local_options.failover == MANUAL_FAILOVER)
{
update_shared_memory(PASSIVE_NODE);
}
else
{
PQExpBufferData current_lsn;
XLogRecPtr last_wal_receive_location = get_last_wal_receive_location(my_local_conn);
initPQExpBuffer(&current_lsn);
appendPQExpBuffer(&current_lsn, "%X/%X",
format_lsn(last_wal_receive_location));
update_shared_memory(current_lsn.data);
termPQExpBuffer(&current_lsn);
}
}
do
@@ -847,6 +874,8 @@ standby_monitor(void)
: "upstream";
}
/*
* Check that the upstream node is still available
* If not, initiate failover process
@@ -855,9 +884,7 @@ standby_monitor(void)
* local_options.reconnect_interval seconds
*/
check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
if (PQstatus(upstream_conn) != CONNECTION_OK)
if (!check_connection(&upstream_conn, upstream_node_type, upstream_conninfo))
{
int previous_master_node_id = master_options.node;
@@ -1372,7 +1399,7 @@ do_master_failover(void)
}
total_active_nodes = PQntuples(res);
log_debug(_("%d active nodes registered\n"), total_active_nodes);
log_info(_("%d active nodes registered\n"), total_active_nodes);
/*
* Build an array with the nodes and indicate which ones are visible and
@@ -1421,7 +1448,7 @@ do_master_failover(void)
*
* If the master did come back at this point, the voting algorithm should decide
* it's the "best candidate" anyway and no standby will promote itself or
* attempt to follow* another server.
* attempt to follow another server.
*
* If we don't try and connect to the master here (and the code generally
* assumes it's failed anyway) but it does come back any time from here
@@ -1455,8 +1482,8 @@ do_master_failover(void)
}
PQclear(res);
log_debug(_("total nodes counted: registered=%d, visible=%d\n"),
total_active_nodes, visible_nodes);
log_info(_("total nodes counted: registered=%d, visible=%d\n"),
total_active_nodes, visible_nodes);
/*
* Am I on the group that should keep alive? If I see less than half of
@@ -1473,7 +1500,7 @@ do_master_failover(void)
/* Query all available nodes to determine readiness and LSN */
for (i = 0; i < total_active_nodes; i++)
{
log_debug("checking node %i...\n", nodes[i].node_id);
log_info("checking node %i...\n", nodes[i].node_id);
/* if the node is not visible, skip it */
if (!nodes[i].is_visible)
@@ -1497,31 +1524,25 @@ do_master_failover(void)
if (PQstatus(node_conn) != CONNECTION_OK)
{
log_err(_("It seems new problems are arising, manual intervention is needed\n"));
log_detail("%s\n", PQerrorMessage(node_conn));
terminate(ERR_FAILOVER_FAIL);
}
if (server_version_num >= 100000)
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
else
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
xlog_recptr = get_last_wal_receive_location(node_conn);
res = PQexec(node_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
if (xlog_recptr == InvalidXLogRecPtr)
{
log_info(_("unable to retrieve node's last standby location: %s\n"),
log_info(_("unable to retrieve last standby location for node %i: %s\n"),
nodes[i].node_id,
PQerrorMessage(node_conn));
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
PQclear(res);
log_detail(_("connection details: %s\n"), nodes[i].conninfo_str);
PQfinish(node_conn);
terminate(ERR_FAILOVER_FAIL);
}
xlog_recptr = lsn_to_xlogrecptr(PQgetvalue(res, 0, 0), &lsn_format_ok);
log_info(_("current LSN of node %i is: %X/%X\n"), nodes[i].node_id, format_lsn(xlog_recptr));
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, PQgetvalue(res, 0, 0));
PQclear(res);
PQfinish(node_conn);
/* If position is 0/0, error */
@@ -1536,7 +1557,6 @@ do_master_failover(void)
}
/* last we get info about this node, and update shared memory */
if (server_version_num >= 100000)
sprintf(sqlquery, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
else
@@ -1555,6 +1575,9 @@ do_master_failover(void)
}
/* write last location in shared memory */
update_shared_memory(PQgetvalue(res, 0, 0));
log_info("local node's LSN is %s\n", PQgetvalue(res, 0, 0));
PQclear(res);
/* Wait for each node to come up and report a valid LSN */
@@ -1591,6 +1614,9 @@ do_master_failover(void)
*/
if (PQstatus(node_conn) != CONNECTION_OK)
{
log_err(_("connection to node %i has gone away:\n%s\n"),
nodes[i].node_id,
PQerrorMessage(node_conn));
log_info(_("At this point, it could be some race conditions "
"that are acceptable, assume the node is restarting "
"and starting failover procedure\n"));
@@ -1607,6 +1633,9 @@ do_master_failover(void)
res = PQexec(node_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
/*
* Note: in repmgr4 we handle this kind of situation much more gracefully.
*/
log_err(_("PQexec failed: %s.\nReport an invalid value to not "
"be considered as new master and exit.\n"),
PQerrorMessage(node_conn));
@@ -1639,8 +1668,8 @@ do_master_failover(void)
*/
if (strcmp(location_value, PASSIVE_NODE) == 0)
{
log_debug("node %i is passive mode\n", nodes[i].node_id);
log_info(_("node %i will not be considered for promotion\n"), nodes[i].node_id);
log_detail("node %i indicates it is a passive node\n", nodes[i].node_id);
nodes[i].xlog_location = InvalidXLogRecPtr;
continue_loop = false;
}
@@ -1650,7 +1679,8 @@ do_master_failover(void)
*/
else if (strcmp(location_value, LSN_QUERY_ERROR) == 0)
{
log_warning(_("node %i is unable to update its shared memory and will not be considered for promotion\n"), nodes[i].node_id);
log_warning(_("node %i is unable to update its shared memory and will not be considered for promotion\n"),
nodes[i].node_id);
nodes[i].xlog_location = InvalidXLogRecPtr;
continue_loop = false;
}
@@ -1658,12 +1688,8 @@ do_master_failover(void)
/* Unable to parse value returned by `repmgr_get_last_standby_location()` */
else if (*location_value == '\0')
{
log_crit(
_("unable to obtain LSN from node %i"), nodes[i].node_id
);
log_hint(
_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n")
);
log_crit(_("unable to obtain LSN from node %i"), nodes[i].node_id);
log_hint(_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n"));
PQfinish(node_conn);
/* XXX shouldn't we just ignore this node? */
@@ -1675,14 +1701,14 @@ do_master_failover(void)
* strategy keep checking
*/
else {
log_warning(_("unable to parse LSN \"%s\"\n"),
log_warning(_("unable to parse shared memory LSN \"%s\"\n"),
location_value);
}
}
else
{
log_debug(
_("invalid LSN returned from node %i: '%s'\n"),
_("invalid shared memory LSN returned from node %i: '%s'\n"),
nodes[i].node_id,
location_value);
}
@@ -1704,7 +1730,7 @@ do_master_failover(void)
nodes[i].xlog_location = xlog_recptr;
}
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, location_value);
log_info(_("shared memory LSN of node %i is: %s\n"), nodes[i].node_id, location_value);
ready_nodes++;
nodes[i].is_ready = true;
@@ -1760,7 +1786,7 @@ do_master_failover(void)
terminate(ERR_FAILOVER_FAIL);
}
log_debug("best candidate node id is %i\n", best_candidate.node_id);
log_info("best candidate node id is %i\n", best_candidate.node_id);
/* if local node is the best candidate, promote it */
if (best_candidate.node_id == local_options.node)
@@ -1776,9 +1802,9 @@ do_master_failover(void)
sleep(5);
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
log_debug("promote command is: \"%s\"\n",
local_options.promote_command);
log_detail(_("LSN is %X/%X\n"), format_lsn(best_candidate.xlog_location));
log_info("promote command is: \"%s\"\n",
local_options.promote_command);
if (log_type == REPMGR_STDERR && *local_options.logfile)
{
@@ -1834,6 +1860,8 @@ do_master_failover(void)
node_info.node_id,
failed_master.node_id);
log_notice("%s\n", event_details.data);
/* my_local_conn is now the master */
create_event_record(my_local_conn,
&local_options,
@@ -1894,7 +1922,7 @@ do_master_failover(void)
}
log_debug(_("executing follow command: \"%s\"\n"), local_options.follow_command);
log_notice(_("executing follow command: \"%s\"\n"), local_options.follow_command);
r = system(local_options.follow_command);
if (r != 0)
@@ -2112,8 +2140,11 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
{
int connection_retries;
if (conninfo != NULL && is_server_available(conninfo))
return true;
/*
* Check if the node is still available if after
* Check if the node is still available; if after
* local_options.reconnect_attempts * local_options.reconnect_interval
* seconds of retries we cannot reconnect return false
*/

View File

@@ -1,6 +1,6 @@
#ifndef _VERSION_H_
#define _VERSION_H_
#define REPMGR_VERSION "3.3.2"
#define REPMGR_VERSION "3.4.0"
#endif