mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 07:06:30 +00:00
Prevent "invalid LSN returned from node..." infinite loop
Currently in repmgrd3, if a repmgrd enters failover, but one or more other repmgrds do not (e.g. partial primary invisibility), the repmgrd in failover may enter an infinite loop waiting for the repmgrd(s) not in failover to update shared memory.
This commit is contained in:
45
dbutils.c
45
dbutils.c
@@ -2095,3 +2095,48 @@ get_data_checksum_version(const char *data_directory)
|
||||
|
||||
return (int)control_file.data_checksum_version;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* ========================== */
|
||||
/* backported from repmgr 4.x */
|
||||
/* ========================== */
|
||||
|
||||
XLogRecPtr
|
||||
parse_lsn(const char *str)
|
||||
{
|
||||
XLogRecPtr ptr = InvalidXLogRecPtr;
|
||||
uint32 high,
|
||||
low;
|
||||
|
||||
if (sscanf(str, "%x/%x", &high, &low) == 2)
|
||||
ptr = (((XLogRecPtr) high) << 32) + (XLogRecPtr) low;
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
||||
XLogRecPtr
|
||||
get_last_wal_receive_location(PGconn *conn)
|
||||
{
|
||||
PGresult *res = NULL;
|
||||
XLogRecPtr ptr = InvalidXLogRecPtr;
|
||||
|
||||
if (PQserverVersion(conn) >= 100000)
|
||||
{
|
||||
res = PQexec(conn, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
|
||||
}
|
||||
else
|
||||
{
|
||||
res = PQexec(conn, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
||||
}
|
||||
|
||||
if (PQresultStatus(res) == PGRES_TUPLES_OK)
|
||||
{
|
||||
ptr = parse_lsn(PQgetvalue(res, 0, 0));
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
#include "strutil.h"
|
||||
|
||||
|
||||
#define format_lsn(x) (uint32) (x >> 32), (uint32) x
|
||||
|
||||
typedef enum {
|
||||
UNKNOWN = 0,
|
||||
MASTER,
|
||||
@@ -140,4 +142,9 @@ void create_checkpoint(PGconn *conn);
|
||||
int get_node_replication_state(PGconn *conn, char *node_name, char *output);
|
||||
t_server_type parse_node_type(const char *type);
|
||||
int get_data_checksum_version(const char *data_directory);
|
||||
|
||||
/* backported from repmgr 4.x */
|
||||
XLogRecPtr parse_lsn(const char *str);
|
||||
XLogRecPtr get_last_wal_receive_location(PGconn *conn);
|
||||
|
||||
#endif
|
||||
|
||||
18
repmgr.c
18
repmgr.c
@@ -4201,27 +4201,13 @@ stop_backup:
|
||||
exit(retval);
|
||||
}
|
||||
|
||||
static void
|
||||
parse_lsn(XLogRecPtr *ptr, const char *str)
|
||||
{
|
||||
uint32 high, low;
|
||||
|
||||
if (sscanf(str, "%x/%x", &high, &low) != 2)
|
||||
return;
|
||||
|
||||
*ptr = (((XLogRecPtr)high) << 32) + (XLogRecPtr)low;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
parse_label_lsn(const char *label_key, const char *label_value)
|
||||
{
|
||||
XLogRecPtr ptr = InvalidXLogRecPtr;
|
||||
XLogRecPtr ptr = parse_lsn(label_value);
|
||||
|
||||
parse_lsn(&ptr, label_value);
|
||||
|
||||
/* parse_lsn() will not modify ptr if it can't parse the label value */
|
||||
/* parse_lsn() will return InvalidXLogRecPtr if it can't parse the label value */
|
||||
if (ptr == InvalidXLogRecPtr)
|
||||
{
|
||||
log_err(_("Couldn't parse backup label entry \"%s: %s\" as lsn"),
|
||||
|
||||
41
repmgrd.c
41
repmgrd.c
@@ -514,6 +514,28 @@ main(int argc, char **argv)
|
||||
else if (node_info.type == STANDBY)
|
||||
{
|
||||
log_info(_("starting continuous standby node monitoring\n"));
|
||||
|
||||
/*
|
||||
* Call update_shared_memory() so it's not stuck at 0/0; this
|
||||
* will otherwise cause an infinite loop on other repmgrds if
|
||||
* this repmgrd does not enter failover.
|
||||
*
|
||||
* NOTE: this is a temporary workaround for a structural
|
||||
* issue resolved through architectural redesign in repmgr 4.
|
||||
*/
|
||||
{
|
||||
PQExpBufferData current_lsn;
|
||||
|
||||
XLogRecPtr last_wal_receive_location = get_last_wal_receive_location(my_local_conn);
|
||||
|
||||
initPQExpBuffer(¤t_lsn);
|
||||
appendPQExpBuffer(¤t_lsn, "%X/%X",
|
||||
format_lsn(last_wal_receive_location));
|
||||
|
||||
update_shared_memory(current_lsn.data);
|
||||
|
||||
termPQExpBuffer(¤t_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
do
|
||||
@@ -847,6 +869,8 @@ standby_monitor(void)
|
||||
: "upstream";
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Check that the upstream node is still available
|
||||
* If not, initiate failover process
|
||||
@@ -1421,7 +1445,7 @@ do_master_failover(void)
|
||||
*
|
||||
* If the master did come back at this point, the voting algorithm should decide
|
||||
* it's the "best candidate" anyway and no standby will promote itself or
|
||||
* attempt to follow* another server.
|
||||
* attempt to follow another server.
|
||||
*
|
||||
* If we don't try and connect to the master here (and the code generally
|
||||
* assumes it's failed anyway) but it does come back any time from here
|
||||
@@ -1500,28 +1524,20 @@ do_master_failover(void)
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
if (server_version_num >= 100000)
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
|
||||
else
|
||||
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
||||
xlog_recptr = get_last_wal_receive_location(node_conn);
|
||||
|
||||
res = PQexec(node_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
if (xlog_recptr == InvalidXLogRecPtr)
|
||||
{
|
||||
log_info(_("unable to retrieve node's last standby location: %s\n"),
|
||||
PQerrorMessage(node_conn));
|
||||
|
||||
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
|
||||
PQclear(res);
|
||||
PQfinish(node_conn);
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
xlog_recptr = lsn_to_xlogrecptr(PQgetvalue(res, 0, 0), &lsn_format_ok);
|
||||
log_debug(_("LSN of node %i is: %X/%X\n"), nodes[i].node_id, format_lsn(xlog_recptr));
|
||||
|
||||
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, PQgetvalue(res, 0, 0));
|
||||
|
||||
PQclear(res);
|
||||
PQfinish(node_conn);
|
||||
|
||||
/* If position is 0/0, error */
|
||||
@@ -1536,7 +1552,6 @@ do_master_failover(void)
|
||||
}
|
||||
|
||||
/* last we get info about this node, and update shared memory */
|
||||
|
||||
if (server_version_num >= 100000)
|
||||
sprintf(sqlquery, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
|
||||
else
|
||||
|
||||
Reference in New Issue
Block a user