Prevent "invalid LSN returned from node..." infinite loop

Currently in repmgrd3, if a repmgrd enters failover, but one or more other
repmgrds do not (e.g. partial primary invisibility), the repmgrd in failover
may enter an infinite loop waiting for the repmgrd(s) not in failover to
update shared memory.
This commit is contained in:
Ian Barwick
2019-02-21 14:18:50 +09:00
parent a4f572a1ff
commit 24a354c0a7
4 changed files with 82 additions and 29 deletions

View File

@@ -2095,3 +2095,48 @@ get_data_checksum_version(const char *data_directory)
return (int)control_file.data_checksum_version;
}
/* ========================== */
/* backported from repmgr 4.x */
/* ========================== */
XLogRecPtr
parse_lsn(const char *str)
{
XLogRecPtr ptr = InvalidXLogRecPtr;
uint32 high,
low;
if (sscanf(str, "%x/%x", &high, &low) == 2)
ptr = (((XLogRecPtr) high) << 32) + (XLogRecPtr) low;
return ptr;
}
XLogRecPtr
get_last_wal_receive_location(PGconn *conn)
{
PGresult *res = NULL;
XLogRecPtr ptr = InvalidXLogRecPtr;
if (PQserverVersion(conn) >= 100000)
{
res = PQexec(conn, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
}
else
{
res = PQexec(conn, "SELECT pg_catalog.pg_last_xlog_receive_location()");
}
if (PQresultStatus(res) == PGRES_TUPLES_OK)
{
ptr = parse_lsn(PQgetvalue(res, 0, 0));
}
PQclear(res);
return ptr;
}

View File

@@ -28,6 +28,8 @@
#include "strutil.h"
#define format_lsn(x) (uint32) (x >> 32), (uint32) x
typedef enum {
UNKNOWN = 0,
MASTER,
@@ -140,4 +142,9 @@ void create_checkpoint(PGconn *conn);
int get_node_replication_state(PGconn *conn, char *node_name, char *output);
t_server_type parse_node_type(const char *type);
int get_data_checksum_version(const char *data_directory);
/* backported from repmgr 4.x */
XLogRecPtr parse_lsn(const char *str);
XLogRecPtr get_last_wal_receive_location(PGconn *conn);
#endif

View File

@@ -4201,27 +4201,13 @@ stop_backup:
exit(retval);
}
static void
parse_lsn(XLogRecPtr *ptr, const char *str)
{
uint32 high, low;
if (sscanf(str, "%x/%x", &high, &low) != 2)
return;
*ptr = (((XLogRecPtr)high) << 32) + (XLogRecPtr)low;
return;
}
static XLogRecPtr
parse_label_lsn(const char *label_key, const char *label_value)
{
XLogRecPtr ptr = InvalidXLogRecPtr;
XLogRecPtr ptr = parse_lsn(label_value);
parse_lsn(&ptr, label_value);
/* parse_lsn() will not modify ptr if it can't parse the label value */
/* parse_lsn() will return InvalidXLogRecPtr if it can't parse the label value */
if (ptr == InvalidXLogRecPtr)
{
log_err(_("Couldn't parse backup label entry \"%s: %s\" as lsn"),

View File

@@ -514,6 +514,28 @@ main(int argc, char **argv)
else if (node_info.type == STANDBY)
{
log_info(_("starting continuous standby node monitoring\n"));
/*
* Call update_shared_memory() so it's not stuck at 0/0; this
* will otherwise cause an infinite loop on other repmgrds if
* this repmgrd does not enter failover.
*
* NOTE: this is a temporary workaround for a structural
* issue resolved through architectural redesign in repmgr 4.
*/
{
PQExpBufferData current_lsn;
XLogRecPtr last_wal_receive_location = get_last_wal_receive_location(my_local_conn);
initPQExpBuffer(&current_lsn);
appendPQExpBuffer(&current_lsn, "%X/%X",
format_lsn(last_wal_receive_location));
update_shared_memory(current_lsn.data);
termPQExpBuffer(&current_lsn);
}
}
do
@@ -847,6 +869,8 @@ standby_monitor(void)
: "upstream";
}
/*
* Check that the upstream node is still available
* If not, initiate failover process
@@ -1421,7 +1445,7 @@ do_master_failover(void)
*
* If the master did come back at this point, the voting algorithm should decide
* it's the "best candidate" anyway and no standby will promote itself or
* attempt to follow* another server.
* attempt to follow another server.
*
* If we don't try and connect to the master here (and the code generally
* assumes it's failed anyway) but it does come back any time from here
@@ -1500,28 +1524,20 @@ do_master_failover(void)
terminate(ERR_FAILOVER_FAIL);
}
if (server_version_num >= 100000)
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
else
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
xlog_recptr = get_last_wal_receive_location(node_conn);
res = PQexec(node_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
if (xlog_recptr == InvalidXLogRecPtr)
{
log_info(_("unable to retrieve node's last standby location: %s\n"),
PQerrorMessage(node_conn));
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
PQclear(res);
PQfinish(node_conn);
terminate(ERR_FAILOVER_FAIL);
}
xlog_recptr = lsn_to_xlogrecptr(PQgetvalue(res, 0, 0), &lsn_format_ok);
log_debug(_("LSN of node %i is: %X/%X\n"), nodes[i].node_id, format_lsn(xlog_recptr));
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, PQgetvalue(res, 0, 0));
PQclear(res);
PQfinish(node_conn);
/* If position is 0/0, error */
@@ -1536,7 +1552,6 @@ do_master_failover(void)
}
/* last we get info about this node, and update shared memory */
if (server_version_num >= 100000)
sprintf(sqlquery, "SELECT pg_catalog.pg_last_wal_receive_lsn()");
else