"standby promote": don't promote if replay paused and in archive recovery

It does not appear feasible to predict if there is still WAL waiting to
be replayed from archive. In this case take no action.

GitHub #540.
This commit is contained in:
Ian Barwick
2019-02-05 14:37:26 +09:00
parent f62b3b2868
commit 2a529e7e8b
2 changed files with 49 additions and 13 deletions

View File

@@ -44,6 +44,10 @@
attempting to promote PostgreSQL in this state will leave PostgreSQL in a condition where the
promotion may occur at a unpredictable point in the future.
</para>
<para>
Note that if the standby is in archive recovery, &repmgr; will not be able to determine
if more WAL is pending replay, and will abort the promotion attempt if WAL replay is paused.
</para>
</note>
</refsect1>

View File

@@ -1997,27 +1997,59 @@ do_standby_promote(void)
* need to avoid leaving a "ticking timebomb" which might cause
* an unexpected status change in the replication cluster.
*/
if (is_wal_replay_paused(conn, true) == true)
{
ReplInfo replication_info;
ReplInfo replication_info;
bool replay_paused = false;
init_replication_info(&replication_info);
log_error(_("WAL replay is paused on this node but not all WAL has been replayed"));
if (get_replication_info(conn, &replication_info) == true)
if (get_replication_info(conn, &replication_info) == false)
{
log_detail(_("replay paused at %X/%X; last WAL received is %X/%X"),
format_lsn(replication_info.last_wal_replay_lsn),
format_lsn(replication_info.last_wal_receive_lsn));
log_error(_("unable to retrieve replication information from local node"));
PQfinish(conn);
exit(ERR_PROMOTION_FAIL);
}
if (PQserverVersion(conn) >= 100000)
log_hint(_("execute \"pg_wal_replay_resume()\" to unpause WAL replay"));
/*
* If the local node is recovering from archive, we can't tell
* whether there's still WAL which needs to be replayed, so
* we'll abort if WAL replay is paused.
*/
if (replication_info.receiving_streamed_wal == false)
{
/* just a simple check for paused WAL replay */
replay_paused = is_wal_replay_paused(conn, false);
if (replay_paused == true)
{
log_error(_("WAL replay is paused on this node"));
log_detail(_("node is in archive recovery and is not safe to promote in this state"));
log_detail(_("replay paused at %X/%X"),
format_lsn(replication_info.last_wal_replay_lsn));
}
}
else
log_hint(_("execute \"pg_xlog_replay_resume()\" to unpause WAL replay"));
PQfinish(conn);
exit(ERR_PROMOTION_FAIL);
{
/* check that replay is pause *and* WAL is pending replay */
replay_paused = is_wal_replay_paused(conn, true);
if (replay_paused == true)
{
log_error(_("WAL replay is paused on this node but not all WAL has been replayed"));
log_detail(_("replay paused at %X/%X; last WAL received is %X/%X"),
format_lsn(replication_info.last_wal_replay_lsn),
format_lsn(replication_info.last_wal_receive_lsn));
}
}
if (replay_paused == true)
{
if (PQserverVersion(conn) >= 100000)
log_hint(_("execute \"pg_wal_replay_resume()\" to unpause WAL replay"));
else
log_hint(_("execute \"pg_xlog_replay_resume()\" to unpause WAL replay"));
PQfinish(conn);
exit(ERR_PROMOTION_FAIL);
}
}
/* check that there's no existing primary */