mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
"standby promote": don't promote if replay paused and in archive recovery
It does not appear feasible to predict if there is still WAL waiting to be replayed from archive. In this case take no action. GitHub #540.
This commit is contained in:
@@ -44,6 +44,10 @@
|
||||
attempting to promote PostgreSQL in this state will leave PostgreSQL in a condition where the
|
||||
promotion may occur at a unpredictable point in the future.
|
||||
</para>
|
||||
<para>
|
||||
Note that if the standby is in archive recovery, &repmgr; will not be able to determine
|
||||
if more WAL is pending replay, and will abort the promotion attempt if WAL replay is paused.
|
||||
</para>
|
||||
</note>
|
||||
|
||||
</refsect1>
|
||||
|
||||
@@ -1997,27 +1997,59 @@ do_standby_promote(void)
|
||||
* need to avoid leaving a "ticking timebomb" which might cause
|
||||
* an unexpected status change in the replication cluster.
|
||||
*/
|
||||
if (is_wal_replay_paused(conn, true) == true)
|
||||
{
|
||||
ReplInfo replication_info;
|
||||
ReplInfo replication_info;
|
||||
bool replay_paused = false;
|
||||
|
||||
init_replication_info(&replication_info);
|
||||
|
||||
log_error(_("WAL replay is paused on this node but not all WAL has been replayed"));
|
||||
|
||||
if (get_replication_info(conn, &replication_info) == true)
|
||||
if (get_replication_info(conn, &replication_info) == false)
|
||||
{
|
||||
log_detail(_("replay paused at %X/%X; last WAL received is %X/%X"),
|
||||
format_lsn(replication_info.last_wal_replay_lsn),
|
||||
format_lsn(replication_info.last_wal_receive_lsn));
|
||||
log_error(_("unable to retrieve replication information from local node"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_PROMOTION_FAIL);
|
||||
}
|
||||
|
||||
if (PQserverVersion(conn) >= 100000)
|
||||
log_hint(_("execute \"pg_wal_replay_resume()\" to unpause WAL replay"));
|
||||
/*
|
||||
* If the local node is recovering from archive, we can't tell
|
||||
* whether there's still WAL which needs to be replayed, so
|
||||
* we'll abort if WAL replay is paused.
|
||||
*/
|
||||
if (replication_info.receiving_streamed_wal == false)
|
||||
{
|
||||
/* just a simple check for paused WAL replay */
|
||||
replay_paused = is_wal_replay_paused(conn, false);
|
||||
if (replay_paused == true)
|
||||
{
|
||||
log_error(_("WAL replay is paused on this node"));
|
||||
log_detail(_("node is in archive recovery and is not safe to promote in this state"));
|
||||
log_detail(_("replay paused at %X/%X"),
|
||||
format_lsn(replication_info.last_wal_replay_lsn));
|
||||
}
|
||||
}
|
||||
else
|
||||
log_hint(_("execute \"pg_xlog_replay_resume()\" to unpause WAL replay"));
|
||||
PQfinish(conn);
|
||||
exit(ERR_PROMOTION_FAIL);
|
||||
{
|
||||
/* check that replay is pause *and* WAL is pending replay */
|
||||
replay_paused = is_wal_replay_paused(conn, true);
|
||||
if (replay_paused == true)
|
||||
{
|
||||
log_error(_("WAL replay is paused on this node but not all WAL has been replayed"));
|
||||
log_detail(_("replay paused at %X/%X; last WAL received is %X/%X"),
|
||||
format_lsn(replication_info.last_wal_replay_lsn),
|
||||
format_lsn(replication_info.last_wal_receive_lsn));
|
||||
}
|
||||
}
|
||||
|
||||
if (replay_paused == true)
|
||||
{
|
||||
if (PQserverVersion(conn) >= 100000)
|
||||
log_hint(_("execute \"pg_wal_replay_resume()\" to unpause WAL replay"));
|
||||
else
|
||||
log_hint(_("execute \"pg_xlog_replay_resume()\" to unpause WAL replay"));
|
||||
|
||||
PQfinish(conn);
|
||||
exit(ERR_PROMOTION_FAIL);
|
||||
}
|
||||
}
|
||||
|
||||
/* check that there's no existing primary */
|
||||
|
||||
Reference in New Issue
Block a user