"daemon status": emit warning if WAL replay is paused

Specifically, if WAL replay is paused *and* WAL is pending replay,
this node cannot be promoted until WAL replay is unpaused. In this
state it is not a suitable promotion candidate in a failover situation.
This commit is contained in:
Ian Barwick
2019-02-06 13:31:02 +09:00
parent 464ec6bec3
commit 984ce7420b
3 changed files with 23 additions and 2 deletions

View File

@@ -5046,7 +5046,7 @@ is_wal_replay_paused(PGconn *conn, bool check_pending_wal)
if (PQresultStatus(res) != PGRES_TUPLES_OK) if (PQresultStatus(res) != PGRES_TUPLES_OK)
{ {
log_db_error(conn, query.data, _("unable to execute \"%s\""), query.data); log_db_error(conn, query.data, _("unable to execute WAL replay pause query"));
} }
else else
{ {

View File

@@ -347,9 +347,11 @@ typedef struct RepmgrdInfo {
char pid_file[MAXLEN]; char pid_file[MAXLEN];
bool pg_running; bool pg_running;
char pg_running_text[MAXLEN]; char pg_running_text[MAXLEN];
RecoveryType recovery_type;
bool running; bool running;
char repmgrd_running[MAXLEN]; char repmgrd_running[MAXLEN];
bool paused; bool paused;
bool wal_paused_pending_wal;
} RepmgrdInfo; } RepmgrdInfo;

View File

@@ -66,6 +66,7 @@ do_daemon_status(void)
int i; int i;
RepmgrdInfo **repmgrd_info; RepmgrdInfo **repmgrd_info;
ItemList warnings = {NULL, NULL}; ItemList warnings = {NULL, NULL};
bool connection_error_found = false;
/* Connect to local database to obtain cluster connection data */ /* Connect to local database to obtain cluster connection data */
log_verbose(LOG_INFO, _("connecting to database")); log_verbose(LOG_INFO, _("connecting to database"));
@@ -108,14 +109,18 @@ do_daemon_status(void)
repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo)); repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
repmgrd_info[i]->node_id = cell->node_info->node_id; repmgrd_info[i]->node_id = cell->node_info->node_id;
repmgrd_info[i]->pid = UNKNOWN_PID; repmgrd_info[i]->pid = UNKNOWN_PID;
repmgrd_info[i]->recovery_type = RECTYPE_UNKNOWN;
repmgrd_info[i]->paused = false; repmgrd_info[i]->paused = false;
repmgrd_info[i]->running = false; repmgrd_info[i]->running = false;
repmgrd_info[i]->pg_running = true; repmgrd_info[i]->pg_running = true;
repmgrd_info[i]->wal_paused_pending_wal = false;
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo); cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK) if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{ {
connection_error_found = true;
if (runtime_options.verbose) if (runtime_options.verbose)
{ {
char error[MAXLEN]; char error[MAXLEN];
@@ -166,6 +171,20 @@ do_daemon_status(void)
repmgrd_info[i]->paused = repmgrd_is_paused(cell->node_info->conn); repmgrd_info[i]->paused = repmgrd_is_paused(cell->node_info->conn);
repmgrd_info[i]->recovery_type = get_recovery_type(cell->node_info->conn);
if (repmgrd_info[i]->recovery_type == RECTYPE_STANDBY)
{
repmgrd_info[i]->wal_paused_pending_wal = is_wal_replay_paused(cell->node_info->conn, true);
if (repmgrd_info[i]->wal_paused_pending_wal == true)
{
item_list_append_format(&warnings,
"WAL replay is paused on node \"%s\" (ID: %i) with WAL replay pending; this node cannot be promoted",
cell->node_info->node_name, cell->node_info->node_id);
}
}
PQfinish(cell->node_info->conn); PQfinish(cell->node_info->conn);
} }
@@ -244,7 +263,7 @@ do_daemon_status(void)
printf(_(" - %s\n"), cell->string); printf(_(" - %s\n"), cell->string);
} }
if (runtime_options.verbose == false) if (runtime_options.verbose == false && connection_error_found == true)
{ {
log_hint(_("execute with --verbose option to see connection error messages")); log_hint(_("execute with --verbose option to see connection error messages"));
} }