From 984ce7420b2ed41ee0033579dc3af2f50ae4b9d1 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 6 Feb 2019 13:31:02 +0900 Subject: [PATCH] "daemon status": emit warning if WAL replay is paused Specifically, if WAL replay is paused *and* WAL is pending replay, this node cannot be promoted until WAL replay is unpaused. In this state it is not a suitable promotion candidate in a failover situation. --- dbutils.c | 2 +- dbutils.h | 2 ++ repmgr-action-daemon.c | 21 ++++++++++++++++++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/dbutils.c b/dbutils.c index e2da25de..b6b9a24f 100644 --- a/dbutils.c +++ b/dbutils.c @@ -5046,7 +5046,7 @@ is_wal_replay_paused(PGconn *conn, bool check_pending_wal) if (PQresultStatus(res) != PGRES_TUPLES_OK) { - log_db_error(conn, query.data, _("unable to execute \"%s\""), query.data); + log_db_error(conn, query.data, _("unable to execute WAL replay pause query")); } else { diff --git a/dbutils.h b/dbutils.h index 43e193a9..d86432c6 100644 --- a/dbutils.h +++ b/dbutils.h @@ -347,9 +347,11 @@ typedef struct RepmgrdInfo { char pid_file[MAXLEN]; bool pg_running; char pg_running_text[MAXLEN]; + RecoveryType recovery_type; bool running; char repmgrd_running[MAXLEN]; bool paused; + bool wal_paused_pending_wal; } RepmgrdInfo; diff --git a/repmgr-action-daemon.c b/repmgr-action-daemon.c index b97ec8b9..0c724fc0 100644 --- a/repmgr-action-daemon.c +++ b/repmgr-action-daemon.c @@ -66,6 +66,7 @@ do_daemon_status(void) int i; RepmgrdInfo **repmgrd_info; ItemList warnings = {NULL, NULL}; + bool connection_error_found = false; /* Connect to local database to obtain cluster connection data */ log_verbose(LOG_INFO, _("connecting to database")); @@ -108,14 +109,18 @@ do_daemon_status(void) repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo)); repmgrd_info[i]->node_id = cell->node_info->node_id; repmgrd_info[i]->pid = UNKNOWN_PID; + repmgrd_info[i]->recovery_type = RECTYPE_UNKNOWN; repmgrd_info[i]->paused = false; repmgrd_info[i]->running = false; repmgrd_info[i]->pg_running = true; + repmgrd_info[i]->wal_paused_pending_wal = false; cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo); if (PQstatus(cell->node_info->conn) != CONNECTION_OK) { + connection_error_found = true; + if (runtime_options.verbose) { char error[MAXLEN]; @@ -166,6 +171,20 @@ do_daemon_status(void) repmgrd_info[i]->paused = repmgrd_is_paused(cell->node_info->conn); + repmgrd_info[i]->recovery_type = get_recovery_type(cell->node_info->conn); + + if (repmgrd_info[i]->recovery_type == RECTYPE_STANDBY) + { + repmgrd_info[i]->wal_paused_pending_wal = is_wal_replay_paused(cell->node_info->conn, true); + + if (repmgrd_info[i]->wal_paused_pending_wal == true) + { + item_list_append_format(&warnings, + "WAL replay is paused on node \"%s\" (ID: %i) with WAL replay pending; this node cannot be promoted", + cell->node_info->node_name, cell->node_info->node_id); + } + } + PQfinish(cell->node_info->conn); } @@ -244,7 +263,7 @@ do_daemon_status(void) printf(_(" - %s\n"), cell->string); } - if (runtime_options.verbose == false) + if (runtime_options.verbose == false && connection_error_found == true) { log_hint(_("execute with --verbose option to see connection error messages")); }