From d353fe2a9f484d5cd02bbfeab0f5ea91525b80a3 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 28 Oct 2015 16:05:35 +0900 Subject: [PATCH] Terminate repmgrd if standby is no longer connected to upstream --- README.md | 29 +++++++++++++++-------------- errcode.h | 1 + repmgrd.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index b39e4cc9..2ec5650a 100644 --- a/README.md +++ b/README.md @@ -355,6 +355,7 @@ Following event types currently exist: standby_promote witness_create repmgrd_start + repmgrd_monitor repmgrd_failover_promote repmgrd_failover_follow @@ -585,20 +586,20 @@ and one view: `repmgr` or `repmgrd` will return one of the following error codes on program exit: -* SUCCESS (0) Program ran successfully. -* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid -* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error -* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed -* ERR_DB_CON (6) Error when trying to connect to a database -* ERR_DB_QUERY (7) Error while executing a database query -* ERR_PROMOTED (8) Exiting program because the node has been promoted to master -* ERR_BAD_PASSWORD (9) Password used to connect to a database was rejected -* ERR_STR_OVERFLOW (10) String overflow error -* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only) -* ERR_BAD_SSH (12) Error when connecting to remote host via SSH -* ERR_SYS_FAILURE (13) Error when forking (repmgrd only) -* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup - +* SUCCESS (0) Program ran successfully. +* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid +* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error +* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed +* ERR_DB_CON (6) Error when trying to connect to a database +* ERR_DB_QUERY (7) Error while executing a database query +* ERR_PROMOTED (8) Exiting program because the node has been promoted to master +* ERR_BAD_PASSWORD (9) Password used to connect to a database was rejected +* ERR_STR_OVERFLOW (10) String overflow error +* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only) +* ERR_BAD_SSH (12) Error when connecting to remote host via SSH +* ERR_SYS_FAILURE (13) Error when forking (repmgrd only) +* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup +* ERR_MONITORING_FAIL (16) Unrecoverable error encountered during monitoring (repmgrd only) Support and Assistance ---------------------- diff --git a/errcode.h b/errcode.h index a67f513c..b6ebd736 100644 --- a/errcode.h +++ b/errcode.h @@ -36,5 +36,6 @@ #define ERR_SYS_FAILURE 13 #define ERR_BAD_BASEBACKUP 14 #define ERR_INTERNAL 15 +#define ERR_MONITORING_FAIL 16 #endif /* _ERRCODE_H_ */ diff --git a/repmgrd.c b/repmgrd.c index dc96ff3b..e977e3cb 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -677,6 +677,7 @@ standby_monitor(void) char last_wal_standby_received[MAXLEN]; char last_wal_standby_applied[MAXLEN]; char last_wal_standby_applied_timestamp[MAXLEN]; + bool last_wal_standby_received_gte_applied; char sqlquery[QUERY_STR_LEN]; XLogRecPtr lsn_master; @@ -956,7 +957,8 @@ standby_monitor(void) /* Get local xlog info */ sqlquery_snprintf(sqlquery, "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " - "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp() "); + "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp(), " + "pg_last_xlog_receive_location() >= pg_last_xlog_replay_location()"); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) @@ -971,8 +973,47 @@ standby_monitor(void) strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN); strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN); strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN); + + last_wal_standby_received_gte_applied = (strcmp(PQgetvalue(res, 0, 4), "t") == 0) + ? true + : false; + PQclear(res); + /* + * Check that last WAL received is greater or equal to last WAL applied + * + * This situation can occur when the standby is no longer connected to + * the upstream node; in this case repmgrd should terminate itself + * as the node may no longer be capable of being promoted or following + * a new upstream node + * + * XXX check if we should (optionally) adopt other strategies to handle + * this situation + */ + if(last_wal_standby_received_gte_applied == false) + { + PQExpBufferData errmsg; + initPQExpBuffer(&errmsg); + + appendPQExpBuffer(&errmsg, + /* XXX improve message */ + _("This node is no longer connected to its upstream node - terminating")); + + log_crit("%s\n", errmsg.data); + + create_event_record(master_conn, + &local_options, + local_options.node, + "repmgrd_monitor", + false, + errmsg.data); + + // XXX use better code + terminate(ERR_MONITORING_FAIL); + } + + /* Get master xlog info */ sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location()");