From d353fe2a9f484d5cd02bbfeab0f5ea91525b80a3 Mon Sep 17 00:00:00 2001
From: Ian Barwick <barwick@gmail.com>
Date: Wed, 28 Oct 2015 16:05:35 +0900
Subject: [PATCH] Terminate repmgrd if standby is no longer connected to
 upstream

---
 README.md | 29 +++++++++++++++--------------
 errcode.h |  1 +
 repmgrd.c | 43 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index b39e4cc9..2ec5650a 100644
--- a/README.md
+++ b/README.md
@@ -355,6 +355,7 @@ Following event types currently exist:
     standby_promote
     witness_create
     repmgrd_start
+    repmgrd_monitor
     repmgrd_failover_promote
     repmgrd_failover_follow
 
@@ -585,20 +586,20 @@ and one view:
 `repmgr` or `repmgrd` will return one of the following error codes on program
 exit:
 
-* SUCCESS (0)             Program ran successfully.
-* ERR_BAD_CONFIG (1)      Configuration file could not be parsed or was invalid
-* ERR_BAD_RSYNC (2)       An rsync call made by the program returned an error
-* ERR_NO_RESTART (4)      An attempt to restart a PostgreSQL instance failed
-* ERR_DB_CON (6)          Error when trying to connect to a database
-* ERR_DB_QUERY (7)        Error while executing a database query
-* ERR_PROMOTED (8)        Exiting program because the node has been promoted to master
-* ERR_BAD_PASSWORD (9)    Password used to connect to a database was rejected
-* ERR_STR_OVERFLOW (10)   String overflow error
-* ERR_FAILOVER_FAIL (11)  Error encountered during failover (repmgrd only)
-* ERR_BAD_SSH (12)        Error when connecting to remote host via SSH
-* ERR_SYS_FAILURE (13)    Error when forking (repmgrd only)
-* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup
-
+* SUCCESS (0)              Program ran successfully.
+* ERR_BAD_CONFIG (1)       Configuration file could not be parsed or was invalid
+* ERR_BAD_RSYNC (2)        An rsync call made by the program returned an error
+* ERR_NO_RESTART (4)       An attempt to restart a PostgreSQL instance failed
+* ERR_DB_CON (6)           Error when trying to connect to a database
+* ERR_DB_QUERY (7)         Error while executing a database query
+* ERR_PROMOTED (8)         Exiting program because the node has been promoted to master
+* ERR_BAD_PASSWORD (9)     Password used to connect to a database was rejected
+* ERR_STR_OVERFLOW (10)    String overflow error
+* ERR_FAILOVER_FAIL (11)   Error encountered during failover (repmgrd only)
+* ERR_BAD_SSH (12)         Error when connecting to remote host via SSH
+* ERR_SYS_FAILURE (13)     Error when forking (repmgrd only)
+* ERR_BAD_BASEBACKUP (14)  Error when executing pg_basebackup
+* ERR_MONITORING_FAIL (16) Unrecoverable error encountered during monitoring (repmgrd only)
 
 Support and Assistance
 ----------------------
diff --git a/errcode.h b/errcode.h
index a67f513c..b6ebd736 100644
--- a/errcode.h
+++ b/errcode.h
@@ -36,5 +36,6 @@
 #define ERR_SYS_FAILURE 13
 #define ERR_BAD_BASEBACKUP 14
 #define ERR_INTERNAL 15
+#define ERR_MONITORING_FAIL 16
 
 #endif   /* _ERRCODE_H_ */
diff --git a/repmgrd.c b/repmgrd.c
index dc96ff3b..e977e3cb 100644
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -677,6 +677,7 @@ standby_monitor(void)
 	char		last_wal_standby_received[MAXLEN];
 	char		last_wal_standby_applied[MAXLEN];
 	char		last_wal_standby_applied_timestamp[MAXLEN];
+	bool        last_wal_standby_received_gte_applied;
 	char		sqlquery[QUERY_STR_LEN];
 
 	XLogRecPtr	lsn_master;
@@ -956,7 +957,8 @@ standby_monitor(void)
 	/* Get local xlog info */
 	sqlquery_snprintf(sqlquery,
 					  "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
-					  "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp() ");
+					  "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp(), "
+					  "pg_last_xlog_receive_location() >= pg_last_xlog_replay_location()");
 
 	res = PQexec(my_local_conn, sqlquery);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -971,8 +973,47 @@ standby_monitor(void)
 	strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN);
 	strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN);
 	strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
+
+	last_wal_standby_received_gte_applied =	(strcmp(PQgetvalue(res, 0, 4), "t") == 0)
+		? true
+		: false;
+
 	PQclear(res);
 
+	/*
+	 * Check that last WAL received is greater or equal to last WAL applied
+	 *
+	 * This situation can occur when the standby is no longer connected to
+	 * the upstream node; in this case repmgrd should terminate itself
+	 * as the node may no longer be capable of being promoted or following
+	 * a new upstream node
+	 *
+	 * XXX check if we should (optionally) adopt other strategies to handle
+	 * this situation
+	 */
+	if(last_wal_standby_received_gte_applied == false)
+	{
+		PQExpBufferData errmsg;
+		initPQExpBuffer(&errmsg);
+
+		appendPQExpBuffer(&errmsg,
+						  /* XXX improve message */
+						  _("This node is no longer connected to its upstream node - terminating"));
+
+		log_crit("%s\n", errmsg.data);
+
+		create_event_record(master_conn,
+							&local_options,
+							local_options.node,
+							"repmgrd_monitor",
+							false,
+							errmsg.data);
+
+		// XXX use better code
+		terminate(ERR_MONITORING_FAIL);
+	}
+
+
 	/* Get master xlog info */
 	sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location()");