From ad988dcccea138480098cc2133b758019a0a695f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mart=C3=ADn=20Marqu=C3=A9s?= <martin@2ndquadrant.com>
Date: Mon, 7 Dec 2015 16:14:19 -0300
Subject: [PATCH] Fix bug discovered last week which prevents recovered standby
 from being used in the cluster. Main issue was that if the local repmgrd was
 not able to connect locally, it would set the local node as failed (active =
 false). This is fine, because we actually don't know if the node is active
 (actually, it's not active ATM) so it's best to keep it out of the cluster.
 The problem is that if the postgres service comes back up, and is able to
 recover by it self, then we should ack that fact and set it as active. There
 was another issue related with repmgrd being terminated if the postgres
 service was downs. This is not the correct thing to do: we should keep trying
 to connect to the local standby.

---
 repmgrd.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/repmgrd.c b/repmgrd.c
index adca5052..1c8ad566 100644
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -691,18 +691,12 @@ standby_monitor(void)
 		initPQExpBuffer(&errmsg);
 
 		appendPQExpBuffer(&errmsg,
-						  _("failed to connect to local node, node marked as failed and terminating!"));
+						  _("failed to connect to local node, node marked as failed!"));
 
 		log_err("%s\n", errmsg.data);
 
-		create_event_record(master_conn,
-							&local_options,
-							local_options.node,
-							"repmgrd_shutdown",
-							false,
-							errmsg.data);
-
-		terminate(ERR_DB_CON);
+		//terminate(ERR_DB_CON);
+		goto continue_monitoring_standby;
 	}
 
 	upstream_conn = get_upstream_connection(my_local_conn,
@@ -831,6 +825,7 @@ standby_monitor(void)
 
 	PQfinish(upstream_conn);
 
+ continue_monitoring_standby:
 	/* Check if we still are a standby, we could have been promoted */
 	do
 	{