From 6bd1c6a36dfc3976da269ffeb13b0bc690825836 Mon Sep 17 00:00:00 2001
From: Ian Barwick <ian@2ndquadrant.com>
Date: Tue, 16 Aug 2016 13:25:39 +0900
Subject: [PATCH] Skip largely pointless master reconnection attempt.

Experimental - see notes in code.
---
 repmgrd.c | 51 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/repmgrd.c b/repmgrd.c
index 515fb29b..8820278e 100644
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -799,14 +799,13 @@ standby_monitor(void)
 	/*
 	 * Check that the upstream node is still available
 	 * If not, initiate failover process
-	 */
-
-	check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
-	/*
+	 *
 	 * This takes up to local_options.reconnect_attempts *
 	 * local_options.reconnect_interval seconds
 	 */
 
+	check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
+
 	if (PQstatus(upstream_conn) != CONNECTION_OK)
 	{
 		int previous_master_node_id = master_options.node;
@@ -1330,21 +1329,43 @@ do_master_failover(void)
 		nodes[i].is_visible = false;
 		nodes[i].is_ready = false;
 
-		/* Copy details of the failed master node */
-		/* XXX only node_id is actually used later */
-		if (nodes[i].type == MASTER)
-		{
-			failed_master.node_id = nodes[i].node_id;
-			failed_master.xlog_location = nodes[i].xlog_location;
-			failed_master.is_ready = nodes[i].is_ready;
-		}
-
 		log_debug(_("node=%i conninfo=\"%s\" type=%s\n"),
 				  nodes[i].node_id,
 				  nodes[i].conninfo_str,
 				  node_type);
 
-		/* XXX do we need to try and connect to the master here?  */
+		/* Copy details of the failed master node */
+		if (nodes[i].type == MASTER)
+		{
+			/* XXX only node_id is currently used */
+			failed_master.node_id = nodes[i].node_id;
+
+			/*
+			 * XXX experimental
+			 *
+			 * Currently an attempt is made to connect to the master,
+			 * which is very likely to be a waste of time at this point, as we'll
+			 * have spent the last however many seconds trying to do just that
+			 * in check_connection() before deciding it's gone away.
+			 *
+			 * If the master did come back at this point, the voting algorithm should decide
+			 * it's the "best candidate" anyway and no standby will promote itself or
+			 * attempt to follow* another server.
+			 *
+			 * If we don't try and connect to the master here (and the code generally
+			 * assumes it's failed anyway) but it does come back any time from here
+			 * onwards, promotion will fail and the promotion candidate will
+			 * notice the reappearance.
+			 *
+			 * TLDR version: by skipping the master connection attempt (and the chances
+			 * the master would reappear between the last attempt in check_connection()
+			 * and now are minimal) we can remove useless cycles during the failover process;
+			 * if the master does reappear it will be caught before later anyway.
+			 */
+
+			continue;
+		}
+
 		node_conn = establish_db_connection(nodes[i].conninfo_str, false);
 
 		/* if we can't see the node just skip it */
@@ -1706,6 +1727,8 @@ do_master_failover(void)
 				{
 					log_notice(_("Original master reappeared before this standby was promoted - no action taken\n"));
 
+					/* XXX log an event here?  */
+
 					PQfinish(master_conn);
 					master_conn = NULL;