From bb842c398923e03a8757a33d8ca7dc781454fc1f Mon Sep 17 00:00:00 2001
From: Ian Barwick <barwick@gmail.com>
Date: Tue, 1 Nov 2016 16:34:32 +0900
Subject: [PATCH] repmgr: improve replication status checking during switchover

When checking the new standby's record in pg_stat_replication, keep
polling until the expected status is reported, and only give up
after a timeout was exceeded.

Previously repmgr would report an error if status was "startup",
even though this is not a problem.
---
 repmgr.c  | 122 +++++++++++++++++++++++++++++++++---------------------
 repmgrd.c |   1 -
 2 files changed, 74 insertions(+), 49 deletions(-)

diff --git a/repmgr.c b/repmgr.c
index da892aed..5622a89b 100644
--- a/repmgr.c
+++ b/repmgr.c
@@ -4180,8 +4180,8 @@ do_standby_promote(void)
 	/*
 	 * Promote standby to master.
 	 *
-	 * `pg_ctl promote` returns immediately and has no -w option, so we
-	 * can't be sure when or if the promotion completes.
+	 * `pg_ctl promote` returns immediately and (prior to 10.0) has no -w option
+	 * so we can't be sure when or if the promotion completes.
 	 * For now we'll poll the server until the default timeout (60 seconds)
 	 */
 
@@ -5275,69 +5275,95 @@ do_standby_switchover(void)
 		exit(ERR_SWITCHOVER_FAIL);
 	}
 
+	/* A connection was made and it was determined the standby is in recovery */
 	log_debug("new standby is in recovery\n");
 
-	/* Check for entry in pg_stat_replication */
+	/* Check for entry in the new master's pg_stat_replication */
 
-	local_conn = establish_db_connection(options.conninfo, true);
-
-	query_result = get_node_replication_state(local_conn, remote_node_record.name, remote_node_replication_state);
-
-	if (query_result == -1)
 	{
+		int			i,
+			replication_check_timeout  = 60,
+			replication_check_interval = 2;
+		bool replication_connection_ok = false;
 		PQExpBufferData event_details;
-		initPQExpBuffer(&event_details);
-		appendPQExpBuffer(&event_details,
-						  _("unable to retrieve replication status for node %i"),
-						  remote_node_id);
-		log_err("%s\n", event_details.data);
-		create_event_record(local_conn,
-							&options,
-							options.node,
-							"standby_switchover",
-							false,
-							event_details.data);
-		termPQExpBuffer(&event_details);
-		PQfinish(local_conn);
-		exit(ERR_SWITCHOVER_FAIL);
-	}
 
-	if (query_result == 0)
-	{
-		log_err(_("node %i not replicating\n"), remote_node_id);
-	}
-	else
-	{
-		/* XXX we should poll for a while in case the node takes time to connect to the primary */
-		if (strcmp(remote_node_replication_state, "streaming") == 0 ||
-			strcmp(remote_node_replication_state, "catchup")  == 0)
+		initPQExpBuffer(&event_details);
+
+		local_conn = establish_db_connection(options.conninfo, true);
+
+		i = 0;
+		for (;;)
 		{
-			log_verbose(LOG_NOTICE, _("node %i is replicating in state \"%s\"\n"), remote_node_id, remote_node_replication_state);
-		}
-		else
-		{
-			/*
-			 * Other possible replication states are:
-			 *  - startup
-			 *  - backup
-			 *  - UNKNOWN
-			 */
-			PQExpBufferData event_details;
+			query_result = get_node_replication_state(local_conn, remote_node_record.name, remote_node_replication_state);
+
+			if (query_result == -1)
+			{
+				appendPQExpBuffer(&event_details,
+								  _("unable to retrieve replication status for node %i"),
+								  remote_node_id);
+				log_warning("%s\n", event_details.data);
+			}
+			else if (query_result == 0)
+			{
+				log_warning(_("pg_stat_replication record for node %i not yet found\n"), remote_node_id);
+			}
+			else
+			{
+				if (strcmp(remote_node_replication_state, "streaming") == 0 ||
+					strcmp(remote_node_replication_state, "catchup")  == 0)
+				{
+					log_verbose(LOG_NOTICE, _("node %i is replicating in state \"%s\"\n"), remote_node_id, remote_node_replication_state);
+					replication_connection_ok = true;
+					break;
+				}
+				else if (strcmp(remote_node_replication_state, "startup") == 0)
+				{
+					log_verbose(LOG_NOTICE, _("node %i is starting up replication\n"), remote_node_id);
+				}
+				else
+				{
+					/*
+					 * Other possible replication states are:
+					 *  - backup
+					 *  - UNKNOWN
+					 */
+					appendPQExpBuffer(&event_details,
+									  _("node %i has unexpected replication state \"%s\""),
+									  remote_node_id, remote_node_replication_state);
+					log_warning("%s\n", event_details.data);
+				}
+			}
+
+			if (i >= replication_check_timeout)
+				break;
+
+			sleep(replication_check_interval);
+
+			i += replication_check_interval;
+
+			/* Reinitialise the string buffer */
+			termPQExpBuffer(&event_details);
 			initPQExpBuffer(&event_details);
-			appendPQExpBuffer(&event_details,
-							  _("node %i has unexpected replication state \"%s\""),
-							  remote_node_id, remote_node_replication_state);
-			log_err("%s\n", event_details.data);
+		}
+
+		/*
+		 * We were unable to establish that the new standby had a pg_stat_replication
+		 * record within the timeout period, so fail with whatever error message
+		 * was placed in the string buffer.
+		 */
+		if (replication_connection_ok == false)
+		{
 			create_event_record(local_conn,
 								&options,
 								options.node,
 								"standby_switchover",
 								false,
 								event_details.data);
-			termPQExpBuffer(&event_details);
 			PQfinish(local_conn);
 			exit(ERR_SWITCHOVER_FAIL);
 		}
+
+		termPQExpBuffer(&event_details);
 	}
 
 	/*
diff --git a/repmgrd.c b/repmgrd.c
index 3011254e..fbe76b22 100644
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -1513,7 +1513,6 @@ do_master_failover(void)
 		 */
 		if (PQstatus(node_conn) != CONNECTION_OK)
 		{
-			/* XXX */
 			log_info(_("At this point, it could be some race conditions "
 					   "that are acceptable, assume the node is restarting "
 					   "and starting failover procedure\n"));