From 5113ab0274f714bcc77b61bd61a5284773d7a653 Mon Sep 17 00:00:00 2001
From: Ian Barwick <ian@2ndquadrant.com>
Date: Thu, 9 Aug 2018 16:36:18 +0900
Subject: [PATCH] repmgrd: fix startup on witness node when local data is stale

Previously, when running on a witness server, repmgrd didn't consider
the local cache of the "repmgr.nodes" table might be outdated, e.g.
as repmgrd wasn't running on the witness server during a failover,
so could potentially end up monitoring a former primary now running
as a standby.

When running on a witness server, at startup repmgrd will now scan
all nodes to determine the current primary, and refresh its local
cache from there. This will also ensure it can start up even if the
node currently registered as primary in the local cache is not available.

Implements GitHub #488 and #489.
---
 HISTORY            |  1 +
 repmgrd-physical.c | 36 +++++++++---------------------------
 2 files changed, 10 insertions(+), 27 deletions(-)

diff --git a/HISTORY b/HISTORY
index ca35e934..27adcd30 100644
--- a/HISTORY
+++ b/HISTORY
@@ -3,6 +3,7 @@
         repmgrd: ensure that sending SIGHUP always results in the log file
           being reopened; GitHub #485 (Ian)
         repmgrd: report version number *after* logger initialisation; GitHub #487 (Ian)
+        repmgrd: ; GitHub #489 (Ian)
 
 4.1.0   2018-07-31
         repmgr: change default log_level to INFO, add documentation; GitHub #470 (Ian)
diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index ec0eb4fc..e48253d8 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -1168,36 +1168,18 @@ monitor_streaming_witness(void)
 	PQExpBufferData event_details;
 	RecordStatus record_status;
 
+	int primary_node_id = UNKNOWN_NODE_ID;
+
 	reset_node_voting_status();
 
 	log_debug("monitor_streaming_witness()");
 
-	if (get_primary_node_record(local_conn, &upstream_node_info) == false)
-	{
-		PQExpBufferData event_details;
-
-		initPQExpBuffer(&event_details);
-
-		appendPQExpBuffer(&event_details,
-						  _("unable to retrieve record for primary node"));
-
-		log_error("%s", event_details.data);
-		log_hint(_("execute \"repmgr witness register --force\" to update the witness node "));
-		close_connection(&local_conn);
-
-		create_event_notification(NULL,
-								  &config_file_options,
-								  config_file_options.node_id,
-								  "repmgrd_shutdown",
-								  false,
-								  event_details.data);
-
-		termPQExpBuffer(&event_details);
-
-		terminate(ERR_BAD_CONFIG);
-	}
-
-	primary_conn = establish_db_connection(upstream_node_info.conninfo, false);
+	/*
+	 * At this point we can't trust the local copy of "repmgr.nodes", as
+	 * it may not have been updated. We'll scan the cluster for the current
+	 * primary and refresh the copy from that before proceeding further.
+	 */
+	primary_conn = get_primary_connection_quiet(local_conn, &primary_node_id, NULL);
 
 	/*
 	 * Primary node must be running at repmgrd startup.
@@ -1222,7 +1204,7 @@ monitor_streaming_witness(void)
 	 * refresh upstream node record from primary, so it's as up-to-date
 	 * as possible
 	 */
-	record_status = get_node_record(primary_conn, upstream_node_info.node_id, &upstream_node_info);
+	record_status = get_node_record(primary_conn, primary_node_id, &upstream_node_info);
 
 	/*
 	 * This is unlikely to happen; if it does emit a warning for diagnostic