From 829cf5cca44d798b0a6878938685c4e3873f9567 Mon Sep 17 00:00:00 2001
From: Ian Barwick <ian@2ndquadrant.com>
Date: Thu, 22 Feb 2018 11:19:37 +0900
Subject: [PATCH] repmgrd: improve detection of status change from primary to
 standby

If repmgrd is running in degraded mode on a primary which has been stopped,
then manually been brought back online as a standby (e.g. by creating
recovery.conf and starting the server), ensure it not only detects the
change but automatically updates the node record so it can resume
monitoring the node as a standby.

Previously, repmgrd was looping waiting for the record to be updated
(as is done transparently when executing "repmgr node rejoin") but
if the record was not updated within the timeout period (e.g. by
"repmgr standby register) it would fail to resume monitoring as a
standby.

It seems reasonable to have repmgrd automatically update the node record,
as this will restore failover capability as quickly as possible. If this
is not desired, then the onus is on the user to shut down repmgrd while
making the desired changes.
---
 HISTORY                              |  3 +-
 dbutils.c                            | 37 ++++++++++-
 dbutils.h                            |  1 +
 doc/repmgrd-degraded-monitoring.sgml | 12 +++-
 repmgrd-physical.c                   | 98 ++++++++++++++++++----------
 5 files changed, 111 insertions(+), 40 deletions(-)
diff --git a/HISTORY b/HISTORY
index 79ed883e..a6b8c7d6 100644
--- a/HISTORY
+++ b/HISTORY
@@ -1,5 +1,6 @@
-4.0.4   2018-??-??
+4.0.4   2018-03-??
         repmgr: ensure "node rejoin" honours "--dry-run" option; GitHub #383 (Ian)
+        repmgrd: improve detection of status change from primary to standby (Ian)
 
 4.0.3   2018-02-15
         repmgr: improve switchover handling when "pg_ctl" used to control the
diff --git a/dbutils.c b/dbutils.c
index 42ffc347..8c62413b 100644
--- a/dbutils.c
+++ b/dbutils.c
@@ -2347,8 +2347,7 @@ update_node_record_set_active(PGconn *conn, int this_node_id, bool active)
 
 	initPQExpBuffer(&query);
 
-	appendPQExpBuffer(
-					  &query,
+	appendPQExpBuffer(&query,
 					  "UPDATE repmgr.nodes SET active = %s "
 					  " WHERE node_id = %i",
 					  active == true ? "TRUE" : "FALSE",
@@ -2373,6 +2372,40 @@ update_node_record_set_active(PGconn *conn, int this_node_id, bool active)
 }
 
 
+bool
+update_node_record_set_active_standby(PGconn *conn, int this_node_id)
+{
+	PQExpBufferData query;
+	PGresult   *res = NULL;
+
+	initPQExpBuffer(&query);
+
+	appendPQExpBuffer(&query,
+					  "UPDATE repmgr.nodes "
+					  "   SET type = 'standby', "
+					  "       active = TRUE "
+					  " WHERE node_id = %i",
+					  this_node_id);
+
+	log_verbose(LOG_DEBUG, "update_node_record_set_active_standby():\n  %s", query.data);
+
+	res = PQexec(conn, query.data);
+	termPQExpBuffer(&query);
+
+	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+	{
+		log_error(_("unable to update node record:\n  %s"),
+				  PQerrorMessage(conn));
+		PQclear(res);
+		return false;
+	}
+
+	PQclear(res);
+
+	return true;
+}
+
+
 bool
 update_node_record_set_primary(PGconn *conn, int this_node_id)
 {
diff --git a/dbutils.h b/dbutils.h
index 59bb4c1e..fe847a17 100644
--- a/dbutils.h
+++ b/dbutils.h
@@ -428,6 +428,7 @@ bool		truncate_node_records(PGconn *conn);
 
 bool		update_node_record_set_active(PGconn *conn, int this_node_id, bool active);
 bool		update_node_record_set_primary(PGconn *conn, int this_node_id);
+bool		update_node_record_set_active_standby(PGconn *conn, int this_node_id);
 bool		update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id);
 bool		update_node_record_status(PGconn *conn, int this_node_id, char *type, int upstream_node_id, bool active);
 bool		update_node_record_conn_priority(PGconn *conn, t_configuration_options *options);
diff --git a/doc/repmgrd-degraded-monitoring.sgml b/doc/repmgrd-degraded-monitoring.sgml
index b2f3ac71..923331ad 100644
--- a/doc/repmgrd-degraded-monitoring.sgml
+++ b/doc/repmgrd-degraded-monitoring.sgml
@@ -40,7 +40,7 @@
    </listitem>
 
    <listitem>
-    <simpara>repmgrd is monitoring the primary node, but it is not available</simpara>
+    <simpara>repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)</simpara>
    </listitem>
   </itemizedlist>
  </para>
@@ -69,7 +69,15 @@
   By default, <literal>repmgrd</literal> will continue in degraded monitoring mode indefinitely.
   However a timeout (in seconds) can be set with <varname>degraded_monitoring_timeout</varname>,
   after which <application>repmgrd</application> will terminate.
-
  </para>
 
+ <note>
+   <para>
+     If <application>repmgrd</application> is monitoring a primary mode which has been stopped
+     and manually restarted as a standby attached to a new primary, it will automatically detect
+     the status change and update the node record to reflect the node's new status
+     as an active standby. It will then resume monitoring the node as a standby.
+   </para>
+ </note>
+
 </chapter>
diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index 8fbb213a..94547621 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -325,7 +325,6 @@ monitor_streaming_primary(void)
 				else
 				{
 					local_node_info.node_status = NODE_STATUS_UP;
-					monitoring_state = MS_NORMAL;
 
 					initPQExpBuffer(&event_details);
 
@@ -353,54 +352,83 @@ monitor_streaming_primary(void)
 						else
 						{
 							RecordStatus record_status;
-							int i = 0;
 
 							log_debug("primary node id is now %i", primary_node_id);
 
-							/*
-							 * poll for a while until record type is returned as "STANDBY" - it's possible
-							 * that there's a gap between the server being restarted and the record
-							 * being updated
-							 */
-							for (i = 0; i < 30; i++)
-							{
-								/*
-								 * try and refresh the local node record from the primary, as the updated
-								 * local node record may not have been replicated yet
-								 */
-
-								record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
-
-								if (record_status == RECORD_FOUND)
-								{
-									log_debug("type = %s", get_node_type_string(local_node_info.type));
-
-									if (local_node_info.type == STANDBY)
-									{
-										PQfinish(new_primary_conn);
-
-										/* XXX add event notification */
-										return;
-									}
-								}
-								sleep(1);
-							}
-
-							PQfinish(new_primary_conn);
+							record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
 
 							if (record_status == RECORD_FOUND)
 							{
-								log_warning(_("repmgr node record is still %s"), get_node_type_string(local_node_info.type));
+								bool resume_monitoring = true;
+
+								log_debug("node %i is registered with type = %s",
+										  config_file_options.node_id,
+										  get_node_type_string(local_node_info.type));
+
+								/*
+								 * node has recovered but metadata not updated - we can do that ourselves,
+								 */
+								if (local_node_info.type == PRIMARY)
+								{
+									log_notice(_("node \"%s\" (ID: %i) still registered as primary, setting to standby"),
+											   config_file_options.node_name,
+											   config_file_options.node_id);
+
+									if (update_node_record_set_active_standby(new_primary_conn, config_file_options.node_id) == false)
+									{
+										resume_monitoring = false;
+									}
+									else
+									{
+										record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
+
+										if (record_status != RECORD_FOUND)
+										{
+											resume_monitoring = false;
+										}
+									}
+								}
+
+								if (resume_monitoring == true)
+								{
+									monitoring_state = MS_NORMAL;
+
+									appendPQExpBuffer(&event_details,
+													  _("former primary has been restored as standby after %i seconds, updating node record and resuming monitoring"),
+													  degraded_monitoring_elapsed);
+
+									create_event_notification(new_primary_conn,
+															  &config_file_options,
+															  config_file_options.node_id,
+															  "repmgrd_standby_reconnect",
+															  true,
+															  event_details.data);
+									log_notice("%s", event_details.data);
+									termPQExpBuffer(&event_details);
+
+									PQfinish(new_primary_conn);
+
+									/* restart monitoring as standby */
+									return;
+								}
 							}
-							else
+							else if (record_status == RECORD_NOT_FOUND)
 							{
-								log_error(_("no metadata record found for this node"));
+								log_error(_("no metadata record found for this node on current primary %i"), primary_node_id);
 								log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
+
+								PQfinish(new_primary_conn);
+
+								/* add event notification */
+								terminate(ERR_BAD_CONFIG);
 							}
+
 						}
 					}
 					else
 					{
+						monitoring_state = MS_NORMAL;
+
 						appendPQExpBuffer(&event_details,
 										  _("reconnected to primary node after %i seconds, resuming monitoring"),
 										  degraded_monitoring_elapsed);