From a6d0ba07ed360b656f0b033b01df0d9402b2f510 Mon Sep 17 00:00:00 2001
From: Ian Barwick <barwick@gmail.com>
Date: Thu, 26 Apr 2018 19:19:42 +0900
Subject: [PATCH] repmgrd: handle pg_ctl timeout

It's possible "pg_ctl promote" will timeout, causing "repmgr standby
follow" to return with an error; however the promotion itself will usually
succeed, so detect this case and handle accordingly.
---
 repmgrd-physical.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index af209222..a454147b 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -883,7 +883,31 @@ monitor_streaming_standby(void)
 				/* local node has been promoted */
 				if (get_recovery_type(local_conn) == RECTYPE_PRIMARY)
 				{
-					log_notice(_("local node is primary, checking local node record"));
+					log_notice(_("local node is primary, checking local node state"));
+
+					/*
+					 * It's possible the promote command timed out, but the promotion itself
+					 * succeeded. In this case failover state will be FAILOVER_STATE_PROMOTION_FAILED;
+					 * we can update the node record ourselves and resume primary monitoring.
+					 *
+					 * XXX check if other standbys follow
+					 */
+					if (failover_state == FAILOVER_STATE_PROMOTION_FAILED)
+					{
+						int			degraded_monitoring_elapsed;
+
+						update_node_record_set_primary(local_conn,  local_node_info.node_id);
+						record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
+
+						degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
+
+						log_notice(_("resuming monitoring as primary node after %i seconds"),
+								   degraded_monitoring_elapsed);
+
+						/* this will restart monitoring in primary mode */
+						monitoring_state = MS_NORMAL;
+						return;
+					}
 
 					/*
 					 * There may be a delay between the node being promoted