From 16048a879ea8ba6e81697ca8c6d4bee14e3c7a2a Mon Sep 17 00:00:00 2001
From: Ian Barwick <ian@2ndquadrant.com>
Date: Fri, 27 Apr 2018 11:54:21 +0900
Subject: [PATCH] repmgrd: notify sibling nodes to follow new primary after
 pg_ctl timeout

If "pg_ctl promote" fails due to a timeout, but the promotion itself succeeds,
have repmgrd on the new primary explicitly notify any sibling nodes to
follow it.

Previously the sibling nodes would wait "primary_notification_timeout" seconds
before attempting to discover the new primary.

This (and preceding commit eac80ae) address GitHub #425.
---
 repmgrd-physical.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index a454147b..7709b614 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -889,12 +889,11 @@ monitor_streaming_standby(void)
 					 * It's possible the promote command timed out, but the promotion itself
 					 * succeeded. In this case failover state will be FAILOVER_STATE_PROMOTION_FAILED;
 					 * we can update the node record ourselves and resume primary monitoring.
-					 *
-					 * XXX check if other standbys follow
 					 */
 					if (failover_state == FAILOVER_STATE_PROMOTION_FAILED)
 					{
 						int			degraded_monitoring_elapsed;
+						int			former_upstream_node_id = local_node_info.upstream_node_id;
 
 						update_node_record_set_primary(local_conn,  local_node_info.node_id);
 						record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
@@ -904,6 +903,25 @@ monitor_streaming_standby(void)
 						log_notice(_("resuming monitoring as primary node after %i seconds"),
 								   degraded_monitoring_elapsed);
 
+						initPQExpBuffer(&event_details);
+						appendPQExpBuffer(&event_details,
+										  "promotion command failed but promotion completed successfully");
+						create_event_notification(local_conn,
+												  &config_file_options,
+												  local_node_info.node_id,
+												  "repmgrd_failover_promote",
+												  true,
+												  event_details.data);
+
+						termPQExpBuffer(&event_details);
+
+						/* notify former siblings that they should now follow this node */
+						get_active_sibling_node_records(local_conn,
+														local_node_info.node_id,
+														former_upstream_node_id,
+														&standby_nodes);
+						notify_followers(&standby_nodes, local_node_info.node_id);
+
 						/* this will restart monitoring in primary mode */
 						monitoring_state = MS_NORMAL;
 						return;