From 16048a879ea8ba6e81697ca8c6d4bee14e3c7a2a Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Fri, 27 Apr 2018 11:54:21 +0900 Subject: [PATCH] repmgrd: notify sibling nodes to follow new primary after pg_ctl timeout If "pg_ctl promote" fails due to a timeout, but the promotion itself succeeds, have repmgrd on the new primary explicitly notify any sibling nodes to follow it. Previously the sibling nodes would wait "primary_notification_timeout" seconds before attempting to discover the new primary. This (and preceding commit eac80ae) address GitHub #425. --- repmgrd-physical.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/repmgrd-physical.c b/repmgrd-physical.c index a454147b..7709b614 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -889,12 +889,11 @@ monitor_streaming_standby(void) * It's possible the promote command timed out, but the promotion itself * succeeded. In this case failover state will be FAILOVER_STATE_PROMOTION_FAILED; * we can update the node record ourselves and resume primary monitoring. - * - * XXX check if other standbys follow */ if (failover_state == FAILOVER_STATE_PROMOTION_FAILED) { int degraded_monitoring_elapsed; + int former_upstream_node_id = local_node_info.upstream_node_id; update_node_record_set_primary(local_conn, local_node_info.node_id); record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info); @@ -904,6 +903,25 @@ monitor_streaming_standby(void) log_notice(_("resuming monitoring as primary node after %i seconds"), degraded_monitoring_elapsed); + initPQExpBuffer(&event_details); + appendPQExpBuffer(&event_details, + "promotion command failed but promotion completed successfully"); + create_event_notification(local_conn, + &config_file_options, + local_node_info.node_id, + "repmgrd_failover_promote", + true, + event_details.data); + + termPQExpBuffer(&event_details); + + /* notify former siblings that they should now follow this node */ + get_active_sibling_node_records(local_conn, + local_node_info.node_id, + former_upstream_node_id, + &standby_nodes); + notify_followers(&standby_nodes, local_node_info.node_id); + /* this will restart monitoring in primary mode */ monitoring_state = MS_NORMAL; return;