From a6d0ba07ed360b656f0b033b01df0d9402b2f510 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 26 Apr 2018 19:19:42 +0900 Subject: [PATCH] repmgrd: handle pg_ctl timeout It's possible "pg_ctl promote" will timeout, causing "repmgr standby follow" to return with an error; however the promotion itself will usually succeed, so detect this case and handle accordingly. --- repmgrd-physical.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/repmgrd-physical.c b/repmgrd-physical.c index af209222..a454147b 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -883,7 +883,31 @@ monitor_streaming_standby(void) /* local node has been promoted */ if (get_recovery_type(local_conn) == RECTYPE_PRIMARY) { - log_notice(_("local node is primary, checking local node record")); + log_notice(_("local node is primary, checking local node state")); + + /* + * It's possible the promote command timed out, but the promotion itself + * succeeded. In this case failover state will be FAILOVER_STATE_PROMOTION_FAILED; + * we can update the node record ourselves and resume primary monitoring. + * + * XXX check if other standbys follow + */ + if (failover_state == FAILOVER_STATE_PROMOTION_FAILED) + { + int degraded_monitoring_elapsed; + + update_node_record_set_primary(local_conn, local_node_info.node_id); + record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info); + + degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start); + + log_notice(_("resuming monitoring as primary node after %i seconds"), + degraded_monitoring_elapsed); + + /* this will restart monitoring in primary mode */ + monitoring_state = MS_NORMAL; + return; + } /* * There may be a delay between the node being promoted