From bc584d84f6ed68c83cc59b65f6ff57c7a6a8f6d9 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 16 Aug 2018 17:14:05 +0900 Subject: [PATCH] repmgrd: improve cascaded standby failover handling In particular, improve handling of the case where the standby follow command fails due to the primary not being available. GitHub #480. --- repmgrd-physical.c | 65 ++++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/repmgrd-physical.c b/repmgrd-physical.c index d9ff23bd..9c65044a 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -1820,7 +1820,7 @@ do_upstream_standby_failover(void) t_node_info primary_node_info = T_NODE_INFO_INITIALIZER; RecordStatus record_status = RECORD_NOT_FOUND; RecoveryType primary_type = RECTYPE_UNKNOWN; - int i, r; + int i, standby_follow_result; char parsed_follow_command[MAXPGPATH] = ""; close_connection(&upstream_conn); @@ -1885,9 +1885,9 @@ do_upstream_standby_failover(void) */ parse_follow_command(parsed_follow_command, config_file_options.follow_command, primary_node_info.node_id); - r = system(parsed_follow_command); + standby_follow_result = system(parsed_follow_command); - if (r != 0) + if (standby_follow_result != 0) { initPQExpBuffer(&event_details); @@ -1914,6 +1914,10 @@ do_upstream_standby_failover(void) /* * It's possible that the standby is still starting up after the "follow_command" * completes, so poll for a while until we get a connection. + * + * NOTE: we've previously closed the local connection, so even if the follow command + * failed for whatever reason and the local node remained up, we can re-open + * the local connection. */ for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++) @@ -1923,7 +1927,7 @@ do_upstream_standby_failover(void) if (PQstatus(local_conn) == CONNECTION_OK) break; - log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node", + log_debug("sleeping 1 second; %i of %i (\"repmgrd_standby_startup_timeout\") attempts to reconnect to local node", i + 1, config_file_options.repmgrd_standby_startup_timeout); sleep(1); @@ -1939,30 +1943,47 @@ do_upstream_standby_failover(void) /* refresh shared memory settings which will have been zapped by the restart */ repmgrd_set_local_node_id(local_conn, config_file_options.node_id); - if (update_node_record_set_upstream(primary_conn, - local_node_info.node_id, - primary_node_info.node_id) == false) + /* + * + */ + + if (standby_follow_result != 0) { - initPQExpBuffer(&event_details); + monitoring_state = MS_DEGRADED; + INSTR_TIME_SET_CURRENT(degraded_monitoring_start); - appendPQExpBuffer(&event_details, - _("unable to set node %i's new upstream ID to %i"), - local_node_info.node_id, - primary_node_info.node_id); + return FAILOVER_STATE_FOLLOW_FAIL; + } - log_error("%s", event_details.data); + /* + * update upstream_node_id to primary node (but only if follow command + * was successful) + */ - create_event_notification( - NULL, - &config_file_options, - local_node_info.node_id, - "repmgrd_failover_follow", - false, - event_details.data); + { + if (update_node_record_set_upstream(primary_conn, + local_node_info.node_id, + primary_node_info.node_id) == false) + { + initPQExpBuffer(&event_details); + appendPQExpBuffer(&event_details, + _("unable to set node %i's new upstream ID to %i"), + local_node_info.node_id, + primary_node_info.node_id); - termPQExpBuffer(&event_details); + log_error("%s", event_details.data); - terminate(ERR_BAD_CONFIG); + create_event_notification(NULL, + &config_file_options, + local_node_info.node_id, + "repmgrd_failover_follow", + false, + event_details.data); + + termPQExpBuffer(&event_details); + + terminate(ERR_BAD_CONFIG); + } } /* refresh own internal node record */