From ad988dcccea138480098cc2133b758019a0a695f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mart=C3=ADn=20Marqu=C3=A9s?= Date: Mon, 7 Dec 2015 16:14:19 -0300 Subject: [PATCH] Fix bug discovered last week which prevents recovered standby from being used in the cluster. Main issue was that if the local repmgrd was not able to connect locally, it would set the local node as failed (active = false). This is fine, because we actually don't know if the node is active (actually, it's not active ATM) so it's best to keep it out of the cluster. The problem is that if the postgres service comes back up, and is able to recover by it self, then we should ack that fact and set it as active. There was another issue related with repmgrd being terminated if the postgres service was downs. This is not the correct thing to do: we should keep trying to connect to the local standby. --- repmgrd.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/repmgrd.c b/repmgrd.c index adca5052..1c8ad566 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -691,18 +691,12 @@ standby_monitor(void) initPQExpBuffer(&errmsg); appendPQExpBuffer(&errmsg, - _("failed to connect to local node, node marked as failed and terminating!")); + _("failed to connect to local node, node marked as failed!")); log_err("%s\n", errmsg.data); - create_event_record(master_conn, - &local_options, - local_options.node, - "repmgrd_shutdown", - false, - errmsg.data); - - terminate(ERR_DB_CON); + //terminate(ERR_DB_CON); + goto continue_monitoring_standby; } upstream_conn = get_upstream_connection(my_local_conn, @@ -831,6 +825,7 @@ standby_monitor(void) PQfinish(upstream_conn); + continue_monitoring_standby: /* Check if we still are a standby, we could have been promoted */ do {