From 508acf5931c82ec0444b5a50495bfeb900710a1e Mon Sep 17 00:00:00 2001 From: Jaime Casanova Date: Thu, 28 Oct 2010 08:24:06 -0500 Subject: [PATCH] Fix segmentation fault caused when trying to close a lost connection. --- repmgrd.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/repmgrd.c b/repmgrd.c index 721a37aa..283bbc7e 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -67,6 +67,8 @@ static void setup_cancel_handler(void); sleep(3); \ } +#define MAX_RETRIES 3 + int main(int argc, char **argv) @@ -197,7 +199,31 @@ MonitorExecute(void) unsigned long long int lsn_standby_received; unsigned long long int lsn_standby_applied; - /* Check if we still are a standby, we could be promoted */ + int connection_retries; + + /* + * Check if the master is still available, if after 3 retries we cannot + * from the error, try to get a new master. If cannot find one then error + * and exit + */ + for (connection_retries = 0; connection_retries < MAX_RETRIES; connection_retries++) + { + if (PQstatus(primaryConn) != CONNECTION_OK) + { + fprintf(stderr, "Connection to master has been lost, trying to recover..."); + PQreset(primaryConn); + } + else + break; + } + if ((connection_retries = MAX_RETRIES) && (PQstatus(primaryConn) != CONNECTION_OK)) + { + primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId); + if (primaryConn == NULL) + exit(1); + } + + /* Check if we still are a standby, we could have been promoted */ if (!is_standby(myLocalConn)) { fprintf(stderr, "\n%s: seems like we have been promoted, so exit from monitoring...\n",