Make repmgrd retry connection to current master 3 times (every 5 min),

then try to get a new master 30 times (every 10 min) before exit. It's not retrying forever because after some attempts it gives an error of multiple files opened.
2026-06-01 03:39:05 +00:00 · 2010-11-10 13:13:53 -05:00
parent 3172ad97cf
commit d9eee72bf0
2 changed files with 29 additions and 8 deletions
@@ -181,6 +181,8 @@ getMasterConnection(PGconn *standby_conn, int id, char *cluster, int *master_id)
 		*master_id = atoi(PQgetvalue(res1, i, 0));
 		strcpy(master_conninfo, PQgetvalue(res1, i, 2));
 		master_conn = establishDBConnection(master_conninfo, false);
 		if (PQstatus(master_conn) != CONNECTION_OK)
 			continue;
 		/* 
 		 * I can't use the is_standby() function here because on error that 
@@ -67,8 +67,6 @@ static void setup_cancel_handler(void);
 							sleep(3); \
 						} 
 #define MAX_RETRIES 3
 int
 main(int argc, char **argv)
@@ -206,21 +204,42 @@ MonitorExecute(void)
 	 * from the error, try to get a new master. If cannot find one then error
 	 * and exit
 	 */
-	for (connection_retries = 0; connection_retries < MAX_RETRIES; connection_retries++)
+	for (connection_retries = 0; connection_retries < 3; connection_retries++)
 	{
 		if (PQstatus(primaryConn) != CONNECTION_OK)
 		{
-			fprintf(stderr, "Connection to master has been lost, trying to recover...");
+			fprintf(stderr, "\n%s: Connection to master has been lost, trying to recover...\n", progname);
 			/* wait 5 minutes between retries */
 			sleep(300);
 			PQreset(primaryConn);
 		}	
 		else
 		{
 			fprintf(stderr, "\n%s: Connection to master has been restored, continue monitoring.\n", progname);
 			break;
 		}
 	}
-	if ((connection_retries = MAX_RETRIES) && (PQstatus(primaryConn) != CONNECTION_OK))
+	if ((connection_retries = 3) && (PQstatus(primaryConn) != CONNECTION_OK))
 	{
-		primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
+		fprintf(stderr, "\n%s: We couldn't reconnect to master, searching if ", progname);
-		if (primaryConn == NULL)
+		fprintf(stderr, "another node has been promoted.\n", progname);
-			exit(1);
+		for (connection_retries = 0; connection_retries < 30; connection_retries++)
 		{
 			primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
 			if (PQstatus(primaryConn) == CONNECTION_OK)
 			{
 				/* Connected, we can continue the process so break the loop */
 				fprintf(stderr, "\n%s: Connected to node %d, continue monitoring.\n", progname, primaryId);
 				break;
 			}
 			else
 			{
 				fprintf(stderr, "\n%s: We haven't found a new master, waiting before retry...\n", progname);
 				/* wait 10 minutes before retries, after 30 failures we stop trying */
 				sleep(600);
 			}
 		}
 	}
 	/* Check if we still are a standby, we could have been promoted */