Implement Martin's suggestion about how much we should try to

recover the connection instead of doing whatever a want.
This commit is contained in:
Jaime Casanova
2010-11-10 13:30:50 -05:00
parent d9eee72bf0
commit 3565fe1c3a

View File

@@ -200,17 +200,16 @@ MonitorExecute(void)
int connection_retries; int connection_retries;
/* /*
* Check if the master is still available, if after 3 retries we cannot * Check if the master is still available, if after 5 minutes of retries
* from the error, try to get a new master. If cannot find one then error * we cannot reconnect, try to get a new master.
* and exit
*/ */
for (connection_retries = 0; connection_retries < 3; connection_retries++) for (connection_retries = 0; connection_retries < 15; connection_retries++)
{ {
if (PQstatus(primaryConn) != CONNECTION_OK) if (PQstatus(primaryConn) != CONNECTION_OK)
{ {
fprintf(stderr, "\n%s: Connection to master has been lost, trying to recover...\n", progname); fprintf(stderr, "\n%s: Connection to master has been lost, trying to recover...\n", progname);
/* wait 5 minutes between retries */ /* wait 20 seconds between retries */
sleep(300); sleep(20);
PQreset(primaryConn); PQreset(primaryConn);
} }
@@ -220,11 +219,11 @@ MonitorExecute(void)
break; break;
} }
} }
if ((connection_retries = 3) && (PQstatus(primaryConn) != CONNECTION_OK)) if (PQstatus(primaryConn) != CONNECTION_OK)
{ {
fprintf(stderr, "\n%s: We couldn't reconnect to master, searching if ", progname); fprintf(stderr, "\n%s: We couldn't reconnect to master, searching if ", progname);
fprintf(stderr, "another node has been promoted.\n", progname); fprintf(stderr, "another node has been promoted.\n", progname);
for (connection_retries = 0; connection_retries < 30; connection_retries++) for (connection_retries = 0; connection_retries < 6; connection_retries++)
{ {
primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId); primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
if (PQstatus(primaryConn) == CONNECTION_OK) if (PQstatus(primaryConn) == CONNECTION_OK)
@@ -236,8 +235,8 @@ MonitorExecute(void)
else else
{ {
fprintf(stderr, "\n%s: We haven't found a new master, waiting before retry...\n", progname); fprintf(stderr, "\n%s: We haven't found a new master, waiting before retry...\n", progname);
/* wait 10 minutes before retries, after 30 failures we stop trying */ /* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */
sleep(600); sleep(300);
} }
} }
} }