Make repmgrd retry connection to current master 3 times (every 5 min),

then try to get a new master 30 times (every 10 min) before exit.
It's not retrying forever because after some attempts it gives an
error of multiple files opened.
This commit is contained in:
Jaime Casanova
2010-11-10 13:13:53 -05:00
parent 3172ad97cf
commit d9eee72bf0
2 changed files with 29 additions and 8 deletions

View File

@@ -181,6 +181,8 @@ getMasterConnection(PGconn *standby_conn, int id, char *cluster, int *master_id)
*master_id = atoi(PQgetvalue(res1, i, 0)); *master_id = atoi(PQgetvalue(res1, i, 0));
strcpy(master_conninfo, PQgetvalue(res1, i, 2)); strcpy(master_conninfo, PQgetvalue(res1, i, 2));
master_conn = establishDBConnection(master_conninfo, false); master_conn = establishDBConnection(master_conninfo, false);
if (PQstatus(master_conn) != CONNECTION_OK)
continue;
/* /*
* I can't use the is_standby() function here because on error that * I can't use the is_standby() function here because on error that

View File

@@ -67,8 +67,6 @@ static void setup_cancel_handler(void);
sleep(3); \ sleep(3); \
} }
#define MAX_RETRIES 3
int int
main(int argc, char **argv) main(int argc, char **argv)
@@ -206,21 +204,42 @@ MonitorExecute(void)
* from the error, try to get a new master. If cannot find one then error * from the error, try to get a new master. If cannot find one then error
* and exit * and exit
*/ */
for (connection_retries = 0; connection_retries < MAX_RETRIES; connection_retries++) for (connection_retries = 0; connection_retries < 3; connection_retries++)
{ {
if (PQstatus(primaryConn) != CONNECTION_OK) if (PQstatus(primaryConn) != CONNECTION_OK)
{ {
fprintf(stderr, "Connection to master has been lost, trying to recover..."); fprintf(stderr, "\n%s: Connection to master has been lost, trying to recover...\n", progname);
/* wait 5 minutes between retries */
sleep(300);
PQreset(primaryConn); PQreset(primaryConn);
} }
else else
{
fprintf(stderr, "\n%s: Connection to master has been restored, continue monitoring.\n", progname);
break; break;
}
} }
if ((connection_retries = MAX_RETRIES) && (PQstatus(primaryConn) != CONNECTION_OK)) if ((connection_retries = 3) && (PQstatus(primaryConn) != CONNECTION_OK))
{ {
primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId); fprintf(stderr, "\n%s: We couldn't reconnect to master, searching if ", progname);
if (primaryConn == NULL) fprintf(stderr, "another node has been promoted.\n", progname);
exit(1); for (connection_retries = 0; connection_retries < 30; connection_retries++)
{
primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
if (PQstatus(primaryConn) == CONNECTION_OK)
{
/* Connected, we can continue the process so break the loop */
fprintf(stderr, "\n%s: Connected to node %d, continue monitoring.\n", progname, primaryId);
break;
}
else
{
fprintf(stderr, "\n%s: We haven't found a new master, waiting before retry...\n", progname);
/* wait 10 minutes before retries, after 30 failures we stop trying */
sleep(600);
}
}
} }
/* Check if we still are a standby, we could have been promoted */ /* Check if we still are a standby, we could have been promoted */