mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 16:46:28 +00:00
Make repmgrd retry connection to current master 3 times (every 5 min),
then try to get a new master 30 times (every 10 min) before exit. It's not retrying forever because after some attempts it gives an error of multiple files opened.
This commit is contained in:
@@ -181,6 +181,8 @@ getMasterConnection(PGconn *standby_conn, int id, char *cluster, int *master_id)
|
|||||||
*master_id = atoi(PQgetvalue(res1, i, 0));
|
*master_id = atoi(PQgetvalue(res1, i, 0));
|
||||||
strcpy(master_conninfo, PQgetvalue(res1, i, 2));
|
strcpy(master_conninfo, PQgetvalue(res1, i, 2));
|
||||||
master_conn = establishDBConnection(master_conninfo, false);
|
master_conn = establishDBConnection(master_conninfo, false);
|
||||||
|
if (PQstatus(master_conn) != CONNECTION_OK)
|
||||||
|
continue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* I can't use the is_standby() function here because on error that
|
* I can't use the is_standby() function here because on error that
|
||||||
|
|||||||
35
repmgrd.c
35
repmgrd.c
@@ -67,8 +67,6 @@ static void setup_cancel_handler(void);
|
|||||||
sleep(3); \
|
sleep(3); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MAX_RETRIES 3
|
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char **argv)
|
main(int argc, char **argv)
|
||||||
@@ -206,21 +204,42 @@ MonitorExecute(void)
|
|||||||
* from the error, try to get a new master. If cannot find one then error
|
* from the error, try to get a new master. If cannot find one then error
|
||||||
* and exit
|
* and exit
|
||||||
*/
|
*/
|
||||||
for (connection_retries = 0; connection_retries < MAX_RETRIES; connection_retries++)
|
for (connection_retries = 0; connection_retries < 3; connection_retries++)
|
||||||
{
|
{
|
||||||
if (PQstatus(primaryConn) != CONNECTION_OK)
|
if (PQstatus(primaryConn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "Connection to master has been lost, trying to recover...");
|
fprintf(stderr, "\n%s: Connection to master has been lost, trying to recover...\n", progname);
|
||||||
|
/* wait 5 minutes between retries */
|
||||||
|
sleep(300);
|
||||||
|
|
||||||
PQreset(primaryConn);
|
PQreset(primaryConn);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n%s: Connection to master has been restored, continue monitoring.\n", progname);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if ((connection_retries = MAX_RETRIES) && (PQstatus(primaryConn) != CONNECTION_OK))
|
if ((connection_retries = 3) && (PQstatus(primaryConn) != CONNECTION_OK))
|
||||||
{
|
{
|
||||||
primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
|
fprintf(stderr, "\n%s: We couldn't reconnect to master, searching if ", progname);
|
||||||
if (primaryConn == NULL)
|
fprintf(stderr, "another node has been promoted.\n", progname);
|
||||||
exit(1);
|
for (connection_retries = 0; connection_retries < 30; connection_retries++)
|
||||||
|
{
|
||||||
|
primaryConn = getMasterConnection(myLocalConn, myLocalId, myClusterName, &primaryId);
|
||||||
|
if (PQstatus(primaryConn) == CONNECTION_OK)
|
||||||
|
{
|
||||||
|
/* Connected, we can continue the process so break the loop */
|
||||||
|
fprintf(stderr, "\n%s: Connected to node %d, continue monitoring.\n", progname, primaryId);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n%s: We haven't found a new master, waiting before retry...\n", progname);
|
||||||
|
/* wait 10 minutes before retries, after 30 failures we stop trying */
|
||||||
|
sleep(600);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Check if we still are a standby, we could have been promoted */
|
/* Check if we still are a standby, we could have been promoted */
|
||||||
|
|||||||
Reference in New Issue
Block a user