mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Add tunables for connection retries to master and interval between
connection retries, these parameters along with master_response_timeout determines the amount of time since failure to failover
This commit is contained in:
34
config.c
34
config.c
@@ -45,6 +45,10 @@ parse_config(const char *config_file, t_configuration_options *options)
|
||||
/* if nothing has been provided defaults to 60 */
|
||||
options->master_response_timeout = 60;
|
||||
|
||||
/* it defaults to 6 retries with a time between retries of 10s */
|
||||
options->reconnect_attempts = 6;
|
||||
options->reconnect_intvl = 10;
|
||||
|
||||
/*
|
||||
* Since some commands don't require a config file at all, not
|
||||
* having one isn't necessarily a problem.
|
||||
@@ -103,6 +107,10 @@ parse_config(const char *config_file, t_configuration_options *options)
|
||||
strncpy(options->follow_command, value, MAXLEN);
|
||||
else if (strcmp(name, "master_response_timeout") == 0)
|
||||
options->master_response_timeout = atoi(value);
|
||||
else if (strcmp(name, "reconnect_attempts") == 0)
|
||||
options->reconnect_attempts = atoi(value);
|
||||
else if (strcmp(name, "reconnect_interval") == 0)
|
||||
options->reconnect_intvl = atoi(value);
|
||||
else
|
||||
log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
|
||||
}
|
||||
@@ -128,6 +136,18 @@ parse_config(const char *config_file, t_configuration_options *options)
|
||||
log_err(_("Master response timeout must be greater than zero. Check the configuration file.\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (options->reconnect_attempts < 0)
|
||||
{
|
||||
log_err(_("Reconnect attempts must be zero or greater. Check the configuration file.\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (options->reconnect_intvl <= 0)
|
||||
{
|
||||
log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -232,6 +252,18 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (new_options.reconnect_attempts < 0)
|
||||
{
|
||||
log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (new_options.reconnect_intvl < 0)
|
||||
{
|
||||
log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Test conninfo string */
|
||||
conn = establishDBConnection(new_options.conninfo, false);
|
||||
if (!conn || (PQstatus(conn) != CONNECTION_OK))
|
||||
@@ -252,6 +284,8 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
|
||||
strcpy(orig_options->follow_command, new_options.follow_command);
|
||||
strcpy(orig_options->rsync_options, new_options.rsync_options);
|
||||
orig_options->master_response_timeout = new_options.master_response_timeout;
|
||||
orig_options->reconnect_attempts = new_options.reconnect_attempts;
|
||||
orig_options->reconnect_intvl = new_options.reconnect_intvl;
|
||||
/*
|
||||
* XXX These ones can change with a simple SIGHUP?
|
||||
|
||||
|
||||
2
config.h
2
config.h
@@ -37,6 +37,8 @@ typedef struct
|
||||
char logfacility[MAXLEN];
|
||||
char rsync_options[QUERY_STR_LEN];
|
||||
int master_response_timeout;
|
||||
int reconnect_attempts;
|
||||
int reconnect_intvl;
|
||||
} t_configuration_options;
|
||||
|
||||
void parse_config(const char *config_file, t_configuration_options *options);
|
||||
|
||||
@@ -16,6 +16,10 @@ rsync_options=--archive --checksum --compress --progress --rsh=ssh
|
||||
# How many seconds we wait for master response before declaring master failure
|
||||
master_response_timeout=60
|
||||
|
||||
# How many time we try to reconnect to master before starting failover procedure
|
||||
reconnect_attempts=6
|
||||
reconnect_interval=10
|
||||
|
||||
# Autofailover options
|
||||
failover=automatic
|
||||
priority=-1
|
||||
|
||||
4
repmgr.h
4
repmgr.h
@@ -69,9 +69,5 @@ typedef struct
|
||||
} t_runtime_options;
|
||||
|
||||
#define SLEEP_MONITOR 2
|
||||
#define SLEEP_RETRY 3
|
||||
#define NUM_RETRY 40
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
16
repmgrd.c
16
repmgrd.c
@@ -345,7 +345,7 @@ WitnessMonitor(void)
|
||||
* Check if the master is still available, if after 5 minutes of retries
|
||||
* we cannot reconnect, return false.
|
||||
*/
|
||||
CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
|
||||
CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
|
||||
|
||||
if (PQstatus(primaryConn) != CONNECTION_OK)
|
||||
{
|
||||
@@ -429,7 +429,7 @@ StandbyMonitor(void)
|
||||
* Check if the master is still available, if after 5 minutes of retries
|
||||
* we cannot reconnect, try to get a new master.
|
||||
*/
|
||||
CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
|
||||
CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
|
||||
|
||||
if (PQstatus(primaryConn) != CONNECTION_OK)
|
||||
{
|
||||
@@ -762,17 +762,19 @@ CheckPrimaryConnection(void)
|
||||
|
||||
/*
|
||||
* Check if the master is still available
|
||||
* if after NUM_RETRY * SLEEP_RETRY seconds of retries
|
||||
* if after local_options.reconnect_attempts * local_options.reconnect_intvl seconds of retries
|
||||
* we cannot reconnect
|
||||
* return false
|
||||
*/
|
||||
for (connection_retries = 0; connection_retries < NUM_RETRY; connection_retries++)
|
||||
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
||||
{
|
||||
if (!is_pgup(primaryConn, local_options.master_response_timeout))
|
||||
{
|
||||
log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"), progname, (SLEEP_RETRY*(NUM_RETRY-connection_retries)));
|
||||
/* wait SLEEP_RETRY seconds between retries */
|
||||
sleep(SLEEP_RETRY);
|
||||
log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"),
|
||||
progname,
|
||||
(local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
|
||||
/* wait local_options.reconnect_intvl seconds between retries */
|
||||
sleep(local_options.reconnect_intvl);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user