Add tunables for connection retries to master and interval between

connection retries, these parameters along with master_response_timeout
determines the amount of time since failure to failover
This commit is contained in:
Jaime Casanova
2012-07-21 11:01:00 -05:00
parent 08ed0aa987
commit aaf35947ed
5 changed files with 49 additions and 11 deletions

View File

@@ -45,6 +45,10 @@ parse_config(const char *config_file, t_configuration_options *options)
/* if nothing has been provided defaults to 60 */
options->master_response_timeout = 60;
/* it defaults to 6 retries with a time between retries of 10s */
options->reconnect_attempts = 6;
options->reconnect_intvl = 10;
/*
* Since some commands don't require a config file at all, not
* having one isn't necessarily a problem.
@@ -103,6 +107,10 @@ parse_config(const char *config_file, t_configuration_options *options)
strncpy(options->follow_command, value, MAXLEN);
else if (strcmp(name, "master_response_timeout") == 0)
options->master_response_timeout = atoi(value);
else if (strcmp(name, "reconnect_attempts") == 0)
options->reconnect_attempts = atoi(value);
else if (strcmp(name, "reconnect_interval") == 0)
options->reconnect_intvl = atoi(value);
else
log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
}
@@ -128,6 +136,18 @@ parse_config(const char *config_file, t_configuration_options *options)
log_err(_("Master response timeout must be greater than zero. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG);
}
if (options->reconnect_attempts < 0)
{
log_err(_("Reconnect attempts must be zero or greater. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG);
}
if (options->reconnect_intvl <= 0)
{
log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG);
}
}
@@ -232,6 +252,18 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
return false;
}
if (new_options.reconnect_attempts < 0)
{
log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
return false;
}
if (new_options.reconnect_intvl < 0)
{
log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
return false;
}
/* Test conninfo string */
conn = establishDBConnection(new_options.conninfo, false);
if (!conn || (PQstatus(conn) != CONNECTION_OK))
@@ -252,6 +284,8 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
strcpy(orig_options->follow_command, new_options.follow_command);
strcpy(orig_options->rsync_options, new_options.rsync_options);
orig_options->master_response_timeout = new_options.master_response_timeout;
orig_options->reconnect_attempts = new_options.reconnect_attempts;
orig_options->reconnect_intvl = new_options.reconnect_intvl;
/*
* XXX These ones can change with a simple SIGHUP?

View File

@@ -37,6 +37,8 @@ typedef struct
char logfacility[MAXLEN];
char rsync_options[QUERY_STR_LEN];
int master_response_timeout;
int reconnect_attempts;
int reconnect_intvl;
} t_configuration_options;
void parse_config(const char *config_file, t_configuration_options *options);

View File

@@ -16,6 +16,10 @@ rsync_options=--archive --checksum --compress --progress --rsh=ssh
# How many seconds we wait for master response before declaring master failure
master_response_timeout=60
# How many time we try to reconnect to master before starting failover procedure
reconnect_attempts=6
reconnect_interval=10
# Autofailover options
failover=automatic
priority=-1

View File

@@ -69,9 +69,5 @@ typedef struct
} t_runtime_options;
#define SLEEP_MONITOR 2
#define SLEEP_RETRY 3
#define NUM_RETRY 40
#endif

View File

@@ -345,7 +345,7 @@ WitnessMonitor(void)
* Check if the master is still available, if after 5 minutes of retries
* we cannot reconnect, return false.
*/
CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
if (PQstatus(primaryConn) != CONNECTION_OK)
{
@@ -429,7 +429,7 @@ StandbyMonitor(void)
* Check if the master is still available, if after 5 minutes of retries
* we cannot reconnect, try to get a new master.
*/
CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds
CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
if (PQstatus(primaryConn) != CONNECTION_OK)
{
@@ -762,17 +762,19 @@ CheckPrimaryConnection(void)
/*
* Check if the master is still available
* if after NUM_RETRY * SLEEP_RETRY seconds of retries
* if after local_options.reconnect_attempts * local_options.reconnect_intvl seconds of retries
* we cannot reconnect
* return false
*/
for (connection_retries = 0; connection_retries < NUM_RETRY; connection_retries++)
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
{
if (!is_pgup(primaryConn, local_options.master_response_timeout))
{
log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"), progname, (SLEEP_RETRY*(NUM_RETRY-connection_retries)));
/* wait SLEEP_RETRY seconds between retries */
sleep(SLEEP_RETRY);
log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"),
progname,
(local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
/* wait local_options.reconnect_intvl seconds between retries */
sleep(local_options.reconnect_intvl);
}
else
{