Add tunables for connection retries to master and interval between

connection retries, these parameters along with master_response_timeout
determines the amount of time since failure to failover
This commit is contained in:
Jaime Casanova
2012-07-21 11:01:00 -05:00
parent 08ed0aa987
commit aaf35947ed
5 changed files with 49 additions and 11 deletions

View File

@@ -45,6 +45,10 @@ parse_config(const char *config_file, t_configuration_options *options)
/* if nothing has been provided defaults to 60 */ /* if nothing has been provided defaults to 60 */
options->master_response_timeout = 60; options->master_response_timeout = 60;
/* it defaults to 6 retries with a time between retries of 10s */
options->reconnect_attempts = 6;
options->reconnect_intvl = 10;
/* /*
* Since some commands don't require a config file at all, not * Since some commands don't require a config file at all, not
* having one isn't necessarily a problem. * having one isn't necessarily a problem.
@@ -103,6 +107,10 @@ parse_config(const char *config_file, t_configuration_options *options)
strncpy(options->follow_command, value, MAXLEN); strncpy(options->follow_command, value, MAXLEN);
else if (strcmp(name, "master_response_timeout") == 0) else if (strcmp(name, "master_response_timeout") == 0)
options->master_response_timeout = atoi(value); options->master_response_timeout = atoi(value);
else if (strcmp(name, "reconnect_attempts") == 0)
options->reconnect_attempts = atoi(value);
else if (strcmp(name, "reconnect_interval") == 0)
options->reconnect_intvl = atoi(value);
else else
log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value); log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
} }
@@ -128,6 +136,18 @@ parse_config(const char *config_file, t_configuration_options *options)
log_err(_("Master response timeout must be greater than zero. Check the configuration file.\n")); log_err(_("Master response timeout must be greater than zero. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
if (options->reconnect_attempts < 0)
{
log_err(_("Reconnect attempts must be zero or greater. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG);
}
if (options->reconnect_intvl <= 0)
{
log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG);
}
} }
@@ -232,6 +252,18 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
return false; return false;
} }
if (new_options.reconnect_attempts < 0)
{
log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
return false;
}
if (new_options.reconnect_intvl < 0)
{
log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
return false;
}
/* Test conninfo string */ /* Test conninfo string */
conn = establishDBConnection(new_options.conninfo, false); conn = establishDBConnection(new_options.conninfo, false);
if (!conn || (PQstatus(conn) != CONNECTION_OK)) if (!conn || (PQstatus(conn) != CONNECTION_OK))
@@ -252,6 +284,8 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
strcpy(orig_options->follow_command, new_options.follow_command); strcpy(orig_options->follow_command, new_options.follow_command);
strcpy(orig_options->rsync_options, new_options.rsync_options); strcpy(orig_options->rsync_options, new_options.rsync_options);
orig_options->master_response_timeout = new_options.master_response_timeout; orig_options->master_response_timeout = new_options.master_response_timeout;
orig_options->reconnect_attempts = new_options.reconnect_attempts;
orig_options->reconnect_intvl = new_options.reconnect_intvl;
/* /*
* XXX These ones can change with a simple SIGHUP? * XXX These ones can change with a simple SIGHUP?

View File

@@ -37,6 +37,8 @@ typedef struct
char logfacility[MAXLEN]; char logfacility[MAXLEN];
char rsync_options[QUERY_STR_LEN]; char rsync_options[QUERY_STR_LEN];
int master_response_timeout; int master_response_timeout;
int reconnect_attempts;
int reconnect_intvl;
} t_configuration_options; } t_configuration_options;
void parse_config(const char *config_file, t_configuration_options *options); void parse_config(const char *config_file, t_configuration_options *options);

View File

@@ -16,6 +16,10 @@ rsync_options=--archive --checksum --compress --progress --rsh=ssh
# How many seconds we wait for master response before declaring master failure # How many seconds we wait for master response before declaring master failure
master_response_timeout=60 master_response_timeout=60
# How many time we try to reconnect to master before starting failover procedure
reconnect_attempts=6
reconnect_interval=10
# Autofailover options # Autofailover options
failover=automatic failover=automatic
priority=-1 priority=-1

View File

@@ -69,9 +69,5 @@ typedef struct
} t_runtime_options; } t_runtime_options;
#define SLEEP_MONITOR 2 #define SLEEP_MONITOR 2
#define SLEEP_RETRY 3
#define NUM_RETRY 40
#endif #endif

View File

@@ -345,7 +345,7 @@ WitnessMonitor(void)
* Check if the master is still available, if after 5 minutes of retries * Check if the master is still available, if after 5 minutes of retries
* we cannot reconnect, return false. * we cannot reconnect, return false.
*/ */
CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
if (PQstatus(primaryConn) != CONNECTION_OK) if (PQstatus(primaryConn) != CONNECTION_OK)
{ {
@@ -429,7 +429,7 @@ StandbyMonitor(void)
* Check if the master is still available, if after 5 minutes of retries * Check if the master is still available, if after 5 minutes of retries
* we cannot reconnect, try to get a new master. * we cannot reconnect, try to get a new master.
*/ */
CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
if (PQstatus(primaryConn) != CONNECTION_OK) if (PQstatus(primaryConn) != CONNECTION_OK)
{ {
@@ -762,17 +762,19 @@ CheckPrimaryConnection(void)
/* /*
* Check if the master is still available * Check if the master is still available
* if after NUM_RETRY * SLEEP_RETRY seconds of retries * if after local_options.reconnect_attempts * local_options.reconnect_intvl seconds of retries
* we cannot reconnect * we cannot reconnect
* return false * return false
*/ */
for (connection_retries = 0; connection_retries < NUM_RETRY; connection_retries++) for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
{ {
if (!is_pgup(primaryConn, local_options.master_response_timeout)) if (!is_pgup(primaryConn, local_options.master_response_timeout))
{ {
log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"), progname, (SLEEP_RETRY*(NUM_RETRY-connection_retries))); log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"),
/* wait SLEEP_RETRY seconds between retries */ progname,
sleep(SLEEP_RETRY); (local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
/* wait local_options.reconnect_intvl seconds between retries */
sleep(local_options.reconnect_intvl);
} }
else else
{ {