diff --git a/config.c b/config.c index a3f6accb..7c4e4ca0 100644 --- a/config.c +++ b/config.c @@ -45,6 +45,10 @@ parse_config(const char *config_file, t_configuration_options *options) /* if nothing has been provided defaults to 60 */ options->master_response_timeout = 60; + /* it defaults to 6 retries with a time between retries of 10s */ + options->reconnect_attempts = 6; + options->reconnect_intvl = 10; + /* * Since some commands don't require a config file at all, not * having one isn't necessarily a problem. @@ -103,6 +107,10 @@ parse_config(const char *config_file, t_configuration_options *options) strncpy(options->follow_command, value, MAXLEN); else if (strcmp(name, "master_response_timeout") == 0) options->master_response_timeout = atoi(value); + else if (strcmp(name, "reconnect_attempts") == 0) + options->reconnect_attempts = atoi(value); + else if (strcmp(name, "reconnect_interval") == 0) + options->reconnect_intvl = atoi(value); else log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value); } @@ -128,6 +136,18 @@ parse_config(const char *config_file, t_configuration_options *options) log_err(_("Master response timeout must be greater than zero. Check the configuration file.\n")); exit(ERR_BAD_CONFIG); } + + if (options->reconnect_attempts < 0) + { + log_err(_("Reconnect attempts must be zero or greater. Check the configuration file.\n")); + exit(ERR_BAD_CONFIG); + } + + if (options->reconnect_intvl <= 0) + { + log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n")); + exit(ERR_BAD_CONFIG); + } } @@ -232,6 +252,18 @@ reload_configuration(char *config_file, t_configuration_options *orig_options) return false; } + if (new_options.reconnect_attempts < 0) + { + log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n")); + return false; + } + + if (new_options.reconnect_intvl < 0) + { + log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n")); + return false; + } + /* Test conninfo string */ conn = establishDBConnection(new_options.conninfo, false); if (!conn || (PQstatus(conn) != CONNECTION_OK)) @@ -252,6 +284,8 @@ reload_configuration(char *config_file, t_configuration_options *orig_options) strcpy(orig_options->follow_command, new_options.follow_command); strcpy(orig_options->rsync_options, new_options.rsync_options); orig_options->master_response_timeout = new_options.master_response_timeout; + orig_options->reconnect_attempts = new_options.reconnect_attempts; + orig_options->reconnect_intvl = new_options.reconnect_intvl; /* * XXX These ones can change with a simple SIGHUP? diff --git a/config.h b/config.h index 45435cc7..4e4cdaf3 100644 --- a/config.h +++ b/config.h @@ -37,6 +37,8 @@ typedef struct char logfacility[MAXLEN]; char rsync_options[QUERY_STR_LEN]; int master_response_timeout; + int reconnect_attempts; + int reconnect_intvl; } t_configuration_options; void parse_config(const char *config_file, t_configuration_options *options); diff --git a/repmgr.conf.sample b/repmgr.conf.sample index 73d8ed40..9e8ad42b 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -16,6 +16,10 @@ rsync_options=--archive --checksum --compress --progress --rsh=ssh # How many seconds we wait for master response before declaring master failure master_response_timeout=60 +# How many time we try to reconnect to master before starting failover procedure +reconnect_attempts=6 +reconnect_interval=10 + # Autofailover options failover=automatic priority=-1 diff --git a/repmgr.h b/repmgr.h index e0970445..dcab7d46 100644 --- a/repmgr.h +++ b/repmgr.h @@ -69,9 +69,5 @@ typedef struct } t_runtime_options; #define SLEEP_MONITOR 2 -#define SLEEP_RETRY 3 -#define NUM_RETRY 40 - - #endif diff --git a/repmgrd.c b/repmgrd.c index e07ec0df..aae9b169 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -345,7 +345,7 @@ WitnessMonitor(void) * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, return false. */ - CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds + CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds if (PQstatus(primaryConn) != CONNECTION_OK) { @@ -429,7 +429,7 @@ StandbyMonitor(void) * Check if the master is still available, if after 5 minutes of retries * we cannot reconnect, try to get a new master. */ - CheckPrimaryConnection(); // this take up to NUM_RETRY * SLEEP_RETRY seconds + CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds if (PQstatus(primaryConn) != CONNECTION_OK) { @@ -762,17 +762,19 @@ CheckPrimaryConnection(void) /* * Check if the master is still available - * if after NUM_RETRY * SLEEP_RETRY seconds of retries + * if after local_options.reconnect_attempts * local_options.reconnect_intvl seconds of retries * we cannot reconnect * return false */ - for (connection_retries = 0; connection_retries < NUM_RETRY; connection_retries++) + for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++) { if (!is_pgup(primaryConn, local_options.master_response_timeout)) { - log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"), progname, (SLEEP_RETRY*(NUM_RETRY-connection_retries))); - /* wait SLEEP_RETRY seconds between retries */ - sleep(SLEEP_RETRY); + log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"), + progname, + (local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries))); + /* wait local_options.reconnect_intvl seconds between retries */ + sleep(local_options.reconnect_intvl); } else {