Make repmgrd failover settings configurable

This commit is contained in:
Ian Barwick
2017-07-07 21:11:22 +09:00
parent b08511ec79
commit 2787994a6e
7 changed files with 34 additions and 17 deletions

View File

@@ -239,11 +239,11 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
strncpy(options->location, DEFAULT_LOCATION, MAXLEN);
memset(options->promote_command, 0, sizeof(options->promote_command));
memset(options->follow_command, 0, sizeof(options->follow_command));
options->monitor_interval_secs = 2;
options->monitor_interval_secs = DEFAULT_STATS_REPORTING_INTERVAL;
options->primary_response_timeout = 60;
/* default to 6 reconnection attempts at intervals of 10 seconds */
options->reconnect_attempts = 6;
options->reconnect_interval = 10;
options->reconnect_attempts = DEFAULT_RECONNECTION_ATTEMPTS;
options->reconnect_interval = DEFAULT_RECONNECTION_INTERVAL;
options->retry_promote_interval_secs = 300;
options->monitoring_history = false; /* new in 4.0, replaces --monitoring-history */
options->degraded_monitoring_timeout = -1;

View File

@@ -125,7 +125,12 @@ typedef struct
/* standby clone settings */ \
false, "", "", "", "", { NULL, NULL }, \
/* repmgrd settings */ \
FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", 2, 60, 6, 10, 300, false, -1, \
FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", \
DEFAULT_STATS_REPORTING_INTERVAL, \
60, \
DEFAULT_RECONNECTION_ATTEMPTS, \
DEFAULT_RECONNECTION_INTERVAL, \
300, false, -1, \
/* witness settings */ \
30, \
/* service settings */ \

View File

@@ -1202,7 +1202,7 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row)
strncpy(node_info->node_name, PQgetvalue(res, row, 3), MAXLEN);
strncpy(node_info->conninfo, PQgetvalue(res, row, 4), MAXLEN);
strncpy(node_info->repluser, PQgetvalue(res, row, 5), MAXLEN);
strncpy(node_info->repluser, PQgetvalue(res, row, 5), NAMEDATALEN);
strncpy(node_info->slot_name, PQgetvalue(res, row, 6), MAXLEN);
strncpy(node_info->location, PQgetvalue(res, row, 7), MAXLEN);
node_info->priority = atoi(PQgetvalue(res, row, 8));

View File

@@ -48,7 +48,7 @@ static char upstream_data_directory[MAXPGPATH];
static t_conninfo_param_list recovery_conninfo;
static char recovery_conninfo_str[MAXLEN];
static char upstream_repluser[MAXLEN];
static char upstream_repluser[NAMEDATALEN];
static t_configfile_list config_files = T_CONFIGFILE_LIST_INITIALIZER;
@@ -1755,12 +1755,11 @@ check_source_server()
upstream_node_id = runtime_options.upstream_node_id;
record_status = get_node_record(source_conn, upstream_node_id, &node_record);
if (record_status == RECORD_FOUND)
{
upstream_record_found = true;
strncpy(recovery_conninfo_str, node_record.conninfo, MAXLEN);
strncpy(upstream_repluser, node_record.repluser, MAXLEN);
strncpy(upstream_repluser, node_record.repluser, NAMEDATALEN);
}
/*

View File

@@ -33,8 +33,12 @@
#define BDR_MONITORING_LOCAL 1
#define BDR_MONITORING_PRIORITY 2
#define DEFAULT_LOCATION "default"
#define DEFAULT_PRIORITY 100
#define DEFAULT_LOCATION "default"
#define DEFAULT_PRIORITY 100
#define DEFAULT_RECONNECTION_ATTEMPTS 6
#define DEFAULT_RECONNECTION_INTERVAL 10
#define DEFAULT_STATS_REPORTING_INTERVAL 2
#define FAILOVER_NODES_MAX_CHECK 50

View File

@@ -583,9 +583,9 @@ monitor_streaming_primary(void)
goto loop;
}
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
}
}
@@ -894,6 +894,10 @@ monitor_streaming_standby(void)
if (PQstatus(upstream_conn) == CONNECTION_OK)
{
// XXX check here if upstream is still primary
// -> will be a problem if another node was promoted in the meantime
// and upstream is now former primary
// XXX scan other nodes to see if any has become primary
upstream_node_status = NODE_STATUS_UP;
monitoring_state = MS_NORMAL;
@@ -930,8 +934,9 @@ monitor_streaming_standby(void)
goto loop;
}
}
// unable to connect to former primary - check if another node has
// been promoted
// XXX scan other nodes to see if any has become primary
}
loop:
@@ -1218,9 +1223,13 @@ do_upstream_standby_failover(void)
PQfinish(upstream_conn);
upstream_conn = NULL;
// check status
record_status = get_primary_node_record(local_conn, &primary_node_info);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve primary node record"));
return false;
}
/*
* Verify that we can still talk to the cluster primary, even though
* the node's upstream is not available
@@ -2185,8 +2194,7 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
int i;
// XXX make this all configurable
int max_attempts = 5;
int max_attempts = config_file_options.reconnect_attempts;
for (i = 0; i < max_attempts; i++)
{
@@ -2207,7 +2215,9 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
PQfinish(conn);
log_notice(_("unable to reconnect to node"));
}
sleep(1);
log_info(_("sleeping %i seconds until next reconnection_attempt"),
config_file_options.reconnect_interval);
sleep(config_file_options.reconnect_interval);
}

View File

@@ -15,7 +15,6 @@
/* same as defined in src/include/replication/walreceiver.h */
#define MAXCONNINFO 1024
/* Why? http://stackoverflow.com/a/5459929/398670 */
#define STR(x) CppAsString(x)
#define MAXLEN_STR STR(MAXLEN)