diff --git a/config.c b/config.c index e2a73f16..3c3f25b5 100644 --- a/config.c +++ b/config.c @@ -239,11 +239,11 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * strncpy(options->location, DEFAULT_LOCATION, MAXLEN); memset(options->promote_command, 0, sizeof(options->promote_command)); memset(options->follow_command, 0, sizeof(options->follow_command)); - options->monitor_interval_secs = 2; + options->monitor_interval_secs = DEFAULT_STATS_REPORTING_INTERVAL; options->primary_response_timeout = 60; /* default to 6 reconnection attempts at intervals of 10 seconds */ - options->reconnect_attempts = 6; - options->reconnect_interval = 10; + options->reconnect_attempts = DEFAULT_RECONNECTION_ATTEMPTS; + options->reconnect_interval = DEFAULT_RECONNECTION_INTERVAL; options->retry_promote_interval_secs = 300; options->monitoring_history = false; /* new in 4.0, replaces --monitoring-history */ options->degraded_monitoring_timeout = -1; diff --git a/config.h b/config.h index dc9c5947..0168bd01 100644 --- a/config.h +++ b/config.h @@ -125,7 +125,12 @@ typedef struct /* standby clone settings */ \ false, "", "", "", "", { NULL, NULL }, \ /* repmgrd settings */ \ - FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", 2, 60, 6, 10, 300, false, -1, \ + FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", \ + DEFAULT_STATS_REPORTING_INTERVAL, \ + 60, \ + DEFAULT_RECONNECTION_ATTEMPTS, \ + DEFAULT_RECONNECTION_INTERVAL, \ + 300, false, -1, \ /* witness settings */ \ 30, \ /* service settings */ \ diff --git a/dbutils.c b/dbutils.c index 9a21ea49..c8c2b77a 100644 --- a/dbutils.c +++ b/dbutils.c @@ -1202,7 +1202,7 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row) strncpy(node_info->node_name, PQgetvalue(res, row, 3), MAXLEN); strncpy(node_info->conninfo, PQgetvalue(res, row, 4), MAXLEN); - strncpy(node_info->repluser, PQgetvalue(res, row, 5), MAXLEN); + strncpy(node_info->repluser, PQgetvalue(res, row, 5), NAMEDATALEN); strncpy(node_info->slot_name, PQgetvalue(res, row, 6), MAXLEN); strncpy(node_info->location, PQgetvalue(res, row, 7), MAXLEN); node_info->priority = atoi(PQgetvalue(res, row, 8)); diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 89674d0d..66840464 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -48,7 +48,7 @@ static char upstream_data_directory[MAXPGPATH]; static t_conninfo_param_list recovery_conninfo; static char recovery_conninfo_str[MAXLEN]; -static char upstream_repluser[MAXLEN]; +static char upstream_repluser[NAMEDATALEN]; static t_configfile_list config_files = T_CONFIGFILE_LIST_INITIALIZER; @@ -1755,12 +1755,11 @@ check_source_server() upstream_node_id = runtime_options.upstream_node_id; record_status = get_node_record(source_conn, upstream_node_id, &node_record); - if (record_status == RECORD_FOUND) { upstream_record_found = true; strncpy(recovery_conninfo_str, node_record.conninfo, MAXLEN); - strncpy(upstream_repluser, node_record.repluser, MAXLEN); + strncpy(upstream_repluser, node_record.repluser, NAMEDATALEN); } /* diff --git a/repmgr.h b/repmgr.h index bc5d114f..f8013d6f 100644 --- a/repmgr.h +++ b/repmgr.h @@ -33,8 +33,12 @@ #define BDR_MONITORING_LOCAL 1 #define BDR_MONITORING_PRIORITY 2 -#define DEFAULT_LOCATION "default" -#define DEFAULT_PRIORITY 100 +#define DEFAULT_LOCATION "default" +#define DEFAULT_PRIORITY 100 +#define DEFAULT_RECONNECTION_ATTEMPTS 6 +#define DEFAULT_RECONNECTION_INTERVAL 10 +#define DEFAULT_STATS_REPORTING_INTERVAL 2 + #define FAILOVER_NODES_MAX_CHECK 50 diff --git a/repmgrd.c b/repmgrd.c index 2278f146..c8103059 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -583,9 +583,9 @@ monitor_streaming_primary(void) goto loop; } + monitoring_state = MS_DEGRADED; INSTR_TIME_SET_CURRENT(degraded_monitoring_start); - } } @@ -894,6 +894,10 @@ monitor_streaming_standby(void) if (PQstatus(upstream_conn) == CONNECTION_OK) { // XXX check here if upstream is still primary + // -> will be a problem if another node was promoted in the meantime + // and upstream is now former primary + // XXX scan other nodes to see if any has become primary + upstream_node_status = NODE_STATUS_UP; monitoring_state = MS_NORMAL; @@ -930,8 +934,9 @@ monitor_streaming_standby(void) goto loop; } } + // unable to connect to former primary - check if another node has + // been promoted - // XXX scan other nodes to see if any has become primary } loop: @@ -1218,9 +1223,13 @@ do_upstream_standby_failover(void) PQfinish(upstream_conn); upstream_conn = NULL; - // check status record_status = get_primary_node_record(local_conn, &primary_node_info); + if (record_status != RECORD_FOUND) + { + log_error(_("unable to retrieve primary node record")); + return false; + } /* * Verify that we can still talk to the cluster primary, even though * the node's upstream is not available @@ -2185,8 +2194,7 @@ try_reconnect(const char *conninfo, NodeStatus *node_status) int i; - // XXX make this all configurable - int max_attempts = 5; + int max_attempts = config_file_options.reconnect_attempts; for (i = 0; i < max_attempts; i++) { @@ -2207,7 +2215,9 @@ try_reconnect(const char *conninfo, NodeStatus *node_status) PQfinish(conn); log_notice(_("unable to reconnect to node")); } - sleep(1); + log_info(_("sleeping %i seconds until next reconnection_attempt"), + config_file_options.reconnect_interval); + sleep(config_file_options.reconnect_interval); } diff --git a/strutil.h b/strutil.h index 0771bb4b..75aeb0c8 100644 --- a/strutil.h +++ b/strutil.h @@ -15,7 +15,6 @@ /* same as defined in src/include/replication/walreceiver.h */ #define MAXCONNINFO 1024 -/* Why? http://stackoverflow.com/a/5459929/398670 */ #define STR(x) CppAsString(x) #define MAXLEN_STR STR(MAXLEN)