diff --git a/HISTORY b/HISTORY index e25b6d98..ba210e58 100644 --- a/HISTORY +++ b/HISTORY @@ -6,6 +6,7 @@ repmgrd: fix memory leaks in witness code (AndrzejNowicki, Martín) repmgrd: handle failover situation with only two nodes in the primary location, and at least one node in another location; GitHub #407 (Ian) + repmgrd: set "connect_timeout=2" when pinging a server (Ian) 4.0.4 2018-03-09 repmgr: add "standby clone --recovery-conf-only" option; GitHub #382 (Ian) diff --git a/dbutils.c b/dbutils.c index f7cf749b..6b564959 100644 --- a/dbutils.c +++ b/dbutils.c @@ -3873,6 +3873,28 @@ is_server_available(const char *conninfo) } +bool +is_server_available_params(t_conninfo_param_list *param_list) +{ + PGPing status = PQpingParams((const char **) param_list->keywords, + (const char **) param_list->values, + false); + + /* deparsing the param_list adds overhead, so only do it if needed */ + if (log_level == LOG_DEBUG) + { + char *conninfo_str = param_list_to_string(param_list); + log_verbose(LOG_DEBUG, "ping status for %s is %i", conninfo_str, (int)status); + pfree(conninfo_str); + } + + if (status == PQPING_OK) + return true; + + return false; +} + + /* ==================== */ /* monitoring functions */ /* ==================== */ diff --git a/dbutils.h b/dbutils.h index a79aee7b..2055637c 100644 --- a/dbutils.h +++ b/dbutils.h @@ -466,6 +466,7 @@ int wait_connection_availability(PGconn *conn, long long timeout); /* node availability functions */ bool is_server_available(const char *conninfo); +bool is_server_available_params(t_conninfo_param_list *param_list); /* monitoring functions */ void diff --git a/repmgrd.c b/repmgrd.c index e650f6c2..81f18489 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -705,17 +705,29 @@ PGconn * try_reconnect(t_node_info *node_info) { PGconn *conn; + t_conninfo_param_list conninfo_params = T_CONNINFO_PARAM_LIST_INITIALIZER; int i; int max_attempts = config_file_options.reconnect_attempts; + initialize_conninfo_params(&conninfo_params, false); + + + /* we assume by now the conninfo string is parseable */ + (void) parse_conninfo_string(node_info->conninfo, &conninfo_params, NULL, false); + + /* set some default values if not explicitly provided */ + param_set_ine(&conninfo_params, "connect_timeout", "2"); + param_set_ine(&conninfo_params, "fallback_application_name", "repmgr"); + for (i = 0; i < max_attempts; i++) { log_info(_("checking state of node %i, %i of %i attempts"), node_info->node_id, i + 1, max_attempts); - if (is_server_available(node_info->conninfo) == true) + if (is_server_available_params(&conninfo_params) == true) { + log_notice(_("node has recovered, reconnecting")); /* @@ -723,9 +735,13 @@ try_reconnect(t_node_info *node_info) * connection denied due to connection exhaustion - fall back to * degraded monitoring? - make that configurable */ - conn = establish_db_connection(node_info->conninfo, false); + + conn = establish_db_connection_by_params(&conninfo_params, false); + if (PQstatus(conn) == CONNECTION_OK) { + free_conninfo_params(&conninfo_params); + node_info->node_status = NODE_STATUS_UP; return conn; } @@ -742,13 +758,14 @@ try_reconnect(t_node_info *node_info) } } - log_warning(_("unable to reconnect to node %i after %i attempts"), node_info->node_id, max_attempts); node_info->node_status = NODE_STATUS_DOWN; + free_conninfo_params(&conninfo_params); + return NULL; }