diff --git a/HISTORY b/HISTORY
index 6a629bf2..6ed98cc2 100644
--- a/HISTORY
+++ b/HISTORY
@@ -27,8 +27,9 @@
repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
repmgrd: on a cascaded standby, don't fail over if "failover=manual";
GitHub #531 (Ian)
- repmgrd: don't consider nodes where repmgrd is not running as promotion
- candidates (Ian)
+ repmgrd: don't consider nodes where repmgrd is not running as promotion
+ candidates (Ian)
+ repmgrd: add option "connection_check_type" (Ian)
4.2.1 2018-??-??
repmgr: add sanity check for correct extension version (Ian)
diff --git a/configfile.c b/configfile.c
index 9bb66076..1b6dd1ea 100644
--- a/configfile.c
+++ b/configfile.c
@@ -358,6 +358,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */
memset(options->repmgrd_pid_file, 0, sizeof(options->repmgrd_pid_file));
+ options->connection_check_type = CHECK_PING;
/*-------------
* witness settings
@@ -618,6 +619,22 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0);
else if (strcmp(name, "repmgrd_pid_file") == 0)
strncpy(options->repmgrd_pid_file, value, MAXPGPATH);
+ else if (strcmp(name, "connection_check_type") == 0)
+ {
+ if (strcasecmp(value, "ping") == 0)
+ {
+ options->connection_check_type = CHECK_PING;
+ }
+ else if (strcasecmp(value, "connection") == 0)
+ {
+ options->connection_check_type = CHECK_CONNECTION;
+ }
+ else
+ {
+ item_list_append(error_list,
+ _("value for \"connection_check_type\" must be \"ping\" or \"connect\"\n"));
+ }
+ }
/* witness settings */
else if (strcmp(name, "witness_sync_interval") == 0)
@@ -1155,7 +1172,6 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
return false;
}
-
/*
* No configuration problems detected - copy any changed values
*
@@ -1330,6 +1346,14 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
config_changed = true;
}
+ if (orig_options->connection_check_type != new_options.connection_check_type)
+ {
+ orig_options->connection_check_type = new_options.connection_check_type;
+ log_info(_("\"connection_check_type\" is now \"%s\""),
+ new_options.connection_check_type == CHECK_PING ? "ping" : "connection");
+ config_changed = true;
+ }
+
/*
* Handle changes to logging configuration
*/
diff --git a/configfile.h b/configfile.h
index 095b813d..34e220f7 100644
--- a/configfile.h
+++ b/configfile.h
@@ -37,6 +37,12 @@ typedef enum
FAILOVER_AUTOMATIC
} failover_mode_opt;
+typedef enum
+{
+ CHECK_PING,
+ CHECK_CONNECTION
+} ConnectionCheckType;
+
typedef struct EventNotificationListCell
{
struct EventNotificationListCell *next;
@@ -135,6 +141,7 @@ typedef struct
int primary_notification_timeout;
int repmgrd_standby_startup_timeout;
char repmgrd_pid_file[MAXPGPATH];
+ ConnectionCheckType connection_check_type;
/* BDR settings */
bool bdr_local_monitoring_only;
@@ -206,7 +213,7 @@ typedef struct
false, -1, \
DEFAULT_ASYNC_QUERY_TIMEOUT, \
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
- -1, "", \
+ -1, "", CHECK_PING, \
/* BDR settings */ \
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
/* service settings */ \
diff --git a/dbutils.c b/dbutils.c
index b873e430..5657cbe6 100644
--- a/dbutils.c
+++ b/dbutils.c
@@ -4132,7 +4132,8 @@ cancel_query(PGconn *conn, int timeout)
*/
if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
{
- log_warning(_("unable to stop current query:\n %s"), errbuf);
+ log_warning(_("unable to cancel current query"));
+ log_detail("%s", errbuf);
PQfreeCancel(pgcancel);
return false;
}
@@ -4150,7 +4151,7 @@ cancel_query(PGconn *conn, int timeout)
* Returns 1 for success; 0 if any error ocurred; -1 if timeout reached.
*/
int
-wait_connection_availability(PGconn *conn, long long timeout)
+wait_connection_availability(PGconn *conn, int timeout)
{
PGresult *res = NULL;
fd_set read_set;
@@ -4159,16 +4160,17 @@ wait_connection_availability(PGconn *conn, long long timeout)
before,
after;
struct timezone tz;
+ long long timeout_ms;
- /* recalc to microseconds */
- timeout *= 1000000;
+ /* calculate timeout in microseconds */
+ timeout_ms = timeout * 1000000;
- while (timeout > 0)
+ while (timeout_ms > 0)
{
if (PQconsumeInput(conn) == 0)
{
- log_warning(_("wait_connection_availability(): could not receive data from connection:\n %s"),
- PQerrorMessage(conn));
+ log_warning(_("wait_connection_availability(): unable to receive data from connection"));
+ log_detail("%s", PQerrorMessage(conn));
return 0;
}
@@ -4199,17 +4201,17 @@ wait_connection_availability(PGconn *conn, long long timeout)
gettimeofday(&after, &tz);
- timeout -= (after.tv_sec * 1000000 + after.tv_usec) -
+ timeout_ms -= (after.tv_sec * 1000000 + after.tv_usec) -
(before.tv_sec * 1000000 + before.tv_usec);
}
- if (timeout >= 0)
+ if (timeout_ms >= 0)
{
return 1;
}
- log_warning(_("wait_connection_availability(): timeout reached"));
+ log_warning(_("wait_connection_availability(): timeout (%i secs) reached"), timeout);
return -1;
}
diff --git a/dbutils.h b/dbutils.h
index 445324e4..268b3fb8 100644
--- a/dbutils.h
+++ b/dbutils.h
@@ -510,7 +510,7 @@ bool get_tablespace_name_by_location(PGconn *conn, const char *location, char *
/* asynchronous query functions */
bool cancel_query(PGconn *conn, int timeout);
-int wait_connection_availability(PGconn *conn, long long timeout);
+int wait_connection_availability(PGconn *conn, int timeout);
/* node availability functions */
bool is_server_available(const char *conninfo);
diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml
index ec68f6cb..6f1e1b64 100644
--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -160,7 +160,17 @@ REPMGRD_OPTS="--daemonize=false"
the absence of a running repmgrd.
-
+
+
+ Add option to enable selection of the method
+ repmgrd uses to determine whether the upstream node is available.
+
+
+ Possible values are ping (default; uses PQping() to
+ determine server availability); and connection (determines server availability
+ by executing an SQL statement on the node via the existing connection).
+
+
diff --git a/doc/repmgrd-configuration.sgml b/doc/repmgrd-configuration.sgml
index 993c0bc5..f87f953f 100644
--- a/doc/repmgrd-configuration.sgml
+++ b/doc/repmgrd-configuration.sgml
@@ -101,6 +101,32 @@
repmgr standby follow will result in the node continuing to follow
the original primary.
+
+
+
+ connection_check_type
+
+ Additionally, the option to enable selection of the method
+ repmgrd uses to determine whether the upstream node is available.
+
+
+ Possible values are:
+
+
+
+ ping (default) - uses PQping() to
+ determine server availability
+
+
+
+
+ connection - determines server availability
+ by executing an SQL statement on the node via the existing connection
+
+
+
+
+
diff --git a/repmgr.conf.sample b/repmgr.conf.sample
index 4ed78b5f..c0f7edce 100644
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -285,6 +285,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
# a value of zero prevents the node being promoted to primary
# (default: 100)
+#connection_check_type=ping # How to check availability of the upstream node; valid options:
+ # 'ping': use PQping() to check if the node is accepting connections
+ # 'connection': execute a throwaway query on the current connection
#reconnect_attempts=6 # Number of attempts which will be made to reconnect to an unreachable
# primary (or other upstream node)
#reconnect_interval=10 # Interval between attempts to reconnect to an unreachable
diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index 410342f4..0280217b 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -831,7 +831,7 @@ monitor_streaming_standby(void)
while (true)
{
log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo);
- if (is_server_available(upstream_node_info.conninfo) == true)
+ if (check_upstream_connection(upstream_conn, upstream_node_info.conninfo) == true)
{
set_upstream_last_seen(local_conn);
}
@@ -1030,9 +1030,10 @@ monitor_streaming_standby(void)
upstream_node_info.node_id,
degraded_monitoring_elapsed);
- if (is_server_available(upstream_node_info.conninfo) == true)
+ if (check_upstream_connection(upstream_conn, upstream_node_info.conninfo) == true)
{
- upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
+ if (config_file_options.connection_check_type == CHECK_PING)
+ upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
if (PQstatus(upstream_conn) == CONNECTION_OK)
{
@@ -1604,7 +1605,7 @@ monitor_streaming_witness(void)
while (true)
{
- if (is_server_available(upstream_node_info.conninfo) == false)
+ if (check_upstream_connection(upstream_conn, upstream_node_info.conninfo) == false)
{
if (upstream_node_info.node_status == NODE_STATUS_UP)
{
@@ -1693,9 +1694,10 @@ monitor_streaming_witness(void)
upstream_node_info.node_id,
degraded_monitoring_elapsed);
- if (is_server_available(upstream_node_info.conninfo) == true)
+ if (check_upstream_connection(primary_conn, upstream_node_info.conninfo) == true)
{
- primary_conn = establish_db_connection(upstream_node_info.conninfo, false);
+ if (config_file_options.connection_check_type == CHECK_PING)
+ primary_conn = establish_db_connection(upstream_node_info.conninfo, false);
if (PQstatus(primary_conn) == CONNECTION_OK)
{
diff --git a/repmgrd.c b/repmgrd.c
index 86f124b0..d62d3fa7 100644
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -818,6 +818,58 @@ show_help(void)
}
+bool
+check_upstream_connection(PGconn *conn, const char *conninfo)
+{
+ /* Check the connection status twice in case it changes after reset */
+ bool twice = false;
+
+ if (config_file_options.connection_check_type == CHECK_PING)
+ return is_server_available(conninfo);
+
+ for (;;)
+ {
+ if (PQstatus(conn) != CONNECTION_OK)
+ {
+ if (twice)
+ return false;
+ PQreset(conn); /* reconnect */
+ twice = true;
+ }
+ else
+ {
+ if (!cancel_query(conn, config_file_options.async_query_timeout))
+ goto failed;
+
+ if (wait_connection_availability(conn, config_file_options.async_query_timeout) != 1)
+ goto failed;
+
+ /* execute a simple query to verify connection availability */
+ if (PQsendQuery(conn, "SELECT 1") == 0)
+ {
+ log_warning(_("unable to send query to upstream"));
+ log_detail("%s", PQerrorMessage(conn));
+ goto failed;
+ }
+
+ if (wait_connection_availability(conn, config_file_options.async_query_timeout) != 1)
+ goto failed;
+
+ break;
+
+ failed:
+ /* retry once */
+ if (twice)
+ return false;
+ PQreset(conn); /* reconnect */
+ twice = true;
+ }
+ }
+
+ return true;
+}
+
+
void
try_reconnect(PGconn **conn, t_node_info *node_info)
{
diff --git a/repmgrd.h b/repmgrd.h
index 38b5122b..56c1df70 100644
--- a/repmgrd.h
+++ b/repmgrd.h
@@ -23,6 +23,7 @@ extern PGconn *local_conn;
extern bool startup_event_logged;
extern char pid_file[MAXPGPATH];
+bool check_upstream_connection(PGconn *conn, const char *conninfo);
void try_reconnect(PGconn **conn, t_node_info *node_info);
int calculate_elapsed(instr_time start_time);
@@ -31,5 +32,4 @@ const char *print_monitoring_state(MonitoringState monitoring_state);
void update_registration(PGconn *conn);
void terminate(int retval);
-
#endif /* _REPMGRD_H_ */