diff --git a/HISTORY b/HISTORY index 6a629bf2..6ed98cc2 100644 --- a/HISTORY +++ b/HISTORY @@ -27,8 +27,9 @@ repmgrd: check binary and extension major versions match; GitHub #515 (Ian) repmgrd: on a cascaded standby, don't fail over if "failover=manual"; GitHub #531 (Ian) - repmgrd: don't consider nodes where repmgrd is not running as promotion - candidates (Ian) + repmgrd: don't consider nodes where repmgrd is not running as promotion + candidates (Ian) + repmgrd: add option "connection_check_type" (Ian) 4.2.1 2018-??-?? repmgr: add sanity check for correct extension version (Ian) diff --git a/configfile.c b/configfile.c index 9bb66076..1b6dd1ea 100644 --- a/configfile.c +++ b/configfile.c @@ -358,6 +358,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT; options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */ memset(options->repmgrd_pid_file, 0, sizeof(options->repmgrd_pid_file)); + options->connection_check_type = CHECK_PING; /*------------- * witness settings @@ -618,6 +619,22 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0); else if (strcmp(name, "repmgrd_pid_file") == 0) strncpy(options->repmgrd_pid_file, value, MAXPGPATH); + else if (strcmp(name, "connection_check_type") == 0) + { + if (strcasecmp(value, "ping") == 0) + { + options->connection_check_type = CHECK_PING; + } + else if (strcasecmp(value, "connection") == 0) + { + options->connection_check_type = CHECK_CONNECTION; + } + else + { + item_list_append(error_list, + _("value for \"connection_check_type\" must be \"ping\" or \"connect\"\n")); + } + } /* witness settings */ else if (strcmp(name, "witness_sync_interval") == 0) @@ -1155,7 +1172,6 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type) return false; } - /* * No configuration problems detected - copy any changed values * @@ -1330,6 +1346,14 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type) config_changed = true; } + if (orig_options->connection_check_type != new_options.connection_check_type) + { + orig_options->connection_check_type = new_options.connection_check_type; + log_info(_("\"connection_check_type\" is now \"%s\""), + new_options.connection_check_type == CHECK_PING ? "ping" : "connection"); + config_changed = true; + } + /* * Handle changes to logging configuration */ diff --git a/configfile.h b/configfile.h index 095b813d..34e220f7 100644 --- a/configfile.h +++ b/configfile.h @@ -37,6 +37,12 @@ typedef enum FAILOVER_AUTOMATIC } failover_mode_opt; +typedef enum +{ + CHECK_PING, + CHECK_CONNECTION +} ConnectionCheckType; + typedef struct EventNotificationListCell { struct EventNotificationListCell *next; @@ -135,6 +141,7 @@ typedef struct int primary_notification_timeout; int repmgrd_standby_startup_timeout; char repmgrd_pid_file[MAXPGPATH]; + ConnectionCheckType connection_check_type; /* BDR settings */ bool bdr_local_monitoring_only; @@ -206,7 +213,7 @@ typedef struct false, -1, \ DEFAULT_ASYNC_QUERY_TIMEOUT, \ DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \ - -1, "", \ + -1, "", CHECK_PING, \ /* BDR settings */ \ false, DEFAULT_BDR_RECOVERY_TIMEOUT, \ /* service settings */ \ diff --git a/dbutils.c b/dbutils.c index b873e430..5657cbe6 100644 --- a/dbutils.c +++ b/dbutils.c @@ -4132,7 +4132,8 @@ cancel_query(PGconn *conn, int timeout) */ if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0) { - log_warning(_("unable to stop current query:\n %s"), errbuf); + log_warning(_("unable to cancel current query")); + log_detail("%s", errbuf); PQfreeCancel(pgcancel); return false; } @@ -4150,7 +4151,7 @@ cancel_query(PGconn *conn, int timeout) * Returns 1 for success; 0 if any error ocurred; -1 if timeout reached. */ int -wait_connection_availability(PGconn *conn, long long timeout) +wait_connection_availability(PGconn *conn, int timeout) { PGresult *res = NULL; fd_set read_set; @@ -4159,16 +4160,17 @@ wait_connection_availability(PGconn *conn, long long timeout) before, after; struct timezone tz; + long long timeout_ms; - /* recalc to microseconds */ - timeout *= 1000000; + /* calculate timeout in microseconds */ + timeout_ms = timeout * 1000000; - while (timeout > 0) + while (timeout_ms > 0) { if (PQconsumeInput(conn) == 0) { - log_warning(_("wait_connection_availability(): could not receive data from connection:\n %s"), - PQerrorMessage(conn)); + log_warning(_("wait_connection_availability(): unable to receive data from connection")); + log_detail("%s", PQerrorMessage(conn)); return 0; } @@ -4199,17 +4201,17 @@ wait_connection_availability(PGconn *conn, long long timeout) gettimeofday(&after, &tz); - timeout -= (after.tv_sec * 1000000 + after.tv_usec) - + timeout_ms -= (after.tv_sec * 1000000 + after.tv_usec) - (before.tv_sec * 1000000 + before.tv_usec); } - if (timeout >= 0) + if (timeout_ms >= 0) { return 1; } - log_warning(_("wait_connection_availability(): timeout reached")); + log_warning(_("wait_connection_availability(): timeout (%i secs) reached"), timeout); return -1; } diff --git a/dbutils.h b/dbutils.h index 445324e4..268b3fb8 100644 --- a/dbutils.h +++ b/dbutils.h @@ -510,7 +510,7 @@ bool get_tablespace_name_by_location(PGconn *conn, const char *location, char * /* asynchronous query functions */ bool cancel_query(PGconn *conn, int timeout); -int wait_connection_availability(PGconn *conn, long long timeout); +int wait_connection_availability(PGconn *conn, int timeout); /* node availability functions */ bool is_server_available(const char *conninfo); diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index ec68f6cb..6f1e1b64 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -160,7 +160,17 @@ REPMGRD_OPTS="--daemonize=false" the absence of a running repmgrd. - + + + Add option to enable selection of the method + repmgrd uses to determine whether the upstream node is available. + + + Possible values are ping (default; uses PQping() to + determine server availability); and connection (determines server availability + by executing an SQL statement on the node via the existing connection). + + diff --git a/doc/repmgrd-configuration.sgml b/doc/repmgrd-configuration.sgml index 993c0bc5..f87f953f 100644 --- a/doc/repmgrd-configuration.sgml +++ b/doc/repmgrd-configuration.sgml @@ -101,6 +101,32 @@ repmgr standby follow will result in the node continuing to follow the original primary. + + + + connection_check_type + + Additionally, the option to enable selection of the method + repmgrd uses to determine whether the upstream node is available. + + + Possible values are: + + + + ping (default) - uses PQping() to + determine server availability + + + + + connection - determines server availability + by executing an SQL statement on the node via the existing connection + + + + + diff --git a/repmgr.conf.sample b/repmgr.conf.sample index 4ed78b5f..c0f7edce 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -285,6 +285,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh" # a value of zero prevents the node being promoted to primary # (default: 100) +#connection_check_type=ping # How to check availability of the upstream node; valid options: + # 'ping': use PQping() to check if the node is accepting connections + # 'connection': execute a throwaway query on the current connection #reconnect_attempts=6 # Number of attempts which will be made to reconnect to an unreachable # primary (or other upstream node) #reconnect_interval=10 # Interval between attempts to reconnect to an unreachable diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 410342f4..0280217b 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -831,7 +831,7 @@ monitor_streaming_standby(void) while (true) { log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo); - if (is_server_available(upstream_node_info.conninfo) == true) + if (check_upstream_connection(upstream_conn, upstream_node_info.conninfo) == true) { set_upstream_last_seen(local_conn); } @@ -1030,9 +1030,10 @@ monitor_streaming_standby(void) upstream_node_info.node_id, degraded_monitoring_elapsed); - if (is_server_available(upstream_node_info.conninfo) == true) + if (check_upstream_connection(upstream_conn, upstream_node_info.conninfo) == true) { - upstream_conn = establish_db_connection(upstream_node_info.conninfo, false); + if (config_file_options.connection_check_type == CHECK_PING) + upstream_conn = establish_db_connection(upstream_node_info.conninfo, false); if (PQstatus(upstream_conn) == CONNECTION_OK) { @@ -1604,7 +1605,7 @@ monitor_streaming_witness(void) while (true) { - if (is_server_available(upstream_node_info.conninfo) == false) + if (check_upstream_connection(upstream_conn, upstream_node_info.conninfo) == false) { if (upstream_node_info.node_status == NODE_STATUS_UP) { @@ -1693,9 +1694,10 @@ monitor_streaming_witness(void) upstream_node_info.node_id, degraded_monitoring_elapsed); - if (is_server_available(upstream_node_info.conninfo) == true) + if (check_upstream_connection(primary_conn, upstream_node_info.conninfo) == true) { - primary_conn = establish_db_connection(upstream_node_info.conninfo, false); + if (config_file_options.connection_check_type == CHECK_PING) + primary_conn = establish_db_connection(upstream_node_info.conninfo, false); if (PQstatus(primary_conn) == CONNECTION_OK) { diff --git a/repmgrd.c b/repmgrd.c index 86f124b0..d62d3fa7 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -818,6 +818,58 @@ show_help(void) } +bool +check_upstream_connection(PGconn *conn, const char *conninfo) +{ + /* Check the connection status twice in case it changes after reset */ + bool twice = false; + + if (config_file_options.connection_check_type == CHECK_PING) + return is_server_available(conninfo); + + for (;;) + { + if (PQstatus(conn) != CONNECTION_OK) + { + if (twice) + return false; + PQreset(conn); /* reconnect */ + twice = true; + } + else + { + if (!cancel_query(conn, config_file_options.async_query_timeout)) + goto failed; + + if (wait_connection_availability(conn, config_file_options.async_query_timeout) != 1) + goto failed; + + /* execute a simple query to verify connection availability */ + if (PQsendQuery(conn, "SELECT 1") == 0) + { + log_warning(_("unable to send query to upstream")); + log_detail("%s", PQerrorMessage(conn)); + goto failed; + } + + if (wait_connection_availability(conn, config_file_options.async_query_timeout) != 1) + goto failed; + + break; + + failed: + /* retry once */ + if (twice) + return false; + PQreset(conn); /* reconnect */ + twice = true; + } + } + + return true; +} + + void try_reconnect(PGconn **conn, t_node_info *node_info) { diff --git a/repmgrd.h b/repmgrd.h index 38b5122b..56c1df70 100644 --- a/repmgrd.h +++ b/repmgrd.h @@ -23,6 +23,7 @@ extern PGconn *local_conn; extern bool startup_event_logged; extern char pid_file[MAXPGPATH]; +bool check_upstream_connection(PGconn *conn, const char *conninfo); void try_reconnect(PGconn **conn, t_node_info *node_info); int calculate_elapsed(instr_time start_time); @@ -31,5 +32,4 @@ const char *print_monitoring_state(MonitoringState monitoring_state); void update_registration(PGconn *conn); void terminate(int retval); - #endif /* _REPMGRD_H_ */