mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
repmgrd: add option "connection_check_type"
This enable selection of the method repmgrd uses to check whether the upstream node is available. Possible values are: - "ping" (default): uses PQping() to check server availability - "connection": executes a query on the connection to check server availability (similar to repmgr3.x).
This commit is contained in:
5
HISTORY
5
HISTORY
@@ -27,8 +27,9 @@
|
||||
repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
|
||||
repmgrd: on a cascaded standby, don't fail over if "failover=manual";
|
||||
GitHub #531 (Ian)
|
||||
repmgrd: don't consider nodes where repmgrd is not running as promotion
|
||||
candidates (Ian)
|
||||
repmgrd: don't consider nodes where repmgrd is not running as promotion
|
||||
candidates (Ian)
|
||||
repmgrd: add option "connection_check_type" (Ian)
|
||||
|
||||
4.2.1 2018-??-??
|
||||
repmgr: add sanity check for correct extension version (Ian)
|
||||
|
||||
26
configfile.c
26
configfile.c
@@ -358,6 +358,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->primary_notification_timeout = DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT;
|
||||
options->repmgrd_standby_startup_timeout = -1; /* defaults to "standby_reconnect_timeout" if not set */
|
||||
memset(options->repmgrd_pid_file, 0, sizeof(options->repmgrd_pid_file));
|
||||
options->connection_check_type = CHECK_PING;
|
||||
|
||||
/*-------------
|
||||
* witness settings
|
||||
@@ -618,6 +619,22 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->repmgrd_standby_startup_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||
else if (strcmp(name, "repmgrd_pid_file") == 0)
|
||||
strncpy(options->repmgrd_pid_file, value, MAXPGPATH);
|
||||
else if (strcmp(name, "connection_check_type") == 0)
|
||||
{
|
||||
if (strcasecmp(value, "ping") == 0)
|
||||
{
|
||||
options->connection_check_type = CHECK_PING;
|
||||
}
|
||||
else if (strcasecmp(value, "connection") == 0)
|
||||
{
|
||||
options->connection_check_type = CHECK_CONNECTION;
|
||||
}
|
||||
else
|
||||
{
|
||||
item_list_append(error_list,
|
||||
_("value for \"connection_check_type\" must be \"ping\" or \"connect\"\n"));
|
||||
}
|
||||
}
|
||||
|
||||
/* witness settings */
|
||||
else if (strcmp(name, "witness_sync_interval") == 0)
|
||||
@@ -1155,7 +1172,6 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* No configuration problems detected - copy any changed values
|
||||
*
|
||||
@@ -1330,6 +1346,14 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
if (orig_options->connection_check_type != new_options.connection_check_type)
|
||||
{
|
||||
orig_options->connection_check_type = new_options.connection_check_type;
|
||||
log_info(_("\"connection_check_type\" is now \"%s\""),
|
||||
new_options.connection_check_type == CHECK_PING ? "ping" : "connection");
|
||||
config_changed = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle changes to logging configuration
|
||||
*/
|
||||
|
||||
@@ -37,6 +37,12 @@ typedef enum
|
||||
FAILOVER_AUTOMATIC
|
||||
} failover_mode_opt;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CHECK_PING,
|
||||
CHECK_CONNECTION
|
||||
} ConnectionCheckType;
|
||||
|
||||
typedef struct EventNotificationListCell
|
||||
{
|
||||
struct EventNotificationListCell *next;
|
||||
@@ -135,6 +141,7 @@ typedef struct
|
||||
int primary_notification_timeout;
|
||||
int repmgrd_standby_startup_timeout;
|
||||
char repmgrd_pid_file[MAXPGPATH];
|
||||
ConnectionCheckType connection_check_type;
|
||||
|
||||
/* BDR settings */
|
||||
bool bdr_local_monitoring_only;
|
||||
@@ -206,7 +213,7 @@ typedef struct
|
||||
false, -1, \
|
||||
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||
-1, "", \
|
||||
-1, "", CHECK_PING, \
|
||||
/* BDR settings */ \
|
||||
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||
/* service settings */ \
|
||||
|
||||
22
dbutils.c
22
dbutils.c
@@ -4132,7 +4132,8 @@ cancel_query(PGconn *conn, int timeout)
|
||||
*/
|
||||
if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
|
||||
{
|
||||
log_warning(_("unable to stop current query:\n %s"), errbuf);
|
||||
log_warning(_("unable to cancel current query"));
|
||||
log_detail("%s", errbuf);
|
||||
PQfreeCancel(pgcancel);
|
||||
return false;
|
||||
}
|
||||
@@ -4150,7 +4151,7 @@ cancel_query(PGconn *conn, int timeout)
|
||||
* Returns 1 for success; 0 if any error ocurred; -1 if timeout reached.
|
||||
*/
|
||||
int
|
||||
wait_connection_availability(PGconn *conn, long long timeout)
|
||||
wait_connection_availability(PGconn *conn, int timeout)
|
||||
{
|
||||
PGresult *res = NULL;
|
||||
fd_set read_set;
|
||||
@@ -4159,16 +4160,17 @@ wait_connection_availability(PGconn *conn, long long timeout)
|
||||
before,
|
||||
after;
|
||||
struct timezone tz;
|
||||
long long timeout_ms;
|
||||
|
||||
/* recalc to microseconds */
|
||||
timeout *= 1000000;
|
||||
/* calculate timeout in microseconds */
|
||||
timeout_ms = timeout * 1000000;
|
||||
|
||||
while (timeout > 0)
|
||||
while (timeout_ms > 0)
|
||||
{
|
||||
if (PQconsumeInput(conn) == 0)
|
||||
{
|
||||
log_warning(_("wait_connection_availability(): could not receive data from connection:\n %s"),
|
||||
PQerrorMessage(conn));
|
||||
log_warning(_("wait_connection_availability(): unable to receive data from connection"));
|
||||
log_detail("%s", PQerrorMessage(conn));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -4199,17 +4201,17 @@ wait_connection_availability(PGconn *conn, long long timeout)
|
||||
|
||||
gettimeofday(&after, &tz);
|
||||
|
||||
timeout -= (after.tv_sec * 1000000 + after.tv_usec) -
|
||||
timeout_ms -= (after.tv_sec * 1000000 + after.tv_usec) -
|
||||
(before.tv_sec * 1000000 + before.tv_usec);
|
||||
}
|
||||
|
||||
|
||||
if (timeout >= 0)
|
||||
if (timeout_ms >= 0)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
log_warning(_("wait_connection_availability(): timeout reached"));
|
||||
log_warning(_("wait_connection_availability(): timeout (%i secs) reached"), timeout);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
@@ -510,7 +510,7 @@ bool get_tablespace_name_by_location(PGconn *conn, const char *location, char *
|
||||
|
||||
/* asynchronous query functions */
|
||||
bool cancel_query(PGconn *conn, int timeout);
|
||||
int wait_connection_availability(PGconn *conn, long long timeout);
|
||||
int wait_connection_availability(PGconn *conn, int timeout);
|
||||
|
||||
/* node availability functions */
|
||||
bool is_server_available(const char *conninfo);
|
||||
|
||||
@@ -160,7 +160,17 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
|
||||
the absence of a running <application>repmgrd</application>.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Add option <option>connection_check_type</option> to enable selection of the method
|
||||
<application>repmgrd</application> uses to determine whether the upstream node is available.
|
||||
</para>
|
||||
<para>
|
||||
Possible values are <literal>ping</literal> (default; uses <command>PQping()</command> to
|
||||
determine server availability); and <literal>connection</literal> (determines server availability
|
||||
by executing an SQL statement on the node via the existing connection).
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
@@ -101,6 +101,32 @@
|
||||
<command>repmgr standby follow</command> will result in the node continuing to follow
|
||||
the original primary.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<indexterm>
|
||||
<primary>connection_check_type</primary>
|
||||
</indexterm>
|
||||
Additionally, the option <option>connection_check_type</option> to enable selection of the method
|
||||
<application>repmgrd</application> uses to determine whether the upstream node is available.
|
||||
</para>
|
||||
<para>
|
||||
Possible values are:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>ping</literal> (default) - uses <command>PQping()</command> to
|
||||
determine server availability
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
<literal>connection</literal> - determines server availability
|
||||
by executing an SQL statement on the node via the existing connection
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
|
||||
</sect2>
|
||||
|
||||
<sect2 id="postgresql-service-configuration">
|
||||
|
||||
@@ -285,6 +285,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
# a value of zero prevents the node being promoted to primary
|
||||
# (default: 100)
|
||||
|
||||
#connection_check_type=ping # How to check availability of the upstream node; valid options:
|
||||
# 'ping': use PQping() to check if the node is accepting connections
|
||||
# 'connection': execute a throwaway query on the current connection
|
||||
#reconnect_attempts=6 # Number of attempts which will be made to reconnect to an unreachable
|
||||
# primary (or other upstream node)
|
||||
#reconnect_interval=10 # Interval between attempts to reconnect to an unreachable
|
||||
|
||||
@@ -831,7 +831,7 @@ monitor_streaming_standby(void)
|
||||
while (true)
|
||||
{
|
||||
log_verbose(LOG_DEBUG, "checking %s", upstream_node_info.conninfo);
|
||||
if (is_server_available(upstream_node_info.conninfo) == true)
|
||||
if (check_upstream_connection(upstream_conn, upstream_node_info.conninfo) == true)
|
||||
{
|
||||
set_upstream_last_seen(local_conn);
|
||||
}
|
||||
@@ -1030,9 +1030,10 @@ monitor_streaming_standby(void)
|
||||
upstream_node_info.node_id,
|
||||
degraded_monitoring_elapsed);
|
||||
|
||||
if (is_server_available(upstream_node_info.conninfo) == true)
|
||||
if (check_upstream_connection(upstream_conn, upstream_node_info.conninfo) == true)
|
||||
{
|
||||
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
||||
if (config_file_options.connection_check_type == CHECK_PING)
|
||||
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
||||
|
||||
if (PQstatus(upstream_conn) == CONNECTION_OK)
|
||||
{
|
||||
@@ -1604,7 +1605,7 @@ monitor_streaming_witness(void)
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (is_server_available(upstream_node_info.conninfo) == false)
|
||||
if (check_upstream_connection(upstream_conn, upstream_node_info.conninfo) == false)
|
||||
{
|
||||
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
||||
{
|
||||
@@ -1693,9 +1694,10 @@ monitor_streaming_witness(void)
|
||||
upstream_node_info.node_id,
|
||||
degraded_monitoring_elapsed);
|
||||
|
||||
if (is_server_available(upstream_node_info.conninfo) == true)
|
||||
if (check_upstream_connection(primary_conn, upstream_node_info.conninfo) == true)
|
||||
{
|
||||
primary_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
||||
if (config_file_options.connection_check_type == CHECK_PING)
|
||||
primary_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||
{
|
||||
|
||||
52
repmgrd.c
52
repmgrd.c
@@ -818,6 +818,58 @@ show_help(void)
|
||||
}
|
||||
|
||||
|
||||
bool
|
||||
check_upstream_connection(PGconn *conn, const char *conninfo)
|
||||
{
|
||||
/* Check the connection status twice in case it changes after reset */
|
||||
bool twice = false;
|
||||
|
||||
if (config_file_options.connection_check_type == CHECK_PING)
|
||||
return is_server_available(conninfo);
|
||||
|
||||
for (;;)
|
||||
{
|
||||
if (PQstatus(conn) != CONNECTION_OK)
|
||||
{
|
||||
if (twice)
|
||||
return false;
|
||||
PQreset(conn); /* reconnect */
|
||||
twice = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!cancel_query(conn, config_file_options.async_query_timeout))
|
||||
goto failed;
|
||||
|
||||
if (wait_connection_availability(conn, config_file_options.async_query_timeout) != 1)
|
||||
goto failed;
|
||||
|
||||
/* execute a simple query to verify connection availability */
|
||||
if (PQsendQuery(conn, "SELECT 1") == 0)
|
||||
{
|
||||
log_warning(_("unable to send query to upstream"));
|
||||
log_detail("%s", PQerrorMessage(conn));
|
||||
goto failed;
|
||||
}
|
||||
|
||||
if (wait_connection_availability(conn, config_file_options.async_query_timeout) != 1)
|
||||
goto failed;
|
||||
|
||||
break;
|
||||
|
||||
failed:
|
||||
/* retry once */
|
||||
if (twice)
|
||||
return false;
|
||||
PQreset(conn); /* reconnect */
|
||||
twice = true;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
try_reconnect(PGconn **conn, t_node_info *node_info)
|
||||
{
|
||||
|
||||
@@ -23,6 +23,7 @@ extern PGconn *local_conn;
|
||||
extern bool startup_event_logged;
|
||||
extern char pid_file[MAXPGPATH];
|
||||
|
||||
bool check_upstream_connection(PGconn *conn, const char *conninfo);
|
||||
void try_reconnect(PGconn **conn, t_node_info *node_info);
|
||||
|
||||
int calculate_elapsed(instr_time start_time);
|
||||
@@ -31,5 +32,4 @@ const char *print_monitoring_state(MonitoringState monitoring_state);
|
||||
void update_registration(PGconn *conn);
|
||||
void terminate(int retval);
|
||||
|
||||
|
||||
#endif /* _REPMGRD_H_ */
|
||||
|
||||
Reference in New Issue
Block a user