mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Bug #90 fix (autofailover with reconnect_attemps > 1).
The main change is that now check_connection requires a conninfo parameter, and the connection object has type (PGconn **) so it can be replaced by check_connection if needed. The bug was caused by the fact that the first failure resulted in *conn == NULL, so that subsequent checks of the upstream connection were failing irrespectively of the actual state of the upstream node. Now, when *conn == NULL, check_connection will use conninfo to establish a new connection and place it into *conn. We introduce a new INTERNAL_ERROR code for the case when they are both NULL. In passing, we also reworded a confusing error message, distinguishing a timeout from the actual elapsed time.
This commit is contained in:
@@ -35,5 +35,6 @@
|
||||
#define ERR_BAD_SSH 12
|
||||
#define ERR_SYS_FAILURE 13
|
||||
#define ERR_BAD_BASEBACKUP 14
|
||||
#define ERR_INTERNAL 15
|
||||
|
||||
#endif /* _ERRCODE_H_ */
|
||||
|
||||
47
repmgrd.c
47
repmgrd.c
@@ -88,7 +88,7 @@ static void check_node_configuration(void);
|
||||
|
||||
static void standby_monitor(void);
|
||||
static void witness_monitor(void);
|
||||
static bool check_connection(PGconn *conn, const char *type);
|
||||
static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
|
||||
static bool set_local_node_failed(void);
|
||||
|
||||
static bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id);
|
||||
@@ -353,7 +353,7 @@ main(int argc, char **argv)
|
||||
*/
|
||||
do
|
||||
{
|
||||
if (check_connection(master_conn, "master"))
|
||||
if (check_connection(&master_conn, "master", NULL))
|
||||
{
|
||||
sleep(local_options.monitor_interval_secs);
|
||||
}
|
||||
@@ -536,7 +536,7 @@ witness_monitor(void)
|
||||
* of a missing master and promotion of a standby by that standby's
|
||||
* repmgrd, so we'll loop for a while before giving up.
|
||||
*/
|
||||
connection_ok = check_connection(master_conn, "master");
|
||||
connection_ok = check_connection(&master_conn, "master", NULL);
|
||||
|
||||
if(connection_ok == false)
|
||||
{
|
||||
@@ -693,6 +693,7 @@ standby_monitor(void)
|
||||
bool did_retry = false;
|
||||
|
||||
PGconn *upstream_conn;
|
||||
char upstream_conninfo[MAXCONNINFO];
|
||||
int upstream_node_id;
|
||||
t_node_info upstream_node;
|
||||
|
||||
@@ -704,7 +705,7 @@ standby_monitor(void)
|
||||
* no point in doing much else anyway
|
||||
*/
|
||||
|
||||
if (!check_connection(my_local_conn, "standby"))
|
||||
if (!check_connection(&my_local_conn, "standby", NULL))
|
||||
{
|
||||
PQExpBufferData errmsg;
|
||||
|
||||
@@ -730,7 +731,7 @@ standby_monitor(void)
|
||||
upstream_conn = get_upstream_connection(my_local_conn,
|
||||
local_options.cluster_name,
|
||||
local_options.node,
|
||||
&upstream_node_id, NULL);
|
||||
&upstream_node_id, upstream_conninfo);
|
||||
|
||||
type = upstream_node_id == master_options.node
|
||||
? "master"
|
||||
@@ -742,11 +743,11 @@ standby_monitor(void)
|
||||
* we cannot reconnect, try to get a new upstream node.
|
||||
*/
|
||||
|
||||
check_connection(upstream_conn, type); /* this takes up to
|
||||
* local_options.reconnect_attempts
|
||||
* local_options.reconnect_intvl seconds
|
||||
*/
|
||||
|
||||
check_connection(&upstream_conn, type, upstream_conninfo);
|
||||
/*
|
||||
* This takes up to local_options.reconnect_attempts *
|
||||
* local_options.reconnect_intvl seconds
|
||||
*/
|
||||
|
||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||
{
|
||||
@@ -879,7 +880,7 @@ standby_monitor(void)
|
||||
log_err(_("standby node has disappeared, trying to reconnect...\n"));
|
||||
did_retry = true;
|
||||
|
||||
if (!check_connection(my_local_conn, "standby"))
|
||||
if (!check_connection(&my_local_conn, "standby", NULL))
|
||||
{
|
||||
set_local_node_failed();
|
||||
terminate(0);
|
||||
@@ -944,8 +945,9 @@ standby_monitor(void)
|
||||
master_conn = get_master_connection(my_local_conn,
|
||||
local_options.cluster_name,
|
||||
&master_options.node, NULL);
|
||||
|
||||
}
|
||||
if (PQstatus(master_conn) != CONNECTION_OK)
|
||||
PQreset(master_conn);
|
||||
|
||||
/*
|
||||
* Cancel any query that is still being executed, so i can insert the
|
||||
@@ -1592,7 +1594,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
* Verify that we can still talk to the cluster master even though
|
||||
* node upstream is not available
|
||||
*/
|
||||
if (!check_connection(master_conn, "master"))
|
||||
if (!check_connection(&master_conn, "master", NULL))
|
||||
{
|
||||
log_err(_("do_upstream_standby_failover(): Unable to connect to last known master node\n"));
|
||||
return false;
|
||||
@@ -1681,7 +1683,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
|
||||
|
||||
static bool
|
||||
check_connection(PGconn *conn, const char *type)
|
||||
check_connection(PGconn **conn, const char *type, const char *conninfo)
|
||||
{
|
||||
int connection_retries;
|
||||
|
||||
@@ -1692,7 +1694,16 @@ check_connection(PGconn *conn, const char *type)
|
||||
*/
|
||||
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
||||
{
|
||||
if (!is_pgup(conn, local_options.master_response_timeout))
|
||||
if (*conn == NULL)
|
||||
{
|
||||
if (conninfo == NULL)
|
||||
{
|
||||
log_err("INTERNAL ERROR: *conn == NULL && conninfo == NULL");
|
||||
terminate(ERR_INTERNAL);
|
||||
}
|
||||
*conn = establish_db_connection(conninfo, false);
|
||||
}
|
||||
if (!is_pgup(*conn, local_options.master_response_timeout))
|
||||
{
|
||||
log_warning(_("connection to %s has been lost, trying to recover... %i seconds before failover decision\n"),
|
||||
type,
|
||||
@@ -1710,9 +1721,9 @@ check_connection(PGconn *conn, const char *type)
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_pgup(conn, local_options.master_response_timeout))
|
||||
if (!is_pgup(*conn, local_options.master_response_timeout))
|
||||
{
|
||||
log_err(_("unable to reconnect to %s after %i seconds...\n"),
|
||||
log_err(_("unable to reconnect to %s (timeout %i seconds)...\n"),
|
||||
type,
|
||||
local_options.master_response_timeout
|
||||
);
|
||||
@@ -1740,7 +1751,7 @@ set_local_node_failed(void)
|
||||
int active_master_node_id = NODE_NOT_FOUND;
|
||||
char master_conninfo[MAXLEN];
|
||||
|
||||
if (!check_connection(master_conn, "master"))
|
||||
if (!check_connection(&master_conn, "master", NULL))
|
||||
{
|
||||
log_err(_("set_local_node_failed(): Unable to connect to last known master node\n"));
|
||||
return false;
|
||||
|
||||
Reference in New Issue
Block a user