From 462d4464771f14b919fea09dc4d7b1c044a979cd Mon Sep 17 00:00:00 2001 From: Gianni Ciolli Date: Mon, 10 Aug 2015 15:04:48 +0100 Subject: [PATCH] Bug #90 fix (autofailover with reconnect_attemps > 1). The main change is that now check_connection requires a conninfo parameter, and the connection object has type (PGconn **) so it can be replaced by check_connection if needed. The bug was caused by the fact that the first failure resulted in *conn == NULL, so that subsequent checks of the upstream connection were failing irrespectively of the actual state of the upstream node. Now, when *conn == NULL, check_connection will use conninfo to establish a new connection and place it into *conn. We introduce a new INTERNAL_ERROR code for the case when they are both NULL. In passing, we also reworded a confusing error message, distinguishing a timeout from the actual elapsed time. --- errcode.h | 1 + repmgrd.c | 47 +++++++++++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/errcode.h b/errcode.h index 00758f12..a67f513c 100644 --- a/errcode.h +++ b/errcode.h @@ -35,5 +35,6 @@ #define ERR_BAD_SSH 12 #define ERR_SYS_FAILURE 13 #define ERR_BAD_BASEBACKUP 14 +#define ERR_INTERNAL 15 #endif /* _ERRCODE_H_ */ diff --git a/repmgrd.c b/repmgrd.c index 9cbca7db..e20ced43 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -88,7 +88,7 @@ static void check_node_configuration(void); static void standby_monitor(void); static void witness_monitor(void); -static bool check_connection(PGconn *conn, const char *type); +static bool check_connection(PGconn **conn, const char *type, const char *conninfo); static bool set_local_node_failed(void); static bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id); @@ -353,7 +353,7 @@ main(int argc, char **argv) */ do { - if (check_connection(master_conn, "master")) + if (check_connection(&master_conn, "master", NULL)) { sleep(local_options.monitor_interval_secs); } @@ -536,7 +536,7 @@ witness_monitor(void) * of a missing master and promotion of a standby by that standby's * repmgrd, so we'll loop for a while before giving up. */ - connection_ok = check_connection(master_conn, "master"); + connection_ok = check_connection(&master_conn, "master", NULL); if(connection_ok == false) { @@ -693,6 +693,7 @@ standby_monitor(void) bool did_retry = false; PGconn *upstream_conn; + char upstream_conninfo[MAXCONNINFO]; int upstream_node_id; t_node_info upstream_node; @@ -704,7 +705,7 @@ standby_monitor(void) * no point in doing much else anyway */ - if (!check_connection(my_local_conn, "standby")) + if (!check_connection(&my_local_conn, "standby", NULL)) { PQExpBufferData errmsg; @@ -730,7 +731,7 @@ standby_monitor(void) upstream_conn = get_upstream_connection(my_local_conn, local_options.cluster_name, local_options.node, - &upstream_node_id, NULL); + &upstream_node_id, upstream_conninfo); type = upstream_node_id == master_options.node ? "master" @@ -742,11 +743,11 @@ standby_monitor(void) * we cannot reconnect, try to get a new upstream node. */ - check_connection(upstream_conn, type); /* this takes up to - * local_options.reconnect_attempts - * local_options.reconnect_intvl seconds - */ - + check_connection(&upstream_conn, type, upstream_conninfo); + /* + * This takes up to local_options.reconnect_attempts * + * local_options.reconnect_intvl seconds + */ if (PQstatus(upstream_conn) != CONNECTION_OK) { @@ -879,7 +880,7 @@ standby_monitor(void) log_err(_("standby node has disappeared, trying to reconnect...\n")); did_retry = true; - if (!check_connection(my_local_conn, "standby")) + if (!check_connection(&my_local_conn, "standby", NULL)) { set_local_node_failed(); terminate(0); @@ -944,8 +945,9 @@ standby_monitor(void) master_conn = get_master_connection(my_local_conn, local_options.cluster_name, &master_options.node, NULL); - } + if (PQstatus(master_conn) != CONNECTION_OK) + PQreset(master_conn); /* * Cancel any query that is still being executed, so i can insert the @@ -1592,7 +1594,7 @@ do_upstream_standby_failover(t_node_info upstream_node) * Verify that we can still talk to the cluster master even though * node upstream is not available */ - if (!check_connection(master_conn, "master")) + if (!check_connection(&master_conn, "master", NULL)) { log_err(_("do_upstream_standby_failover(): Unable to connect to last known master node\n")); return false; @@ -1681,7 +1683,7 @@ do_upstream_standby_failover(t_node_info upstream_node) static bool -check_connection(PGconn *conn, const char *type) +check_connection(PGconn **conn, const char *type, const char *conninfo) { int connection_retries; @@ -1692,7 +1694,16 @@ check_connection(PGconn *conn, const char *type) */ for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++) { - if (!is_pgup(conn, local_options.master_response_timeout)) + if (*conn == NULL) + { + if (conninfo == NULL) + { + log_err("INTERNAL ERROR: *conn == NULL && conninfo == NULL"); + terminate(ERR_INTERNAL); + } + *conn = establish_db_connection(conninfo, false); + } + if (!is_pgup(*conn, local_options.master_response_timeout)) { log_warning(_("connection to %s has been lost, trying to recover... %i seconds before failover decision\n"), type, @@ -1710,9 +1721,9 @@ check_connection(PGconn *conn, const char *type) } } - if (!is_pgup(conn, local_options.master_response_timeout)) + if (!is_pgup(*conn, local_options.master_response_timeout)) { - log_err(_("unable to reconnect to %s after %i seconds...\n"), + log_err(_("unable to reconnect to %s (timeout %i seconds)...\n"), type, local_options.master_response_timeout ); @@ -1740,7 +1751,7 @@ set_local_node_failed(void) int active_master_node_id = NODE_NOT_FOUND; char master_conninfo[MAXLEN]; - if (!check_connection(master_conn, "master")) + if (!check_connection(&master_conn, "master", NULL)) { log_err(_("set_local_node_failed(): Unable to connect to last known master node\n")); return false;