From b0b44a157f2b546f9fc1f299a53d80f76cbd4a49 Mon Sep 17 00:00:00 2001 From: Jaime Casanova Date: Wed, 10 Jul 2013 09:53:45 -0500 Subject: [PATCH] If PQcancel() fails, consider it as if the master is failing. Because PQcancel() establish a new synchronous connection to the database, if it fails it means something wrong has happenned with master. So instead of just ignore the failure, CancelQuery() now reports a failure condition so we can detect master's death in that situation. This is very important specially when only postmaster crashes but other children/backend connections are still there. Because the children connection won't fail and CancelQuery() failure is our only indication of something wrong happenning. Currently we just ignore the PQcancel() failure which leads us to a situation in which we just loop forever trying to cancel the async query. Reported by: Martin Euser Problem analyzed and bug spotted by: Andres Freund Patch by: Jaime Casanova --- dbutils.c | 28 ++++++++++++++++++++++------ dbutils.h | 2 +- repmgrd.c | 8 +++++--- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/dbutils.c b/dbutils.c index 8ed67c2e..ca7ac4bc 100644 --- a/dbutils.c +++ b/dbutils.c @@ -146,7 +146,8 @@ is_pgup(PGconn *conn, int timeout) /* * Send a SELECT 1 just to check if the connection is OK */ - CancelQuery(conn, timeout); + if (!CancelQuery(conn, timeout)) + goto failed; if (wait_connection_availability(conn, timeout) != 1) goto failed; @@ -429,18 +430,33 @@ wait_connection_availability(PGconn *conn, int timeout) } -void +bool CancelQuery(PGconn *conn, int timeout) { char errbuf[ERRBUFF_SIZE]; PGcancel *pgcancel; - wait_connection_availability(conn, timeout); + if (wait_connection_availability(conn, timeout) != 1) + return false; pgcancel = PQgetCancel(conn); - if (!pgcancel || PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0) - log_warning(_("Can't stop current query: %s\n"), errbuf); + if (pgcancel != NULL) + { + /* + * PQcancel can only return 0 if socket()/connect()/send() + * fails, in any of those cases we can assume something + * bad happened to the connection + */ + if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0) + { + log_warning(_("Can't stop current query: %s\n"), errbuf); + PQfreeCancel(pgcancel); + return false; + } - PQfreeCancel(pgcancel); + PQfreeCancel(pgcancel); + } + + return true; } diff --git a/dbutils.h b/dbutils.h index 90a6d390..8d983048 100644 --- a/dbutils.h +++ b/dbutils.h @@ -37,5 +37,5 @@ PGconn *getMasterConnection(PGconn *standby_conn, char *schema, char *cluster, int *master_id, char *master_conninfo_out); int wait_connection_availability(PGconn *conn, int timeout); -void CancelQuery(PGconn *conn, int timeout); +bool CancelQuery(PGconn *conn, int timeout); #endif diff --git a/repmgrd.c b/repmgrd.c index 5cd18e2a..a15cbef2 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -111,7 +111,7 @@ static void setup_event_handlers(void); #define CloseConnections() \ if (PQisBusy(primaryConn) == 1) \ - CancelQuery(primaryConn, local_options.master_response_timeout); \ + (void) CancelQuery(primaryConn, local_options.master_response_timeout); \ if (myLocalConn != NULL) \ PQfinish(myLocalConn); \ if (primaryConn != NULL && primaryConn != myLocalConn) \ @@ -376,7 +376,8 @@ WitnessMonitor(void) * Cancel any query that is still being executed, * so i can insert the current record */ - CancelQuery(primaryConn, local_options.master_response_timeout); + if (!CancelQuery(primaryConn, local_options.master_response_timeout)) + return; if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1) return; @@ -497,7 +498,8 @@ StandbyMonitor(void) * Cancel any query that is still being executed, * so i can insert the current record */ - CancelQuery(primaryConn, local_options.master_response_timeout); + if (!CancelQuery(primaryConn, local_options.master_response_timeout)) + return; if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1) return;