If PQcancel() fails, consider it as if the master is failing.

Because PQcancel() establish a new synchronous connection to the
database, if it fails it means something wrong has happenned with
master. So instead of just ignore the failure, CancelQuery() now
reports a failure condition so we can detect master's death in
that situation.

This is very important specially when only postmaster crashes but
other children/backend connections are still there. Because the
children connection won't fail and CancelQuery() failure is our
only indication of something wrong happenning.
Currently we just ignore the PQcancel() failure which leads us to
a situation in which we just loop forever
trying to cancel the async query.

Reported by: Martin Euser <martin.euser@nl.abnamro.com>
Problem analyzed and bug spotted by: Andres Freund <andres@2ndquadrant.com>
Patch by: Jaime Casanova <jaime@2ndquadrant.com>
This commit is contained in:
Jaime Casanova
2013-07-10 09:53:45 -05:00
parent 49a2531930
commit b0b44a157f
3 changed files with 28 additions and 10 deletions

View File

@@ -146,7 +146,8 @@ is_pgup(PGconn *conn, int timeout)
/*
* Send a SELECT 1 just to check if the connection is OK
*/
CancelQuery(conn, timeout);
if (!CancelQuery(conn, timeout))
goto failed;
if (wait_connection_availability(conn, timeout) != 1)
goto failed;
@@ -429,18 +430,33 @@ wait_connection_availability(PGconn *conn, int timeout)
}
void
bool
CancelQuery(PGconn *conn, int timeout)
{
char errbuf[ERRBUFF_SIZE];
PGcancel *pgcancel;
wait_connection_availability(conn, timeout);
if (wait_connection_availability(conn, timeout) != 1)
return false;
pgcancel = PQgetCancel(conn);
if (!pgcancel || PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
log_warning(_("Can't stop current query: %s\n"), errbuf);
if (pgcancel != NULL)
{
/*
* PQcancel can only return 0 if socket()/connect()/send()
* fails, in any of those cases we can assume something
* bad happened to the connection
*/
if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
{
log_warning(_("Can't stop current query: %s\n"), errbuf);
PQfreeCancel(pgcancel);
return false;
}
PQfreeCancel(pgcancel);
PQfreeCancel(pgcancel);
}
return true;
}

View File

@@ -37,5 +37,5 @@ PGconn *getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
int *master_id, char *master_conninfo_out);
int wait_connection_availability(PGconn *conn, int timeout);
void CancelQuery(PGconn *conn, int timeout);
bool CancelQuery(PGconn *conn, int timeout);
#endif

View File

@@ -111,7 +111,7 @@ static void setup_event_handlers(void);
#define CloseConnections() \
if (PQisBusy(primaryConn) == 1) \
CancelQuery(primaryConn, local_options.master_response_timeout); \
(void) CancelQuery(primaryConn, local_options.master_response_timeout); \
if (myLocalConn != NULL) \
PQfinish(myLocalConn); \
if (primaryConn != NULL && primaryConn != myLocalConn) \
@@ -376,7 +376,8 @@ WitnessMonitor(void)
* Cancel any query that is still being executed,
* so i can insert the current record
*/
CancelQuery(primaryConn, local_options.master_response_timeout);
if (!CancelQuery(primaryConn, local_options.master_response_timeout))
return;
if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1)
return;
@@ -497,7 +498,8 @@ StandbyMonitor(void)
* Cancel any query that is still being executed,
* so i can insert the current record
*/
CancelQuery(primaryConn, local_options.master_response_timeout);
if (!CancelQuery(primaryConn, local_options.master_response_timeout))
return;
if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1)
return;