mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 07:06:30 +00:00
If PQcancel() fails, consider it as if the master is failing.
Because PQcancel() establish a new synchronous connection to the database, if it fails it means something wrong has happenned with master. So instead of just ignore the failure, CancelQuery() now reports a failure condition so we can detect master's death in that situation. This is very important specially when only postmaster crashes but other children/backend connections are still there. Because the children connection won't fail and CancelQuery() failure is our only indication of something wrong happenning. Currently we just ignore the PQcancel() failure which leads us to a situation in which we just loop forever trying to cancel the async query. Reported by: Martin Euser <martin.euser@nl.abnamro.com> Problem analyzed and bug spotted by: Andres Freund <andres@2ndquadrant.com> Patch by: Jaime Casanova <jaime@2ndquadrant.com>
This commit is contained in:
28
dbutils.c
28
dbutils.c
@@ -146,7 +146,8 @@ is_pgup(PGconn *conn, int timeout)
|
||||
/*
|
||||
* Send a SELECT 1 just to check if the connection is OK
|
||||
*/
|
||||
CancelQuery(conn, timeout);
|
||||
if (!CancelQuery(conn, timeout))
|
||||
goto failed;
|
||||
if (wait_connection_availability(conn, timeout) != 1)
|
||||
goto failed;
|
||||
|
||||
@@ -429,18 +430,33 @@ wait_connection_availability(PGconn *conn, int timeout)
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
bool
|
||||
CancelQuery(PGconn *conn, int timeout)
|
||||
{
|
||||
char errbuf[ERRBUFF_SIZE];
|
||||
PGcancel *pgcancel;
|
||||
|
||||
wait_connection_availability(conn, timeout);
|
||||
if (wait_connection_availability(conn, timeout) != 1)
|
||||
return false;
|
||||
|
||||
pgcancel = PQgetCancel(conn);
|
||||
|
||||
if (!pgcancel || PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
|
||||
log_warning(_("Can't stop current query: %s\n"), errbuf);
|
||||
if (pgcancel != NULL)
|
||||
{
|
||||
/*
|
||||
* PQcancel can only return 0 if socket()/connect()/send()
|
||||
* fails, in any of those cases we can assume something
|
||||
* bad happened to the connection
|
||||
*/
|
||||
if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
|
||||
{
|
||||
log_warning(_("Can't stop current query: %s\n"), errbuf);
|
||||
PQfreeCancel(pgcancel);
|
||||
return false;
|
||||
}
|
||||
|
||||
PQfreeCancel(pgcancel);
|
||||
PQfreeCancel(pgcancel);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -37,5 +37,5 @@ PGconn *getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
|
||||
int *master_id, char *master_conninfo_out);
|
||||
|
||||
int wait_connection_availability(PGconn *conn, int timeout);
|
||||
void CancelQuery(PGconn *conn, int timeout);
|
||||
bool CancelQuery(PGconn *conn, int timeout);
|
||||
#endif
|
||||
|
||||
@@ -111,7 +111,7 @@ static void setup_event_handlers(void);
|
||||
|
||||
#define CloseConnections() \
|
||||
if (PQisBusy(primaryConn) == 1) \
|
||||
CancelQuery(primaryConn, local_options.master_response_timeout); \
|
||||
(void) CancelQuery(primaryConn, local_options.master_response_timeout); \
|
||||
if (myLocalConn != NULL) \
|
||||
PQfinish(myLocalConn); \
|
||||
if (primaryConn != NULL && primaryConn != myLocalConn) \
|
||||
@@ -376,7 +376,8 @@ WitnessMonitor(void)
|
||||
* Cancel any query that is still being executed,
|
||||
* so i can insert the current record
|
||||
*/
|
||||
CancelQuery(primaryConn, local_options.master_response_timeout);
|
||||
if (!CancelQuery(primaryConn, local_options.master_response_timeout))
|
||||
return;
|
||||
if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1)
|
||||
return;
|
||||
|
||||
@@ -497,7 +498,8 @@ StandbyMonitor(void)
|
||||
* Cancel any query that is still being executed,
|
||||
* so i can insert the current record
|
||||
*/
|
||||
CancelQuery(primaryConn, local_options.master_response_timeout);
|
||||
if (!CancelQuery(primaryConn, local_options.master_response_timeout))
|
||||
return;
|
||||
if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1)
|
||||
return;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user