Health check delay (#118)

* initial commit of server check delay implementation

* fmt

* spelling

* Update name to last_healthcheck and some comments

* Moved server tested stat to after require_healthcheck check

* Make health check delay configurable

* Rename to last_activity

* Fix typo

* Add debug log for healthcheck

* Add address to debug log
This commit is contained in:
zainkabani
2022-08-11 17:42:40 -04:00
committed by GitHub
parent a262337ba5
commit f963b12821
8 changed files with 165 additions and 41 deletions

View File

@@ -251,7 +251,7 @@ impl ConnectionPool {
/// Get a connection from the pool.
pub async fn get(
&mut self,
&self,
shard: usize, // shard number
role: Option<Role>, // primary or replica
process_id: i32, // client id
@@ -283,6 +283,9 @@ impl ConnectionPool {
return Err(Error::BadConfig);
}
let healthcheck_timeout = get_config().general.healthcheck_timeout;
let healthcheck_delay = get_config().general.healthcheck_delay as u128;
while allowed_attempts > 0 {
// Round-robin replicas.
round_robin += 1;
@@ -312,7 +315,7 @@ impl ConnectionPool {
Ok(conn) => conn,
Err(err) => {
error!("Banning replica {}, error: {:?}", index, err);
self.ban(address, shard);
self.ban(address, shard, process_id);
self.stats.client_disconnecting(process_id, address.id);
self.stats
.checkout_time(now.elapsed().as_micros(), process_id, address.id);
@@ -322,8 +325,19 @@ impl ConnectionPool {
// // Check if this server is alive with a health check.
let server = &mut *conn;
let healthcheck_timeout = get_config().general.healthcheck_timeout;
// Will return error if timestamp is greater than current system time, which it should never be set to
let require_healthcheck =
server.last_activity().elapsed().unwrap().as_millis() > healthcheck_delay;
if !require_healthcheck {
self.stats
.checkout_time(now.elapsed().as_micros(), process_id, address.id);
self.stats.server_idle(conn.process_id(), address.id);
return Ok((conn, address.clone()));
}
debug!("Running health check for replica {}, {:?}", index, address);
self.stats.server_tested(server.process_id(), address.id);
match tokio::time::timeout(
@@ -348,10 +362,7 @@ impl ConnectionPool {
// Don't leave a bad connection in the pool.
server.mark_bad();
self.ban(address, shard);
self.stats.client_disconnecting(process_id, address.id);
self.stats
.checkout_time(now.elapsed().as_micros(), process_id, address.id);
self.ban(address, shard, process_id);
continue;
}
},
@@ -362,10 +373,7 @@ impl ConnectionPool {
// Don't leave a bad connection in the pool.
server.mark_bad();
self.ban(address, shard);
self.stats.client_disconnecting(process_id, address.id);
self.stats
.checkout_time(now.elapsed().as_micros(), process_id, address.id);
self.ban(address, shard, process_id);
continue;
}
}
@@ -377,7 +385,11 @@ impl ConnectionPool {
/// Ban an address (i.e. replica). It no longer will serve
/// traffic for any new transactions. Existing transactions on that replica
/// will finish successfully or error out to the clients.
pub fn ban(&self, address: &Address, shard: usize) {
pub fn ban(&self, address: &Address, shard: usize, process_id: i32) {
self.stats.client_disconnecting(process_id, address.id);
self.stats
.checkout_time(Instant::now().elapsed().as_micros(), process_id, address.id);
error!("Banning {:?}", address);
let now = chrono::offset::Utc::now().naive_utc();
let mut guard = self.banlist.write();