Random instance selection (#136)

* wip

* revert some'

* revert more

* poor-man's integration test

* remove test

* fmt

* --workspace

* fix build

* fix integration test

* another stab

* log

* run after integration

* cargo test after integration

* revert

* revert more

* Refactor + clean up

* more clean up
This commit is contained in:
Mostafa Abdelraouf
2022-08-22 00:15:20 -05:00
committed by GitHub
parent 5948fef6cf
commit 5f5b5e2543
3 changed files with 46 additions and 83 deletions

View File

@@ -499,7 +499,6 @@ where
// The query router determines where the query is going to go, // The query router determines where the query is going to go,
// e.g. primary, replica, which shard. // e.g. primary, replica, which shard.
let mut query_router = QueryRouter::new(); let mut query_router = QueryRouter::new();
let mut round_robin = rand::random();
// Our custom protocol loop. // Our custom protocol loop.
// We expect the client to either start a transaction with regular queries // We expect the client to either start a transaction with regular queries
@@ -631,12 +630,7 @@ where
// Grab a server from the pool. // Grab a server from the pool.
let connection = match pool let connection = match pool
.get( .get(query_router.shard(), query_router.role(), self.process_id)
query_router.shard(),
query_router.role(),
self.process_id,
round_robin,
)
.await .await
{ {
Ok(conn) => { Ok(conn) => {
@@ -655,8 +649,6 @@ where
let address = connection.1; let address = connection.1;
let server = &mut *reference; let server = &mut *reference;
round_robin += 1;
// Server is assigned to the client in case the client wants to // Server is assigned to the client in case the client wants to
// cancel a query later. // cancel a query later.
server.claim(self.process_id, self.secret_key); server.claim(self.process_id, self.secret_key);

View File

@@ -63,7 +63,7 @@ pub struct Address {
pub shard: usize, pub shard: usize,
pub database: String, pub database: String,
pub role: Role, pub role: Role,
pub replica_number: usize, pub instance_index: usize,
pub username: String, pub username: String,
pub poolname: String, pub poolname: String,
} }
@@ -75,7 +75,7 @@ impl Default for Address {
host: String::from("127.0.0.1"), host: String::from("127.0.0.1"),
port: String::from("5432"), port: String::from("5432"),
shard: 0, shard: 0,
replica_number: 0, instance_index: 0,
database: String::from("database"), database: String::from("database"),
role: Role::Replica, role: Role::Replica,
username: String::from("username"), username: String::from("username"),
@@ -92,7 +92,7 @@ impl Address {
Role::Replica => format!( Role::Replica => format!(
"{}_shard_{}_replica_{}", "{}_shard_{}_replica_{}",
self.poolname, self.shard, self.replica_number self.poolname, self.shard, self.instance_index
), ),
} }
} }

View File

@@ -6,6 +6,8 @@ use chrono::naive::NaiveDateTime;
use log::{debug, error, info, warn}; use log::{debug, error, info, warn};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use parking_lot::{Mutex, RwLock}; use parking_lot::{Mutex, RwLock};
use rand::seq::SliceRandom;
use rand::thread_rng;
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::time::Instant; use std::time::Instant;
@@ -118,7 +120,7 @@ impl ConnectionPool {
host: server.0.clone(), host: server.0.clone(),
port: server.1.to_string(), port: server.1.to_string(),
role: role, role: role,
replica_number, instance_index: replica_number,
shard: shard_idx.parse::<usize>().unwrap(), shard: shard_idx.parse::<usize>().unwrap(),
username: user_info.username.clone(), username: user_info.username.clone(),
poolname: pool_name.clone(), poolname: pool_name.clone(),
@@ -201,16 +203,9 @@ impl ConnectionPool {
/// the pooler starts up. /// the pooler starts up.
async fn validate(&mut self) -> Result<(), Error> { async fn validate(&mut self) -> Result<(), Error> {
let mut server_infos = Vec::new(); let mut server_infos = Vec::new();
let stats = self.stats.clone();
for shard in 0..self.shards() { for shard in 0..self.shards() {
let mut round_robin = 0; for index in 0..self.servers(shard) {
let connection = match self.databases[shard][index].get().await {
for _ in 0..self.servers(shard) {
// To keep stats consistent.
let fake_process_id = 0;
let connection = match self.get(shard, None, fake_process_id, round_robin).await {
Ok(conn) => conn, Ok(conn) => conn,
Err(err) => { Err(err) => {
error!("Shard {} down or misconfigured: {:?}", shard, err); error!("Shard {} down or misconfigured: {:?}", shard, err);
@@ -218,25 +213,20 @@ impl ConnectionPool {
} }
}; };
let proxy = connection.0; let proxy = connection;
let address = connection.1;
let server = &*proxy; let server = &*proxy;
let server_info = server.server_info(); let server_info = server.server_info();
stats.client_disconnecting(fake_process_id, address.id);
if server_infos.len() > 0 { if server_infos.len() > 0 {
// Compare against the last server checked. // Compare against the last server checked.
if server_info != server_infos[server_infos.len() - 1] { if server_info != server_infos[server_infos.len() - 1] {
warn!( warn!(
"{:?} has different server configuration than the last server", "{:?} has different server configuration than the last server",
address proxy.address()
); );
} }
} }
server_infos.push(server_info); server_infos.push(server_info);
round_robin += 1;
} }
} }
@@ -254,58 +244,31 @@ impl ConnectionPool {
/// Get a connection from the pool. /// Get a connection from the pool.
pub async fn get( pub async fn get(
&self, &self,
shard: usize, // shard number shard: usize, // shard number
role: Option<Role>, // primary or replica role: Option<Role>, // primary or replica
process_id: i32, // client id process_id: i32, // client id
mut round_robin: usize, // round robin offset
) -> Result<(PooledConnection<'_, ServerPool>, Address), Error> { ) -> Result<(PooledConnection<'_, ServerPool>, Address), Error> {
let now = Instant::now(); let now = Instant::now();
let addresses = &self.addresses[shard]; let mut candidates: Vec<Address> = self.addresses[shard]
.clone()
.into_iter()
.filter(|address| address.role == role)
.collect();
let mut allowed_attempts = match role { // Random load balancing
// Primary-specific queries get one attempt, if the primary is down, candidates.shuffle(&mut thread_rng());
// nothing we should do about it I think. It's dangerous to retry
// write queries.
Some(Role::Primary) => 1,
// Replicas get to try as many times as there are replicas
// and connections in the pool.
_ => addresses.len(),
};
debug!("Allowed attempts for {:?}: {}", role, allowed_attempts);
let exists = match role {
Some(role) => addresses.iter().filter(|addr| addr.role == role).count() > 0,
None => true,
};
if !exists {
error!("Requested role {:?}, but none are configured", role);
return Err(Error::BadConfig);
}
let healthcheck_timeout = get_config().general.healthcheck_timeout; let healthcheck_timeout = get_config().general.healthcheck_timeout;
let healthcheck_delay = get_config().general.healthcheck_delay as u128; let healthcheck_delay = get_config().general.healthcheck_delay as u128;
while allowed_attempts > 0 { while !candidates.is_empty() {
// Round-robin replicas. // Get the next candidate
round_robin += 1; let address = match candidates.pop() {
Some(address) => address,
None => break,
};
let index = round_robin % addresses.len(); if self.is_banned(&address, address.shard, role) {
let address = &addresses[index];
// Make sure you're getting a primary or a replica
// as per request. If no specific role is requested, the first
// available will be chosen.
if address.role != role {
continue;
}
allowed_attempts -= 1;
// Don't attempt to connect to banned servers.
if self.is_banned(address, shard, role) {
continue; continue;
} }
@@ -313,11 +276,14 @@ impl ConnectionPool {
self.stats.client_waiting(process_id, address.id); self.stats.client_waiting(process_id, address.id);
// Check if we can connect // Check if we can connect
let mut conn = match self.databases[shard][index].get().await { let mut conn = match self.databases[address.shard][address.instance_index]
.get()
.await
{
Ok(conn) => conn, Ok(conn) => conn,
Err(err) => { Err(err) => {
error!("Banning replica {}, error: {:?}", index, err); error!("Banning instance {:?}, error: {:?}", address, err);
self.ban(address, shard, process_id); self.ban(&address, address.shard, process_id);
self.stats.client_disconnecting(process_id, address.id); self.stats.client_disconnecting(process_id, address.id);
self.stats self.stats
.checkout_time(now.elapsed().as_micros(), process_id, address.id); .checkout_time(now.elapsed().as_micros(), process_id, address.id);
@@ -359,29 +325,34 @@ impl ConnectionPool {
} }
// Health check failed. // Health check failed.
Err(_) => { Err(err) => {
error!("Banning replica {} because of failed health check", index); error!(
"Banning instance {:?} because of failed health check, {:?}",
address, err
);
// Don't leave a bad connection in the pool. // Don't leave a bad connection in the pool.
server.mark_bad(); server.mark_bad();
self.ban(address, shard, process_id); self.ban(&address, address.shard, process_id);
continue; continue;
} }
}, },
// Health check timed out. // Health check timed out.
Err(_) => { Err(err) => {
error!("Banning replica {} because of health check timeout", index); error!(
"Banning instance {:?} because of health check timeout, {:?}",
address, err
);
// Don't leave a bad connection in the pool. // Don't leave a bad connection in the pool.
server.mark_bad(); server.mark_bad();
self.ban(address, shard, process_id); self.ban(&address, address.shard, process_id);
continue; continue;
} }
} }
} }
return Err(Error::AllServersDown); return Err(Error::AllServersDown);
} }