mirror of
https://github.com/postgresml/pgcat.git
synced 2026-03-27 18:56:30 +00:00
Random instance selection (#136)
* wip * revert some' * revert more * poor-man's integration test * remove test * fmt * --workspace * fix build * fix integration test * another stab * log * run after integration * cargo test after integration * revert * revert more * Refactor + clean up * more clean up
This commit is contained in:
committed by
GitHub
parent
5948fef6cf
commit
5f5b5e2543
@@ -499,7 +499,6 @@ where
|
|||||||
// The query router determines where the query is going to go,
|
// The query router determines where the query is going to go,
|
||||||
// e.g. primary, replica, which shard.
|
// e.g. primary, replica, which shard.
|
||||||
let mut query_router = QueryRouter::new();
|
let mut query_router = QueryRouter::new();
|
||||||
let mut round_robin = rand::random();
|
|
||||||
|
|
||||||
// Our custom protocol loop.
|
// Our custom protocol loop.
|
||||||
// We expect the client to either start a transaction with regular queries
|
// We expect the client to either start a transaction with regular queries
|
||||||
@@ -631,12 +630,7 @@ where
|
|||||||
|
|
||||||
// Grab a server from the pool.
|
// Grab a server from the pool.
|
||||||
let connection = match pool
|
let connection = match pool
|
||||||
.get(
|
.get(query_router.shard(), query_router.role(), self.process_id)
|
||||||
query_router.shard(),
|
|
||||||
query_router.role(),
|
|
||||||
self.process_id,
|
|
||||||
round_robin,
|
|
||||||
)
|
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(conn) => {
|
Ok(conn) => {
|
||||||
@@ -655,8 +649,6 @@ where
|
|||||||
let address = connection.1;
|
let address = connection.1;
|
||||||
let server = &mut *reference;
|
let server = &mut *reference;
|
||||||
|
|
||||||
round_robin += 1;
|
|
||||||
|
|
||||||
// Server is assigned to the client in case the client wants to
|
// Server is assigned to the client in case the client wants to
|
||||||
// cancel a query later.
|
// cancel a query later.
|
||||||
server.claim(self.process_id, self.secret_key);
|
server.claim(self.process_id, self.secret_key);
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ pub struct Address {
|
|||||||
pub shard: usize,
|
pub shard: usize,
|
||||||
pub database: String,
|
pub database: String,
|
||||||
pub role: Role,
|
pub role: Role,
|
||||||
pub replica_number: usize,
|
pub instance_index: usize,
|
||||||
pub username: String,
|
pub username: String,
|
||||||
pub poolname: String,
|
pub poolname: String,
|
||||||
}
|
}
|
||||||
@@ -75,7 +75,7 @@ impl Default for Address {
|
|||||||
host: String::from("127.0.0.1"),
|
host: String::from("127.0.0.1"),
|
||||||
port: String::from("5432"),
|
port: String::from("5432"),
|
||||||
shard: 0,
|
shard: 0,
|
||||||
replica_number: 0,
|
instance_index: 0,
|
||||||
database: String::from("database"),
|
database: String::from("database"),
|
||||||
role: Role::Replica,
|
role: Role::Replica,
|
||||||
username: String::from("username"),
|
username: String::from("username"),
|
||||||
@@ -92,7 +92,7 @@ impl Address {
|
|||||||
|
|
||||||
Role::Replica => format!(
|
Role::Replica => format!(
|
||||||
"{}_shard_{}_replica_{}",
|
"{}_shard_{}_replica_{}",
|
||||||
self.poolname, self.shard, self.replica_number
|
self.poolname, self.shard, self.instance_index
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
113
src/pool.rs
113
src/pool.rs
@@ -6,6 +6,8 @@ use chrono::naive::NaiveDateTime;
|
|||||||
use log::{debug, error, info, warn};
|
use log::{debug, error, info, warn};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use parking_lot::{Mutex, RwLock};
|
use parking_lot::{Mutex, RwLock};
|
||||||
|
use rand::seq::SliceRandom;
|
||||||
|
use rand::thread_rng;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
@@ -118,7 +120,7 @@ impl ConnectionPool {
|
|||||||
host: server.0.clone(),
|
host: server.0.clone(),
|
||||||
port: server.1.to_string(),
|
port: server.1.to_string(),
|
||||||
role: role,
|
role: role,
|
||||||
replica_number,
|
instance_index: replica_number,
|
||||||
shard: shard_idx.parse::<usize>().unwrap(),
|
shard: shard_idx.parse::<usize>().unwrap(),
|
||||||
username: user_info.username.clone(),
|
username: user_info.username.clone(),
|
||||||
poolname: pool_name.clone(),
|
poolname: pool_name.clone(),
|
||||||
@@ -201,16 +203,9 @@ impl ConnectionPool {
|
|||||||
/// the pooler starts up.
|
/// the pooler starts up.
|
||||||
async fn validate(&mut self) -> Result<(), Error> {
|
async fn validate(&mut self) -> Result<(), Error> {
|
||||||
let mut server_infos = Vec::new();
|
let mut server_infos = Vec::new();
|
||||||
let stats = self.stats.clone();
|
|
||||||
|
|
||||||
for shard in 0..self.shards() {
|
for shard in 0..self.shards() {
|
||||||
let mut round_robin = 0;
|
for index in 0..self.servers(shard) {
|
||||||
|
let connection = match self.databases[shard][index].get().await {
|
||||||
for _ in 0..self.servers(shard) {
|
|
||||||
// To keep stats consistent.
|
|
||||||
let fake_process_id = 0;
|
|
||||||
|
|
||||||
let connection = match self.get(shard, None, fake_process_id, round_robin).await {
|
|
||||||
Ok(conn) => conn,
|
Ok(conn) => conn,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
error!("Shard {} down or misconfigured: {:?}", shard, err);
|
error!("Shard {} down or misconfigured: {:?}", shard, err);
|
||||||
@@ -218,25 +213,20 @@ impl ConnectionPool {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let proxy = connection.0;
|
let proxy = connection;
|
||||||
let address = connection.1;
|
|
||||||
let server = &*proxy;
|
let server = &*proxy;
|
||||||
let server_info = server.server_info();
|
let server_info = server.server_info();
|
||||||
|
|
||||||
stats.client_disconnecting(fake_process_id, address.id);
|
|
||||||
|
|
||||||
if server_infos.len() > 0 {
|
if server_infos.len() > 0 {
|
||||||
// Compare against the last server checked.
|
// Compare against the last server checked.
|
||||||
if server_info != server_infos[server_infos.len() - 1] {
|
if server_info != server_infos[server_infos.len() - 1] {
|
||||||
warn!(
|
warn!(
|
||||||
"{:?} has different server configuration than the last server",
|
"{:?} has different server configuration than the last server",
|
||||||
address
|
proxy.address()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
server_infos.push(server_info);
|
server_infos.push(server_info);
|
||||||
round_robin += 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -254,58 +244,31 @@ impl ConnectionPool {
|
|||||||
/// Get a connection from the pool.
|
/// Get a connection from the pool.
|
||||||
pub async fn get(
|
pub async fn get(
|
||||||
&self,
|
&self,
|
||||||
shard: usize, // shard number
|
shard: usize, // shard number
|
||||||
role: Option<Role>, // primary or replica
|
role: Option<Role>, // primary or replica
|
||||||
process_id: i32, // client id
|
process_id: i32, // client id
|
||||||
mut round_robin: usize, // round robin offset
|
|
||||||
) -> Result<(PooledConnection<'_, ServerPool>, Address), Error> {
|
) -> Result<(PooledConnection<'_, ServerPool>, Address), Error> {
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
let addresses = &self.addresses[shard];
|
let mut candidates: Vec<Address> = self.addresses[shard]
|
||||||
|
.clone()
|
||||||
|
.into_iter()
|
||||||
|
.filter(|address| address.role == role)
|
||||||
|
.collect();
|
||||||
|
|
||||||
let mut allowed_attempts = match role {
|
// Random load balancing
|
||||||
// Primary-specific queries get one attempt, if the primary is down,
|
candidates.shuffle(&mut thread_rng());
|
||||||
// nothing we should do about it I think. It's dangerous to retry
|
|
||||||
// write queries.
|
|
||||||
Some(Role::Primary) => 1,
|
|
||||||
|
|
||||||
// Replicas get to try as many times as there are replicas
|
|
||||||
// and connections in the pool.
|
|
||||||
_ => addresses.len(),
|
|
||||||
};
|
|
||||||
|
|
||||||
debug!("Allowed attempts for {:?}: {}", role, allowed_attempts);
|
|
||||||
|
|
||||||
let exists = match role {
|
|
||||||
Some(role) => addresses.iter().filter(|addr| addr.role == role).count() > 0,
|
|
||||||
None => true,
|
|
||||||
};
|
|
||||||
|
|
||||||
if !exists {
|
|
||||||
error!("Requested role {:?}, but none are configured", role);
|
|
||||||
return Err(Error::BadConfig);
|
|
||||||
}
|
|
||||||
|
|
||||||
let healthcheck_timeout = get_config().general.healthcheck_timeout;
|
let healthcheck_timeout = get_config().general.healthcheck_timeout;
|
||||||
let healthcheck_delay = get_config().general.healthcheck_delay as u128;
|
let healthcheck_delay = get_config().general.healthcheck_delay as u128;
|
||||||
|
|
||||||
while allowed_attempts > 0 {
|
while !candidates.is_empty() {
|
||||||
// Round-robin replicas.
|
// Get the next candidate
|
||||||
round_robin += 1;
|
let address = match candidates.pop() {
|
||||||
|
Some(address) => address,
|
||||||
|
None => break,
|
||||||
|
};
|
||||||
|
|
||||||
let index = round_robin % addresses.len();
|
if self.is_banned(&address, address.shard, role) {
|
||||||
let address = &addresses[index];
|
|
||||||
|
|
||||||
// Make sure you're getting a primary or a replica
|
|
||||||
// as per request. If no specific role is requested, the first
|
|
||||||
// available will be chosen.
|
|
||||||
if address.role != role {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
allowed_attempts -= 1;
|
|
||||||
|
|
||||||
// Don't attempt to connect to banned servers.
|
|
||||||
if self.is_banned(address, shard, role) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -313,11 +276,14 @@ impl ConnectionPool {
|
|||||||
self.stats.client_waiting(process_id, address.id);
|
self.stats.client_waiting(process_id, address.id);
|
||||||
|
|
||||||
// Check if we can connect
|
// Check if we can connect
|
||||||
let mut conn = match self.databases[shard][index].get().await {
|
let mut conn = match self.databases[address.shard][address.instance_index]
|
||||||
|
.get()
|
||||||
|
.await
|
||||||
|
{
|
||||||
Ok(conn) => conn,
|
Ok(conn) => conn,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
error!("Banning replica {}, error: {:?}", index, err);
|
error!("Banning instance {:?}, error: {:?}", address, err);
|
||||||
self.ban(address, shard, process_id);
|
self.ban(&address, address.shard, process_id);
|
||||||
self.stats.client_disconnecting(process_id, address.id);
|
self.stats.client_disconnecting(process_id, address.id);
|
||||||
self.stats
|
self.stats
|
||||||
.checkout_time(now.elapsed().as_micros(), process_id, address.id);
|
.checkout_time(now.elapsed().as_micros(), process_id, address.id);
|
||||||
@@ -359,29 +325,34 @@ impl ConnectionPool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Health check failed.
|
// Health check failed.
|
||||||
Err(_) => {
|
Err(err) => {
|
||||||
error!("Banning replica {} because of failed health check", index);
|
error!(
|
||||||
|
"Banning instance {:?} because of failed health check, {:?}",
|
||||||
|
address, err
|
||||||
|
);
|
||||||
|
|
||||||
// Don't leave a bad connection in the pool.
|
// Don't leave a bad connection in the pool.
|
||||||
server.mark_bad();
|
server.mark_bad();
|
||||||
|
|
||||||
self.ban(address, shard, process_id);
|
self.ban(&address, address.shard, process_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
// Health check timed out.
|
// Health check timed out.
|
||||||
Err(_) => {
|
Err(err) => {
|
||||||
error!("Banning replica {} because of health check timeout", index);
|
error!(
|
||||||
|
"Banning instance {:?} because of health check timeout, {:?}",
|
||||||
|
address, err
|
||||||
|
);
|
||||||
// Don't leave a bad connection in the pool.
|
// Don't leave a bad connection in the pool.
|
||||||
server.mark_bad();
|
server.mark_bad();
|
||||||
|
|
||||||
self.ban(address, shard, process_id);
|
self.ban(&address, address.shard, process_id);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Err(Error::AllServersDown);
|
return Err(Error::AllServersDown);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user