mirror of
https://github.com/postgresml/pgcat.git
synced 2026-03-26 10:26:30 +00:00
Allow configuring routing decision when no shard is selected (#578)
The TL;DR for the change is that we allow QueryRouter to set the active shard to None. This signals to the Pool::get method that we have no shard selected. The get method follows a no_shard_specified_behavior config to know how to route the query. Original PR description Ruby-pg library makes a startup query to SET client_encoding to ... if Encoding.default_internal value is set (Code). This query is troublesome because we cannot possibly attach a routing comment to it. PgCat, by default, will route that query to the default shard. Everything is fine until shard 0 has issues, Clients will all be attempting to send this query to shard0 which increases the connection latency significantly for all clients, even those not interested in shard0 This PR introduces no_shard_specified_behavior that defines the behavior in case we have routing-by-comment enabled but we get a query without a comment. The allowed behaviors are random: Picks a shard at random random_healthy: Picks a shard at random favoring shards with the least number of recent connection/checkout errors shard_<number>: e.g. shard_0, shard_4, etc. picks a specific shard, everytime In order to achieve this, this PR introduces an error_count on the Address Object that tracks the number of errors since the last checkout and uses that metric to sort shards by error count before making a routing decision. I didn't want to use address stats to avoid introducing a routing dependency on internal stats (We might do that in the future but I prefer to avoid this for the time being. I also made changes to the test environment to replace Ruby's TOML reader library, It appears to be abandoned and does not support mixed arrays (which we use in the config toml), and it also does not play nicely with single-quoted regular expressions. I opted for using yj which is a CLI tool that can convert from toml to JSON and back. So I refactor the tests to use that library.
This commit is contained in:
committed by
GitHub
parent
33db0dffa8
commit
0b01d70b55
83
src/pool.rs
83
src/pool.rs
@@ -10,6 +10,7 @@ use rand::thread_rng;
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::{Display, Formatter};
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::{
|
||||
atomic::{AtomicBool, Ordering},
|
||||
Arc,
|
||||
@@ -18,7 +19,7 @@ use std::time::Instant;
|
||||
use tokio::sync::Notify;
|
||||
|
||||
use crate::config::{
|
||||
get_config, Address, General, LoadBalancingMode, Plugins, PoolMode, Role, User,
|
||||
get_config, Address, DefaultShard, General, LoadBalancingMode, Plugins, PoolMode, Role, User,
|
||||
};
|
||||
use crate::errors::Error;
|
||||
|
||||
@@ -140,6 +141,9 @@ pub struct PoolSettings {
|
||||
// Regex for searching for the shard id in SQL statements
|
||||
pub shard_id_regex: Option<Regex>,
|
||||
|
||||
// What to do when no shard is selected in a sharded system
|
||||
pub default_shard: DefaultShard,
|
||||
|
||||
// Limit how much of each query is searched for a potential shard regex match
|
||||
pub regex_search_limit: usize,
|
||||
|
||||
@@ -173,6 +177,7 @@ impl Default for PoolSettings {
|
||||
sharding_key_regex: None,
|
||||
shard_id_regex: None,
|
||||
regex_search_limit: 1000,
|
||||
default_shard: DefaultShard::Shard(0),
|
||||
auth_query: None,
|
||||
auth_query_user: None,
|
||||
auth_query_password: None,
|
||||
@@ -299,6 +304,7 @@ impl ConnectionPool {
|
||||
pool_name: pool_name.clone(),
|
||||
mirrors: vec![],
|
||||
stats: Arc::new(AddressStats::default()),
|
||||
error_count: Arc::new(AtomicU64::new(0)),
|
||||
});
|
||||
address_id += 1;
|
||||
}
|
||||
@@ -317,6 +323,7 @@ impl ConnectionPool {
|
||||
pool_name: pool_name.clone(),
|
||||
mirrors: mirror_addresses,
|
||||
stats: Arc::new(AddressStats::default()),
|
||||
error_count: Arc::new(AtomicU64::new(0)),
|
||||
};
|
||||
|
||||
address_id += 1;
|
||||
@@ -482,6 +489,7 @@ impl ConnectionPool {
|
||||
.clone()
|
||||
.map(|regex| Regex::new(regex.as_str()).unwrap()),
|
||||
regex_search_limit: pool_config.regex_search_limit.unwrap_or(1000),
|
||||
default_shard: pool_config.default_shard.clone(),
|
||||
auth_query: pool_config.auth_query.clone(),
|
||||
auth_query_user: pool_config.auth_query_user.clone(),
|
||||
auth_query_password: pool_config.auth_query_password.clone(),
|
||||
@@ -603,19 +611,51 @@ impl ConnectionPool {
|
||||
/// Get a connection from the pool.
|
||||
pub async fn get(
|
||||
&self,
|
||||
shard: usize, // shard number
|
||||
shard: Option<usize>, // shard number
|
||||
role: Option<Role>, // primary or replica
|
||||
client_stats: &ClientStats, // client id
|
||||
) -> Result<(PooledConnection<'_, ServerPool>, Address), Error> {
|
||||
let mut candidates: Vec<&Address> = self.addresses[shard]
|
||||
.iter()
|
||||
.filter(|address| address.role == role)
|
||||
.collect();
|
||||
let effective_shard_id = if self.shards() == 1 {
|
||||
// The base, unsharded case
|
||||
Some(0)
|
||||
} else {
|
||||
if !self.valid_shard_id(shard) {
|
||||
// None is valid shard ID so it is safe to unwrap here
|
||||
return Err(Error::InvalidShardId(shard.unwrap()));
|
||||
}
|
||||
shard
|
||||
};
|
||||
|
||||
// We shuffle even if least_outstanding_queries is used to avoid imbalance
|
||||
// in cases where all candidates have more or less the same number of outstanding
|
||||
// queries
|
||||
let mut candidates = self
|
||||
.addresses
|
||||
.iter()
|
||||
.flatten()
|
||||
.filter(|address| address.role == role)
|
||||
.collect::<Vec<&Address>>();
|
||||
|
||||
// We start with a shuffled list of addresses even if we end up resorting
|
||||
// this is meant to avoid hitting instance 0 everytime if the sorting metric
|
||||
// ends up being the same for all instances
|
||||
candidates.shuffle(&mut thread_rng());
|
||||
|
||||
match effective_shard_id {
|
||||
Some(shard_id) => candidates.retain(|address| address.shard == shard_id),
|
||||
None => match self.settings.default_shard {
|
||||
DefaultShard::Shard(shard_id) => {
|
||||
candidates.retain(|address| address.shard == shard_id)
|
||||
}
|
||||
DefaultShard::Random => (),
|
||||
DefaultShard::RandomHealthy => {
|
||||
candidates.sort_by(|a, b| {
|
||||
b.error_count
|
||||
.load(Ordering::Relaxed)
|
||||
.partial_cmp(&a.error_count.load(Ordering::Relaxed))
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
if self.settings.load_balancing_mode == LoadBalancingMode::LeastOutstandingConnections {
|
||||
candidates.sort_by(|a, b| {
|
||||
self.busy_connection_count(b)
|
||||
@@ -651,7 +691,10 @@ impl ConnectionPool {
|
||||
.get()
|
||||
.await
|
||||
{
|
||||
Ok(conn) => conn,
|
||||
Ok(conn) => {
|
||||
address.reset_error_count();
|
||||
conn
|
||||
}
|
||||
Err(err) => {
|
||||
error!(
|
||||
"Connection checkout error for instance {:?}, error: {:?}",
|
||||
@@ -766,6 +809,18 @@ impl ConnectionPool {
|
||||
/// traffic for any new transactions. Existing transactions on that replica
|
||||
/// will finish successfully or error out to the clients.
|
||||
pub fn ban(&self, address: &Address, reason: BanReason, client_info: Option<&ClientStats>) {
|
||||
// Count the number of errors since the last successful checkout
|
||||
// This is used to determine if the shard is down
|
||||
match reason {
|
||||
BanReason::FailedHealthCheck
|
||||
| BanReason::FailedCheckout
|
||||
| BanReason::MessageSendFailed
|
||||
| BanReason::MessageReceiveFailed => {
|
||||
address.increment_error_count();
|
||||
}
|
||||
_ => (),
|
||||
};
|
||||
|
||||
// Primary can never be banned
|
||||
if address.role == Role::Primary {
|
||||
return;
|
||||
@@ -920,6 +975,7 @@ impl ConnectionPool {
|
||||
self.original_server_parameters.read().clone()
|
||||
}
|
||||
|
||||
/// Get the number of checked out connection for an address
|
||||
fn busy_connection_count(&self, address: &Address) -> u32 {
|
||||
let state = self.pool_state(address.shard, address.address_index);
|
||||
let idle = state.idle_connections;
|
||||
@@ -933,6 +989,13 @@ impl ConnectionPool {
|
||||
debug!("{:?} has {:?} busy connections", address, busy);
|
||||
return busy;
|
||||
}
|
||||
|
||||
fn valid_shard_id(&self, shard: Option<usize>) -> bool {
|
||||
match shard {
|
||||
None => true,
|
||||
Some(shard) => shard < self.shards(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for the bb8 connection pool.
|
||||
|
||||
Reference in New Issue
Block a user