Merge pull request #5 from levkk/levkk-replica-primary

#4 Primary/replica selection
This commit is contained in:
Lev Kokotov
2022-02-10 10:40:53 -08:00
committed by GitHub
11 changed files with 427 additions and 45 deletions

View File

@@ -34,8 +34,9 @@ See [sharding README](./tests/sharding/README.md) for sharding logic testing.
3. `COPY` protocol support.
4. Query cancellation.
5. Round-robin load balancing of replicas.
6. Banlist & failover
6. Banlist & failover.
7. Sharding!
8. Explicit query routing to primary or replicas.
### Session mode
Each client owns its own server for the duration of the session. Commands like `SET` are allowed.
@@ -56,7 +57,8 @@ this might be relevant given than this is a transactional pooler but if you're n
### Round-robin load balancing
This is the novel part. PgBouncer doesn't support it and suggests we use DNS or a TCP proxy instead.
We prefer to have everything as part of one package; arguably, it's easier to understand and optimize.
This pooler will round-robin between multiple replicas keeping load reasonably even.
This pooler will round-robin between multiple replicas keeping load reasonably even. If the primary is in
the pool as well, it'll be treated as a replica for read-only queries.
### Banlist & failover
This is where it gets even more interesting. If we fail to connect to one of the replicas or it fails a health check,
@@ -82,6 +84,19 @@ SET SHARDING KEY TO '1234';
This sharding key will be hashed and the pooler will select a shard to use for the next transaction. If the pooler is in session mode, this sharding key has to be set as the first query on startup & cannot be changed until the client re-connects.
### Explicit read/write query routing
If you want to have the primary and replicas in the same pooler, you'd probably want to
route queries explicitely to the primary or replicas, depending if they are reads or writes (e.g `SELECT`s or `INSERT`/`UPDATE`, etc). To help with this, we introduce some more custom syntax:
```sql
SET SERVER ROLE TO 'primary';
SET SERVER ROLE TO 'replica';
```
After executing this, the next transaction will be routed to the primary or replica respectively. By default, all queries will be load-balanced between all servers, so if the client wants to write or talk to the primary, they have to explicitely select it using the syntax above.
## Missing

View File

@@ -43,26 +43,29 @@ password = "sharding_user"
# Shard 0
[shards.0]
# [ host, port ]
# [ host, port, role ]
servers = [
[ "127.0.0.1", 5432 ],
[ "localhost", 5432 ],
[ "127.0.0.1", 5432, "primary" ],
[ "localhost", 5432, "replica" ],
# [ "127.0.1.1", 5432, "replica" ],
]
# Database name (e.g. "postgres")
database = "shard0"
[shards.1]
# [ host, port ]
# [ host, port, role ]
servers = [
[ "127.0.0.1", 5432 ],
[ "localhost", 5432 ],
[ "127.0.0.1", 5432, "primary" ],
[ "localhost", 5432, "replica" ],
# [ "127.0.1.1", 5432, "replica" ],
]
database = "shard1"
[shards.2]
# [ host, port ]
# [ host, port, role ]
servers = [
[ "127.0.0.1", 5432 ],
[ "localhost", 5432 ],
[ "127.0.0.1", 5432, "primary" ],
[ "localhost", 5432, "replica" ],
# [ "127.0.1.1", 5432, "replica" ],
]
database = "shard2"
database = "shard2"

View File

@@ -7,6 +7,7 @@ use tokio::io::{AsyncReadExt, BufReader};
use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf};
use tokio::net::TcpStream;
use crate::config::Role;
use crate::errors::Error;
use crate::messages::*;
use crate::pool::{ClientServerMap, ConnectionPool};
@@ -14,6 +15,7 @@ use crate::server::Server;
use crate::sharding::Sharder;
const SHARDING_REGEX: &str = r"SET SHARDING KEY TO '[0-9]+';";
const ROLE_REGEX: &str = r"SET SERVER ROLE TO '(PRIMARY|REPLICA)';";
/// The client state. One of these is created per client.
pub struct Client {
@@ -45,6 +47,9 @@ pub struct Client {
// sharding regex
sharding_regex: Regex,
// role detection regex
role_regex: Regex,
}
impl Client {
@@ -57,6 +62,7 @@ impl Client {
transaction_mode: bool,
) -> Result<Client, Error> {
let sharding_regex = Regex::new(SHARDING_REGEX).unwrap();
let role_regex = Regex::new(ROLE_REGEX).unwrap();
loop {
// Could be StartupMessage or SSLRequest
@@ -114,6 +120,7 @@ impl Client {
secret_key: secret_key,
client_server_map: client_server_map,
sharding_regex: sharding_regex,
role_regex: role_regex,
});
}
@@ -134,6 +141,7 @@ impl Client {
secret_key: secret_key,
client_server_map: client_server_map,
sharding_regex: sharding_regex,
role_regex: role_regex,
});
}
@@ -145,7 +153,7 @@ impl Client {
}
/// Client loop. We handle all messages between the client and the database here.
pub async fn handle(&mut self, pool: ConnectionPool) -> Result<(), Error> {
pub async fn handle(&mut self, mut pool: ConnectionPool) -> Result<(), Error> {
// Special: cancelling existing running query
if self.cancel_mode {
let (process_id, secret_key, address, port) = {
@@ -172,6 +180,9 @@ impl Client {
// - if in transaction mode, this lives for the duration of one transaction.
let mut shard: Option<usize> = None;
// Active database role we want to talk to, e.g. primary or replica.
let mut role: Option<Role> = None;
loop {
// Read a complete message from the client, which normally would be
// either a `Q` (query) or `P` (prepare, extended protocol).
@@ -182,18 +193,36 @@ impl Client {
// Parse for special select shard command.
// SET SHARDING KEY TO 'bigint';
match self.select_shard(message.clone(), pool.shards()).await {
match self.select_shard(message.clone(), pool.shards()) {
Some(s) => {
set_sharding_key(&mut self.write).await?;
custom_protocol_response_ok(&mut self.write, "SET SHARDING KEY").await?;
shard = Some(s);
continue;
}
None => (),
};
// Parse for special server role selection command.
//
match self.select_role(message.clone()) {
Some(r) => {
custom_protocol_response_ok(&mut self.write, "SET SERVER ROLE").await?;
role = Some(r);
continue;
}
None => (),
};
// Grab a server from the pool.
// None = any shard
let connection = pool.get(shard).await.unwrap();
let connection = match pool.get(shard, role).await {
Ok(conn) => conn,
Err(err) => {
println!(">> Could not get connection from pool: {:?}", err);
return Err(err);
}
};
let mut proxy = connection.0;
let _address = connection.1;
let server = &mut *proxy;
@@ -232,10 +261,13 @@ impl Client {
match code {
'Q' => {
// TODO: implement retries here for read-only transactions.
server.send(original).await?;
loop {
// TODO: implement retries here for read-only transactions.
let response = server.recv().await?;
match write_all_half(&mut self.write, response).await {
Ok(_) => (),
Err(err) => {
@@ -252,6 +284,7 @@ impl Client {
// Release server
if !server.in_transaction() && self.transaction_mode {
shard = None;
role = None;
break;
}
}
@@ -290,10 +323,13 @@ impl Client {
'S' => {
// Extended protocol, client requests sync
self.buffer.put(&original[..]);
// TODO: retries for read-only transactions
server.send(self.buffer.clone()).await?;
self.buffer.clear();
loop {
// TODO: retries for read-only transactions
let response = server.recv().await?;
match write_all_half(&mut self.write, response).await {
Ok(_) => (),
@@ -311,6 +347,7 @@ impl Client {
// Release server
if !server.in_transaction() && self.transaction_mode {
shard = None;
role = None;
break;
}
}
@@ -338,6 +375,7 @@ impl Client {
if !server.in_transaction() && self.transaction_mode {
println!("Releasing after copy done");
shard = None;
role = None;
break;
}
}
@@ -361,7 +399,7 @@ impl Client {
/// Determine if the query is part of our special syntax, extract
/// the shard key, and return the shard to query based on Postgres'
/// PARTITION BY HASH function.
async fn select_shard(&mut self, mut buf: BytesMut, shards: usize) -> Option<usize> {
fn select_shard(&mut self, mut buf: BytesMut, shards: usize) -> Option<usize> {
let code = buf.get_u8() as char;
// Only supporting simpe protocol here, so
@@ -390,4 +428,31 @@ impl Client {
None
}
}
// Pick a primary or a replica from the pool.
fn select_role(&mut self, mut buf: BytesMut) -> Option<Role> {
let code = buf.get_u8() as char;
// Same story as select_shard() above.
match code {
'Q' => (),
_ => return None,
};
let len = buf.get_i32();
let query = String::from_utf8_lossy(&buf[..len as usize - 4 - 1]).to_ascii_uppercase();
// Copy / paste from above. If we get one more of these use cases,
// it'll be time to abstract :).
if self.role_regex.is_match(&query) {
let role = query.split("'").collect::<Vec<&str>>()[1];
match role {
"PRIMARY" => Some(Role::Primary),
"REPLICA" => Some(Role::Replica),
_ => return None,
}
} else {
None
}
}
}

View File

@@ -3,14 +3,21 @@ use tokio::fs::File;
use tokio::io::AsyncReadExt;
use toml;
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use crate::errors::Error;
#[derive(Clone, PartialEq, Deserialize, Hash, std::cmp::Eq, Debug, Copy)]
pub enum Role {
Primary,
Replica,
}
#[derive(Clone, PartialEq, Hash, std::cmp::Eq, Debug)]
pub struct Address {
pub host: String,
pub port: String,
pub role: Role,
}
#[derive(Clone, PartialEq, Hash, std::cmp::Eq, Deserialize, Debug)]
@@ -32,7 +39,7 @@ pub struct General {
#[derive(Deserialize, Debug, Clone)]
pub struct Shard {
pub servers: Vec<(String, u16)>,
pub servers: Vec<(String, u16, String)>,
pub database: String,
}
@@ -70,6 +77,47 @@ pub async fn parse(path: &str) -> Result<Config, Error> {
}
};
// Quick config sanity check.
for shard in &config.shards {
// We use addresses as unique identifiers,
// let's make sure they are unique in the config as well.
let mut dup_check = HashSet::new();
let mut primary_count = 0;
for server in &shard.1.servers {
dup_check.insert(server);
// Check that we define only zero or one primary.
match server.2.as_ref() {
"primary" => primary_count += 1,
_ => (),
};
// Check role spelling.
match server.2.as_ref() {
"primary" => (),
"replica" => (),
_ => {
println!(
"> Shard {} server role must be either 'primary' or 'replica', got: '{}'",
shard.0, server.2
);
return Err(Error::BadConfig);
}
};
}
if primary_count > 1 {
println!("> Shard {} has more than on primary configured.", &shard.0);
return Err(Error::BadConfig);
}
if dup_check.len() != shard.1.servers.len() {
println!("> Shard {} contains duplicate server configs.", &shard.0);
return Err(Error::BadConfig);
}
}
Ok(config)
}
@@ -83,5 +131,6 @@ mod test {
assert_eq!(config.general.pool_size, 15);
assert_eq!(config.shards.len(), 3);
assert_eq!(config.shards["1"].servers[0].0, "127.0.0.1");
assert_eq!(config.shards["0"].servers[0].2, "primary");
}
}

View File

@@ -8,4 +8,5 @@ pub enum Error {
// ServerTimeout,
// DirtyServer,
BadConfig,
AllServersDown,
}

View File

@@ -73,6 +73,7 @@ async fn main() {
"> Healthcheck timeout: {}ms",
config.general.healthcheck_timeout
);
println!("> Connection timeout: {}ms", config.general.connect_timeout);
let pool = ConnectionPool::from_config(config.clone(), client_server_map.clone()).await;
let transaction_mode = config.general.pool_mode == "transaction";

View File

@@ -141,12 +141,16 @@ pub async fn md5_password(
Ok(write_all(stream, message).await?)
}
/// Implements a response to our custom `SET SHARDING KEY` command.
/// Implements a response to our custom `SET SHARDING KEY`
/// and `SET SERVER ROLE` commands.
/// This tells the client we're ready for the next query.
pub async fn set_sharding_key(stream: &mut OwnedWriteHalf) -> Result<(), Error> {
pub async fn custom_protocol_response_ok(
stream: &mut OwnedWriteHalf,
message: &str,
) -> Result<(), Error> {
let mut res = BytesMut::with_capacity(25);
let set_complete = BytesMut::from(&"SET SHARDING KEY\0"[..]);
let set_complete = BytesMut::from(&format!("{}\0", message)[..]);
let len = (set_complete.len() + 4) as i32;
// CommandComplete

View File

@@ -3,29 +3,31 @@ use async_trait::async_trait;
use bb8::{ManageConnection, Pool, PooledConnection};
use chrono::naive::NaiveDateTime;
use crate::config::{Address, Config, User};
use crate::config::{Address, Config, Role, User};
use crate::errors::Error;
use crate::server::Server;
use std::collections::HashMap;
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc, Mutex,
// atomic::{AtomicUsize, Ordering},
Arc,
Mutex,
};
// Banlist: bad servers go in here.
pub type BanList = Arc<Mutex<Vec<HashMap<Address, NaiveDateTime>>>>;
pub type Counter = Arc<AtomicUsize>;
// pub type Counter = Arc<AtomicUsize>;
pub type ClientServerMap = Arc<Mutex<HashMap<(i32, i32), (i32, i32, String, String)>>>;
#[derive(Clone, Debug)]
pub struct ConnectionPool {
databases: Vec<Vec<Pool<ServerPool>>>,
addresses: Vec<Vec<Address>>,
round_robin: Counter,
round_robin: usize,
banlist: BanList,
healthcheck_timeout: u64,
ban_time: i64,
pool_size: u32,
}
impl ConnectionPool {
@@ -48,9 +50,19 @@ impl ConnectionPool {
let mut replica_addresses = Vec::new();
for server in &shard.servers {
let role = match server.2.as_ref() {
"primary" => Role::Primary,
"replica" => Role::Replica,
_ => {
println!("> Config error: server role can be 'primary' or 'replica', have: '{}'. Defaulting to 'replica'.", server.2);
Role::Replica
}
};
let address = Address {
host: server.0.clone(),
port: server.1.to_string(),
role: role,
};
let manager = ServerPool::new(
@@ -79,20 +91,25 @@ impl ConnectionPool {
banlist.push(HashMap::new());
}
assert_eq!(shards.len(), addresses.len());
let address_len = addresses.len();
ConnectionPool {
databases: shards,
addresses: addresses,
round_robin: Arc::new(AtomicUsize::new(0)),
round_robin: rand::random::<usize>() % address_len, // Start at a random replica
banlist: Arc::new(Mutex::new(banlist)),
healthcheck_timeout: config.general.healthcheck_timeout,
ban_time: config.general.ban_time,
pool_size: config.general.pool_size,
}
}
/// Get a connection from the pool.
pub async fn get(
&self,
&mut self,
shard: Option<usize>,
role: Option<Role>,
) -> Result<(PooledConnection<'_, ServerPool>, Address), Error> {
// Set this to false to gain ~3-4% speed.
let with_health_check = true;
@@ -102,28 +119,82 @@ impl ConnectionPool {
None => 0, // TODO: pick a shard at random
};
loop {
let index =
self.round_robin.fetch_add(1, Ordering::SeqCst) % self.databases[shard].len();
let address = self.addresses[shard][index].clone();
let addresses = &self.addresses[shard];
if self.is_banned(&address, shard) {
// Make sure if a specific role is requested, it's available in the pool.
match role {
Some(role) => {
let role_count = addresses
.iter()
.filter(|&db| db.role == Role::Primary)
.count();
if role_count == 0 {
println!(
">> Error: Role '{:?}' requested, but none are configured.",
role
);
return Err(Error::AllServersDown);
}
}
// Any role should be present.
_ => (),
};
let mut allowed_attempts = match role {
// Primary-specific queries get one attempt, if the primary is down,
// nothing we should do about it I think. It's dangerous to retry
// write queries.
Some(Role::Primary) => 1,
// Replicas get to try as many times as there are replicas
// and connections in the pool.
_ => self.databases[shard].len() * self.pool_size as usize,
};
while allowed_attempts > 0 {
// Round-robin each client's queries.
// If a client only sends one query and then disconnects, it doesn't matter
// which replica it'll go to.
self.round_robin += 1;
let index = self.round_robin % addresses.len();
let address = &addresses[index];
// Make sure you're getting a primary or a replica
// as per request.
match role {
Some(role) => {
// If the client wants a specific role,
// we'll do our best to pick it, but if we only
// have one server in the cluster, it's probably only a primary
// (or only a replica), so the client will just get what we have.
if address.role != role && addresses.len() > 1 {
continue;
}
}
None => (),
};
if self.is_banned(address, shard, role) {
continue;
}
allowed_attempts -= 1;
// Check if we can connect
// TODO: implement query wait timeout, i.e. time to get a conn from the pool
let mut conn = match self.databases[shard][index].get().await {
Ok(conn) => conn,
Err(err) => {
println!(">> Banning replica {}, error: {:?}", index, err);
self.ban(&address, shard);
self.ban(address, shard);
continue;
}
};
if !with_health_check {
return Ok((conn, address));
return Ok((conn, address.clone()));
}
// // Check if this server is alive with a health check
@@ -137,13 +208,16 @@ impl ConnectionPool {
{
// Check if health check succeeded
Ok(res) => match res {
Ok(_) => return Ok((conn, address)),
Ok(_) => return Ok((conn, address.clone())),
Err(_) => {
println!(
">> Banning replica {} because of failed health check",
index
);
self.ban(&address, shard);
// Don't leave a bad connection in the pool.
server.mark_bad();
self.ban(address, shard);
continue;
}
},
@@ -153,11 +227,16 @@ impl ConnectionPool {
">> Banning replica {} because of health check timeout",
index
);
self.ban(&address, shard);
// Don't leave a bad connection in the pool.
server.mark_bad();
self.ban(address, shard);
continue;
}
}
}
return Err(Error::AllServersDown);
}
/// Ban an address (i.e. replica). It no longer will serve
@@ -179,7 +258,14 @@ impl ConnectionPool {
/// Check if a replica can serve traffic. If all replicas are banned,
/// we unban all of them. Better to try then not to.
pub fn is_banned(&self, address: &Address, shard: usize) -> bool {
pub fn is_banned(&self, address: &Address, shard: usize, role: Option<Role>) -> bool {
// If primary is requested explicitely, it can never be banned.
if Some(Role::Primary) == role {
return false;
}
// If you're not asking for the primary,
// all databases are treated as replicas.
let mut guard = self.banlist.lock().unwrap();
// Everything is banned = nothing is banned.
@@ -251,6 +337,7 @@ impl ManageConnection for ServerPool {
&self.user.password,
&self.database,
self.client_server_map.clone(),
self.address.role,
)
.await
}

View File

@@ -8,7 +8,7 @@ use tokio::io::{AsyncReadExt, BufReader};
use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf};
use tokio::net::TcpStream;
use crate::config::Address;
use crate::config::{Address, Role};
use crate::errors::Error;
use crate::messages::*;
use crate::ClientServerMap;
@@ -48,6 +48,8 @@ pub struct Server {
// Mapping of clients and servers used for query cancellation.
client_server_map: ClientServerMap,
role: Role,
}
impl Server {
@@ -60,6 +62,7 @@ impl Server {
password: &str,
database: &str,
client_server_map: ClientServerMap,
role: Role,
) -> Result<Server, Error> {
let mut stream = match TcpStream::connect(&format!("{}:{}", host, port)).await {
Ok(stream) => stream,
@@ -189,6 +192,7 @@ impl Server {
data_available: false,
bad: false,
client_server_map: client_server_map,
role: role,
});
}
@@ -409,6 +413,7 @@ impl Server {
Address {
host: self.host.to_string(),
port: self.port.to_string(),
role: self.role,
}
}
}

View File

@@ -1,12 +1,19 @@
#/bin/bash
set -e
# Setup all the shards.
sudo service postgresql restart
# sudo service postgresql restart
psql -f query_routing_setup.sql
echo "Giving Postgres 5 seconds to start up..."
# sleep 5
# psql -f query_routing_setup.sql
psql -h 127.0.0.1 -p 6432 -f query_routing_test_insert.sql
psql -h 127.0.0.1 -p 6432 -f query_routing_test_select.sql
psql -f query_routing_test_validate.sql
psql -e -h 127.0.0.1 -p 6432 -f query_routing_test_primary_replica.sql
psql -f query_routing_test_validate.sql

View File

@@ -0,0 +1,145 @@
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '1';
INSERT INTO data (id, value) VALUES (1, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '1';
SELECT * FROM data WHERE id = 1;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '2';
INSERT INTO data (id, value) VALUES (2, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '2';
SELECT * FROM data WHERE id = 2;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '3';
INSERT INTO data (id, value) VALUES (3, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '3';
SELECT * FROM data WHERE id = 3;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '4';
INSERT INTO data (id, value) VALUES (4, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '4';
SELECT * FROM data WHERE id = 4;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '5';
INSERT INTO data (id, value) VALUES (5, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '5';
SELECT * FROM data WHERE id = 5;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '6';
INSERT INTO data (id, value) VALUES (6, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '6';
SELECT * FROM data WHERE id = 6;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '7';
INSERT INTO data (id, value) VALUES (7, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '7';
SELECT * FROM data WHERE id = 7;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '8';
INSERT INTO data (id, value) VALUES (8, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '8';
SELECT * FROM data WHERE id = 8;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '9';
INSERT INTO data (id, value) VALUES (9, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '9';
SELECT * FROM data WHERE id = 9;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '10';
INSERT INTO data (id, value) VALUES (10, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '10';
SELECT * FROM data WHERE id = 10;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '11';
INSERT INTO data (id, value) VALUES (11, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '11';
SELECT * FROM data WHERE id = 11;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '12';
INSERT INTO data (id, value) VALUES (12, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '12';
SELECT * FROM data WHERE id = 12;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '13';
INSERT INTO data (id, value) VALUES (13, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '13';
SELECT * FROM data WHERE id = 13;
---
SET SERVER ROLE TO 'primary';
SET SHARDING KEY TO '14';
INSERT INTO data (id, value) VALUES (14, 'value_1');
SET SERVER ROLE TO 'replica';
SET SHARDING KEY TO '14';
SELECT * FROM data WHERE id = 14;
---
SET SERVER ROLE TO 'primary';
SELECT 1;
SET SERVER ROLE TO 'replica';
SELECT 1;