mirror of
https://github.com/postgresml/pgcat.git
synced 2026-03-28 03:06:29 +00:00
Merge pull request #5 from levkk/levkk-replica-primary
#4 Primary/replica selection
This commit is contained in:
19
README.md
19
README.md
@@ -34,8 +34,9 @@ See [sharding README](./tests/sharding/README.md) for sharding logic testing.
|
|||||||
3. `COPY` protocol support.
|
3. `COPY` protocol support.
|
||||||
4. Query cancellation.
|
4. Query cancellation.
|
||||||
5. Round-robin load balancing of replicas.
|
5. Round-robin load balancing of replicas.
|
||||||
6. Banlist & failover
|
6. Banlist & failover.
|
||||||
7. Sharding!
|
7. Sharding!
|
||||||
|
8. Explicit query routing to primary or replicas.
|
||||||
|
|
||||||
### Session mode
|
### Session mode
|
||||||
Each client owns its own server for the duration of the session. Commands like `SET` are allowed.
|
Each client owns its own server for the duration of the session. Commands like `SET` are allowed.
|
||||||
@@ -56,7 +57,8 @@ this might be relevant given than this is a transactional pooler but if you're n
|
|||||||
### Round-robin load balancing
|
### Round-robin load balancing
|
||||||
This is the novel part. PgBouncer doesn't support it and suggests we use DNS or a TCP proxy instead.
|
This is the novel part. PgBouncer doesn't support it and suggests we use DNS or a TCP proxy instead.
|
||||||
We prefer to have everything as part of one package; arguably, it's easier to understand and optimize.
|
We prefer to have everything as part of one package; arguably, it's easier to understand and optimize.
|
||||||
This pooler will round-robin between multiple replicas keeping load reasonably even.
|
This pooler will round-robin between multiple replicas keeping load reasonably even. If the primary is in
|
||||||
|
the pool as well, it'll be treated as a replica for read-only queries.
|
||||||
|
|
||||||
### Banlist & failover
|
### Banlist & failover
|
||||||
This is where it gets even more interesting. If we fail to connect to one of the replicas or it fails a health check,
|
This is where it gets even more interesting. If we fail to connect to one of the replicas or it fails a health check,
|
||||||
@@ -82,6 +84,19 @@ SET SHARDING KEY TO '1234';
|
|||||||
|
|
||||||
This sharding key will be hashed and the pooler will select a shard to use for the next transaction. If the pooler is in session mode, this sharding key has to be set as the first query on startup & cannot be changed until the client re-connects.
|
This sharding key will be hashed and the pooler will select a shard to use for the next transaction. If the pooler is in session mode, this sharding key has to be set as the first query on startup & cannot be changed until the client re-connects.
|
||||||
|
|
||||||
|
### Explicit read/write query routing
|
||||||
|
|
||||||
|
If you want to have the primary and replicas in the same pooler, you'd probably want to
|
||||||
|
route queries explicitely to the primary or replicas, depending if they are reads or writes (e.g `SELECT`s or `INSERT`/`UPDATE`, etc). To help with this, we introduce some more custom syntax:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
```
|
||||||
|
|
||||||
|
After executing this, the next transaction will be routed to the primary or replica respectively. By default, all queries will be load-balanced between all servers, so if the client wants to write or talk to the primary, they have to explicitely select it using the syntax above.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Missing
|
## Missing
|
||||||
|
|
||||||
|
|||||||
21
pgcat.toml
21
pgcat.toml
@@ -43,26 +43,29 @@ password = "sharding_user"
|
|||||||
# Shard 0
|
# Shard 0
|
||||||
[shards.0]
|
[shards.0]
|
||||||
|
|
||||||
# [ host, port ]
|
# [ host, port, role ]
|
||||||
servers = [
|
servers = [
|
||||||
[ "127.0.0.1", 5432 ],
|
[ "127.0.0.1", 5432, "primary" ],
|
||||||
[ "localhost", 5432 ],
|
[ "localhost", 5432, "replica" ],
|
||||||
|
# [ "127.0.1.1", 5432, "replica" ],
|
||||||
]
|
]
|
||||||
# Database name (e.g. "postgres")
|
# Database name (e.g. "postgres")
|
||||||
database = "shard0"
|
database = "shard0"
|
||||||
|
|
||||||
[shards.1]
|
[shards.1]
|
||||||
# [ host, port ]
|
# [ host, port, role ]
|
||||||
servers = [
|
servers = [
|
||||||
[ "127.0.0.1", 5432 ],
|
[ "127.0.0.1", 5432, "primary" ],
|
||||||
[ "localhost", 5432 ],
|
[ "localhost", 5432, "replica" ],
|
||||||
|
# [ "127.0.1.1", 5432, "replica" ],
|
||||||
]
|
]
|
||||||
database = "shard1"
|
database = "shard1"
|
||||||
|
|
||||||
[shards.2]
|
[shards.2]
|
||||||
# [ host, port ]
|
# [ host, port, role ]
|
||||||
servers = [
|
servers = [
|
||||||
[ "127.0.0.1", 5432 ],
|
[ "127.0.0.1", 5432, "primary" ],
|
||||||
[ "localhost", 5432 ],
|
[ "localhost", 5432, "replica" ],
|
||||||
|
# [ "127.0.1.1", 5432, "replica" ],
|
||||||
]
|
]
|
||||||
database = "shard2"
|
database = "shard2"
|
||||||
@@ -7,6 +7,7 @@ use tokio::io::{AsyncReadExt, BufReader};
|
|||||||
use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf};
|
use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf};
|
||||||
use tokio::net::TcpStream;
|
use tokio::net::TcpStream;
|
||||||
|
|
||||||
|
use crate::config::Role;
|
||||||
use crate::errors::Error;
|
use crate::errors::Error;
|
||||||
use crate::messages::*;
|
use crate::messages::*;
|
||||||
use crate::pool::{ClientServerMap, ConnectionPool};
|
use crate::pool::{ClientServerMap, ConnectionPool};
|
||||||
@@ -14,6 +15,7 @@ use crate::server::Server;
|
|||||||
use crate::sharding::Sharder;
|
use crate::sharding::Sharder;
|
||||||
|
|
||||||
const SHARDING_REGEX: &str = r"SET SHARDING KEY TO '[0-9]+';";
|
const SHARDING_REGEX: &str = r"SET SHARDING KEY TO '[0-9]+';";
|
||||||
|
const ROLE_REGEX: &str = r"SET SERVER ROLE TO '(PRIMARY|REPLICA)';";
|
||||||
|
|
||||||
/// The client state. One of these is created per client.
|
/// The client state. One of these is created per client.
|
||||||
pub struct Client {
|
pub struct Client {
|
||||||
@@ -45,6 +47,9 @@ pub struct Client {
|
|||||||
|
|
||||||
// sharding regex
|
// sharding regex
|
||||||
sharding_regex: Regex,
|
sharding_regex: Regex,
|
||||||
|
|
||||||
|
// role detection regex
|
||||||
|
role_regex: Regex,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Client {
|
impl Client {
|
||||||
@@ -57,6 +62,7 @@ impl Client {
|
|||||||
transaction_mode: bool,
|
transaction_mode: bool,
|
||||||
) -> Result<Client, Error> {
|
) -> Result<Client, Error> {
|
||||||
let sharding_regex = Regex::new(SHARDING_REGEX).unwrap();
|
let sharding_regex = Regex::new(SHARDING_REGEX).unwrap();
|
||||||
|
let role_regex = Regex::new(ROLE_REGEX).unwrap();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// Could be StartupMessage or SSLRequest
|
// Could be StartupMessage or SSLRequest
|
||||||
@@ -114,6 +120,7 @@ impl Client {
|
|||||||
secret_key: secret_key,
|
secret_key: secret_key,
|
||||||
client_server_map: client_server_map,
|
client_server_map: client_server_map,
|
||||||
sharding_regex: sharding_regex,
|
sharding_regex: sharding_regex,
|
||||||
|
role_regex: role_regex,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,6 +141,7 @@ impl Client {
|
|||||||
secret_key: secret_key,
|
secret_key: secret_key,
|
||||||
client_server_map: client_server_map,
|
client_server_map: client_server_map,
|
||||||
sharding_regex: sharding_regex,
|
sharding_regex: sharding_regex,
|
||||||
|
role_regex: role_regex,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -145,7 +153,7 @@ impl Client {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Client loop. We handle all messages between the client and the database here.
|
/// Client loop. We handle all messages between the client and the database here.
|
||||||
pub async fn handle(&mut self, pool: ConnectionPool) -> Result<(), Error> {
|
pub async fn handle(&mut self, mut pool: ConnectionPool) -> Result<(), Error> {
|
||||||
// Special: cancelling existing running query
|
// Special: cancelling existing running query
|
||||||
if self.cancel_mode {
|
if self.cancel_mode {
|
||||||
let (process_id, secret_key, address, port) = {
|
let (process_id, secret_key, address, port) = {
|
||||||
@@ -172,6 +180,9 @@ impl Client {
|
|||||||
// - if in transaction mode, this lives for the duration of one transaction.
|
// - if in transaction mode, this lives for the duration of one transaction.
|
||||||
let mut shard: Option<usize> = None;
|
let mut shard: Option<usize> = None;
|
||||||
|
|
||||||
|
// Active database role we want to talk to, e.g. primary or replica.
|
||||||
|
let mut role: Option<Role> = None;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// Read a complete message from the client, which normally would be
|
// Read a complete message from the client, which normally would be
|
||||||
// either a `Q` (query) or `P` (prepare, extended protocol).
|
// either a `Q` (query) or `P` (prepare, extended protocol).
|
||||||
@@ -182,18 +193,36 @@ impl Client {
|
|||||||
|
|
||||||
// Parse for special select shard command.
|
// Parse for special select shard command.
|
||||||
// SET SHARDING KEY TO 'bigint';
|
// SET SHARDING KEY TO 'bigint';
|
||||||
match self.select_shard(message.clone(), pool.shards()).await {
|
match self.select_shard(message.clone(), pool.shards()) {
|
||||||
Some(s) => {
|
Some(s) => {
|
||||||
set_sharding_key(&mut self.write).await?;
|
custom_protocol_response_ok(&mut self.write, "SET SHARDING KEY").await?;
|
||||||
shard = Some(s);
|
shard = Some(s);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
None => (),
|
None => (),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Parse for special server role selection command.
|
||||||
|
//
|
||||||
|
match self.select_role(message.clone()) {
|
||||||
|
Some(r) => {
|
||||||
|
custom_protocol_response_ok(&mut self.write, "SET SERVER ROLE").await?;
|
||||||
|
role = Some(r);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
None => (),
|
||||||
|
};
|
||||||
|
|
||||||
// Grab a server from the pool.
|
// Grab a server from the pool.
|
||||||
// None = any shard
|
// None = any shard
|
||||||
let connection = pool.get(shard).await.unwrap();
|
let connection = match pool.get(shard, role).await {
|
||||||
|
Ok(conn) => conn,
|
||||||
|
Err(err) => {
|
||||||
|
println!(">> Could not get connection from pool: {:?}", err);
|
||||||
|
return Err(err);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let mut proxy = connection.0;
|
let mut proxy = connection.0;
|
||||||
let _address = connection.1;
|
let _address = connection.1;
|
||||||
let server = &mut *proxy;
|
let server = &mut *proxy;
|
||||||
@@ -232,10 +261,13 @@ impl Client {
|
|||||||
|
|
||||||
match code {
|
match code {
|
||||||
'Q' => {
|
'Q' => {
|
||||||
|
// TODO: implement retries here for read-only transactions.
|
||||||
server.send(original).await?;
|
server.send(original).await?;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
|
// TODO: implement retries here for read-only transactions.
|
||||||
let response = server.recv().await?;
|
let response = server.recv().await?;
|
||||||
|
|
||||||
match write_all_half(&mut self.write, response).await {
|
match write_all_half(&mut self.write, response).await {
|
||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
@@ -252,6 +284,7 @@ impl Client {
|
|||||||
// Release server
|
// Release server
|
||||||
if !server.in_transaction() && self.transaction_mode {
|
if !server.in_transaction() && self.transaction_mode {
|
||||||
shard = None;
|
shard = None;
|
||||||
|
role = None;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -290,10 +323,13 @@ impl Client {
|
|||||||
'S' => {
|
'S' => {
|
||||||
// Extended protocol, client requests sync
|
// Extended protocol, client requests sync
|
||||||
self.buffer.put(&original[..]);
|
self.buffer.put(&original[..]);
|
||||||
|
|
||||||
|
// TODO: retries for read-only transactions
|
||||||
server.send(self.buffer.clone()).await?;
|
server.send(self.buffer.clone()).await?;
|
||||||
self.buffer.clear();
|
self.buffer.clear();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
|
// TODO: retries for read-only transactions
|
||||||
let response = server.recv().await?;
|
let response = server.recv().await?;
|
||||||
match write_all_half(&mut self.write, response).await {
|
match write_all_half(&mut self.write, response).await {
|
||||||
Ok(_) => (),
|
Ok(_) => (),
|
||||||
@@ -311,6 +347,7 @@ impl Client {
|
|||||||
// Release server
|
// Release server
|
||||||
if !server.in_transaction() && self.transaction_mode {
|
if !server.in_transaction() && self.transaction_mode {
|
||||||
shard = None;
|
shard = None;
|
||||||
|
role = None;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -338,6 +375,7 @@ impl Client {
|
|||||||
if !server.in_transaction() && self.transaction_mode {
|
if !server.in_transaction() && self.transaction_mode {
|
||||||
println!("Releasing after copy done");
|
println!("Releasing after copy done");
|
||||||
shard = None;
|
shard = None;
|
||||||
|
role = None;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -361,7 +399,7 @@ impl Client {
|
|||||||
/// Determine if the query is part of our special syntax, extract
|
/// Determine if the query is part of our special syntax, extract
|
||||||
/// the shard key, and return the shard to query based on Postgres'
|
/// the shard key, and return the shard to query based on Postgres'
|
||||||
/// PARTITION BY HASH function.
|
/// PARTITION BY HASH function.
|
||||||
async fn select_shard(&mut self, mut buf: BytesMut, shards: usize) -> Option<usize> {
|
fn select_shard(&mut self, mut buf: BytesMut, shards: usize) -> Option<usize> {
|
||||||
let code = buf.get_u8() as char;
|
let code = buf.get_u8() as char;
|
||||||
|
|
||||||
// Only supporting simpe protocol here, so
|
// Only supporting simpe protocol here, so
|
||||||
@@ -390,4 +428,31 @@ impl Client {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pick a primary or a replica from the pool.
|
||||||
|
fn select_role(&mut self, mut buf: BytesMut) -> Option<Role> {
|
||||||
|
let code = buf.get_u8() as char;
|
||||||
|
|
||||||
|
// Same story as select_shard() above.
|
||||||
|
match code {
|
||||||
|
'Q' => (),
|
||||||
|
_ => return None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let len = buf.get_i32();
|
||||||
|
let query = String::from_utf8_lossy(&buf[..len as usize - 4 - 1]).to_ascii_uppercase();
|
||||||
|
|
||||||
|
// Copy / paste from above. If we get one more of these use cases,
|
||||||
|
// it'll be time to abstract :).
|
||||||
|
if self.role_regex.is_match(&query) {
|
||||||
|
let role = query.split("'").collect::<Vec<&str>>()[1];
|
||||||
|
match role {
|
||||||
|
"PRIMARY" => Some(Role::Primary),
|
||||||
|
"REPLICA" => Some(Role::Replica),
|
||||||
|
_ => return None,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,14 +3,21 @@ use tokio::fs::File;
|
|||||||
use tokio::io::AsyncReadExt;
|
use tokio::io::AsyncReadExt;
|
||||||
use toml;
|
use toml;
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
use crate::errors::Error;
|
use crate::errors::Error;
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Deserialize, Hash, std::cmp::Eq, Debug, Copy)]
|
||||||
|
pub enum Role {
|
||||||
|
Primary,
|
||||||
|
Replica,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Hash, std::cmp::Eq, Debug)]
|
#[derive(Clone, PartialEq, Hash, std::cmp::Eq, Debug)]
|
||||||
pub struct Address {
|
pub struct Address {
|
||||||
pub host: String,
|
pub host: String,
|
||||||
pub port: String,
|
pub port: String,
|
||||||
|
pub role: Role,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Hash, std::cmp::Eq, Deserialize, Debug)]
|
#[derive(Clone, PartialEq, Hash, std::cmp::Eq, Deserialize, Debug)]
|
||||||
@@ -32,7 +39,7 @@ pub struct General {
|
|||||||
|
|
||||||
#[derive(Deserialize, Debug, Clone)]
|
#[derive(Deserialize, Debug, Clone)]
|
||||||
pub struct Shard {
|
pub struct Shard {
|
||||||
pub servers: Vec<(String, u16)>,
|
pub servers: Vec<(String, u16, String)>,
|
||||||
pub database: String,
|
pub database: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -70,6 +77,47 @@ pub async fn parse(path: &str) -> Result<Config, Error> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Quick config sanity check.
|
||||||
|
for shard in &config.shards {
|
||||||
|
// We use addresses as unique identifiers,
|
||||||
|
// let's make sure they are unique in the config as well.
|
||||||
|
let mut dup_check = HashSet::new();
|
||||||
|
let mut primary_count = 0;
|
||||||
|
|
||||||
|
for server in &shard.1.servers {
|
||||||
|
dup_check.insert(server);
|
||||||
|
|
||||||
|
// Check that we define only zero or one primary.
|
||||||
|
match server.2.as_ref() {
|
||||||
|
"primary" => primary_count += 1,
|
||||||
|
_ => (),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check role spelling.
|
||||||
|
match server.2.as_ref() {
|
||||||
|
"primary" => (),
|
||||||
|
"replica" => (),
|
||||||
|
_ => {
|
||||||
|
println!(
|
||||||
|
"> Shard {} server role must be either 'primary' or 'replica', got: '{}'",
|
||||||
|
shard.0, server.2
|
||||||
|
);
|
||||||
|
return Err(Error::BadConfig);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if primary_count > 1 {
|
||||||
|
println!("> Shard {} has more than on primary configured.", &shard.0);
|
||||||
|
return Err(Error::BadConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
if dup_check.len() != shard.1.servers.len() {
|
||||||
|
println!("> Shard {} contains duplicate server configs.", &shard.0);
|
||||||
|
return Err(Error::BadConfig);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(config)
|
Ok(config)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -83,5 +131,6 @@ mod test {
|
|||||||
assert_eq!(config.general.pool_size, 15);
|
assert_eq!(config.general.pool_size, 15);
|
||||||
assert_eq!(config.shards.len(), 3);
|
assert_eq!(config.shards.len(), 3);
|
||||||
assert_eq!(config.shards["1"].servers[0].0, "127.0.0.1");
|
assert_eq!(config.shards["1"].servers[0].0, "127.0.0.1");
|
||||||
|
assert_eq!(config.shards["0"].servers[0].2, "primary");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,4 +8,5 @@ pub enum Error {
|
|||||||
// ServerTimeout,
|
// ServerTimeout,
|
||||||
// DirtyServer,
|
// DirtyServer,
|
||||||
BadConfig,
|
BadConfig,
|
||||||
|
AllServersDown,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -73,6 +73,7 @@ async fn main() {
|
|||||||
"> Healthcheck timeout: {}ms",
|
"> Healthcheck timeout: {}ms",
|
||||||
config.general.healthcheck_timeout
|
config.general.healthcheck_timeout
|
||||||
);
|
);
|
||||||
|
println!("> Connection timeout: {}ms", config.general.connect_timeout);
|
||||||
|
|
||||||
let pool = ConnectionPool::from_config(config.clone(), client_server_map.clone()).await;
|
let pool = ConnectionPool::from_config(config.clone(), client_server_map.clone()).await;
|
||||||
let transaction_mode = config.general.pool_mode == "transaction";
|
let transaction_mode = config.general.pool_mode == "transaction";
|
||||||
|
|||||||
@@ -141,12 +141,16 @@ pub async fn md5_password(
|
|||||||
Ok(write_all(stream, message).await?)
|
Ok(write_all(stream, message).await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Implements a response to our custom `SET SHARDING KEY` command.
|
/// Implements a response to our custom `SET SHARDING KEY`
|
||||||
|
/// and `SET SERVER ROLE` commands.
|
||||||
/// This tells the client we're ready for the next query.
|
/// This tells the client we're ready for the next query.
|
||||||
pub async fn set_sharding_key(stream: &mut OwnedWriteHalf) -> Result<(), Error> {
|
pub async fn custom_protocol_response_ok(
|
||||||
|
stream: &mut OwnedWriteHalf,
|
||||||
|
message: &str,
|
||||||
|
) -> Result<(), Error> {
|
||||||
let mut res = BytesMut::with_capacity(25);
|
let mut res = BytesMut::with_capacity(25);
|
||||||
|
|
||||||
let set_complete = BytesMut::from(&"SET SHARDING KEY\0"[..]);
|
let set_complete = BytesMut::from(&format!("{}\0", message)[..]);
|
||||||
let len = (set_complete.len() + 4) as i32;
|
let len = (set_complete.len() + 4) as i32;
|
||||||
|
|
||||||
// CommandComplete
|
// CommandComplete
|
||||||
|
|||||||
125
src/pool.rs
125
src/pool.rs
@@ -3,29 +3,31 @@ use async_trait::async_trait;
|
|||||||
use bb8::{ManageConnection, Pool, PooledConnection};
|
use bb8::{ManageConnection, Pool, PooledConnection};
|
||||||
use chrono::naive::NaiveDateTime;
|
use chrono::naive::NaiveDateTime;
|
||||||
|
|
||||||
use crate::config::{Address, Config, User};
|
use crate::config::{Address, Config, Role, User};
|
||||||
use crate::errors::Error;
|
use crate::errors::Error;
|
||||||
use crate::server::Server;
|
use crate::server::Server;
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::sync::{
|
use std::sync::{
|
||||||
atomic::{AtomicUsize, Ordering},
|
// atomic::{AtomicUsize, Ordering},
|
||||||
Arc, Mutex,
|
Arc,
|
||||||
|
Mutex,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Banlist: bad servers go in here.
|
// Banlist: bad servers go in here.
|
||||||
pub type BanList = Arc<Mutex<Vec<HashMap<Address, NaiveDateTime>>>>;
|
pub type BanList = Arc<Mutex<Vec<HashMap<Address, NaiveDateTime>>>>;
|
||||||
pub type Counter = Arc<AtomicUsize>;
|
// pub type Counter = Arc<AtomicUsize>;
|
||||||
pub type ClientServerMap = Arc<Mutex<HashMap<(i32, i32), (i32, i32, String, String)>>>;
|
pub type ClientServerMap = Arc<Mutex<HashMap<(i32, i32), (i32, i32, String, String)>>>;
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct ConnectionPool {
|
pub struct ConnectionPool {
|
||||||
databases: Vec<Vec<Pool<ServerPool>>>,
|
databases: Vec<Vec<Pool<ServerPool>>>,
|
||||||
addresses: Vec<Vec<Address>>,
|
addresses: Vec<Vec<Address>>,
|
||||||
round_robin: Counter,
|
round_robin: usize,
|
||||||
banlist: BanList,
|
banlist: BanList,
|
||||||
healthcheck_timeout: u64,
|
healthcheck_timeout: u64,
|
||||||
ban_time: i64,
|
ban_time: i64,
|
||||||
|
pool_size: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ConnectionPool {
|
impl ConnectionPool {
|
||||||
@@ -48,9 +50,19 @@ impl ConnectionPool {
|
|||||||
let mut replica_addresses = Vec::new();
|
let mut replica_addresses = Vec::new();
|
||||||
|
|
||||||
for server in &shard.servers {
|
for server in &shard.servers {
|
||||||
|
let role = match server.2.as_ref() {
|
||||||
|
"primary" => Role::Primary,
|
||||||
|
"replica" => Role::Replica,
|
||||||
|
_ => {
|
||||||
|
println!("> Config error: server role can be 'primary' or 'replica', have: '{}'. Defaulting to 'replica'.", server.2);
|
||||||
|
Role::Replica
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let address = Address {
|
let address = Address {
|
||||||
host: server.0.clone(),
|
host: server.0.clone(),
|
||||||
port: server.1.to_string(),
|
port: server.1.to_string(),
|
||||||
|
role: role,
|
||||||
};
|
};
|
||||||
|
|
||||||
let manager = ServerPool::new(
|
let manager = ServerPool::new(
|
||||||
@@ -79,20 +91,25 @@ impl ConnectionPool {
|
|||||||
banlist.push(HashMap::new());
|
banlist.push(HashMap::new());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
assert_eq!(shards.len(), addresses.len());
|
||||||
|
let address_len = addresses.len();
|
||||||
|
|
||||||
ConnectionPool {
|
ConnectionPool {
|
||||||
databases: shards,
|
databases: shards,
|
||||||
addresses: addresses,
|
addresses: addresses,
|
||||||
round_robin: Arc::new(AtomicUsize::new(0)),
|
round_robin: rand::random::<usize>() % address_len, // Start at a random replica
|
||||||
banlist: Arc::new(Mutex::new(banlist)),
|
banlist: Arc::new(Mutex::new(banlist)),
|
||||||
healthcheck_timeout: config.general.healthcheck_timeout,
|
healthcheck_timeout: config.general.healthcheck_timeout,
|
||||||
ban_time: config.general.ban_time,
|
ban_time: config.general.ban_time,
|
||||||
|
pool_size: config.general.pool_size,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get a connection from the pool.
|
/// Get a connection from the pool.
|
||||||
pub async fn get(
|
pub async fn get(
|
||||||
&self,
|
&mut self,
|
||||||
shard: Option<usize>,
|
shard: Option<usize>,
|
||||||
|
role: Option<Role>,
|
||||||
) -> Result<(PooledConnection<'_, ServerPool>, Address), Error> {
|
) -> Result<(PooledConnection<'_, ServerPool>, Address), Error> {
|
||||||
// Set this to false to gain ~3-4% speed.
|
// Set this to false to gain ~3-4% speed.
|
||||||
let with_health_check = true;
|
let with_health_check = true;
|
||||||
@@ -102,28 +119,82 @@ impl ConnectionPool {
|
|||||||
None => 0, // TODO: pick a shard at random
|
None => 0, // TODO: pick a shard at random
|
||||||
};
|
};
|
||||||
|
|
||||||
loop {
|
let addresses = &self.addresses[shard];
|
||||||
let index =
|
|
||||||
self.round_robin.fetch_add(1, Ordering::SeqCst) % self.databases[shard].len();
|
|
||||||
let address = self.addresses[shard][index].clone();
|
|
||||||
|
|
||||||
if self.is_banned(&address, shard) {
|
// Make sure if a specific role is requested, it's available in the pool.
|
||||||
|
match role {
|
||||||
|
Some(role) => {
|
||||||
|
let role_count = addresses
|
||||||
|
.iter()
|
||||||
|
.filter(|&db| db.role == Role::Primary)
|
||||||
|
.count();
|
||||||
|
|
||||||
|
if role_count == 0 {
|
||||||
|
println!(
|
||||||
|
">> Error: Role '{:?}' requested, but none are configured.",
|
||||||
|
role
|
||||||
|
);
|
||||||
|
|
||||||
|
return Err(Error::AllServersDown);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Any role should be present.
|
||||||
|
_ => (),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut allowed_attempts = match role {
|
||||||
|
// Primary-specific queries get one attempt, if the primary is down,
|
||||||
|
// nothing we should do about it I think. It's dangerous to retry
|
||||||
|
// write queries.
|
||||||
|
Some(Role::Primary) => 1,
|
||||||
|
|
||||||
|
// Replicas get to try as many times as there are replicas
|
||||||
|
// and connections in the pool.
|
||||||
|
_ => self.databases[shard].len() * self.pool_size as usize,
|
||||||
|
};
|
||||||
|
|
||||||
|
while allowed_attempts > 0 {
|
||||||
|
// Round-robin each client's queries.
|
||||||
|
// If a client only sends one query and then disconnects, it doesn't matter
|
||||||
|
// which replica it'll go to.
|
||||||
|
self.round_robin += 1;
|
||||||
|
let index = self.round_robin % addresses.len();
|
||||||
|
let address = &addresses[index];
|
||||||
|
|
||||||
|
// Make sure you're getting a primary or a replica
|
||||||
|
// as per request.
|
||||||
|
match role {
|
||||||
|
Some(role) => {
|
||||||
|
// If the client wants a specific role,
|
||||||
|
// we'll do our best to pick it, but if we only
|
||||||
|
// have one server in the cluster, it's probably only a primary
|
||||||
|
// (or only a replica), so the client will just get what we have.
|
||||||
|
if address.role != role && addresses.len() > 1 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => (),
|
||||||
|
};
|
||||||
|
|
||||||
|
if self.is_banned(address, shard, role) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
allowed_attempts -= 1;
|
||||||
|
|
||||||
// Check if we can connect
|
// Check if we can connect
|
||||||
// TODO: implement query wait timeout, i.e. time to get a conn from the pool
|
|
||||||
let mut conn = match self.databases[shard][index].get().await {
|
let mut conn = match self.databases[shard][index].get().await {
|
||||||
Ok(conn) => conn,
|
Ok(conn) => conn,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
println!(">> Banning replica {}, error: {:?}", index, err);
|
println!(">> Banning replica {}, error: {:?}", index, err);
|
||||||
self.ban(&address, shard);
|
self.ban(address, shard);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if !with_health_check {
|
if !with_health_check {
|
||||||
return Ok((conn, address));
|
return Ok((conn, address.clone()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// // Check if this server is alive with a health check
|
// // Check if this server is alive with a health check
|
||||||
@@ -137,13 +208,16 @@ impl ConnectionPool {
|
|||||||
{
|
{
|
||||||
// Check if health check succeeded
|
// Check if health check succeeded
|
||||||
Ok(res) => match res {
|
Ok(res) => match res {
|
||||||
Ok(_) => return Ok((conn, address)),
|
Ok(_) => return Ok((conn, address.clone())),
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
println!(
|
println!(
|
||||||
">> Banning replica {} because of failed health check",
|
">> Banning replica {} because of failed health check",
|
||||||
index
|
index
|
||||||
);
|
);
|
||||||
self.ban(&address, shard);
|
// Don't leave a bad connection in the pool.
|
||||||
|
server.mark_bad();
|
||||||
|
|
||||||
|
self.ban(address, shard);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -153,11 +227,16 @@ impl ConnectionPool {
|
|||||||
">> Banning replica {} because of health check timeout",
|
">> Banning replica {} because of health check timeout",
|
||||||
index
|
index
|
||||||
);
|
);
|
||||||
self.ban(&address, shard);
|
// Don't leave a bad connection in the pool.
|
||||||
|
server.mark_bad();
|
||||||
|
|
||||||
|
self.ban(address, shard);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return Err(Error::AllServersDown);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Ban an address (i.e. replica). It no longer will serve
|
/// Ban an address (i.e. replica). It no longer will serve
|
||||||
@@ -179,7 +258,14 @@ impl ConnectionPool {
|
|||||||
|
|
||||||
/// Check if a replica can serve traffic. If all replicas are banned,
|
/// Check if a replica can serve traffic. If all replicas are banned,
|
||||||
/// we unban all of them. Better to try then not to.
|
/// we unban all of them. Better to try then not to.
|
||||||
pub fn is_banned(&self, address: &Address, shard: usize) -> bool {
|
pub fn is_banned(&self, address: &Address, shard: usize, role: Option<Role>) -> bool {
|
||||||
|
// If primary is requested explicitely, it can never be banned.
|
||||||
|
if Some(Role::Primary) == role {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If you're not asking for the primary,
|
||||||
|
// all databases are treated as replicas.
|
||||||
let mut guard = self.banlist.lock().unwrap();
|
let mut guard = self.banlist.lock().unwrap();
|
||||||
|
|
||||||
// Everything is banned = nothing is banned.
|
// Everything is banned = nothing is banned.
|
||||||
@@ -251,6 +337,7 @@ impl ManageConnection for ServerPool {
|
|||||||
&self.user.password,
|
&self.user.password,
|
||||||
&self.database,
|
&self.database,
|
||||||
self.client_server_map.clone(),
|
self.client_server_map.clone(),
|
||||||
|
self.address.role,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use tokio::io::{AsyncReadExt, BufReader};
|
|||||||
use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf};
|
use tokio::net::tcp::{OwnedReadHalf, OwnedWriteHalf};
|
||||||
use tokio::net::TcpStream;
|
use tokio::net::TcpStream;
|
||||||
|
|
||||||
use crate::config::Address;
|
use crate::config::{Address, Role};
|
||||||
use crate::errors::Error;
|
use crate::errors::Error;
|
||||||
use crate::messages::*;
|
use crate::messages::*;
|
||||||
use crate::ClientServerMap;
|
use crate::ClientServerMap;
|
||||||
@@ -48,6 +48,8 @@ pub struct Server {
|
|||||||
|
|
||||||
// Mapping of clients and servers used for query cancellation.
|
// Mapping of clients and servers used for query cancellation.
|
||||||
client_server_map: ClientServerMap,
|
client_server_map: ClientServerMap,
|
||||||
|
|
||||||
|
role: Role,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Server {
|
impl Server {
|
||||||
@@ -60,6 +62,7 @@ impl Server {
|
|||||||
password: &str,
|
password: &str,
|
||||||
database: &str,
|
database: &str,
|
||||||
client_server_map: ClientServerMap,
|
client_server_map: ClientServerMap,
|
||||||
|
role: Role,
|
||||||
) -> Result<Server, Error> {
|
) -> Result<Server, Error> {
|
||||||
let mut stream = match TcpStream::connect(&format!("{}:{}", host, port)).await {
|
let mut stream = match TcpStream::connect(&format!("{}:{}", host, port)).await {
|
||||||
Ok(stream) => stream,
|
Ok(stream) => stream,
|
||||||
@@ -189,6 +192,7 @@ impl Server {
|
|||||||
data_available: false,
|
data_available: false,
|
||||||
bad: false,
|
bad: false,
|
||||||
client_server_map: client_server_map,
|
client_server_map: client_server_map,
|
||||||
|
role: role,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -409,6 +413,7 @@ impl Server {
|
|||||||
Address {
|
Address {
|
||||||
host: self.host.to_string(),
|
host: self.host.to_string(),
|
||||||
port: self.port.to_string(),
|
port: self.port.to_string(),
|
||||||
|
role: self.role,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,12 +1,19 @@
|
|||||||
#/bin/bash
|
#/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
# Setup all the shards.
|
# Setup all the shards.
|
||||||
sudo service postgresql restart
|
# sudo service postgresql restart
|
||||||
|
|
||||||
psql -f query_routing_setup.sql
|
echo "Giving Postgres 5 seconds to start up..."
|
||||||
|
|
||||||
|
# sleep 5
|
||||||
|
|
||||||
|
# psql -f query_routing_setup.sql
|
||||||
|
|
||||||
psql -h 127.0.0.1 -p 6432 -f query_routing_test_insert.sql
|
psql -h 127.0.0.1 -p 6432 -f query_routing_test_insert.sql
|
||||||
|
|
||||||
psql -h 127.0.0.1 -p 6432 -f query_routing_test_select.sql
|
psql -h 127.0.0.1 -p 6432 -f query_routing_test_select.sql
|
||||||
|
|
||||||
|
psql -e -h 127.0.0.1 -p 6432 -f query_routing_test_primary_replica.sql
|
||||||
|
|
||||||
psql -f query_routing_test_validate.sql
|
psql -f query_routing_test_validate.sql
|
||||||
145
tests/sharding/query_routing_test_primary_replica.sql
Normal file
145
tests/sharding/query_routing_test_primary_replica.sql
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '1';
|
||||||
|
INSERT INTO data (id, value) VALUES (1, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '1';
|
||||||
|
SELECT * FROM data WHERE id = 1;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '2';
|
||||||
|
INSERT INTO data (id, value) VALUES (2, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '2';
|
||||||
|
SELECT * FROM data WHERE id = 2;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '3';
|
||||||
|
INSERT INTO data (id, value) VALUES (3, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '3';
|
||||||
|
SELECT * FROM data WHERE id = 3;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '4';
|
||||||
|
INSERT INTO data (id, value) VALUES (4, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '4';
|
||||||
|
SELECT * FROM data WHERE id = 4;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '5';
|
||||||
|
INSERT INTO data (id, value) VALUES (5, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '5';
|
||||||
|
SELECT * FROM data WHERE id = 5;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '6';
|
||||||
|
INSERT INTO data (id, value) VALUES (6, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '6';
|
||||||
|
SELECT * FROM data WHERE id = 6;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '7';
|
||||||
|
INSERT INTO data (id, value) VALUES (7, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '7';
|
||||||
|
SELECT * FROM data WHERE id = 7;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '8';
|
||||||
|
INSERT INTO data (id, value) VALUES (8, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '8';
|
||||||
|
SELECT * FROM data WHERE id = 8;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '9';
|
||||||
|
INSERT INTO data (id, value) VALUES (9, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '9';
|
||||||
|
SELECT * FROM data WHERE id = 9;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '10';
|
||||||
|
INSERT INTO data (id, value) VALUES (10, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '10';
|
||||||
|
SELECT * FROM data WHERE id = 10;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '11';
|
||||||
|
INSERT INTO data (id, value) VALUES (11, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '11';
|
||||||
|
SELECT * FROM data WHERE id = 11;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '12';
|
||||||
|
INSERT INTO data (id, value) VALUES (12, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '12';
|
||||||
|
SELECT * FROM data WHERE id = 12;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '13';
|
||||||
|
INSERT INTO data (id, value) VALUES (13, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '13';
|
||||||
|
SELECT * FROM data WHERE id = 13;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SET SHARDING KEY TO '14';
|
||||||
|
INSERT INTO data (id, value) VALUES (14, 'value_1');
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SET SHARDING KEY TO '14';
|
||||||
|
SELECT * FROM data WHERE id = 14;
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'primary';
|
||||||
|
SELECT 1;
|
||||||
|
|
||||||
|
SET SERVER ROLE TO 'replica';
|
||||||
|
SELECT 1;
|
||||||
Reference in New Issue
Block a user