mirror of
https://github.com/postgresml/pgcat.git
synced 2026-03-22 17:06:29 +00:00
Add checkout_failure_limit config/feature (#911)
In a high availability deployment of PgCat, it is possible that a client may land on a container of PgCat that is very busy with clients and as such the new client might be perpetually stuck in checkout failure loop because all connections are used by other clients. This is specially true in session mode pools with long-lived client connections (e.g. FDW connections). One way to fix this issue is to close client connections after they encounter some number of checkout failure. This will force the client to hit the Network load balancer again, land on a different process/container, try to checkout a connection on the new process/container. if it fails, it is disconnected and tries with another one. This mechanism is guaranteed to eventually land on a balanced state where all clients are able to find connections provided that the overall number of connections across all containers matches the number of clients. I was able to reproduce this issue in a control environment and was able to show this PR is able to fix it.
This commit is contained in:
13
CONFIG.md
13
CONFIG.md
@@ -298,6 +298,19 @@ Load balancing mode
|
|||||||
`random` selects the server at random
|
`random` selects the server at random
|
||||||
`loc` selects the server with the least outstanding busy connections
|
`loc` selects the server with the least outstanding busy connections
|
||||||
|
|
||||||
|
### checkout_failure_limit
|
||||||
|
```
|
||||||
|
path: pools.<pool_name>.checkout_failure_limit
|
||||||
|
default: 0 (disabled)
|
||||||
|
```
|
||||||
|
|
||||||
|
`Maximum number of checkout failures a client is allowed before it
|
||||||
|
gets disconnected. This is needed to prevent persistent client/server
|
||||||
|
imbalance in high availability setups where multiple PgCat instances are placed
|
||||||
|
behind a single load balancer. If for any reason a client lands on a PgCat instance that has
|
||||||
|
a large number of connected clients, it might get stuck in perpetual checkout failure loop especially
|
||||||
|
in session mode
|
||||||
|
`
|
||||||
### default_role
|
### default_role
|
||||||
```
|
```
|
||||||
path: pools.<pool_name>.default_role
|
path: pools.<pool_name>.default_role
|
||||||
|
|||||||
@@ -859,6 +859,8 @@ where
|
|||||||
// e.g. primary, replica, which shard.
|
// e.g. primary, replica, which shard.
|
||||||
let mut query_router = QueryRouter::new();
|
let mut query_router = QueryRouter::new();
|
||||||
|
|
||||||
|
let mut checkout_failure_count: u64 = 0;
|
||||||
|
|
||||||
self.stats.register(self.stats.clone());
|
self.stats.register(self.stats.clone());
|
||||||
|
|
||||||
// Result returned by one of the plugins.
|
// Result returned by one of the plugins.
|
||||||
@@ -1108,7 +1110,25 @@ where
|
|||||||
query_router.role(),
|
query_router.role(),
|
||||||
err
|
err
|
||||||
);
|
);
|
||||||
|
checkout_failure_count += 1;
|
||||||
|
if let Some(limit) = pool.settings.checkout_failure_limit {
|
||||||
|
if checkout_failure_count >= limit {
|
||||||
|
error!(
|
||||||
|
"Checkout failure limit reached ({} / {}) - disconnecting client",
|
||||||
|
checkout_failure_count, limit
|
||||||
|
);
|
||||||
|
error_response_terminal(
|
||||||
|
&mut self.write,
|
||||||
|
&format!(
|
||||||
|
"checkout failure limit reached ({} / {})",
|
||||||
|
checkout_failure_count, limit
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
self.stats.disconnect();
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -558,6 +558,14 @@ pub struct Pool {
|
|||||||
/// Close idle connections that have been opened for longer than this.
|
/// Close idle connections that have been opened for longer than this.
|
||||||
pub idle_timeout: Option<u64>,
|
pub idle_timeout: Option<u64>,
|
||||||
|
|
||||||
|
/// Maximum number of checkout failures a client is allowed before it
|
||||||
|
/// gets disconnected. This is needed to prevent persistent client/server
|
||||||
|
/// imbalance in high availability setups where multiple PgCat instances are placed
|
||||||
|
/// behind a single load balancer. If for any reason a client lands on a PgCat instance that has
|
||||||
|
/// a large number of connected clients, it might get stuck in perpetual checkout failure loop especially
|
||||||
|
/// in session mode
|
||||||
|
pub checkout_failure_limit: Option<u64>,
|
||||||
|
|
||||||
/// Close server connections that have been opened for longer than this.
|
/// Close server connections that have been opened for longer than this.
|
||||||
/// Only applied to idle connections. If the connection is actively used for
|
/// Only applied to idle connections. If the connection is actively used for
|
||||||
/// longer than this period, the pool will not interrupt it.
|
/// longer than this period, the pool will not interrupt it.
|
||||||
@@ -782,6 +790,7 @@ impl Default for Pool {
|
|||||||
Pool {
|
Pool {
|
||||||
pool_mode: Self::default_pool_mode(),
|
pool_mode: Self::default_pool_mode(),
|
||||||
load_balancing_mode: Self::default_load_balancing_mode(),
|
load_balancing_mode: Self::default_load_balancing_mode(),
|
||||||
|
checkout_failure_limit: None,
|
||||||
default_role: String::from("any"),
|
default_role: String::from("any"),
|
||||||
query_parser_enabled: false,
|
query_parser_enabled: false,
|
||||||
query_parser_max_length: None,
|
query_parser_max_length: None,
|
||||||
@@ -1298,6 +1307,17 @@ impl Config {
|
|||||||
None => self.general.idle_timeout,
|
None => self.general.idle_timeout,
|
||||||
};
|
};
|
||||||
info!("[pool: {}] Idle timeout: {}ms", pool_name, idle_timeout);
|
info!("[pool: {}] Idle timeout: {}ms", pool_name, idle_timeout);
|
||||||
|
match pool_config.checkout_failure_limit {
|
||||||
|
Some(checkout_failure_limit) => {
|
||||||
|
info!(
|
||||||
|
"[pool: {}] Checkout failure limit: {}",
|
||||||
|
pool_name, checkout_failure_limit
|
||||||
|
);
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
info!("[pool: {}] Checkout failure limit: not set", pool_name);
|
||||||
|
}
|
||||||
|
};
|
||||||
info!(
|
info!(
|
||||||
"[pool: {}] Sharding function: {}",
|
"[pool: {}] Sharding function: {}",
|
||||||
pool_name,
|
pool_name,
|
||||||
|
|||||||
10
src/pool.rs
10
src/pool.rs
@@ -152,6 +152,14 @@ pub struct PoolSettings {
|
|||||||
/// Random or LeastOutstandingConnections.
|
/// Random or LeastOutstandingConnections.
|
||||||
pub load_balancing_mode: LoadBalancingMode,
|
pub load_balancing_mode: LoadBalancingMode,
|
||||||
|
|
||||||
|
/// Maximum number of checkout failures a client is allowed before it
|
||||||
|
/// gets disconnected. This is needed to prevent persistent client/server
|
||||||
|
/// imbalance in high availability setups where multiple PgCat instances are placed
|
||||||
|
/// behind a single load balancer. If for any reason a client lands on a PgCat instance that has
|
||||||
|
/// a large number of connected clients, it might get stuck in perpetual checkout failure loop especially
|
||||||
|
/// in session mode
|
||||||
|
pub checkout_failure_limit: Option<u64>,
|
||||||
|
|
||||||
// Number of shards.
|
// Number of shards.
|
||||||
pub shards: usize,
|
pub shards: usize,
|
||||||
|
|
||||||
@@ -227,6 +235,7 @@ impl Default for PoolSettings {
|
|||||||
PoolSettings {
|
PoolSettings {
|
||||||
pool_mode: PoolMode::Transaction,
|
pool_mode: PoolMode::Transaction,
|
||||||
load_balancing_mode: LoadBalancingMode::Random,
|
load_balancing_mode: LoadBalancingMode::Random,
|
||||||
|
checkout_failure_limit: None,
|
||||||
shards: 1,
|
shards: 1,
|
||||||
user: User::default(),
|
user: User::default(),
|
||||||
db: String::default(),
|
db: String::default(),
|
||||||
@@ -537,6 +546,7 @@ impl ConnectionPool {
|
|||||||
None => pool_config.pool_mode,
|
None => pool_config.pool_mode,
|
||||||
},
|
},
|
||||||
load_balancing_mode: pool_config.load_balancing_mode,
|
load_balancing_mode: pool_config.load_balancing_mode,
|
||||||
|
checkout_failure_limit: pool_config.checkout_failure_limit,
|
||||||
// shards: pool_config.shards.clone(),
|
// shards: pool_config.shards.clone(),
|
||||||
shards: shard_ids.len(),
|
shards: shard_ids.len(),
|
||||||
user: user.clone(),
|
user: user.clone(),
|
||||||
|
|||||||
@@ -1617,6 +1617,7 @@ mod test {
|
|||||||
let pool_settings = PoolSettings {
|
let pool_settings = PoolSettings {
|
||||||
pool_mode: PoolMode::Transaction,
|
pool_mode: PoolMode::Transaction,
|
||||||
load_balancing_mode: crate::config::LoadBalancingMode::Random,
|
load_balancing_mode: crate::config::LoadBalancingMode::Random,
|
||||||
|
checkout_failure_limit: None,
|
||||||
shards: 2,
|
shards: 2,
|
||||||
user: crate::config::User::default(),
|
user: crate::config::User::default(),
|
||||||
default_role: Some(Role::Replica),
|
default_role: Some(Role::Replica),
|
||||||
@@ -1699,6 +1700,7 @@ mod test {
|
|||||||
let pool_settings = PoolSettings {
|
let pool_settings = PoolSettings {
|
||||||
pool_mode: PoolMode::Transaction,
|
pool_mode: PoolMode::Transaction,
|
||||||
load_balancing_mode: crate::config::LoadBalancingMode::Random,
|
load_balancing_mode: crate::config::LoadBalancingMode::Random,
|
||||||
|
checkout_failure_limit: Some(10),
|
||||||
shards: 5,
|
shards: 5,
|
||||||
user: crate::config::User::default(),
|
user: crate::config::User::default(),
|
||||||
default_role: Some(Role::Replica),
|
default_role: Some(Role::Replica),
|
||||||
|
|||||||
@@ -188,6 +188,102 @@ describe "Miscellaneous" do
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
describe "Checkout failure limit" do
|
||||||
|
context "when no checkout failure limit is set" do
|
||||||
|
before do
|
||||||
|
new_configs = processes.pgcat.current_config
|
||||||
|
new_configs["general"]["connect_timeout"] = 200
|
||||||
|
new_configs["pools"]["sharded_db"]["users"]["0"]["pool_size"] = 1
|
||||||
|
processes.pgcat.update_config(new_configs)
|
||||||
|
processes.pgcat.reload_config
|
||||||
|
sleep 0.5
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does not disconnect client" do
|
||||||
|
Array.new(5) do
|
||||||
|
Thread.new do
|
||||||
|
conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
||||||
|
for i in 0..4
|
||||||
|
begin
|
||||||
|
conn.async_exec("SELECT pg_sleep(0.5);")
|
||||||
|
expect(conn.status).to eq(PG::CONNECTION_OK)
|
||||||
|
rescue PG::SystemError
|
||||||
|
expect(conn.status).to eq(PG::CONNECTION_OK)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
conn.close
|
||||||
|
end
|
||||||
|
end.each(&:join)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context "when checkout failure limit is set high" do
|
||||||
|
before do
|
||||||
|
new_configs = processes.pgcat.current_config
|
||||||
|
new_configs["general"]["connect_timeout"] = 200
|
||||||
|
new_configs["pools"]["sharded_db"]["users"]["0"]["pool_size"] = 1
|
||||||
|
new_configs["pools"]["sharded_db"]["checkout_failure_limit"] = 10000
|
||||||
|
processes.pgcat.update_config(new_configs)
|
||||||
|
processes.pgcat.reload_config
|
||||||
|
sleep 0.5
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does not disconnect client" do
|
||||||
|
Array.new(5) do
|
||||||
|
Thread.new do
|
||||||
|
conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
||||||
|
for i in 0..4
|
||||||
|
begin
|
||||||
|
conn.async_exec("SELECT pg_sleep(0.5);")
|
||||||
|
expect(conn.status).to eq(PG::CONNECTION_OK)
|
||||||
|
rescue PG::SystemError
|
||||||
|
expect(conn.status).to eq(PG::CONNECTION_OK)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
conn.close
|
||||||
|
end
|
||||||
|
end.each(&:join)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context "when checkout failure limit is set low" do
|
||||||
|
before do
|
||||||
|
new_configs = processes.pgcat.current_config
|
||||||
|
new_configs["general"]["connect_timeout"] = 200
|
||||||
|
new_configs["pools"]["sharded_db"]["users"]["0"]["pool_size"] = 1
|
||||||
|
new_configs["pools"]["sharded_db"]["checkout_failure_limit"] = 2
|
||||||
|
processes.pgcat.update_config(new_configs)
|
||||||
|
processes.pgcat.reload_config
|
||||||
|
sleep 0.5
|
||||||
|
end
|
||||||
|
|
||||||
|
it "disconnects client after reaching limit" do
|
||||||
|
Array.new(5) do
|
||||||
|
Thread.new do
|
||||||
|
conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
||||||
|
checkout_failure_count = 0
|
||||||
|
for i in 0..4
|
||||||
|
begin
|
||||||
|
conn.async_exec("SELECT pg_sleep(1);")
|
||||||
|
expect(conn.status).to eq(PG::CONNECTION_OK)
|
||||||
|
rescue PG::SystemError
|
||||||
|
checkout_failure_count += 1
|
||||||
|
expect(conn.status).to eq(PG::CONNECTION_OK)
|
||||||
|
rescue PG::ConnectionBad
|
||||||
|
expect(checkout_failure_count).to eq(2)
|
||||||
|
expect(conn.status).to eq(PG::CONNECTION_BAD)
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
conn.close
|
||||||
|
end
|
||||||
|
end.each(&:join)
|
||||||
|
puts processes.pgcat.logs
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
describe "Server version reporting" do
|
describe "Server version reporting" do
|
||||||
it "reports correct version for normal and admin databases" do
|
it "reports correct version for normal and admin databases" do
|
||||||
server_conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
server_conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
||||||
|
|||||||
Reference in New Issue
Block a user