Add checkout_failure_limit config/feature (#911)

In a high availability deployment of PgCat, it is possible that a client may land on a container of PgCat that is very busy with clients and as such the new client might be perpetually stuck in checkout failure loop because all connections are used by other clients. This is specially true in session mode pools with long-lived client connections (e.g. FDW connections).

One way to fix this issue is to close client connections after they encounter some number of checkout failure. This will force the client to hit the Network load balancer again, land on a different process/container, try to checkout a connection on the new process/container. if it fails, it is disconnected and tries with another one.

This mechanism is guaranteed to eventually land on a balanced state where all clients are able to find connections provided that the overall number of connections across all containers matches the number of clients.

I was able to reproduce this issue in a control environment and was able to show this PR is able to fix it.
This commit is contained in:
Mostafa
2025-02-27 13:17:00 -06:00
committed by GitHub
parent f8e2fcd0ed
commit 3349cecc18
6 changed files with 162 additions and 1 deletions

View File

@@ -188,6 +188,102 @@ describe "Miscellaneous" do
end
end
describe "Checkout failure limit" do
context "when no checkout failure limit is set" do
before do
new_configs = processes.pgcat.current_config
new_configs["general"]["connect_timeout"] = 200
new_configs["pools"]["sharded_db"]["users"]["0"]["pool_size"] = 1
processes.pgcat.update_config(new_configs)
processes.pgcat.reload_config
sleep 0.5
end
it "does not disconnect client" do
Array.new(5) do
Thread.new do
conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
for i in 0..4
begin
conn.async_exec("SELECT pg_sleep(0.5);")
expect(conn.status).to eq(PG::CONNECTION_OK)
rescue PG::SystemError
expect(conn.status).to eq(PG::CONNECTION_OK)
end
end
conn.close
end
end.each(&:join)
end
end
context "when checkout failure limit is set high" do
before do
new_configs = processes.pgcat.current_config
new_configs["general"]["connect_timeout"] = 200
new_configs["pools"]["sharded_db"]["users"]["0"]["pool_size"] = 1
new_configs["pools"]["sharded_db"]["checkout_failure_limit"] = 10000
processes.pgcat.update_config(new_configs)
processes.pgcat.reload_config
sleep 0.5
end
it "does not disconnect client" do
Array.new(5) do
Thread.new do
conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
for i in 0..4
begin
conn.async_exec("SELECT pg_sleep(0.5);")
expect(conn.status).to eq(PG::CONNECTION_OK)
rescue PG::SystemError
expect(conn.status).to eq(PG::CONNECTION_OK)
end
end
conn.close
end
end.each(&:join)
end
end
context "when checkout failure limit is set low" do
before do
new_configs = processes.pgcat.current_config
new_configs["general"]["connect_timeout"] = 200
new_configs["pools"]["sharded_db"]["users"]["0"]["pool_size"] = 1
new_configs["pools"]["sharded_db"]["checkout_failure_limit"] = 2
processes.pgcat.update_config(new_configs)
processes.pgcat.reload_config
sleep 0.5
end
it "disconnects client after reaching limit" do
Array.new(5) do
Thread.new do
conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
checkout_failure_count = 0
for i in 0..4
begin
conn.async_exec("SELECT pg_sleep(1);")
expect(conn.status).to eq(PG::CONNECTION_OK)
rescue PG::SystemError
checkout_failure_count += 1
expect(conn.status).to eq(PG::CONNECTION_OK)
rescue PG::ConnectionBad
expect(checkout_failure_count).to eq(2)
expect(conn.status).to eq(PG::CONNECTION_BAD)
break
end
end
conn.close
end
end.each(&:join)
puts processes.pgcat.logs
end
end
end
describe "Server version reporting" do
it "reports correct version for normal and admin databases" do
server_conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))