Add checkout_failure_limit config/feature (#911)

In a high availability deployment of PgCat, it is possible that a client may land on a container of PgCat that is very busy with clients and as such the new client might be perpetually stuck in checkout failure loop because all connections are used by other clients. This is specially true in session mode pools with long-lived client connections (e.g. FDW connections). One way to fix this issue is to close client connections after they encounter some number of checkout failure. This will force the client to hit the Network load balancer again, land on a different process/container, try to checkout a connection on the new process/container. if it fails, it is disconnected and tries with another one. This mechanism is guaranteed to eventually land on a balanced state where all clients are able to find connections provided that the overall number of connections across all containers matches the number of clients. I was able to reproduce this issue in a control environment and was able to show this PR is able to fix it.
2026-07-16 09:29:05 +00:00 · 2025-02-27 13:17:00 -06:00
parent f8e2fcd0ed
commit 3349cecc18
6 changed files with 162 additions and 1 deletions
@@ -188,6 +188,102 @@ describe "Miscellaneous" do
    end
  end

+  describe "Checkout failure limit" do
+    context "when no checkout failure limit is set" do
+      before do 
+        new_configs = processes.pgcat.current_config
+        new_configs["general"]["connect_timeout"] = 200
+        new_configs["pools"]["sharded_db"]["users"]["0"]["pool_size"] = 1
+        processes.pgcat.update_config(new_configs)
+        processes.pgcat.reload_config
+        sleep 0.5
+      end
+  
+      it "does not disconnect client" do
+        Array.new(5) do
+          Thread.new do
+            conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
+            for i in 0..4
+              begin
+                conn.async_exec("SELECT pg_sleep(0.5);")
+                expect(conn.status).to eq(PG::CONNECTION_OK)
+              rescue PG::SystemError
+                expect(conn.status).to eq(PG::CONNECTION_OK)
+              end
+            end
+            conn.close
+          end
+        end.each(&:join)
+      end
+    end
+
+    context "when checkout failure limit is set high" do
+      before do 
+        new_configs = processes.pgcat.current_config
+        new_configs["general"]["connect_timeout"] = 200
+        new_configs["pools"]["sharded_db"]["users"]["0"]["pool_size"] = 1
+        new_configs["pools"]["sharded_db"]["checkout_failure_limit"] = 10000
+        processes.pgcat.update_config(new_configs)
+        processes.pgcat.reload_config
+        sleep 0.5
+      end
+  
+      it "does not disconnect client" do
+        Array.new(5) do
+          Thread.new do
+            conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
+            for i in 0..4
+              begin
+                conn.async_exec("SELECT pg_sleep(0.5);")
+                expect(conn.status).to eq(PG::CONNECTION_OK)
+              rescue PG::SystemError
+                expect(conn.status).to eq(PG::CONNECTION_OK)
+              end
+            end
+            conn.close
+          end
+        end.each(&:join)
+      end
+    end
+
+    context "when checkout failure limit is set low" do
+      before do 
+        new_configs = processes.pgcat.current_config
+        new_configs["general"]["connect_timeout"] = 200
+        new_configs["pools"]["sharded_db"]["users"]["0"]["pool_size"] = 1
+        new_configs["pools"]["sharded_db"]["checkout_failure_limit"] = 2
+        processes.pgcat.update_config(new_configs)
+        processes.pgcat.reload_config
+        sleep 0.5
+      end
+  
+      it "disconnects client after reaching limit" do
+        Array.new(5) do
+          Thread.new do
+            conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
+            checkout_failure_count = 0
+            for i in 0..4
+              begin
+                conn.async_exec("SELECT pg_sleep(1);")
+                expect(conn.status).to eq(PG::CONNECTION_OK)
+              rescue PG::SystemError
+                checkout_failure_count += 1
+                expect(conn.status).to eq(PG::CONNECTION_OK)
+              rescue PG::ConnectionBad
+                expect(checkout_failure_count).to eq(2)
+                expect(conn.status).to eq(PG::CONNECTION_BAD)
+                break
+              end
+            end
+            conn.close
+          end
+        end.each(&:join)
+        puts processes.pgcat.logs
+
+      end
+    end
+  end
+  
  describe "Server version reporting" do
    it "reports correct version for normal and admin databases" do
      server_conn = PG::connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))