Refactor stats to use atomics (#375)

* Refactor stats to use atomics When we are dealing with a high number of connections, generated stats cannot be consumed fast enough by the stats collector loop. This makes the stats subsystem inconsistent and a log of warning messages are thrown due to unregistered server/clients. This change refactors the stats subsystem so it uses atomics: - Now counters are handled using U64 atomics - Event system is dropped and averages are calculated using a loop every 15 seconds. - Now, instead of snapshots being generated ever second we keep track of servers/clients that have registered. Each pool/server/client has its own instance of the counter and makes changes directly, instead of adding an event that gets processed later. * Manually mplement Hash/Eq in `config::Address` ignoring stats * Add tests for client connection counters * Allow connecting to dockerized dev pgcat from the host * stats: Decrease cl_idle when idle socket disconnects
2026-07-16 17:39:06 +00:00 · 2023-03-28 17:19:37 +02:00
parent 9a2076a9eb
commit 58ce76d9b9
19 changed files with 1303 additions and 1182 deletions
@@ -176,6 +176,47 @@ describe "Admin" do
      end
    end

+    context "clients connects and disconnect normally" do
+      let(:processes) { Helpers::Pgcat.single_instance_setup("sharded_db", 2) }
+
+      it 'shows the same number of clients before and after' do
+        clients_before = clients_connected_to_pool(processes: processes)
+        threads = []
+        connections = Array.new(4) { PG::connect("#{pgcat_conn_str}?application_name=one_query") }
+        connections.each do |c|
+          threads << Thread.new { c.async_exec("SELECT 1") }
+        end
+        clients_between = clients_connected_to_pool(processes: processes)
+        expect(clients_before).not_to eq(clients_between)
+        connections.each(&:close)
+        clients_after = clients_connected_to_pool(processes: processes)
+        expect(clients_before).to eq(clients_after)
+      end
+    end
+
+    context "clients connects and disconnect abruptly" do
+      let(:processes) { Helpers::Pgcat.single_instance_setup("sharded_db", 10) }
+
+      it 'shows the same number of clients before and after' do
+        threads = []
+        connections = Array.new(2) { PG::connect("#{pgcat_conn_str}?application_name=one_query") }
+        connections.each do |c|
+          threads << Thread.new { c.async_exec("SELECT 1") }
+        end
+        clients_before = clients_connected_to_pool(processes: processes)
+        random_string = (0...8).map { (65 + rand(26)).chr }.join
+        connection_string = "#{pgcat_conn_str}?application_name=#{random_string}"
+        faulty_client = Process.spawn("psql -Atx #{connection_string} >/dev/null")
+        sleep(1)
+        # psql starts two processes, we only know the pid of the parent, this
+        # ensure both are killed
+        `pkill -9 -f '#{random_string}'`
+        Process.wait(faulty_client)
+        clients_after = clients_connected_to_pool(processes: processes)
+        expect(clients_before).to eq(clients_after)
+      end
+    end
+
    context "clients overwhelm server pools" do
      let(:processes) { Helpers::Pgcat.single_instance_setup("sharded_db", 2) }

@@ -199,7 +240,7 @@ describe "Admin" do

        sleep(2.5) # Allow time for stats to update
        results = admin_conn.async_exec("SHOW POOLS")[0]
-        %w[cl_active cl_waiting cl_cancel_req sv_active sv_used sv_tested sv_login maxwait].each do |s|
+        %w[cl_active cl_waiting cl_cancel_req sv_active sv_used sv_tested sv_login].each do |s|
          raise StandardError, "Field #{s} was expected to be 0 but found to be #{results[s]}" if results[s] != "0"
        end
        expect(results["cl_idle"]).to eq("4")
@@ -19,3 +19,10 @@ ensure
  STDOUT.reopen(sout)
  STDERR.reopen(serr)
 end
+
+def clients_connected_to_pool(pool_index: 0, processes:)
+  admin_conn = PG::connect(processes.pgcat.admin_connection_string)
+  results = admin_conn.async_exec("SHOW POOLS")[pool_index]
+  admin_conn.close
+  results['cl_idle'].to_i + results['cl_active'].to_i + results['cl_waiting'].to_i
+end