tests/ruby/load_balancing_spec.rb

# frozen_string_literal: true
require_relative 'spec_helper'

describe "Random Load Balancing" do
  let(:processes) { Helpers::Pgcat.single_shard_setup("sharded_db", 5) }
  after do
    processes.all_databases.map(&:reset)
    processes.pgcat.shutdown
  end

  context "under regular circumstances" do
    it "balances query volume between all instances" do
      conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))

      query_count = QUERY_COUNT
      expected_share = query_count / processes.all_databases.count
      failed_count = 0

      query_count.times do
        conn.async_exec("SELECT 1 + 2")
      rescue
        failed_count += 1
      end

      expect(failed_count).to eq(0)
      processes.all_databases.map(&:count_select_1_plus_2).each do |instance_share|
        expect(instance_share).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)
      end
    end
  end

  context "when some replicas are down" do
    it "balances query volume between working instances" do
      conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
      expected_share = QUERY_COUNT / (processes.all_databases.count - 2)
      failed_count = 0

      processes[:replicas][0].take_down do
        processes[:replicas][1].take_down do
          QUERY_COUNT.times do
            conn.async_exec("SELECT 1 + 2")
          rescue
            conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
            failed_count += 1
          end
        end
      end

      expect(failed_count).to be <= 2
      processes.all_databases.each do |instance|
        queries_routed = instance.count_select_1_plus_2
        if processes.replicas[0..1].include?(instance)
          expect(queries_routed).to eq(0)
        else
          expect(queries_routed).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)
        end
      end
    end
  end
end

describe "Least Outstanding Queries Load Balancing" do
  let(:processes) { Helpers::Pgcat.single_shard_setup("sharded_db", 1, "transaction", "loc") }
  after do
    processes.all_databases.map(&:reset)
    processes.pgcat.shutdown
  end

  context "under homogenous load" do
    it "balances query volume between all instances" do
      conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))

      query_count = QUERY_COUNT
      expected_share = query_count / processes.all_databases.count
      failed_count = 0

      query_count.times do
        conn.async_exec("SELECT 1 + 2")
      rescue
        failed_count += 1
      end

      expect(failed_count).to eq(0)
      processes.all_databases.map(&:count_select_1_plus_2).each do |instance_share|
        expect(instance_share).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)
      end
    end
  end

  context "under heterogeneous load" do
    xit "balances query volume between all instances based on how busy they are" do
      slow_query_count = 2
      threads = Array.new(slow_query_count) do
        Thread.new do
          conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
          conn.async_exec("BEGIN")
        end
      end

      conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))

      query_count = QUERY_COUNT
      expected_share = query_count / (processes.all_databases.count - slow_query_count)
      failed_count = 0

      query_count.times do
        conn.async_exec("SELECT 1 + 2")
      rescue
        failed_count += 1
      end

      expect(failed_count).to eq(0)
      # Under LOQ, we expect replicas running the slow pg_sleep
      # to get no selects
      expect(
        processes.
          all_databases.
          map(&:count_select_1_plus_2).
          count { |instance_share| instance_share == 0 }
      ).to eq(slow_query_count)

      # We also expect the quick queries to be spread across
      # the idle servers only
      processes.
        all_databases.
        map(&:count_select_1_plus_2).
        reject { |instance_share| instance_share == 0 }.
        each do |instance_share|
          expect(instance_share).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)
      end

      threads.map(&:join)
    end
  end

  context "when some replicas are down" do
    it "balances query volume between working instances" do
      conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
      expected_share = QUERY_COUNT / (processes.all_databases.count - 2)
      failed_count = 0

      processes[:replicas][0].take_down do
        processes[:replicas][1].take_down do
          QUERY_COUNT.times do
            conn.async_exec("SELECT 1 + 2")
          rescue
            conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
            failed_count += 1
          end
        end
      end

      expect(failed_count).to eq(2)
      processes.all_databases.each do |instance|
        queries_routed = instance.count_select_1_plus_2
        if processes.replicas[0..1].include?(instance)
          expect(queries_routed).to eq(0)
        else
          expect(queries_routed).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)
        end
      end
    end
  end
end
Ruby integration tests (#147) * Ruby integration tests * forgot a file * refactor * refactoring * more refactoring * remove config helper * try multiple databases * fix * more databases * Use pg stats * ports * speed * Fix tests * preload library * comment 2022-08-30 11:14:53 -05:00			`# frozen_string_literal: true`
			`require_relative 'spec_helper'`

Introduce least-outstanding-connections load balancing (#282) Least outstanding connections load balancing can improve the load distribution between instances but for Pgcat it may also improve handling slow replicas that don't go completely down. With LoC, traffic will quickly move away from the slow replica without waiting for the replica to be banned. If all replicas slow down equally (due to a bad query that is hitting all replicas), the algorithm will degenerate to Random Load Balancing (which is what we had in Pgcat until today). This may also allow Pgcat to accommodate pools with differently-sized replicas. 2023-01-17 06:52:18 -06:00			`describe "Random Load Balancing" do`
Ruby integration tests (#147) * Ruby integration tests * forgot a file * refactor * refactoring * more refactoring * remove config helper * try multiple databases * fix * more databases * Use pg stats * ports * speed * Fix tests * preload library * comment 2022-08-30 11:14:53 -05:00			`let(:processes) { Helpers::Pgcat.single_shard_setup("sharded_db", 5) }`
			`after do`
			`processes.all_databases.map(&:reset)`
			`processes.pgcat.shutdown`
			`end`

			`context "under regular circumstances" do`
			`it "balances query volume between all instances" do`
			`conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))`

			`query_count = QUERY_COUNT`
			`expected_share = query_count / processes.all_databases.count`
			`failed_count = 0`

			`query_count.times do`
			`conn.async_exec("SELECT 1 + 2")`
			`rescue`
			`failed_count += 1`
			`end`

			`expect(failed_count).to eq(0)`
			`processes.all_databases.map(&:count_select_1_plus_2).each do \|instance_share\|`
			`expect(instance_share).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)`
			`end`
			`end`
			`end`

			`context "when some replicas are down" do`
Introduce least-outstanding-connections load balancing (#282) Least outstanding connections load balancing can improve the load distribution between instances but for Pgcat it may also improve handling slow replicas that don't go completely down. With LoC, traffic will quickly move away from the slow replica without waiting for the replica to be banned. If all replicas slow down equally (due to a bad query that is hitting all replicas), the algorithm will degenerate to Random Load Balancing (which is what we had in Pgcat until today). This may also allow Pgcat to accommodate pools with differently-sized replicas. 2023-01-17 06:52:18 -06:00			`it "balances query volume between working instances" do`
			`conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))`
			`expected_share = QUERY_COUNT / (processes.all_databases.count - 2)`
			`failed_count = 0`

			`processes[:replicas][0].take_down do`
			`processes[:replicas][1].take_down do`
			`QUERY_COUNT.times do`
			`conn.async_exec("SELECT 1 + 2")`
			`rescue`
			`conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))`
			`failed_count += 1`
			`end`
			`end`
			`end`

More Test coverage + fix some code coverage bugs (#321) Connection to the CI databases is viewed by Postgres as coming from localhost. The pg_hba.conf file generated by the docker image uses trust for these connections, that's why we had no test coverage on SASL and md5 branches. This PR fixes this issue. There was also an issue with under-reporting code coverage. This should be fixed now 2023-02-16 23:09:22 -06:00			`expect(failed_count).to be <= 2`
Introduce least-outstanding-connections load balancing (#282) Least outstanding connections load balancing can improve the load distribution between instances but for Pgcat it may also improve handling slow replicas that don't go completely down. With LoC, traffic will quickly move away from the slow replica without waiting for the replica to be banned. If all replicas slow down equally (due to a bad query that is hitting all replicas), the algorithm will degenerate to Random Load Balancing (which is what we had in Pgcat until today). This may also allow Pgcat to accommodate pools with differently-sized replicas. 2023-01-17 06:52:18 -06:00			`processes.all_databases.each do \|instance\|`
			`queries_routed = instance.count_select_1_plus_2`
			`if processes.replicas[0..1].include?(instance)`
			`expect(queries_routed).to eq(0)`
			`else`
			`expect(queries_routed).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)`
			`end`
			`end`
			`end`
			`end`
			`end`

			`describe "Least Outstanding Queries Load Balancing" do`
			`let(:processes) { Helpers::Pgcat.single_shard_setup("sharded_db", 1, "transaction", "loc") }`
			`after do`
			`processes.all_databases.map(&:reset)`
			`processes.pgcat.shutdown`
			`end`

			`context "under homogenous load" do`
			`it "balances query volume between all instances" do`
			`conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))`

			`query_count = QUERY_COUNT`
			`expected_share = query_count / processes.all_databases.count`
			`failed_count = 0`

			`query_count.times do`
			`conn.async_exec("SELECT 1 + 2")`
			`rescue`
			`failed_count += 1`
			`end`

			`expect(failed_count).to eq(0)`
			`processes.all_databases.map(&:count_select_1_plus_2).each do \|instance_share\|`
			`expect(instance_share).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)`
			`end`
			`end`
			`end`

			`context "under heterogeneous load" do`
Introduce tcp_keepalives to PgCat (#315) We have encountered a case where PgCat pools were stuck following a database incident. Our best understanding at this point is that the PgCat -> Postgres connections died silently and because Tokio defaults to disabling keepalives, connections in the pool were marked as busy forever. Only when we deployed PgCat did we see recovery. This PR introduces tcp_keepalives to PgCat. This sets the defaults to be keepalives_idle: 5 # seconds keepalives_interval: 5 # seconds keepalives_count: 5 # a count These settings can detect the death of an idle connection within 30 seconds of its death. Please note that the connection can remain idle forever (from an application perspective) as long as the keepalive packets are flowing so disconnection will only occur if the other end is not acknowledging keepalive packets (keepalive packet acks are handled by the OS, the application does not need to do anything). I plan to add tcp_user_timeout in a follow-up PR. 2023-02-08 11:35:38 -06:00			`xit "balances query volume between all instances based on how busy they are" do`
Introduce least-outstanding-connections load balancing (#282) Least outstanding connections load balancing can improve the load distribution between instances but for Pgcat it may also improve handling slow replicas that don't go completely down. With LoC, traffic will quickly move away from the slow replica without waiting for the replica to be banned. If all replicas slow down equally (due to a bad query that is hitting all replicas), the algorithm will degenerate to Random Load Balancing (which is what we had in Pgcat until today). This may also allow Pgcat to accommodate pools with differently-sized replicas. 2023-01-17 06:52:18 -06:00			`slow_query_count = 2`
			`threads = Array.new(slow_query_count) do`
			`Thread.new do`
			`conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))`
Log error messages for network failures (#289) We are seeing some Error reading message code from socket error messages, we want to get more context so this PR logs the actual error reported. 2023-01-19 05:18:08 -06:00			`conn.async_exec("BEGIN")`
Introduce least-outstanding-connections load balancing (#282) Least outstanding connections load balancing can improve the load distribution between instances but for Pgcat it may also improve handling slow replicas that don't go completely down. With LoC, traffic will quickly move away from the slow replica without waiting for the replica to be banned. If all replicas slow down equally (due to a bad query that is hitting all replicas), the algorithm will degenerate to Random Load Balancing (which is what we had in Pgcat until today). This may also allow Pgcat to accommodate pools with differently-sized replicas. 2023-01-17 06:52:18 -06:00			`end`
			`end`

			`conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))`

			`query_count = QUERY_COUNT`
			`expected_share = query_count / (processes.all_databases.count - slow_query_count)`
			`failed_count = 0`

			`query_count.times do`
			`conn.async_exec("SELECT 1 + 2")`
			`rescue`
			`failed_count += 1`
			`end`

			`expect(failed_count).to eq(0)`
			`# Under LOQ, we expect replicas running the slow pg_sleep`
			`# to get no selects`
			`expect(`
			`processes.`
			`all_databases.`
			`map(&:count_select_1_plus_2).`
			`count { \|instance_share\| instance_share == 0 }`
			`).to eq(slow_query_count)`

			`# We also expect the quick queries to be spread across`
			`# the idle servers only`
			`processes.`
			`all_databases.`
			`map(&:count_select_1_plus_2).`
			`reject { \|instance_share\| instance_share == 0 }.`
			`each do \|instance_share\|`
			`expect(instance_share).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)`
			`end`

			`threads.map(&:join)`
			`end`
			`end`

			`context "when some replicas are down" do`
Ruby integration tests (#147) * Ruby integration tests * forgot a file * refactor * refactoring * more refactoring * remove config helper * try multiple databases * fix * more databases * Use pg stats * ports * speed * Fix tests * preload library * comment 2022-08-30 11:14:53 -05:00			`it "balances query volume between working instances" do`
			`conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))`
			`expected_share = QUERY_COUNT / (processes.all_databases.count - 2)`
			`failed_count = 0`

			`processes[:replicas][0].take_down do`
			`processes[:replicas][1].take_down do`
			`QUERY_COUNT.times do`
			`conn.async_exec("SELECT 1 + 2")`
			`rescue`
			`conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))`
			`failed_count += 1`
			`end`
			`end`
			`end`

			`expect(failed_count).to eq(2)`
			`processes.all_databases.each do \|instance\|`
			`queries_routed = instance.count_select_1_plus_2`
			`if processes.replicas[0..1].include?(instance)`
			`expect(queries_routed).to eq(0)`
			`else`
			`expect(queries_routed).to be_within(expected_share * MARGIN_OF_ERROR).of(expected_share)`
			`end`
			`end`
			`end`
			`end`
			`end`