2022-08-30 11:14:53 -05:00
|
|
|
# frozen_string_literal: true
|
2024-11-06 12:59:20 +01:00
|
|
|
require_relative "spec_helper"
|
2022-08-30 11:14:53 -05:00
|
|
|
|
2023-01-17 06:52:18 -06:00
|
|
|
describe "Random Load Balancing" do
|
2022-08-30 11:14:53 -05:00
|
|
|
let(:processes) { Helpers::Pgcat.single_shard_setup("sharded_db", 5) }
|
|
|
|
|
after do
|
|
|
|
|
processes.all_databases.map(&:reset)
|
|
|
|
|
processes.pgcat.shutdown
|
|
|
|
|
end
|
|
|
|
|
|
2024-11-06 12:59:20 +01:00
|
|
|
context("under regular circumstances") do
|
2022-08-30 11:14:53 -05:00
|
|
|
it "balances query volume between all instances" do
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
|
|
|
|
|
query_count = QUERY_COUNT
|
|
|
|
|
expected_share = query_count / processes.all_databases.count
|
|
|
|
|
failed_count = 0
|
|
|
|
|
|
|
|
|
|
query_count.times do
|
|
|
|
|
conn.async_exec("SELECT 1 + 2")
|
|
|
|
|
rescue
|
|
|
|
|
failed_count += 1
|
|
|
|
|
end
|
|
|
|
|
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(failed_count).to(eq(0))
|
2022-08-30 11:14:53 -05:00
|
|
|
processes.all_databases.map(&:count_select_1_plus_2).each do |instance_share|
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(instance_share).to(be_within(expected_share * MARGIN_OF_ERROR).of(expected_share))
|
2022-08-30 11:14:53 -05:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2024-11-06 12:59:20 +01:00
|
|
|
context("when some replicas are down") do
|
2023-01-17 06:52:18 -06:00
|
|
|
it "balances query volume between working instances" do
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
expected_share = QUERY_COUNT / (processes.all_databases.count - 2)
|
|
|
|
|
failed_count = 0
|
|
|
|
|
|
|
|
|
|
processes[:replicas][0].take_down do
|
|
|
|
|
processes[:replicas][1].take_down do
|
|
|
|
|
QUERY_COUNT.times do
|
|
|
|
|
conn.async_exec("SELECT 1 + 2")
|
|
|
|
|
rescue
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
failed_count += 1
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
processes.all_databases.each do |instance|
|
|
|
|
|
queries_routed = instance.count_select_1_plus_2
|
|
|
|
|
if processes.replicas[0..1].include?(instance)
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(queries_routed).to(eq(0))
|
2023-01-17 06:52:18 -06:00
|
|
|
else
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(queries_routed).to(be_within(expected_share * MARGIN_OF_ERROR).of(expected_share))
|
2023-01-17 06:52:18 -06:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
describe "Least Outstanding Queries Load Balancing" do
|
|
|
|
|
let(:processes) { Helpers::Pgcat.single_shard_setup("sharded_db", 1, "transaction", "loc") }
|
|
|
|
|
after do
|
|
|
|
|
processes.all_databases.map(&:reset)
|
|
|
|
|
processes.pgcat.shutdown
|
|
|
|
|
end
|
|
|
|
|
|
2024-11-06 12:59:20 +01:00
|
|
|
context("under homogeneous load") do
|
2023-01-17 06:52:18 -06:00
|
|
|
it "balances query volume between all instances" do
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
|
|
|
|
|
query_count = QUERY_COUNT
|
|
|
|
|
expected_share = query_count / processes.all_databases.count
|
|
|
|
|
failed_count = 0
|
|
|
|
|
|
|
|
|
|
query_count.times do
|
|
|
|
|
conn.async_exec("SELECT 1 + 2")
|
|
|
|
|
rescue
|
|
|
|
|
failed_count += 1
|
|
|
|
|
end
|
|
|
|
|
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(failed_count).to(eq(0))
|
2023-01-17 06:52:18 -06:00
|
|
|
processes.all_databases.map(&:count_select_1_plus_2).each do |instance_share|
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(instance_share).to(be_within(expected_share * MARGIN_OF_ERROR).of(expected_share))
|
2023-01-17 06:52:18 -06:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2024-11-06 12:59:20 +01:00
|
|
|
context("under heterogeneous load") do
|
|
|
|
|
xit("balances query volume between all instances based on how busy they are") do
|
2023-01-17 06:52:18 -06:00
|
|
|
slow_query_count = 2
|
|
|
|
|
threads = Array.new(slow_query_count) do
|
|
|
|
|
Thread.new do
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
2023-01-19 05:18:08 -06:00
|
|
|
conn.async_exec("BEGIN")
|
2023-01-17 06:52:18 -06:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
|
|
|
|
|
query_count = QUERY_COUNT
|
|
|
|
|
expected_share = query_count / (processes.all_databases.count - slow_query_count)
|
|
|
|
|
failed_count = 0
|
|
|
|
|
|
|
|
|
|
query_count.times do
|
|
|
|
|
conn.async_exec("SELECT 1 + 2")
|
|
|
|
|
rescue
|
|
|
|
|
failed_count += 1
|
|
|
|
|
end
|
|
|
|
|
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(failed_count).to(eq(0))
|
2023-01-17 06:52:18 -06:00
|
|
|
# Under LOQ, we expect replicas running the slow pg_sleep
|
|
|
|
|
# to get no selects
|
|
|
|
|
expect(
|
2024-11-06 12:59:20 +01:00
|
|
|
processes
|
|
|
|
|
.all_databases
|
|
|
|
|
.map(&:count_select_1_plus_2)
|
|
|
|
|
.count { |instance_share| instance_share == 0 }
|
|
|
|
|
)
|
|
|
|
|
.to(eq(slow_query_count))
|
2023-01-17 06:52:18 -06:00
|
|
|
|
|
|
|
|
# We also expect the quick queries to be spread across
|
|
|
|
|
# the idle servers only
|
2024-11-06 12:59:20 +01:00
|
|
|
processes
|
|
|
|
|
.all_databases
|
|
|
|
|
.map(&:count_select_1_plus_2)
|
|
|
|
|
.reject { |instance_share| instance_share == 0 }
|
|
|
|
|
.each do |instance_share|
|
|
|
|
|
expect(instance_share).to(be_within(expected_share * MARGIN_OF_ERROR).of(expected_share))
|
|
|
|
|
end
|
2023-01-17 06:52:18 -06:00
|
|
|
|
|
|
|
|
threads.map(&:join)
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2024-11-06 12:59:20 +01:00
|
|
|
context("when some replicas are down") do
|
2022-08-30 11:14:53 -05:00
|
|
|
it "balances query volume between working instances" do
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
expected_share = QUERY_COUNT / (processes.all_databases.count - 2)
|
|
|
|
|
failed_count = 0
|
|
|
|
|
|
|
|
|
|
processes[:replicas][0].take_down do
|
|
|
|
|
processes[:replicas][1].take_down do
|
|
|
|
|
QUERY_COUNT.times do
|
|
|
|
|
conn.async_exec("SELECT 1 + 2")
|
|
|
|
|
rescue
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
failed_count += 1
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(failed_count).to(be <= 2)
|
2022-08-30 11:14:53 -05:00
|
|
|
processes.all_databases.each do |instance|
|
|
|
|
|
queries_routed = instance.count_select_1_plus_2
|
|
|
|
|
if processes.replicas[0..1].include?(instance)
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(queries_routed).to(eq(0))
|
2022-08-30 11:14:53 -05:00
|
|
|
else
|
2024-11-06 12:59:20 +01:00
|
|
|
expect(queries_routed).to(be_within(expected_share * MARGIN_OF_ERROR).of(expected_share))
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
describe "Candidate filtering based on `default_pool`" do
|
|
|
|
|
let(:processes) {
|
|
|
|
|
Helpers::Pgcat.single_shard_setup("sharded_db", 5, "transaction", "random", "debug", pool_settings)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
after do
|
|
|
|
|
processes.all_databases.map(&:reset)
|
|
|
|
|
processes.pgcat.shutdown
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context("with default_pool set to replicas") do
|
|
|
|
|
context("when all replicas are down ") do
|
|
|
|
|
let(:pool_settings) do
|
|
|
|
|
{
|
|
|
|
|
"default_role" => "replica",
|
|
|
|
|
"replica_to_primary_failover_enabled" => replica_to_primary_failover_enabled
|
|
|
|
|
}
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context("with `replica_to_primary_failover_enabled` set to false`") do
|
|
|
|
|
let(:replica_to_primary_failover_enabled) { false }
|
|
|
|
|
|
|
|
|
|
it(
|
|
|
|
|
"unbans them automatically to prevent false positives in health checks that could make all replicas unavailable"
|
|
|
|
|
) do
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
failed_count = 0
|
|
|
|
|
number_of_replicas = processes[:replicas].length
|
|
|
|
|
|
|
|
|
|
# Take down all replicas
|
|
|
|
|
processes[:replicas].each(&:take_down)
|
|
|
|
|
|
|
|
|
|
(number_of_replicas + 1).times do |n|
|
|
|
|
|
conn.async_exec("SELECT 1 + 2")
|
|
|
|
|
rescue
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
failed_count += 1
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
expect(failed_count).to(eq(number_of_replicas + 1))
|
|
|
|
|
failed_count = 0
|
|
|
|
|
|
|
|
|
|
# Ban_time is configured to 60 so this reset will only work
|
|
|
|
|
# if the replicas are unbanned automatically
|
|
|
|
|
processes[:replicas].each(&:reset)
|
|
|
|
|
|
|
|
|
|
number_of_replicas.times do
|
|
|
|
|
conn.async_exec("SELECT 1 + 2")
|
|
|
|
|
rescue
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
failed_count += 1
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
expect(failed_count).to(eq(0))
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
context("with `replica_to_primary_failover_enabled` set to true`") do
|
|
|
|
|
let(:replica_to_primary_failover_enabled) { true }
|
|
|
|
|
|
|
|
|
|
it "does not unbans them automatically" do
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
failed_count = 0
|
|
|
|
|
number_of_replicas = processes[:replicas].length
|
|
|
|
|
|
|
|
|
|
# We need to allow pgcat to open connections to replicas
|
|
|
|
|
(number_of_replicas + 10).times do |n|
|
|
|
|
|
conn.async_exec("SELECT 1 + 2")
|
|
|
|
|
rescue
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
failed_count += 1
|
|
|
|
|
end
|
|
|
|
|
expect(failed_count).to(eq(0))
|
|
|
|
|
|
|
|
|
|
# Take down all replicas
|
|
|
|
|
processes[:replicas].each(&:take_down)
|
|
|
|
|
|
|
|
|
|
(number_of_replicas + 10).times do |n|
|
|
|
|
|
conn.async_exec("SELECT 1 + 2")
|
|
|
|
|
rescue
|
|
|
|
|
conn = PG.connect(processes.pgcat.connection_string("sharded_db", "sharding_user"))
|
|
|
|
|
failed_count += 1
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
expect(failed_count).to(eq(number_of_replicas))
|
2022-08-30 11:14:53 -05:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|