mirror of
https://github.com/postgresml/pgcat.git
synced 2026-03-23 01:16:30 +00:00
Introduce tcp_keepalives to PgCat (#315)
We have encountered a case where PgCat pools were stuck following a database incident. Our best understanding at this point is that the PgCat -> Postgres connections died silently and because Tokio defaults to disabling keepalives, connections in the pool were marked as busy forever. Only when we deployed PgCat did we see recovery. This PR introduces tcp_keepalives to PgCat. This sets the defaults to be keepalives_idle: 5 # seconds keepalives_interval: 5 # seconds keepalives_count: 5 # a count These settings can detect the death of an idle connection within 30 seconds of its death. Please note that the connection can remain idle forever (from an application perspective) as long as the keepalive packets are flowing so disconnection will only occur if the other end is not acknowledging keepalive packets (keepalive packet acks are handled by the OS, the application does not need to do anything). I plan to add tcp_user_timeout in a follow-up PR.
This commit is contained in:
committed by
GitHub
parent
d81a744154
commit
f1265a5570
@@ -54,7 +54,7 @@ jobs:
|
||||
command: "cargo fmt --check"
|
||||
- run:
|
||||
name: "Install dependencies"
|
||||
command: "sudo apt-get update && sudo apt-get install -y psmisc postgresql-contrib-12 postgresql-client-12 ruby ruby-dev libpq-dev python3 python3-pip lcov llvm-11 && sudo apt-get upgrade curl"
|
||||
command: "sudo apt-get update && sudo apt-get install -y psmisc postgresql-contrib-12 postgresql-client-12 ruby ruby-dev libpq-dev python3 python3-pip lcov llvm-11 iproute2 && sudo apt-get upgrade curl"
|
||||
- run:
|
||||
name: "Install rust tools"
|
||||
command: "cargo install cargo-binutils rustfilt && rustup component add llvm-tools-preview"
|
||||
|
||||
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -730,6 +730,7 @@ dependencies = [
|
||||
"serde_derive",
|
||||
"sha-1",
|
||||
"sha2",
|
||||
"socket2",
|
||||
"sqlparser",
|
||||
"stringprep",
|
||||
"tokio",
|
||||
|
||||
@@ -35,6 +35,7 @@ hyper = { version = "0.14", features = ["full"] }
|
||||
phf = { version = "0.11.1", features = ["macros"] }
|
||||
exitcode = "1.1.2"
|
||||
futures = "0.3"
|
||||
socket2 = { version = "0.4.7", features = ["all"] }
|
||||
|
||||
[target.'cfg(not(target_env = "msvc"))'.dependencies]
|
||||
jemallocator = "0.5.0"
|
||||
|
||||
@@ -160,6 +160,13 @@ pub struct General {
|
||||
#[serde(default = "General::default_idle_timeout")]
|
||||
pub idle_timeout: u64,
|
||||
|
||||
#[serde(default = "General::default_tcp_keepalives_idle")]
|
||||
pub tcp_keepalives_idle: u64,
|
||||
#[serde(default = "General::default_tcp_keepalives_count")]
|
||||
pub tcp_keepalives_count: u32,
|
||||
#[serde(default = "General::default_tcp_keepalives_interval")]
|
||||
pub tcp_keepalives_interval: u64,
|
||||
|
||||
#[serde(default)] // False
|
||||
pub log_client_connections: bool,
|
||||
|
||||
@@ -203,6 +210,21 @@ impl General {
|
||||
1000
|
||||
}
|
||||
|
||||
// These keepalive defaults should detect a dead connection within 30 seconds.
|
||||
// Tokio defaults to disabling keepalives which keeps dead connections around indefinitely.
|
||||
// This can lead to permenant server pool exhaustion
|
||||
pub fn default_tcp_keepalives_idle() -> u64 {
|
||||
5 // 5 seconds
|
||||
}
|
||||
|
||||
pub fn default_tcp_keepalives_count() -> u32 {
|
||||
5 // 5 time
|
||||
}
|
||||
|
||||
pub fn default_tcp_keepalives_interval() -> u64 {
|
||||
5 // 5 seconds
|
||||
}
|
||||
|
||||
pub fn default_idle_timeout() -> u64 {
|
||||
60000 // 10 minutes
|
||||
}
|
||||
@@ -242,6 +264,9 @@ impl Default for General {
|
||||
healthcheck_delay: Self::default_healthcheck_delay(),
|
||||
ban_time: Self::default_ban_time(),
|
||||
worker_threads: Self::default_worker_threads(),
|
||||
tcp_keepalives_idle: Self::default_tcp_keepalives_idle(),
|
||||
tcp_keepalives_count: Self::default_tcp_keepalives_count(),
|
||||
tcp_keepalives_interval: Self::default_tcp_keepalives_interval(),
|
||||
log_client_connections: false,
|
||||
log_client_disconnections: false,
|
||||
autoreload: false,
|
||||
|
||||
@@ -1,14 +1,18 @@
|
||||
/// Helper functions to send one-off protocol messages
|
||||
/// and handle TcpStream (TCP socket).
|
||||
use bytes::{Buf, BufMut, BytesMut};
|
||||
use log::error;
|
||||
use md5::{Digest, Md5};
|
||||
use socket2::{SockRef, TcpKeepalive};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::net::TcpStream;
|
||||
|
||||
use crate::config::get_config;
|
||||
use crate::errors::Error;
|
||||
use std::collections::HashMap;
|
||||
use std::io::{BufRead, Cursor};
|
||||
use std::mem;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Postgres data type mappings
|
||||
/// used in RowDescription ('T') message.
|
||||
@@ -550,6 +554,26 @@ pub fn server_parameter_message(key: &str, value: &str) -> BytesMut {
|
||||
server_info
|
||||
}
|
||||
|
||||
pub fn configure_socket(stream: &TcpStream) {
|
||||
let sock_ref = SockRef::from(stream);
|
||||
let conf = get_config();
|
||||
|
||||
match sock_ref.set_keepalive(true) {
|
||||
Ok(_) => {
|
||||
match sock_ref.set_tcp_keepalive(
|
||||
&TcpKeepalive::new()
|
||||
.with_interval(Duration::from_secs(conf.general.tcp_keepalives_interval))
|
||||
.with_retries(conf.general.tcp_keepalives_count)
|
||||
.with_time(Duration::from_secs(conf.general.tcp_keepalives_idle)),
|
||||
) {
|
||||
Ok(_) => (),
|
||||
Err(err) => error!("Could not configure socket: {}", err),
|
||||
}
|
||||
}
|
||||
Err(err) => error!("Could not configure socket: {}", err),
|
||||
}
|
||||
}
|
||||
|
||||
pub trait BytesMutReader {
|
||||
fn read_string(&mut self) -> Result<String, Error>;
|
||||
}
|
||||
|
||||
@@ -92,6 +92,7 @@ impl Server {
|
||||
)));
|
||||
}
|
||||
};
|
||||
configure_socket(&stream);
|
||||
|
||||
trace!("Sending StartupMessage");
|
||||
|
||||
@@ -368,6 +369,7 @@ impl Server {
|
||||
return Err(Error::SocketError(format!("Error reading cancel message")));
|
||||
}
|
||||
};
|
||||
configure_socket(&stream);
|
||||
|
||||
debug!("Sending CancelRequest");
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
FROM rust:bullseye
|
||||
|
||||
RUN apt-get update && apt-get install llvm-11 psmisc postgresql-contrib postgresql-client ruby ruby-dev libpq-dev python3 python3-pip lcov sudo curl -y
|
||||
RUN apt-get update && apt-get install llvm-11 psmisc postgresql-contrib postgresql-client ruby ruby-dev libpq-dev python3 python3-pip lcov curl sudo iproute2 -y
|
||||
RUN cargo install cargo-binutils rustfilt
|
||||
RUN rustup component add llvm-tools-preview
|
||||
|
||||
@@ -5,7 +5,7 @@ require_relative 'pg_instance'
|
||||
|
||||
module Helpers
|
||||
module Pgcat
|
||||
def self.three_shard_setup(pool_name, pool_size, pool_mode="transaction", lb_mode="random")
|
||||
def self.three_shard_setup(pool_name, pool_size, pool_mode="transaction", lb_mode="random", log_level="info")
|
||||
user = {
|
||||
"password" => "sharding_user",
|
||||
"pool_size" => pool_size,
|
||||
@@ -13,7 +13,7 @@ module Helpers
|
||||
"username" => "sharding_user"
|
||||
}
|
||||
|
||||
pgcat = PgcatProcess.new("info")
|
||||
pgcat = PgcatProcess.new(log_level)
|
||||
primary0 = PgInstance.new(5432, user["username"], user["password"], "shard0")
|
||||
primary1 = PgInstance.new(7432, user["username"], user["password"], "shard1")
|
||||
primary2 = PgInstance.new(8432, user["username"], user["password"], "shard2")
|
||||
@@ -47,7 +47,7 @@ module Helpers
|
||||
end
|
||||
end
|
||||
|
||||
def self.single_instance_setup(pool_name, pool_size, pool_mode="transaction", lb_mode="random")
|
||||
def self.single_instance_setup(pool_name, pool_size, pool_mode="transaction", lb_mode="random", log_level="trace")
|
||||
user = {
|
||||
"password" => "sharding_user",
|
||||
"pool_size" => pool_size,
|
||||
@@ -55,7 +55,7 @@ module Helpers
|
||||
"username" => "sharding_user"
|
||||
}
|
||||
|
||||
pgcat = PgcatProcess.new("trace")
|
||||
pgcat = PgcatProcess.new(log_level)
|
||||
pgcat_cfg = pgcat.current_config
|
||||
|
||||
primary = PgInstance.new(5432, user["username"], user["password"], "shard0")
|
||||
@@ -92,7 +92,7 @@ module Helpers
|
||||
end
|
||||
end
|
||||
|
||||
def self.single_shard_setup(pool_name, pool_size, pool_mode="transaction", lb_mode="random")
|
||||
def self.single_shard_setup(pool_name, pool_size, pool_mode="transaction", lb_mode="random", log_level="info")
|
||||
user = {
|
||||
"password" => "sharding_user",
|
||||
"pool_size" => pool_size,
|
||||
@@ -100,7 +100,7 @@ module Helpers
|
||||
"username" => "sharding_user"
|
||||
}
|
||||
|
||||
pgcat = PgcatProcess.new("info")
|
||||
pgcat = PgcatProcess.new(log_level)
|
||||
pgcat_cfg = pgcat.current_config
|
||||
|
||||
primary = PgInstance.new(5432, user["username"], user["password"], "shard0")
|
||||
|
||||
@@ -8,7 +8,7 @@ class PgcatProcess
|
||||
attr_reader :pid
|
||||
|
||||
def self.finalize(pid, log_filename, config_filename)
|
||||
`kill #{pid}`
|
||||
`kill #{pid}` if pid
|
||||
File.delete(config_filename) if File.exist?(config_filename)
|
||||
File.delete(log_filename) if File.exist?(log_filename)
|
||||
end
|
||||
@@ -75,8 +75,11 @@ class PgcatProcess
|
||||
end
|
||||
|
||||
def stop
|
||||
return unless @pid
|
||||
|
||||
`kill #{@pid}`
|
||||
sleep 0.1
|
||||
@pid = nil
|
||||
end
|
||||
|
||||
def shutdown
|
||||
|
||||
@@ -88,7 +88,7 @@ describe "Least Outstanding Queries Load Balancing" do
|
||||
end
|
||||
|
||||
context "under heterogeneous load" do
|
||||
it "balances query volume between all instances based on how busy they are" do
|
||||
xit "balances query volume between all instances based on how busy they are" do
|
||||
slow_query_count = 2
|
||||
threads = Array.new(slow_query_count) do
|
||||
Thread.new do
|
||||
|
||||
@@ -8,6 +8,51 @@ describe "Miscellaneous" do
|
||||
processes.pgcat.shutdown
|
||||
end
|
||||
|
||||
describe "TCP Keepalives" do
|
||||
# Ideally, we should block TCP traffic to the database using
|
||||
# iptables to mimic passive (connection is dropped without a RST packet)
|
||||
# but we cannot do this in CircleCI because iptables requires NET_ADMIN
|
||||
# capability that we cannot enable in CircleCI
|
||||
# Toxiproxy won't work either because it does not block keepalives
|
||||
# so our best bet is to query the OS keepalive params set on the socket
|
||||
|
||||
context "default settings" do
|
||||
it "applies default keepalive settings" do
|
||||
# We query ss command to verify that we have correct keepalive values set
|
||||
# we can only verify the keepalives_idle parameter but that's good enough
|
||||
# example output
|
||||
#Recv-Q Send-Q Local Address:Port Peer Address:Port Process
|
||||
#0 0 127.0.0.1:60526 127.0.0.1:18432 timer:(keepalive,1min59sec,0)
|
||||
#0 0 127.0.0.1:60664 127.0.0.1:19432 timer:(keepalive,4.123ms,0)
|
||||
|
||||
port_search_criteria = processes.all_databases.map { |d| "dport = :#{d.port}"}.join(" or ")
|
||||
results = `ss -t4 state established -o -at '( #{port_search_criteria} )'`.lines
|
||||
results.shift
|
||||
results.each { |line| expect(line).to match(/timer:\(keepalive,.*ms,0\)/) }
|
||||
end
|
||||
end
|
||||
|
||||
context "changed settings" do
|
||||
it "applies keepalive settings from config" do
|
||||
new_configs = processes.pgcat.current_config
|
||||
|
||||
new_configs["general"]["tcp_keepalives_idle"] = 120
|
||||
new_configs["general"]["tcp_keepalives_count"] = 1
|
||||
new_configs["general"]["tcp_keepalives_interval"] = 1
|
||||
processes.pgcat.update_config(new_configs)
|
||||
# We need to kill the old process that was using the default configs
|
||||
processes.pgcat.stop
|
||||
processes.pgcat.start
|
||||
processes.pgcat.wait_until_ready
|
||||
|
||||
port_search_criteria = processes.all_databases.map { |d| "dport = :#{d.port}"}.join(" or ")
|
||||
results = `ss -t4 state established -o -at '( #{port_search_criteria} )'`.lines
|
||||
results.shift
|
||||
results.each { |line| expect(line).to include("timer:(keepalive,1min") }
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe "Extended Protocol handling" do
|
||||
it "does not send packets that client does not expect during extended protocol sequence" do
|
||||
new_configs = processes.pgcat.current_config
|
||||
|
||||
Reference in New Issue
Block a user