mirror of
https://github.com/postgresml/pgcat.git
synced 2026-03-22 17:06:29 +00:00
Improve Prometheus stats + Add Grafana dashboard (#785)
We were missing some labels on metrics generated by the Prometheus exporter so I fixed that. There are still some gaps that I want to address with respect to the metrics we track but this seems like a good start. I also created a Grafana Dashboard and exported it to JSON. It is designed with the same metric names the Prometheus exporter uses.
This commit is contained in:
committed by
GitHub
parent
4a7a6a8e7a
commit
c05129018d
@@ -268,6 +268,8 @@ psql -h 127.0.0.1 -p 6432 -d pgbouncer -c 'SHOW DATABASES'
|
||||
|
||||
Additionally, Prometheus statistics are available at `/metrics` via HTTP.
|
||||
|
||||
We also have a [basic Grafana dashboard](https://github.com/postgresml/pgcat/blob/main/grafana_dashboard.json) based on Prometheus metrics that you can import into Grafana and build on it or use it for monitoring.
|
||||
|
||||
### Live configuration reloading
|
||||
|
||||
The config can be reloaded by sending a `kill -s SIGHUP` to the process or by querying `RELOAD` to the admin database. All settings except the `host` and `port` can be reloaded without restarting the pooler, including sharding and replicas configurations.
|
||||
|
||||
2124
grafana_dashboard.json
Normal file
2124
grafana_dashboard.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -12,19 +12,30 @@ use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use crate::config::Address;
|
||||
use crate::pool::{get_all_pools, PoolIdentifier};
|
||||
use crate::stats::get_server_stats;
|
||||
use crate::stats::pool::PoolStats;
|
||||
use crate::stats::{get_server_stats, ServerStats};
|
||||
|
||||
struct MetricHelpType {
|
||||
help: &'static str,
|
||||
ty: &'static str,
|
||||
}
|
||||
|
||||
struct ServerPrometheusStats {
|
||||
bytes_received: u64,
|
||||
bytes_sent: u64,
|
||||
transaction_count: u64,
|
||||
query_count: u64,
|
||||
error_count: u64,
|
||||
active_count: u64,
|
||||
idle_count: u64,
|
||||
login_count: u64,
|
||||
tested_count: u64,
|
||||
}
|
||||
|
||||
// reference for metric types: https://prometheus.io/docs/concepts/metric_types/
|
||||
// counters only increase
|
||||
// gauges can arbitrarily increase or decrease
|
||||
@@ -127,22 +138,46 @@ static METRIC_HELP_AND_TYPES_LOOKUP: phf::Map<&'static str, MetricHelpType> = ph
|
||||
},
|
||||
"servers_bytes_received" => MetricHelpType {
|
||||
help: "Volume in bytes of network traffic received by server",
|
||||
ty: "gauge",
|
||||
ty: "counter",
|
||||
},
|
||||
"servers_bytes_sent" => MetricHelpType {
|
||||
help: "Volume in bytes of network traffic sent by server",
|
||||
ty: "gauge",
|
||||
ty: "counter",
|
||||
},
|
||||
"servers_transaction_count" => MetricHelpType {
|
||||
help: "Number of transactions executed by server",
|
||||
ty: "gauge",
|
||||
ty: "counter",
|
||||
},
|
||||
"servers_query_count" => MetricHelpType {
|
||||
help: "Number of queries executed by server",
|
||||
ty: "gauge",
|
||||
ty: "counter",
|
||||
},
|
||||
"servers_error_count" => MetricHelpType {
|
||||
help: "Number of errors",
|
||||
ty: "counter",
|
||||
},
|
||||
"servers_idle_count" => MetricHelpType {
|
||||
help: "Number of server connection in idle state",
|
||||
ty: "gauge",
|
||||
},
|
||||
"servers_active_count" => MetricHelpType {
|
||||
help: "Number of server connection in active state",
|
||||
ty: "gauge",
|
||||
},
|
||||
"servers_tested_count" => MetricHelpType {
|
||||
help: "Number of server connection in tested state",
|
||||
ty: "gauge",
|
||||
},
|
||||
"servers_login_count" => MetricHelpType {
|
||||
help: "Number of server connection in login state",
|
||||
ty: "gauge",
|
||||
},
|
||||
"servers_is_banned" => MetricHelpType {
|
||||
help: "0 if server is not banned, 1 if server is banned",
|
||||
ty: "gauge",
|
||||
},
|
||||
"servers_is_paused" => MetricHelpType {
|
||||
help: "0 if server is not paused, 1 if server is paused",
|
||||
ty: "gauge",
|
||||
},
|
||||
"databases_pool_size" => MetricHelpType {
|
||||
@@ -210,7 +245,9 @@ impl<Value: fmt::Display> PrometheusMetric<Value> {
|
||||
labels.insert("shard", address.shard.to_string());
|
||||
labels.insert("role", address.role.to_string());
|
||||
labels.insert("pool", address.pool_name.clone());
|
||||
labels.insert("index", address.address_index.to_string());
|
||||
labels.insert("database", address.database.to_string());
|
||||
labels.insert("user", address.username.clone());
|
||||
|
||||
Self::from_name(&format!("databases_{}", name), value, labels)
|
||||
}
|
||||
@@ -225,8 +262,9 @@ impl<Value: fmt::Display> PrometheusMetric<Value> {
|
||||
labels.insert("shard", address.shard.to_string());
|
||||
labels.insert("role", address.role.to_string());
|
||||
labels.insert("pool", address.pool_name.clone());
|
||||
labels.insert("index", address.address_index.to_string());
|
||||
labels.insert("database", address.database.to_string());
|
||||
|
||||
labels.insert("user", address.username.clone());
|
||||
Self::from_name(&format!("servers_{}", name), value, labels)
|
||||
}
|
||||
|
||||
@@ -236,7 +274,9 @@ impl<Value: fmt::Display> PrometheusMetric<Value> {
|
||||
labels.insert("shard", address.shard.to_string());
|
||||
labels.insert("pool", address.pool_name.clone());
|
||||
labels.insert("role", address.role.to_string());
|
||||
labels.insert("index", address.address_index.to_string());
|
||||
labels.insert("database", address.database.to_string());
|
||||
labels.insert("user", address.username.clone());
|
||||
|
||||
Self::from_name(&format!("stats_{}", name), value, labels)
|
||||
}
|
||||
@@ -338,34 +378,51 @@ fn push_database_stats(lines: &mut Vec<String>) {
|
||||
// Adds relevant metrics shown in a SHOW SERVERS admin command.
|
||||
fn push_server_stats(lines: &mut Vec<String>) {
|
||||
let server_stats = get_server_stats();
|
||||
let mut server_stats_by_addresses = HashMap::<String, Arc<ServerStats>>::new();
|
||||
let mut prom_stats = HashMap::<String, ServerPrometheusStats>::new();
|
||||
for (_, stats) in server_stats {
|
||||
server_stats_by_addresses.insert(stats.address_name(), stats);
|
||||
let entry = prom_stats
|
||||
.entry(stats.address_name())
|
||||
.or_insert(ServerPrometheusStats {
|
||||
bytes_received: 0,
|
||||
bytes_sent: 0,
|
||||
transaction_count: 0,
|
||||
query_count: 0,
|
||||
error_count: 0,
|
||||
active_count: 0,
|
||||
idle_count: 0,
|
||||
login_count: 0,
|
||||
tested_count: 0,
|
||||
});
|
||||
entry.bytes_received += stats.bytes_received.load(Ordering::Relaxed);
|
||||
entry.bytes_sent += stats.bytes_sent.load(Ordering::Relaxed);
|
||||
entry.transaction_count += stats.transaction_count.load(Ordering::Relaxed);
|
||||
entry.query_count += stats.query_count.load(Ordering::Relaxed);
|
||||
entry.error_count += stats.error_count.load(Ordering::Relaxed);
|
||||
match stats.state.load(Ordering::Relaxed) {
|
||||
crate::stats::ServerState::Login => entry.login_count += 1,
|
||||
crate::stats::ServerState::Active => entry.active_count += 1,
|
||||
crate::stats::ServerState::Tested => entry.tested_count += 1,
|
||||
crate::stats::ServerState::Idle => entry.idle_count += 1,
|
||||
}
|
||||
}
|
||||
|
||||
for (_, pool) in get_all_pools() {
|
||||
for shard in 0..pool.shards() {
|
||||
for server in 0..pool.servers(shard) {
|
||||
let address = pool.address(shard, server);
|
||||
if let Some(server_info) = server_stats_by_addresses.get(&address.name()) {
|
||||
if let Some(server_info) = prom_stats.get(&address.name()) {
|
||||
let metrics = [
|
||||
(
|
||||
"bytes_received",
|
||||
server_info.bytes_received.load(Ordering::Relaxed),
|
||||
),
|
||||
("bytes_sent", server_info.bytes_sent.load(Ordering::Relaxed)),
|
||||
(
|
||||
"transaction_count",
|
||||
server_info.transaction_count.load(Ordering::Relaxed),
|
||||
),
|
||||
(
|
||||
"query_count",
|
||||
server_info.query_count.load(Ordering::Relaxed),
|
||||
),
|
||||
(
|
||||
"error_count",
|
||||
server_info.error_count.load(Ordering::Relaxed),
|
||||
),
|
||||
("bytes_received", server_info.bytes_received),
|
||||
("bytes_sent", server_info.bytes_sent),
|
||||
("transaction_count", server_info.transaction_count),
|
||||
("query_count", server_info.query_count),
|
||||
("error_count", server_info.error_count),
|
||||
("idle_count", server_info.idle_count),
|
||||
("active_count", server_info.active_count),
|
||||
("login_count", server_info.login_count),
|
||||
("tested_count", server_info.tested_count),
|
||||
("is_banned", if pool.is_banned(address) { 1 } else { 0 }),
|
||||
("is_paused", if pool.paused() { 1 } else { 0 }),
|
||||
];
|
||||
for (key, value) in metrics {
|
||||
if let Some(prometheus_metric) =
|
||||
|
||||
Reference in New Issue
Block a user