From a127e8faceb0e984ef53415e7af006e843840462 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 24 Aug 2017 12:58:36 +0900 Subject: [PATCH] Provisionally add view "repmgr.replication_status" --- README.md | 50 +++++++++++++++++++++++++++++++++++-- repmgr--4.0.sql | 34 +++++++++++++++++++------ repmgr--unpackaged--4.0.sql | 32 +++++++++++------------- 3 files changed, 90 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 4abd97b8..fbdc1c11 100644 --- a/README.md +++ b/README.md @@ -115,8 +115,8 @@ tables: - `repmgr.events`: records events of interest - `repmgr.nodes`: connection and status information for each server in the replication cluster - - `repmgr.monitor`: historical standby monitoring information written by `repmgrd` - XXX not yet implemented + - `repmgr.monitoring_history`: historical standby monitoring information written by `repmgrd` + views: - `repmgr.show_nodes`: based on the table `repl_nodes`, additionally showing the @@ -323,6 +323,52 @@ The following replication settings may need to be adjusted: # wal_keep_segments = 5000 +### Monitoring with `repmgrd` + +When `repmgrd` is running with the option `monitoring_history=true`, it will +constantly write standby node status information to the `monitoring_history` +able, providing a near-real time overview of replication status on all nodes +in the cluster. + +The view `replication_status` shows the most recent state for each node, e.g.: + + repmgr=# SELECT * FROM repmgr.replication_status; + -[ RECORD 1 ]-------------+----------------------------- + primary_node | 1 + standby_node | 2 + standby_name | node2 + node_type | standby + active | t + last_monitor_time | 2016-01-05 14:02:34.51713+09 + last_wal_primary_location | 0/3012AF0 + last_wal_standby_location | 0/3012AF0 + replication_lag | 0 bytes + replication_time_lag | 00:00:03.463085 + apply_lag | 0 bytes + communication_time_lag | 00:00:00.955385 + +The interval in which monitoring history is written is controlled by the +configuration parameter `monitor_interval_secs`; default is 2. + +As this can generate a large amount of monitoring data in the `monitoring_history` +table , it's advisable to regularly purge historical data with +`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how +many day's worth of data should be retained. *XXX not yet implemented* + +It's possible to use `repmgrd` to provide monitoring only for some or all +nodes by setting `failover=manual` in the node's `repmgr.conf` file. In the +event of the node's upstream failing, no failover action will be taken +and the node will require manual intervention to be reattached to replication. +If this occurs, an event notification `standby_disconnect_manual` will be +created. + +Note that when a standby node is not streaming directly from its upstream +node, e.g. recovering WAL from an archive, `apply_lag` will always appear as +`0 bytes`. + +XXX ALTER TABLE monitoring_history SET UNLOGGED ; + + Reference --------- diff --git a/repmgr--4.0.sql b/repmgr--4.0.sql index 405cb4f8..b24c82b6 100644 --- a/repmgr--4.0.sql +++ b/repmgr--4.0.sql @@ -1,7 +1,7 @@ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION repmgr" to load this file. \quit -CREATE TABLE nodes ( +CREATE TABLE repmgr.nodes ( node_id INTEGER PRIMARY KEY, upstream_node_id INTEGER NULL REFERENCES nodes (node_id) DEFERRABLE, active BOOLEAN NOT NULL DEFAULT TRUE, @@ -15,7 +15,7 @@ CREATE TABLE nodes ( config_file TEXT NOT NULL ); -CREATE TABLE events ( +CREATE TABLE repmgr.events ( node_id INTEGER NOT NULL, event TEXT NOT NULL, successful BOOLEAN NOT NULL DEFAULT TRUE, @@ -23,7 +23,7 @@ CREATE TABLE events ( details TEXT NULL ); -CREATE TABLE monitoring_history ( +CREATE TABLE repmgr.monitoring_history ( primary_node_id INTEGER NOT NULL, standby_node_id INTEGER NOT NULL, last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL, @@ -34,9 +34,9 @@ CREATE TABLE monitoring_history ( apply_lag BIGINT NOT NULL ); CREATE INDEX idx_monitoring_history_time - ON monitoring_history (last_monitor_time, standby_node_id); + ON repmgr.monitoring_history (last_monitor_time, standby_node_id); -CREATE VIEW show_nodes AS +CREATE VIEW repmgr.show_nodes AS SELECT n.node_id, n.node_name, n.active, @@ -45,10 +45,30 @@ CREATE VIEW show_nodes AS n.type, n.priority, n.conninfo - FROM nodes n -LEFT JOIN nodes un + FROM repmgr.nodes n +LEFT JOIN repmgr.nodes un ON un.node_id = n.upstream_node_id; +-- repmgr.repmgr_get_last_updated() +CREATE VIEW repmgr.replication_status AS + SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name, + n.type AS node_type, n.active, last_monitor_time, + CASE WHEN n.type='standby' THEN m.last_wal_primary_location ELSE NULL END AS last_wal_primary_location, + m.last_wal_standby_location, + CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.replication_lag) ELSE NULL END AS replication_lag, + CASE WHEN n.type='standby' THEN + CASE WHEN replication_lag > 0 THEN age(now(), m.last_apply_time) ELSE '0'::INTERVAL END + ELSE NULL + END AS replication_time_lag, + CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.apply_lag) ELSE NULL END AS apply_lag, + AGE(NOW(), CASE WHEN pg_catalog.pg_is_in_recovery() THEN NOW() ELSE m.last_monitor_time END) AS communication_time_lag + FROM repmgr.monitoring_history m + JOIN repmgr.nodes n ON m.standby_node_id = n.node_id + WHERE (m.standby_node_id, m.last_monitor_time) IN ( + SELECT m1.standby_node_id, MAX(m1.last_monitor_time) + FROM repmgr.monitoring_history m1 GROUP BY 1 + ); + /* repmgrd functions */ CREATE FUNCTION request_vote(INT,INT) diff --git a/repmgr--unpackaged--4.0.sql b/repmgr--unpackaged--4.0.sql index 9012e109..1c06783c 100644 --- a/repmgr--unpackaged--4.0.sql +++ b/repmgr--unpackaged--4.0.sql @@ -27,7 +27,7 @@ BEGIN END$repmgr$; -- convert "repmgr_$cluster.repl_nodes" to "repmgr.nodes" -CREATE TABLE nodes ( +CREATE TABLE repmgr.nodes ( node_id INTEGER PRIMARY KEY, upstream_node_id INTEGER NULL REFERENCES repmgr.nodes (node_id) DEFERRABLE, active BOOLEAN NOT NULL DEFAULT TRUE, @@ -41,22 +41,22 @@ CREATE TABLE nodes ( config_file TEXT NOT NULL ); -INSERT INTO nodes +INSERT INTO repmgr.nodes (node_id, upstream_node_id, active, node_name, type, location, priority, conninfo, repluser, slot_name, config_file) SELECT id, upstream_node_id, active, name, CASE WHEN type = 'master' THEN 'primary' ELSE type END, 'default', priority, conninfo, 'unknown', slot_name, 'unknown' - FROM repl_nodes + FROM repmgr.repl_nodes ORDER BY id; -- convert "repmgr_$cluster.repl_event" to "event" -ALTER TABLE repl_events RENAME TO events; +ALTER TABLE repmgr.repl_events RENAME TO repmgr.events; -- convert "repmgr_$cluster.repl_monitor" to "monitoring_history" -CREATE TABLE monitoring_history ( +CREATE TABLE repmgr.monitoring_history ( primary_node_id INTEGER NOT NULL, standby_node_id INTEGER NOT NULL, last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL, @@ -67,20 +67,20 @@ CREATE TABLE monitoring_history ( apply_lag BIGINT NOT NULL ); -INSERT INTO monitoring_history +INSERT INTO repmgr.monitoring_history (primary_node_id, standby_node_id, last_monitor_time, last_apply_time, last_wal_primary_location, last_wal_standby_location, replication_lag, apply_lag) SELECT primary_node_id, standby_node_id, last_monitor_time, last_apply_time, last_wal_primary_location, last_wal_standby_location, replication_lag, apply_lag - FROM repl_monitor; + FROM repmgr.repl_monitor; CREATE INDEX idx_monitoring_history_time - ON monitoring_history (last_monitor_time, standby_node_id); + ON repmgr.monitoring_history (last_monitor_time, standby_node_id); -- recreate VIEW DROP VIEW IF EXISTS repl_show_nodes; -CREATE VIEW show_nodes AS +CREATE VIEW repmgr.show_nodes AS SELECT n.node_id, n.node_name, n.active, @@ -89,17 +89,17 @@ CREATE VIEW show_nodes AS n.type, n.priority, n.conninfo - FROM nodes n -LEFT JOIN nodes un + FROM repmgr.nodes n +LEFT JOIN repmgr.nodes un ON un.node_id = n.upstream_node_id; -DROP VIEW IF EXISTS repl_status; +DROP VIEW IF EXISTS repmgr.repl_status; --- CREATE VIEW status ... ; +-- XXX CREATE VIEW repmgr.replication_status ... ; /* drop old tables */ -DROP TABLE repl_nodes; -DROP TABLE repl_monitor; +DROP TABLE repmgr.repl_nodes; +DROP TABLE repmgr.repl_monitor; /* repmgrd functions */ @@ -139,13 +139,11 @@ CREATE FUNCTION reset_voting_status() AS '$libdir/repmgr', 'reset_voting_status' LANGUAGE C STRICT; - CREATE FUNCTION am_bdr_failover_handler(INT) RETURNS BOOL AS '$libdir/repmgr', 'am_bdr_failover_handler' LANGUAGE C STRICT; - CREATE FUNCTION unset_bdr_failover_handler() RETURNS VOID AS '$libdir/repmgr', 'unset_bdr_failover_handler'