Provisionally add view "repmgr.replication_status"

This commit is contained in:
Ian Barwick
2017-08-24 12:58:36 +09:00
parent a659132ea4
commit a127e8face
3 changed files with 90 additions and 26 deletions

View File

@@ -115,8 +115,8 @@ tables:
- `repmgr.events`: records events of interest
- `repmgr.nodes`: connection and status information for each server in the
replication cluster
- `repmgr.monitor`: historical standby monitoring information written by `repmgrd`
XXX not yet implemented
- `repmgr.monitoring_history`: historical standby monitoring information written by `repmgrd`
views:
- `repmgr.show_nodes`: based on the table `repl_nodes`, additionally showing the
@@ -323,6 +323,52 @@ The following replication settings may need to be adjusted:
# wal_keep_segments = 5000
### Monitoring with `repmgrd`
When `repmgrd` is running with the option `monitoring_history=true`, it will
constantly write standby node status information to the `monitoring_history`
able, providing a near-real time overview of replication status on all nodes
in the cluster.
The view `replication_status` shows the most recent state for each node, e.g.:
repmgr=# SELECT * FROM repmgr.replication_status;
-[ RECORD 1 ]-------------+-----------------------------
primary_node | 1
standby_node | 2
standby_name | node2
node_type | standby
active | t
last_monitor_time | 2016-01-05 14:02:34.51713+09
last_wal_primary_location | 0/3012AF0
last_wal_standby_location | 0/3012AF0
replication_lag | 0 bytes
replication_time_lag | 00:00:03.463085
apply_lag | 0 bytes
communication_time_lag | 00:00:00.955385
The interval in which monitoring history is written is controlled by the
configuration parameter `monitor_interval_secs`; default is 2.
As this can generate a large amount of monitoring data in the `monitoring_history`
table , it's advisable to regularly purge historical data with
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
many day's worth of data should be retained. *XXX not yet implemented*
It's possible to use `repmgrd` to provide monitoring only for some or all
nodes by setting `failover=manual` in the node's `repmgr.conf` file. In the
event of the node's upstream failing, no failover action will be taken
and the node will require manual intervention to be reattached to replication.
If this occurs, an event notification `standby_disconnect_manual` will be
created.
Note that when a standby node is not streaming directly from its upstream
node, e.g. recovering WAL from an archive, `apply_lag` will always appear as
`0 bytes`.
XXX ALTER TABLE monitoring_history SET UNLOGGED ;
Reference
---------

View File

@@ -1,7 +1,7 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION repmgr" to load this file. \quit
CREATE TABLE nodes (
CREATE TABLE repmgr.nodes (
node_id INTEGER PRIMARY KEY,
upstream_node_id INTEGER NULL REFERENCES nodes (node_id) DEFERRABLE,
active BOOLEAN NOT NULL DEFAULT TRUE,
@@ -15,7 +15,7 @@ CREATE TABLE nodes (
config_file TEXT NOT NULL
);
CREATE TABLE events (
CREATE TABLE repmgr.events (
node_id INTEGER NOT NULL,
event TEXT NOT NULL,
successful BOOLEAN NOT NULL DEFAULT TRUE,
@@ -23,7 +23,7 @@ CREATE TABLE events (
details TEXT NULL
);
CREATE TABLE monitoring_history (
CREATE TABLE repmgr.monitoring_history (
primary_node_id INTEGER NOT NULL,
standby_node_id INTEGER NOT NULL,
last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL,
@@ -34,9 +34,9 @@ CREATE TABLE monitoring_history (
apply_lag BIGINT NOT NULL
);
CREATE INDEX idx_monitoring_history_time
ON monitoring_history (last_monitor_time, standby_node_id);
ON repmgr.monitoring_history (last_monitor_time, standby_node_id);
CREATE VIEW show_nodes AS
CREATE VIEW repmgr.show_nodes AS
SELECT n.node_id,
n.node_name,
n.active,
@@ -45,10 +45,30 @@ CREATE VIEW show_nodes AS
n.type,
n.priority,
n.conninfo
FROM nodes n
LEFT JOIN nodes un
FROM repmgr.nodes n
LEFT JOIN repmgr.nodes un
ON un.node_id = n.upstream_node_id;
-- repmgr.repmgr_get_last_updated()
CREATE VIEW repmgr.replication_status AS
SELECT m.primary_node_id, m.standby_node_id, n.node_name AS standby_name,
n.type AS node_type, n.active, last_monitor_time,
CASE WHEN n.type='standby' THEN m.last_wal_primary_location ELSE NULL END AS last_wal_primary_location,
m.last_wal_standby_location,
CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.replication_lag) ELSE NULL END AS replication_lag,
CASE WHEN n.type='standby' THEN
CASE WHEN replication_lag > 0 THEN age(now(), m.last_apply_time) ELSE '0'::INTERVAL END
ELSE NULL
END AS replication_time_lag,
CASE WHEN n.type='standby' THEN pg_catalog.pg_size_pretty(m.apply_lag) ELSE NULL END AS apply_lag,
AGE(NOW(), CASE WHEN pg_catalog.pg_is_in_recovery() THEN NOW() ELSE m.last_monitor_time END) AS communication_time_lag
FROM repmgr.monitoring_history m
JOIN repmgr.nodes n ON m.standby_node_id = n.node_id
WHERE (m.standby_node_id, m.last_monitor_time) IN (
SELECT m1.standby_node_id, MAX(m1.last_monitor_time)
FROM repmgr.monitoring_history m1 GROUP BY 1
);
/* repmgrd functions */
CREATE FUNCTION request_vote(INT,INT)

View File

@@ -27,7 +27,7 @@ BEGIN
END$repmgr$;
-- convert "repmgr_$cluster.repl_nodes" to "repmgr.nodes"
CREATE TABLE nodes (
CREATE TABLE repmgr.nodes (
node_id INTEGER PRIMARY KEY,
upstream_node_id INTEGER NULL REFERENCES repmgr.nodes (node_id) DEFERRABLE,
active BOOLEAN NOT NULL DEFAULT TRUE,
@@ -41,22 +41,22 @@ CREATE TABLE nodes (
config_file TEXT NOT NULL
);
INSERT INTO nodes
INSERT INTO repmgr.nodes
(node_id, upstream_node_id, active, node_name, type, location, priority, conninfo, repluser, slot_name, config_file)
SELECT id, upstream_node_id, active, name,
CASE WHEN type = 'master' THEN 'primary' ELSE type END,
'default', priority, conninfo, 'unknown', slot_name, 'unknown'
FROM repl_nodes
FROM repmgr.repl_nodes
ORDER BY id;
-- convert "repmgr_$cluster.repl_event" to "event"
ALTER TABLE repl_events RENAME TO events;
ALTER TABLE repmgr.repl_events RENAME TO repmgr.events;
-- convert "repmgr_$cluster.repl_monitor" to "monitoring_history"
CREATE TABLE monitoring_history (
CREATE TABLE repmgr.monitoring_history (
primary_node_id INTEGER NOT NULL,
standby_node_id INTEGER NOT NULL,
last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL,
@@ -67,20 +67,20 @@ CREATE TABLE monitoring_history (
apply_lag BIGINT NOT NULL
);
INSERT INTO monitoring_history
INSERT INTO repmgr.monitoring_history
(primary_node_id, standby_node_id, last_monitor_time, last_apply_time, last_wal_primary_location, last_wal_standby_location, replication_lag, apply_lag)
SELECT primary_node_id, standby_node_id, last_monitor_time, last_apply_time, last_wal_primary_location, last_wal_standby_location, replication_lag, apply_lag
FROM repl_monitor;
FROM repmgr.repl_monitor;
CREATE INDEX idx_monitoring_history_time
ON monitoring_history (last_monitor_time, standby_node_id);
ON repmgr.monitoring_history (last_monitor_time, standby_node_id);
-- recreate VIEW
DROP VIEW IF EXISTS repl_show_nodes;
CREATE VIEW show_nodes AS
CREATE VIEW repmgr.show_nodes AS
SELECT n.node_id,
n.node_name,
n.active,
@@ -89,17 +89,17 @@ CREATE VIEW show_nodes AS
n.type,
n.priority,
n.conninfo
FROM nodes n
LEFT JOIN nodes un
FROM repmgr.nodes n
LEFT JOIN repmgr.nodes un
ON un.node_id = n.upstream_node_id;
DROP VIEW IF EXISTS repl_status;
DROP VIEW IF EXISTS repmgr.repl_status;
-- CREATE VIEW status ... ;
-- XXX CREATE VIEW repmgr.replication_status ... ;
/* drop old tables */
DROP TABLE repl_nodes;
DROP TABLE repl_monitor;
DROP TABLE repmgr.repl_nodes;
DROP TABLE repmgr.repl_monitor;
/* repmgrd functions */
@@ -139,13 +139,11 @@ CREATE FUNCTION reset_voting_status()
AS '$libdir/repmgr', 'reset_voting_status'
LANGUAGE C STRICT;
CREATE FUNCTION am_bdr_failover_handler(INT)
RETURNS BOOL
AS '$libdir/repmgr', 'am_bdr_failover_handler'
LANGUAGE C STRICT;
CREATE FUNCTION unset_bdr_failover_handler()
RETURNS VOID
AS '$libdir/repmgr', 'unset_bdr_failover_handler'