From 87ea7850cab3597407f211c83429682272bccdf6 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 5 Oct 2017 15:21:22 +0900 Subject: [PATCH] More updates --- doc/event-notifications.sgml | 181 ++++++++++++++++++++++++++++++++++ doc/filelist.sgml | 4 +- doc/repmgr-cluster-event.sgml | 37 +++++++ doc/repmgr.sgml | 3 + doc/repmgrd-monitoring.sgml | 71 +++++++++++++ 5 files changed, 295 insertions(+), 1 deletion(-) create mode 100644 doc/event-notifications.sgml create mode 100644 doc/repmgr-cluster-event.sgml create mode 100644 doc/repmgrd-monitoring.sgml diff --git a/doc/event-notifications.sgml b/doc/event-notifications.sgml new file mode 100644 index 00000000..46b327d3 --- /dev/null +++ b/doc/event-notifications.sgml @@ -0,0 +1,181 @@ + + Event Notifications + + Each time `repmgr` or `repmgrd` perform a significant event, a record + of that event is written into the `repmgr.events` table together with + a timestamp, an indication of failure or success, and further details + if appropriate. This is useful for gaining an overview of events + affecting the replication cluster. However note that this table has + advisory character and should be used in combination with the `repmgr` + and PostgreSQL logs to obtain details of any events. + + + Example output after a primary was registered and a standby cloned + and registered: + + repmgr=# SELECT * from repmgr.events ; + node_id | event | successful | event_timestamp | details + ---------+------------------+------------+-------------------------------+------------------------------------------------------------------------------------- + 1 | primary_register | t | 2016-01-08 15:04:39.781733+09 | + 2 | standby_clone | t | 2016-01-08 15:04:49.530001+09 | Cloned from host 'repmgr_node1', port 5432; backup method: pg_basebackup; --force: N + 2 | standby_register | t | 2016-01-08 15:04:50.621292+09 | + (3 rows) + + + Alternatively, use to output a + formatted list of events. + + + Additionally, event notifications can be passed to a user-defined program + or script which can take further action, e.g. send email notifications. + This is done by setting the `event_notification_command` parameter in + `repmgr.conf`. + + + This parameter accepts the following format placeholders: + + + + + + + + node ID + + + + + + + + + event type + + + + + + + + + success (1 or 0) + + + + + + + + timestamp + + + + + + + + + details + + + + + + The values provided for %t and %d + will probably contain spaces, so should be quoted in the provided command + configuration, e.g.: + + event_notification_command='/path/to/some/script %n %e %s "%t" "%d"' + + + + Additionally the following format placeholders are available for the event + type bdr_failover and optionally bdr_recovery: + + + + + + + conninfo string of the next available node + + + + + + + + name of the next available node + + + + + + These should always be quoted. + + + By default, all notification types will be passed to the designated script; + the notification types can be filtered to explicitly named ones: + + + + primary_register + + + standby_register + + + standby_unregister + + + standby_clone + + + standby_promote + + + standby_follow + + + standby_disconnect_manual + + + repmgrd_start + + + repmgrd_shutdown + + + repmgrd_failover_promote + + + repmgrd_failover_follow + + + bdr_failover + + + bdr_reconnect + + + bdr_recovery + + + bdr_register + + + bdr_unregister + + + + + + Note that under some circumstances (e.g. when no replication cluster primary + could be located), it will not be possible to write an entry into the + repmgr.events + table, in which case executing a script via event_notification_command + can serve as a fallback by generating some form of notification. + + + + diff --git a/doc/filelist.sgml b/doc/filelist.sgml index 940cf68b..246b7504 100644 --- a/doc/filelist.sgml +++ b/doc/filelist.sgml @@ -43,10 +43,12 @@ + + @@ -62,9 +64,9 @@ + - diff --git a/doc/repmgr-cluster-event.sgml b/doc/repmgr-cluster-event.sgml new file mode 100644 index 00000000..f1f24fb7 --- /dev/null +++ b/doc/repmgr-cluster-event.sgml @@ -0,0 +1,37 @@ + + + repmgr cluster event + + repmgr cluster event + + This outputs a formatted list of cluster events, as stored in the + repmgr.events table. Output is in reverse chronological order, and + can be filtered with the following options: + + + --all: outputs all entries + + + --limit: set the maximum number of entries to output (default: 20) + + + --node-id: restrict entries to node with this ID + + + --node-name: restrict entries to node with this name + + + --event: filter specific event + + + + + Example: + + $ repmgr -f /etc/repmgr.conf cluster event --event=standby_register + Node ID | Name | Event | OK | Timestamp | Details + ---------+-------+------------------+----+---------------------+-------------------------------- + 3 | node3 | standby_register | t | 2017-08-17 10:28:55 | standby registration succeeded + 2 | node2 | standby_register | t | 2017-08-17 10:28:53 | standby registration succeeded + + diff --git a/doc/repmgr.sgml b/doc/repmgr.sgml index 11bb2b5d..475f42f6 100644 --- a/doc/repmgr.sgml +++ b/doc/repmgr.sgml @@ -73,6 +73,7 @@ &promoting-standby; &follow-new-primary; &switchover; + &event-notifications; @@ -80,6 +81,7 @@ &repmgrd-automatic-failover; &repmgrd-configuration; &repmgrd-demonstration; + &repmgrd-monitoring; @@ -99,6 +101,7 @@ &repmgr-cluster-show; &repmgr-cluster-matrix; &repmgr-cluster-crosscheck; + &repmgr-cluster-event; &repmgr-cluster-cleanup; diff --git a/doc/repmgrd-monitoring.sgml b/doc/repmgrd-monitoring.sgml new file mode 100644 index 00000000..7daaac0a --- /dev/null +++ b/doc/repmgrd-monitoring.sgml @@ -0,0 +1,71 @@ + + Monitoring with repmgrd + + When `repmgrd` is running with the option monitoring_history=true, + it will constantly write standby node status information to the + monitoring_history table, providing a near-real time + overview of replication status on all nodes + in the cluster. + + + The view replication_status shows the most recent state + for each node, e.g.: + + repmgr=# select * from repmgr.replication_status; + -[ RECORD 1 ]-------------+------------------------------ + primary_node_id | 1 + standby_node_id | 2 + standby_name | node2 + node_type | standby + active | t + last_monitor_time | 2017-08-24 16:28:41.260478+09 + last_wal_primary_location | 0/6D57A00 + last_wal_standby_location | 0/5000000 + replication_lag | 29 MB + replication_time_lag | 00:00:11.736163 + apply_lag | 15 MB + communication_time_lag | 00:00:01.365643 + + + The interval in which monitoring history is written is controlled by the + configuration parameter monitor_interval_secs; + default is 2. + + + As this can generate a large amount of monitoring data in the table + repmgr.monitoring_history. it's advisable to regularly + purge historical data using the + command; use the -k/--keep-history option to + specify how many day's worth of data should be retained. + + + It's possible to use repmgrd to run in monitoring + mode only (without automatic failover capability) for some or all + nodes by setting failover=manual in the node's + repmgr.conf file. In the event of the node's upstream failing, + no failover action will be taken and the node will require manual intervention to + be reattached to replication. If this occurs, an + event notification + standby_disconnect_manual will be created. + + + Note that when a standby node is not streaming directly from its upstream + node, e.g. recovering WAL from an archive, apply_lag will always appear as + 0 bytes. + + + + If monitoring history is enabled, the contents of the repmgr.monitoring_history + table will be replicated to attached standbys. This means there will be a small but + constant stream of replication activity which may not be desirable. To prevent + this, convert the table to an UNLOGGED one with: + + ALTER TABLE repmgr.monitoring_history SET UNLOGGED; + + + This will however mean that monitoring history will not be available on + another node following a failover, and the view repmgr.replication_status + will not work on standbys. + + +