From 71d151ca8766b38b4722c4afad7f39a5bf528cb6 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Sat, 23 Feb 2019 09:53:29 +0900 Subject: [PATCH] Don't check status of logical replication slots We only want to check the status of physical replication slots to determine whether a streaming replication standby has become detached and there is therefore a risk of uncontrolled WAL buildup on the local node. It's not feasible to second-guess the state of logical replication slots. --- HISTORY | 1 + dbutils.c | 10 ++++++---- doc/appendix-release-notes.sgml | 11 ++++++++++- doc/repmgr-node-check.sgml | 18 +++++++++++++----- repmgr-action-node.c | 18 +++++++++--------- 5 files changed, 39 insertions(+), 19 deletions(-) diff --git a/HISTORY b/HISTORY index 7138b9e8..d7ee6288 100644 --- a/HISTORY +++ b/HISTORY @@ -21,6 +21,7 @@ repmgr: fix long node ID display in "cluster show" (Ian) repmgr: check for primary server before executing "witness register"; GitHub #538 (Ian) + repmgr: "node check" will only consider physical replication slots (Ian) repmgrd: check binary and extension major versions match; GitHub #515 (Ian) repmgrd: on a cascaded standby, don't fail over if "failover=manual"; GitHub #531 (Ian) diff --git a/dbutils.c b/dbutils.c index bafbd3cb..92071f51 100644 --- a/dbutils.c +++ b/dbutils.c @@ -3982,7 +3982,8 @@ get_free_replication_slot_count(PGconn *conn) appendPQExpBufferStr(&query, " SELECT pg_catalog.current_setting('max_replication_slots')::INT - " " pg_catalog.count(*) AS free_slots" - " FROM pg_catalog.pg_replication_slots"); + " FROM pg_catalog.pg_replication_slots s" + " WHERE s.slot_type = 'physical'"); res = PQexec(conn, query.data); @@ -4022,6 +4023,7 @@ get_inactive_replication_slots(PGconn *conn, KeyValueList *list) " SELECT slot_name, slot_type " " FROM pg_catalog.pg_replication_slots " " WHERE active IS FALSE " + " AND slot_type = 'physical' " " ORDER BY slot_name "); res = PQexec(conn, query.data); @@ -4952,9 +4954,9 @@ get_node_replication_stats(PGconn *conn, t_node_info *node_info) { appendPQExpBufferStr(&query, " current_setting('max_replication_slots')::INT AS max_replication_slots, " - " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots) AS total_replication_slots, " - " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots WHERE active IS TRUE) AS active_replication_slots, " - " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots WHERE active IS FALSE) AS inactive_replication_slots, "); + " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots WHERE slot_type='physical') AS total_replication_slots, " + " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots WHERE active IS TRUE AND slot_type='physical') AS active_replication_slots, " + " (SELECT pg_catalog.count(*) FROM pg_catalog.pg_replication_slots WHERE active IS FALSE AND slot_type='physical') AS inactive_replication_slots, "); } diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index 0114d95e..9412c62b 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -17,7 +17,7 @@ Release 4.3 - Feb ???, 2019 + Mar ???, 2019 &repmgr; 4.3 is a major release. @@ -214,6 +214,15 @@ REPMGRD_OPTS="--daemonize=false" + + + repmgr node check + will only consider physical replication slots, as the purpose + of slot checks is to warn about potential issues with + streaming replication standbys which are no longer attached. + + + diff --git a/doc/repmgr-node-check.sgml b/doc/repmgr-node-check.sgml index 371b1eb9..4c1d908b 100644 --- a/doc/repmgr-node-check.sgml +++ b/doc/repmgr-node-check.sgml @@ -18,6 +18,14 @@ Performs some health checks on a node from a replication perspective. This command must be run on the local node. + + + Currently &repmgr; performs health checks on physical replication + slots only, with the aim of warning about streaming replication standbys which + have become detached and the associated risk of uncontrolled WAL file + growth. + + @@ -30,8 +38,8 @@ Replication lag: OK (N/A - node is primary) WAL archiving: OK (0 pending files) Downstream servers: OK (2 of 2 downstream nodes attached) - Replication slots: OK (node has no replication slots) - Missing replication slots: OK (node has no missing replication slots) + Replication slots: OK (node has no physical replication slots) + Missing replication slots: OK (node has no missing physical replication slots) @@ -44,7 +52,7 @@ OK (node is primary) - Parameters for individual checks are as follows: + Parameters for individual checks are as follows: @@ -76,13 +84,13 @@ - --slots: checks there are no inactive replication slots + --slots: checks there are no inactive physical replication slots - --missing-slots: checks there are no missing replication slots + --missing-slots: checks there are no missing physical replication slots diff --git a/repmgr-action-node.c b/repmgr-action-node.c index 9943338e..63b98ee2 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -368,7 +368,7 @@ do_node_status(void) initPQExpBuffer(&slotinfo); appendPQExpBuffer(&slotinfo, - "%i (of maximal %i; %i missing)", + "%i physical (of maximal %i; %i missing)", node_info.active_replication_slots + node_info.inactive_replication_slots, node_info.max_replication_slots, missing_slots.node_count); @@ -385,13 +385,13 @@ do_node_status(void) node_info.inactive_replication_slots); item_list_append_format(&warnings, - _("- node has %i inactive replication slots"), + _("- node has %i inactive physical replication slots"), node_info.inactive_replication_slots); for (cell = inactive_replication_slots.head; cell; cell = cell->next) { item_list_append_format(&warnings, - " - %s (%s)", cell->key, cell->value); + " - %s", cell->key); } key_value_list_free(&inactive_replication_slots); @@ -1639,12 +1639,12 @@ do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, Check else if (node_info->total_replication_slots == 0) { appendPQExpBufferStr(&details, - _("node has no replication slots")); + _("node has no physical replication slots")); } else if (node_info->inactive_replication_slots == 0) { appendPQExpBuffer(&details, - _("%i of %i replication slots are active"), + _("%i of %i physical replication slots are active"), node_info->total_replication_slots, node_info->total_replication_slots); } @@ -1653,7 +1653,7 @@ do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, Check status = CHECK_STATUS_CRITICAL; appendPQExpBuffer(&details, - _("%i of %i replication slots are inactive"), + _("%i of %i physical replication slots are inactive"), node_info->inactive_replication_slots, node_info->total_replication_slots); } @@ -1721,7 +1721,7 @@ do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_inf if (missing_slots.node_count == 0) { appendPQExpBufferStr(&details, - _("node has no missing replication slots")); + _("node has no missing physical replication slots")); } else { @@ -1731,7 +1731,7 @@ do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_inf status = CHECK_STATUS_CRITICAL; appendPQExpBuffer(&details, - _("%i replication slots are missing"), + _("%i physical replication slots are missing"), missing_slots.node_count); if (missing_slots.node_count) @@ -1792,7 +1792,7 @@ do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_inf if (list_output != NULL) { check_status_list_set(list_output, - "Missing replication slots", + "Missing physical replication slots", status, details.data); }