From f1667a7e98ece9301e21f25cffad648c7b6d3364 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Thu, 7 Feb 2019 16:53:48 +0900 Subject: [PATCH] repmgrd: don't consider nodes where repmgrd is not running If, for whatever reason, repmgrd is not running on a node, but that node qualifies as promotion candidate, failover will not take place as that node will never promote itself. We therefore discount nodes where repmgrd is running as promotion candidates, which will ensure one node is always promoted. There is a slight risk here that the node(s) where repmgrd is not running are further ahead, leading to a timeline fork. It might be possible to mitigate that by having the "election" leader perform the promote (or follow) operation. --- HISTORY | 2 ++ doc/appendix-release-notes.sgml | 21 +++++++++++++++++++++ repmgrd-physical.c | 17 +++++++++++++++++ 3 files changed, 40 insertions(+) diff --git a/HISTORY b/HISTORY index f74f1b7e..6fbe8b15 100644 --- a/HISTORY +++ b/HISTORY @@ -20,6 +20,8 @@ repmgrd: check binary and extension major versions match; GitHub #515 (Ian) repmgrd: on a cascaded standby, don't fail over if "failover=manual"; GitHub #531 (Ian) + repmgrd: don't consider nodes where repmgrd is not running as promotion + candidates (Ian) 4.2.1 2018-??-?? repmgr: add sanity check for correct extension version (Ian) diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index 9e054392..91672880 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -124,6 +124,27 @@ REPMGRD_OPTS="--daemonize=false" + + repmgrd enhancements + + + + + + repmgrd will no longer consider nodes where repmgrd + is not running as promotion candidates. + + + Previously, if repmgrd was not running on a node, but + that node qualified as the promotion candidate, it would never be promoted due to + the absence of a running repmgrd. + + + + + + + Bug fixes diff --git a/repmgrd-physical.c b/repmgrd-physical.c index ebe0c839..e827c042 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -3257,6 +3257,7 @@ do_election(void) log_debug("node %i is witness, not querying state", cell->node_info->node_id); continue; } + /* don't check 0-priority nodes */ if (cell->node_info->priority == 0) { @@ -3265,6 +3266,22 @@ do_election(void) continue; } + + /* + * check if repmgrd running - skip if not + * + * TODO: include pid query in replication info query? + * + * NOTE: from Pg12 we could execute "pg_promote()" from a running repmgrd; + * here we'll need to find a way of ensuring only one repmgrd does this + */ + if (repmgrd_get_pid(cell->node_info->conn) == UNKNOWN_PID) + { + log_warning(_("repmgrd not running on node %i, skipping"), + cell->node_info->node_id); + continue; + } + if (get_replication_info(cell->node_info->conn, &sibling_replication_info) == false) { log_warning(_("unable to retrieve replication information for node %i, skipping"),