From b7f20ee1f7bea649329b57ead74c2228237b1499 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 28 Sep 2016 10:50:38 +0900 Subject: [PATCH] repmgrd: don't start if node is inactive and failover=automatic If failover=automatic, it would be reasonable to expect repmgrd to consider this node as a promotion candidate, however this will not happen if it is marked inactive. This often happens when a failed primary is recloned as a standby but not re-registered, and if repmgrd would run it would give the incorrect impression that failover capability is available. Addresses GitHub #153. --- FAQ.md | 11 +++++++++++ repmgrd.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/FAQ.md b/FAQ.md index 2abbfdce..046129af 100644 --- a/FAQ.md +++ b/FAQ.md @@ -151,6 +151,9 @@ General In `repmgr.conf`, set its priority to a value of 0 or less. + Additionally, if `failover` is set to `manual`, the node will never + be considered as a promotion candidate. + - Does `repmgrd` support delayed standbys? `repmgrd` can monitor delayed standbys - those set up with @@ -169,3 +172,11 @@ General Configure your system's `logrotate` service to do this; see example in README.md + +- I've recloned a failed master as a standby, but `repmgrd` refuses to start? + + Check you registered the standby after recloning. If unregistered the standby + cannot be considered as a promotion candidate even if `failover` is set to + `automatic`, which is probably not what you want. `repmgrd` will start if + `failover` is set to `manual` so the node's replication status can still + be monitored, if desired. diff --git a/repmgrd.c b/repmgrd.c index 62cc37f1..7ffe1b86 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -311,6 +311,41 @@ main(int argc, char **argv) log_debug("node id is %i, upstream is %i\n", node_info.node_id, node_info.upstream_node_id); + /* + * Check if node record is active - if not, and `failover=automatic`, the node + * won't be considered as a promotion candidate; this often happens when + * a failed primary is recloned and the node was not re-registered, giving + * the impression failover capability is there when it's not. In this case + * abort with an error and a hint about registering. + * + * If `failover=manual`, repmgrd can continue to passively monitor the node, but + * we should nevertheless issue a warning and the same hint. + */ + + if (node_info.active == false) + { + char *hint = "Check that 'repmgr (master|standby) register' was executed for this node"; + + switch (local_options.failover) + { + case AUTOMATIC_FAILOVER: + log_err(_("This node is marked as inactive and cannot be used for failover\n")); + log_hint(_("%s\n"), hint); + terminate(ERR_BAD_CONFIG); + + case MANUAL_FAILOVER: + log_warning(_("This node is marked as inactive and will be passively monitored only\n")); + log_hint(_("%s\n"), hint); + break; + + default: + /* This should never happen */ + log_err(_("Unknown failover mode %i\n"), local_options.failover); + terminate(ERR_BAD_CONFIG); + } + + } + /* * MAIN LOOP This loops cycles at startup and once per failover and * Requisites: