From c0a53471e10917327120527020b9aaaaf6a63c7e Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 31 Jan 2018 10:25:15 +0900 Subject: [PATCH] "standby switchover": improve log messages and add new exit code Previously, if an issue was encountered with the old primary, but user provided -F/--force to have repmgr promote the standby anyway, repmgr would exit with the log message "STANDBY SWITCHOVER is complete" and exit code 0 (SUCCESS). To better report this partial completion, repmgr will now emit the message "STANDBY SWITCHOVER has completed with issues" (and a HINT to check preceding log messages) and new exit code 22 (ERR_SWITCHOVER_INCOMPLETE). --- HISTORY | 4 +- doc/repmgr-standby-switchover.sgml | 66 +++++++++++++++++++++++++++++- errcode.h | 2 +- repmgr-action-standby.c | 40 ++++++++++++++---- 4 files changed, 100 insertions(+), 12 deletions(-) diff --git a/HISTORY b/HISTORY index db7118c0..d5039b8c 100644 --- a/HISTORY +++ b/HISTORY @@ -1,8 +1,10 @@ 4.0.3 2018-02- repmgr: improve switchover handling when "pg_ctl" used to control the server and logging output is not explicitly redirected (Ian) + repmgr: improve switchover log messages and exit code when old primary could + not be shut down cleanly (Ian) -4.0.2 2018-01- +4.0.2 2018-01-18 repmgr: add missing -W option to getopt_long() invocation; GitHub #350 (Ian) repmgr: automatically create slot name if missing; GitHub #343 (Ian) repmgr: fixes to parsing output of remote repmgr invocations; GitHub #349 (Ian) diff --git a/doc/repmgr-standby-switchover.sgml b/doc/repmgr-standby-switchover.sgml index dada6a6b..e551ce04 100644 --- a/doc/repmgr-standby-switchover.sgml +++ b/doc/repmgr-standby-switchover.sgml @@ -22,9 +22,17 @@ If other standbys are connected to the demotion candidate, &repmgr; can instruct - these to follow the new primary if the option --siblings-follow + these to follow the new primary if the option --siblings-follow is specified. + + + Performing a switchover is a non-trivial operation. In particular it + relies on the current primary being able to shut down cleanly and quickly. + &repmgr; will attempt to check for potential issues but cannot guarantee + a successful switchover. + + @@ -47,6 +55,13 @@ Check prerequisites but don't actually execute a switchover. + + + Success of does not imply the switchover will + complete successfully, only that + the prerequisites for performing the operation are met. + + @@ -57,6 +72,12 @@ Ignore warnings and continue anyway. + + Specifically, if a problem is encountered when shutting down the current primary, + using will cause &repmgr; to continue by promoting + the standby to be the new primary, and if is + specified, attach any other standbys to the new primary. + @@ -103,6 +124,11 @@ repmgrd should not be active on any nodes while a switchover is being executed. This restriction may be lifted in a later version. + + External database connections, e.g. from an application, should not be permitted while + the switchover is taking place. In particular, active transactions on the primary + can potentially disrupt the shutdown process. + @@ -119,6 +145,44 @@ + + Exit codes + + Following exit codes can be emitted by repmgr standby switchover: + + + + + + + + The switchover completed successfully. + + + + + + + + + The switchover could not be executed. + + + + + + + + + The switchover was executed but a problem was encountered. + Typically this means the former primary could not be reattached + as a standby. + + + + + + See also diff --git a/errcode.h b/errcode.h index 4d137603..c152967a 100644 --- a/errcode.h +++ b/errcode.h @@ -43,6 +43,6 @@ #define ERR_BARMAN 19 #define ERR_REGISTRATION_SYNC 20 #define ERR_OUT_OF_MEMORY 21 -#define ERR_REJOIN_FAIL 22 +#define ERR_SWITCHOVER_INCOMPLETE 22 #endif /* _ERRCODE_H_ */ diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 465e2510..fe82bbc7 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -2036,6 +2036,10 @@ do_standby_switchover(void) i; bool command_success = false; bool shutdown_success = false; + + /* this flag will use to generate the final message generated */ + bool switchover_success = true; + XLogRecPtr remote_last_checkpoint_lsn = InvalidXLogRecPtr; ReplInfo replication_info = T_REPLINFO_INTIALIZER; @@ -2894,12 +2898,17 @@ do_standby_switchover(void) /* clean up remote node */ remote_conn = establish_db_connection(remote_node_record.conninfo, false); - /* check replication status */ + /* check new standby (old primary) is reachable */ if (PQstatus(remote_conn) != CONNECTION_OK) { - log_error(_("unable to reestablish connection to remote node \"%s\""), - remote_node_record.node_name); - /* log_hint(_("")); // depends on replication status */ + switchover_success = false; + + /* TODO: double-check whether new standby has attached */ + + log_warning(_("switchover did not fully complete")); + log_detail(_("node \"%s\" is now primary but node \"%s\" is not reachable"), + local_node_record.node_name, + remote_node_record.node_name); } else { @@ -2910,17 +2919,20 @@ do_standby_switchover(void) local_node_record.slot_name); } /* TODO warn about any inactive replication slots */ + + log_notice(_("switchover was successful")); + log_detail(_("node \"%s\" is now primary and node \"%s\" is attached as standby"), + local_node_record.node_name, + remote_node_record.node_name); + } PQfinish(remote_conn); - log_notice(_("switchover was successful")); - log_detail(_("node \"%s\" is now primary"), - local_node_record.node_name); /* * If --siblings-follow specified, attempt to make them follow the new - * standby + * primary */ if (runtime_options.siblings_follow == true && sibling_nodes.node_count > 0) @@ -2993,7 +3005,17 @@ do_standby_switchover(void) PQfinish(local_conn); - log_notice(_("STANDBY SWITCHOVER is complete")); + if (switchover_success == true) + { + log_notice(_("STANDBY SWITCHOVER has completed successfully")); + } + else + { + log_notice(_("STANDBY SWITCHOVER has completed with issues")); + log_hint(_("see preceding log message(s) for details")); + exit(ERR_SWITCHOVER_INCOMPLETE); + } + return; }