From 061932d02324a28b39edbcd1d4871afb6afc8297 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 23 Jan 2019 16:59:14 +0900 Subject: [PATCH] "node rejoin": verify status of rejoin target This adapts the code previously added to "standby follow" to verify whether the rejoin target can actually be rejoined. --- HISTORY | 2 + doc/appendix-release-notes.sgml | 24 ++++++++++- repmgr-action-node.c | 20 ++++++++- repmgr-action-standby.c | 11 +++-- repmgr-client-global.h | 2 +- repmgr-client.c | 75 +++++++++++++++++++++++---------- 6 files changed, 104 insertions(+), 30 deletions(-) diff --git a/HISTORY b/HISTORY index 5a9ac271..c681bc50 100644 --- a/HISTORY +++ b/HISTORY @@ -5,6 +5,8 @@ repmgr: add "node check --data-directory-config"; GitHub #523 (Ian) repmgr: ensure "standby switchover" verifies repmgr can read the data directory on the demotion candidate; GitHub #523 (Ian) + repmgr: when executing "standby follow" and "node rejoin", check that + it will actually be possible to stream from the target node (Ian) repmgr: "standby switchover": improve handling of connection URIs when executing "node rejoin" on the demotion candidate; GitHub #525 (Ian) repmgrd: check binary and extension major versions match; GitHub #515 (Ian) diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index b5cd7e17..1defe51d 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -24,10 +24,32 @@ repmgr enhancements - + + repmgr standby follow: + option can now be used to specify another standby + to follow. + + + + + + repmgr standby follow: + verify that it is actually possible to follow another node. + + + + + + repmgr node rejoin: + verify that it is actually possible to attach the node to the current primary. + + + + + Add to repmgr cluster show (GitHub #521). diff --git a/repmgr-action-node.c b/repmgr-action-node.c index 9686cfed..bd2166eb 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -2240,8 +2240,24 @@ do_node_rejoin(void) exit(ERR_BAD_CONFIG); } - // sanity-check that it will actually be possible to stream from the new upstream - + /* + * sanity-check that it will actually be possible to stream from the new upstream + */ + { + bool can_follow; + + can_follow = check_node_can_attach(get_timeline(config_file_options.data_directory), + get_min_recovery_location(config_file_options.data_directory), + upstream_conn, + &primary_node_record, + true); + + if (can_follow == false) + { + PQfinish(upstream_conn); + exit(ERR_BAD_CONFIG); + } + } /* diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 267fb714..eb1aee2d 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -2489,8 +2489,10 @@ do_standby_follow(void) if (PQstatus(local_repl_conn) != CONNECTION_OK) { log_error(_("unable to establish a replication connection to the local node")); + PQfinish(local_conn); PQfinish(follow_target_conn); + exit(ERR_FOLLOW_FAIL); } else if (runtime_options.dry_run == true) @@ -2499,21 +2501,24 @@ do_standby_follow(void) } success = identify_system(local_repl_conn, &local_identification); + PQfinish(local_repl_conn); if (success == false) { log_error(_("unable to query the local node's system identification")); + PQfinish(local_conn); + PQfinish(follow_target_conn); + exit(ERR_FOLLOW_FAIL); } - PQfinish(local_repl_conn); - can_follow = check_node_can_attach(local_identification.timeline, local_xlogpos, follow_target_conn, - &follow_target_node_record); + &follow_target_node_record, + false); if (can_follow == false) { diff --git a/repmgr-client-global.h b/repmgr-client-global.h index d8829dd4..caaf70fd 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -253,6 +253,6 @@ extern void init_node_record(t_node_info *node_record); extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason); extern void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name); -extern bool check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record); +extern bool check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin); #endif /* _REPMGR_CLIENT_GLOBAL_H_ */ diff --git a/repmgr-client.c b/repmgr-client.c index b4935d3d..25ebf6b9 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -3173,14 +3173,14 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name) * can actually be followed. */ bool -check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record) +check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin) { uint64 local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER; t_conninfo_param_list follow_target_repl_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER; PGconn *follow_target_repl_conn = NULL; t_system_identification follow_target_identification = T_SYSTEM_IDENTIFICATION_INITIALIZER; TimeLineHistoryEntry *follow_target_history = NULL; - bool success; + bool success = true; /* check replication connection */ initialize_conninfo_params(&follow_target_repl_conninfo, false); @@ -3210,10 +3210,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo } /* check system_identifiers match */ - local_system_identifier = get_system_identifier(config_file_options.data_directory); - success = identify_system(follow_target_repl_conn, &follow_target_identification); - - if (success == false) + if (identify_system(follow_target_repl_conn, &follow_target_identification) == false) { log_error(_("unable to query the follow target node's system identification")); @@ -3221,6 +3218,11 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo return false; } + local_system_identifier = get_system_identifier(config_file_options.data_directory); + + /* + * Check for thing that should never happen, but expect the unexpected anyway. + */ if (follow_target_identification.system_identifier != local_system_identifier) { log_error(_("this node is not part of the follow target node's replication cluster")); @@ -3230,13 +3232,13 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo PQfinish(follow_target_repl_conn); return false; } - else if (runtime_options.dry_run == true) + + if (runtime_options.dry_run == true) { log_info(_("local and follow target system identifiers match")); log_detail(_("system identifier is %lu"), local_system_identifier); } - /* check timelines */ log_verbose(LOG_DEBUG, "local timeline: %i; follow target timeline: %i", @@ -3275,12 +3277,21 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo } else { - log_error(_("this node is ahead of the follow target")); + const char *error_msg = _("this node is ahead of the follow target"); + + if (is_rejoin == true && runtime_options.force_rewind_used == true) + { + log_warning("%s", error_msg); + } + else + { + log_error("%s", error_msg); + success = false; + } + log_detail(_("local node lsn is %X/%X, follow target lsn is %X/%X"), format_lsn(local_xlogpos), format_lsn(follow_target_xlogpos)); - PQfinish(follow_target_repl_conn); - return false; } } else @@ -3306,26 +3317,44 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo */ if (local_xlogpos > follow_target_history->end) { - log_error(_("this node cannot attach to follow target node %i"), - follow_target_node_record->node_id); - log_detail(_("follow target server's timeline %i forked off current database system timeline %i before current recovery point %X/%X\n"), + if (is_rejoin == true && runtime_options.force_rewind_used == true) + { + log_notice(_("pg_rewind execution required for this node to attach to follow target node %i"), + follow_target_node_record->node_id); + } + else + { + log_error(_("this node cannot attach to follow target node %i"), + follow_target_node_record->node_id); + success = false; + } + + log_detail(_("follow target server's timeline %i forked off current database system timeline %i before current recovery point %X/%X"), local_tli + 1, local_tli, format_lsn(local_xlogpos)); - return false; + + if (is_rejoin == true && runtime_options.force_rewind_used == false) + { + log_hint(_("use --force-rewind to execute pg_rewind")); + } } - if (runtime_options.dry_run == true) + if (success == true && runtime_options.dry_run == true) { - log_info(_("local node %i can attach to target node %i"), - config_file_options.node_id, - follow_target_node_record->node_id); + if (is_rejoin == false || (is_rejoin == true && runtime_options.force_rewind_used == false)) + { + log_info(_("local node %i can attach to target node %i"), + config_file_options.node_id, + follow_target_node_record->node_id); - log_detail(_("local node's recovery point: %X/%X; follow target node's fork point: %X/%X"), - format_lsn(local_xlogpos), - format_lsn(follow_target_history->end)); + log_detail(_("local node's recovery point: %X/%X; follow target node's fork point: %X/%X"), + format_lsn(local_xlogpos), + format_lsn(follow_target_history->end)); + } } } - return true; + PQfinish(follow_target_repl_conn); + return success; }