From 70061c51aaf3be40e3777de177baca0414030a22 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Mon, 28 Sep 2020 13:54:18 +0900 Subject: [PATCH] Further improve handling of possible pg_control read errors Builds on changes in commit 147f454, and ensures appropriate action is taken if a value cannot be read from pg_control. --- HISTORY | 1 + controldata.c | 14 +++++---- controldata.h | 2 +- doc/appendix-release-notes.xml | 6 +++- repmgr-action-node.c | 53 ++++++++++++++++++++++++---------- repmgr-client.c | 30 +++++++++++++++---- repmgr.h | 1 + 7 files changed, 79 insertions(+), 28 deletions(-) diff --git a/HISTORY b/HISTORY index 36f9a4ac..32548f5c 100644 --- a/HISTORY +++ b/HISTORY @@ -14,6 +14,7 @@ demotion candidate during "standby switchover" (Ian) repmgr: fix issue with tablespace mapping when cloning from Barman; GitHub #650 (Ian) + repmgr: improve handling of pg_control read errors (Ian) repmgrd: add additional optional parameters to "failover_validation command" (spaskalev; GitHub #651) repmgrd: ensure primary connection is reset if same as upstream; diff --git a/controldata.c b/controldata.c index e52d605f..a5251c70 100644 --- a/controldata.c +++ b/controldata.c @@ -100,19 +100,21 @@ get_system_identifier(const char *data_directory) } -DBState -get_db_state(const char *data_directory) +bool +get_db_state(const char *data_directory, DBState *state) { ControlFileInfo *control_file_info = NULL; - DBState state; + bool control_file_processed; control_file_info = get_controlfile(data_directory); + control_file_processed = control_file_info->control_file_processed; - state = control_file_info->state; + if (control_file_processed == true) + *state = control_file_info->state; pfree(control_file_info); - return state; + return control_file_processed; } @@ -137,7 +139,7 @@ int get_data_checksum_version(const char *data_directory) { ControlFileInfo *control_file_info = NULL; - int data_checksum_version = -1; + int data_checksum_version = UNKNOWN_DATA_CHECKSUM_VERSION; control_file_info = get_controlfile(data_directory); diff --git a/controldata.h b/controldata.h index c5554662..931d4ee6 100644 --- a/controldata.h +++ b/controldata.h @@ -377,7 +377,7 @@ typedef struct ControlFileData12 #endif extern int get_pg_version(const char *data_directory, char *version_string); -extern DBState get_db_state(const char *data_directory); +extern bool get_db_state(const char *data_directory, DBState *state); extern const char *describe_db_state(DBState state); extern int get_data_checksum_version(const char *data_directory); extern uint64 get_system_identifier(const char *data_directory); diff --git a/doc/appendix-release-notes.xml b/doc/appendix-release-notes.xml index b108e1be..940bb7cd 100644 --- a/doc/appendix-release-notes.xml +++ b/doc/appendix-release-notes.xml @@ -92,7 +92,11 @@ report database connection error if the was provided. - + + + Improve handling of pg_control read errors. + + diff --git a/repmgr-action-node.c b/repmgr-action-node.c index 316e23d5..74a0f9ff 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -134,13 +134,22 @@ do_node_status(void) if (runtime_options.verbose == true) { - uint64 local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER; + uint64 local_system_identifier = get_system_identifier(config_file_options.data_directory); - local_system_identifier = get_system_identifier(config_file_options.data_directory); - - key_value_list_set_format(&node_status, - "System identifier", - "%lu", local_system_identifier); + if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER) + { + key_value_list_set(&node_status, + "System identifier", + "unknown"); + item_list_append_format(&warnings, + _("unable to retrieve system identifier from pg_control")); + } + else + { + key_value_list_set_format(&node_status, + "System identifier", + "%lu", local_system_identifier); + } } key_value_list_set(&node_status, @@ -638,9 +647,17 @@ _do_node_status_is_shutdown_cleanly(void) break; } - /* check what pg_controldata says */ + /* check what pg_control says */ - db_state = get_db_state(config_file_options.data_directory); + if (get_db_state(config_file_options.data_directory, &db_state) == false) + { + /* + * Unable to retrieve the database state from pg_control + */ + node_status = NODE_STATUS_UNKNOWN; + log_verbose(LOG_DEBUG, "unable to determine db state"); + goto return_state; + } log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state)); @@ -659,21 +676,23 @@ _do_node_status_is_shutdown_cleanly(void) checkPoint = get_latest_checkpoint_location(config_file_options.data_directory); - /* unable to read pg_control, don't know what's happening */ if (checkPoint == InvalidXLogRecPtr) { + /* unable to read pg_control, don't know what's happening */ node_status = NODE_STATUS_UNKNOWN; } - - /* - * if still "UNKNOWN" at this point, then the node must be cleanly shut - * down - */ else if (node_status == NODE_STATUS_UNKNOWN) { + /* + * if still "UNKNOWN" at this point, then the node must be cleanly shut + * down + */ node_status = NODE_STATUS_DOWN; } + +return_state: + log_verbose(LOG_DEBUG, "node status determined as: %s", print_node_status(node_status)); @@ -2504,7 +2523,11 @@ do_node_rejoin(void) break; } - db_state = get_db_state(config_file_options.data_directory); + if (get_db_state(config_file_options.data_directory, &db_state) == false) + { + log_error(_("unable to determine database state from pg_control")); + exit(ERR_BAD_CONFIG); + } if (is_shutdown == false) { diff --git a/repmgr-client.c b/repmgr-client.c index bd860504..9902463a 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -3612,7 +3612,7 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea { int data_checksum_version = get_data_checksum_version(data_directory); - if (data_checksum_version < 0) + if (data_checksum_version == UNKNOWN_DATA_CHECKSUM_VERSION) { if (can_use == false) appendPQExpBuffer(reason, "; "); @@ -4081,10 +4081,31 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo local_system_identifier = get_system_identifier(config_file_options.data_directory); /* - * Check for thing that should never happen, but expect the unexpected anyway. + * Check for things that should never happen, but expect the unexpected anyway. */ - if (follow_target_identification.system_identifier != local_system_identifier) + + if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER) { + /* + * We don't return immediately here so subsequent checks can be + * made, but indicate the node will not be able to rejoin. + */ + success = false; + if (runtime_options.dry_run == true) + { + log_warning(_("unable to retrieve system identifier from pg_control")); + } + else + { + log_error(_("unable to retrieve system identifier from pg_control, aborting")); + } + } + else if (follow_target_identification.system_identifier != local_system_identifier) + { + /* + * It's never going to be possible to rejoin a node from another cluster, + * so no need to bother with further checks. + */ log_error(_("this node is not part of the %s target node's replication cluster"), action); log_detail(_("this node's system identifier is %lu, %s target node's system identifier is %lu"), local_system_identifier, @@ -4093,8 +4114,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo PQfinish(follow_target_repl_conn); return false; } - - if (runtime_options.dry_run == true) + else if (runtime_options.dry_run == true) { log_info(_("local and %s target system identifiers match"), action); log_detail(_("system identifier is %lu"), local_system_identifier); diff --git a/repmgr.h b/repmgr.h index 129eeda2..e54fc941 100644 --- a/repmgr.h +++ b/repmgr.h @@ -84,6 +84,7 @@ #define UNKNOWN_TIMELINE_ID -1 #define UNKNOWN_SYSTEM_IDENTIFIER 0 +#define UNKNOWN_DATA_CHECKSUM_VERSION -1 #define UNKNOWN_PID -1 #define UNKNOWN_REPLICATION_LAG -1 #define UNKNOWN_VALUE -1