diff --git a/HISTORY b/HISTORY
index 36f9a4ac..32548f5c 100644
--- a/HISTORY
+++ b/HISTORY
@@ -14,6 +14,7 @@
demotion candidate during "standby switchover" (Ian)
repmgr: fix issue with tablespace mapping when cloning from Barman;
GitHub #650 (Ian)
+ repmgr: improve handling of pg_control read errors (Ian)
repmgrd: add additional optional parameters to "failover_validation command"
(spaskalev; GitHub #651)
repmgrd: ensure primary connection is reset if same as upstream;
diff --git a/controldata.c b/controldata.c
index e52d605f..a5251c70 100644
--- a/controldata.c
+++ b/controldata.c
@@ -100,19 +100,21 @@ get_system_identifier(const char *data_directory)
}
-DBState
-get_db_state(const char *data_directory)
+bool
+get_db_state(const char *data_directory, DBState *state)
{
ControlFileInfo *control_file_info = NULL;
- DBState state;
+ bool control_file_processed;
control_file_info = get_controlfile(data_directory);
+ control_file_processed = control_file_info->control_file_processed;
- state = control_file_info->state;
+ if (control_file_processed == true)
+ *state = control_file_info->state;
pfree(control_file_info);
- return state;
+ return control_file_processed;
}
@@ -137,7 +139,7 @@ int
get_data_checksum_version(const char *data_directory)
{
ControlFileInfo *control_file_info = NULL;
- int data_checksum_version = -1;
+ int data_checksum_version = UNKNOWN_DATA_CHECKSUM_VERSION;
control_file_info = get_controlfile(data_directory);
diff --git a/controldata.h b/controldata.h
index c5554662..931d4ee6 100644
--- a/controldata.h
+++ b/controldata.h
@@ -377,7 +377,7 @@ typedef struct ControlFileData12
#endif
extern int get_pg_version(const char *data_directory, char *version_string);
-extern DBState get_db_state(const char *data_directory);
+extern bool get_db_state(const char *data_directory, DBState *state);
extern const char *describe_db_state(DBState state);
extern int get_data_checksum_version(const char *data_directory);
extern uint64 get_system_identifier(const char *data_directory);
diff --git a/doc/appendix-release-notes.xml b/doc/appendix-release-notes.xml
index b108e1be..940bb7cd 100644
--- a/doc/appendix-release-notes.xml
+++ b/doc/appendix-release-notes.xml
@@ -92,7 +92,11 @@
report database connection error if the was provided.
-
+
+
+ Improve handling of pg_control read errors.
+
+
diff --git a/repmgr-action-node.c b/repmgr-action-node.c
index 316e23d5..74a0f9ff 100644
--- a/repmgr-action-node.c
+++ b/repmgr-action-node.c
@@ -134,13 +134,22 @@ do_node_status(void)
if (runtime_options.verbose == true)
{
- uint64 local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
+ uint64 local_system_identifier = get_system_identifier(config_file_options.data_directory);
- local_system_identifier = get_system_identifier(config_file_options.data_directory);
-
- key_value_list_set_format(&node_status,
- "System identifier",
- "%lu", local_system_identifier);
+ if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
+ {
+ key_value_list_set(&node_status,
+ "System identifier",
+ "unknown");
+ item_list_append_format(&warnings,
+ _("unable to retrieve system identifier from pg_control"));
+ }
+ else
+ {
+ key_value_list_set_format(&node_status,
+ "System identifier",
+ "%lu", local_system_identifier);
+ }
}
key_value_list_set(&node_status,
@@ -638,9 +647,17 @@ _do_node_status_is_shutdown_cleanly(void)
break;
}
- /* check what pg_controldata says */
+ /* check what pg_control says */
- db_state = get_db_state(config_file_options.data_directory);
+ if (get_db_state(config_file_options.data_directory, &db_state) == false)
+ {
+ /*
+ * Unable to retrieve the database state from pg_control
+ */
+ node_status = NODE_STATUS_UNKNOWN;
+ log_verbose(LOG_DEBUG, "unable to determine db state");
+ goto return_state;
+ }
log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state));
@@ -659,21 +676,23 @@ _do_node_status_is_shutdown_cleanly(void)
checkPoint = get_latest_checkpoint_location(config_file_options.data_directory);
- /* unable to read pg_control, don't know what's happening */
if (checkPoint == InvalidXLogRecPtr)
{
+ /* unable to read pg_control, don't know what's happening */
node_status = NODE_STATUS_UNKNOWN;
}
-
- /*
- * if still "UNKNOWN" at this point, then the node must be cleanly shut
- * down
- */
else if (node_status == NODE_STATUS_UNKNOWN)
{
+ /*
+ * if still "UNKNOWN" at this point, then the node must be cleanly shut
+ * down
+ */
node_status = NODE_STATUS_DOWN;
}
+
+return_state:
+
log_verbose(LOG_DEBUG, "node status determined as: %s",
print_node_status(node_status));
@@ -2504,7 +2523,11 @@ do_node_rejoin(void)
break;
}
- db_state = get_db_state(config_file_options.data_directory);
+ if (get_db_state(config_file_options.data_directory, &db_state) == false)
+ {
+ log_error(_("unable to determine database state from pg_control"));
+ exit(ERR_BAD_CONFIG);
+ }
if (is_shutdown == false)
{
diff --git a/repmgr-client.c b/repmgr-client.c
index bd860504..9902463a 100644
--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -3612,7 +3612,7 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea
{
int data_checksum_version = get_data_checksum_version(data_directory);
- if (data_checksum_version < 0)
+ if (data_checksum_version == UNKNOWN_DATA_CHECKSUM_VERSION)
{
if (can_use == false)
appendPQExpBuffer(reason, "; ");
@@ -4081,10 +4081,31 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
local_system_identifier = get_system_identifier(config_file_options.data_directory);
/*
- * Check for thing that should never happen, but expect the unexpected anyway.
+ * Check for things that should never happen, but expect the unexpected anyway.
*/
- if (follow_target_identification.system_identifier != local_system_identifier)
+
+ if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
{
+ /*
+ * We don't return immediately here so subsequent checks can be
+ * made, but indicate the node will not be able to rejoin.
+ */
+ success = false;
+ if (runtime_options.dry_run == true)
+ {
+ log_warning(_("unable to retrieve system identifier from pg_control"));
+ }
+ else
+ {
+ log_error(_("unable to retrieve system identifier from pg_control, aborting"));
+ }
+ }
+ else if (follow_target_identification.system_identifier != local_system_identifier)
+ {
+ /*
+ * It's never going to be possible to rejoin a node from another cluster,
+ * so no need to bother with further checks.
+ */
log_error(_("this node is not part of the %s target node's replication cluster"), action);
log_detail(_("this node's system identifier is %lu, %s target node's system identifier is %lu"),
local_system_identifier,
@@ -4093,8 +4114,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
PQfinish(follow_target_repl_conn);
return false;
}
-
- if (runtime_options.dry_run == true)
+ else if (runtime_options.dry_run == true)
{
log_info(_("local and %s target system identifiers match"), action);
log_detail(_("system identifier is %lu"), local_system_identifier);
diff --git a/repmgr.h b/repmgr.h
index 129eeda2..e54fc941 100644
--- a/repmgr.h
+++ b/repmgr.h
@@ -84,6 +84,7 @@
#define UNKNOWN_TIMELINE_ID -1
#define UNKNOWN_SYSTEM_IDENTIFIER 0
+#define UNKNOWN_DATA_CHECKSUM_VERSION -1
#define UNKNOWN_PID -1
#define UNKNOWN_REPLICATION_LAG -1
#define UNKNOWN_VALUE -1