mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Further improve handling of possible pg_control read errors
Builds on changes in commit 147f454, and ensures appropriate
action is taken if a value cannot be read from pg_control.
This commit is contained in:
1
HISTORY
1
HISTORY
@@ -14,6 +14,7 @@
|
||||
demotion candidate during "standby switchover" (Ian)
|
||||
repmgr: fix issue with tablespace mapping when cloning from Barman;
|
||||
GitHub #650 (Ian)
|
||||
repmgr: improve handling of pg_control read errors (Ian)
|
||||
repmgrd: add additional optional parameters to "failover_validation command"
|
||||
(spaskalev; GitHub #651)
|
||||
repmgrd: ensure primary connection is reset if same as upstream;
|
||||
|
||||
@@ -100,19 +100,21 @@ get_system_identifier(const char *data_directory)
|
||||
}
|
||||
|
||||
|
||||
DBState
|
||||
get_db_state(const char *data_directory)
|
||||
bool
|
||||
get_db_state(const char *data_directory, DBState *state)
|
||||
{
|
||||
ControlFileInfo *control_file_info = NULL;
|
||||
DBState state;
|
||||
bool control_file_processed;
|
||||
|
||||
control_file_info = get_controlfile(data_directory);
|
||||
control_file_processed = control_file_info->control_file_processed;
|
||||
|
||||
state = control_file_info->state;
|
||||
if (control_file_processed == true)
|
||||
*state = control_file_info->state;
|
||||
|
||||
pfree(control_file_info);
|
||||
|
||||
return state;
|
||||
return control_file_processed;
|
||||
}
|
||||
|
||||
|
||||
@@ -137,7 +139,7 @@ int
|
||||
get_data_checksum_version(const char *data_directory)
|
||||
{
|
||||
ControlFileInfo *control_file_info = NULL;
|
||||
int data_checksum_version = -1;
|
||||
int data_checksum_version = UNKNOWN_DATA_CHECKSUM_VERSION;
|
||||
|
||||
control_file_info = get_controlfile(data_directory);
|
||||
|
||||
|
||||
@@ -377,7 +377,7 @@ typedef struct ControlFileData12
|
||||
#endif
|
||||
|
||||
extern int get_pg_version(const char *data_directory, char *version_string);
|
||||
extern DBState get_db_state(const char *data_directory);
|
||||
extern bool get_db_state(const char *data_directory, DBState *state);
|
||||
extern const char *describe_db_state(DBState state);
|
||||
extern int get_data_checksum_version(const char *data_directory);
|
||||
extern uint64 get_system_identifier(const char *data_directory);
|
||||
|
||||
@@ -92,7 +92,11 @@
|
||||
report database connection error if the <option>--optformat</option> was provided.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
Improve handling of pg_control read errors.
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
@@ -134,13 +134,22 @@ do_node_status(void)
|
||||
|
||||
if (runtime_options.verbose == true)
|
||||
{
|
||||
uint64 local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
|
||||
uint64 local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
||||
|
||||
local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
||||
|
||||
key_value_list_set_format(&node_status,
|
||||
"System identifier",
|
||||
"%lu", local_system_identifier);
|
||||
if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
|
||||
{
|
||||
key_value_list_set(&node_status,
|
||||
"System identifier",
|
||||
"unknown");
|
||||
item_list_append_format(&warnings,
|
||||
_("unable to retrieve system identifier from pg_control"));
|
||||
}
|
||||
else
|
||||
{
|
||||
key_value_list_set_format(&node_status,
|
||||
"System identifier",
|
||||
"%lu", local_system_identifier);
|
||||
}
|
||||
}
|
||||
|
||||
key_value_list_set(&node_status,
|
||||
@@ -638,9 +647,17 @@ _do_node_status_is_shutdown_cleanly(void)
|
||||
break;
|
||||
}
|
||||
|
||||
/* check what pg_controldata says */
|
||||
/* check what pg_control says */
|
||||
|
||||
db_state = get_db_state(config_file_options.data_directory);
|
||||
if (get_db_state(config_file_options.data_directory, &db_state) == false)
|
||||
{
|
||||
/*
|
||||
* Unable to retrieve the database state from pg_control
|
||||
*/
|
||||
node_status = NODE_STATUS_UNKNOWN;
|
||||
log_verbose(LOG_DEBUG, "unable to determine db state");
|
||||
goto return_state;
|
||||
}
|
||||
|
||||
log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state));
|
||||
|
||||
@@ -659,21 +676,23 @@ _do_node_status_is_shutdown_cleanly(void)
|
||||
|
||||
checkPoint = get_latest_checkpoint_location(config_file_options.data_directory);
|
||||
|
||||
/* unable to read pg_control, don't know what's happening */
|
||||
if (checkPoint == InvalidXLogRecPtr)
|
||||
{
|
||||
/* unable to read pg_control, don't know what's happening */
|
||||
node_status = NODE_STATUS_UNKNOWN;
|
||||
}
|
||||
|
||||
/*
|
||||
* if still "UNKNOWN" at this point, then the node must be cleanly shut
|
||||
* down
|
||||
*/
|
||||
else if (node_status == NODE_STATUS_UNKNOWN)
|
||||
{
|
||||
/*
|
||||
* if still "UNKNOWN" at this point, then the node must be cleanly shut
|
||||
* down
|
||||
*/
|
||||
node_status = NODE_STATUS_DOWN;
|
||||
}
|
||||
|
||||
|
||||
return_state:
|
||||
|
||||
log_verbose(LOG_DEBUG, "node status determined as: %s",
|
||||
print_node_status(node_status));
|
||||
|
||||
@@ -2504,7 +2523,11 @@ do_node_rejoin(void)
|
||||
break;
|
||||
}
|
||||
|
||||
db_state = get_db_state(config_file_options.data_directory);
|
||||
if (get_db_state(config_file_options.data_directory, &db_state) == false)
|
||||
{
|
||||
log_error(_("unable to determine database state from pg_control"));
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
if (is_shutdown == false)
|
||||
{
|
||||
|
||||
@@ -3612,7 +3612,7 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea
|
||||
{
|
||||
int data_checksum_version = get_data_checksum_version(data_directory);
|
||||
|
||||
if (data_checksum_version < 0)
|
||||
if (data_checksum_version == UNKNOWN_DATA_CHECKSUM_VERSION)
|
||||
{
|
||||
if (can_use == false)
|
||||
appendPQExpBuffer(reason, "; ");
|
||||
@@ -4081,10 +4081,31 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
||||
local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
||||
|
||||
/*
|
||||
* Check for thing that should never happen, but expect the unexpected anyway.
|
||||
* Check for things that should never happen, but expect the unexpected anyway.
|
||||
*/
|
||||
if (follow_target_identification.system_identifier != local_system_identifier)
|
||||
|
||||
if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
|
||||
{
|
||||
/*
|
||||
* We don't return immediately here so subsequent checks can be
|
||||
* made, but indicate the node will not be able to rejoin.
|
||||
*/
|
||||
success = false;
|
||||
if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_warning(_("unable to retrieve system identifier from pg_control"));
|
||||
}
|
||||
else
|
||||
{
|
||||
log_error(_("unable to retrieve system identifier from pg_control, aborting"));
|
||||
}
|
||||
}
|
||||
else if (follow_target_identification.system_identifier != local_system_identifier)
|
||||
{
|
||||
/*
|
||||
* It's never going to be possible to rejoin a node from another cluster,
|
||||
* so no need to bother with further checks.
|
||||
*/
|
||||
log_error(_("this node is not part of the %s target node's replication cluster"), action);
|
||||
log_detail(_("this node's system identifier is %lu, %s target node's system identifier is %lu"),
|
||||
local_system_identifier,
|
||||
@@ -4093,8 +4114,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
||||
PQfinish(follow_target_repl_conn);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (runtime_options.dry_run == true)
|
||||
else if (runtime_options.dry_run == true)
|
||||
{
|
||||
log_info(_("local and %s target system identifiers match"), action);
|
||||
log_detail(_("system identifier is %lu"), local_system_identifier);
|
||||
|
||||
Reference in New Issue
Block a user