mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 08:56:29 +00:00
Further improve handling of possible pg_control read errors
Builds on changes in commit 147f454, and ensures appropriate
action is taken if a value cannot be read from pg_control.
This commit is contained in:
1
HISTORY
1
HISTORY
@@ -14,6 +14,7 @@
|
|||||||
demotion candidate during "standby switchover" (Ian)
|
demotion candidate during "standby switchover" (Ian)
|
||||||
repmgr: fix issue with tablespace mapping when cloning from Barman;
|
repmgr: fix issue with tablespace mapping when cloning from Barman;
|
||||||
GitHub #650 (Ian)
|
GitHub #650 (Ian)
|
||||||
|
repmgr: improve handling of pg_control read errors (Ian)
|
||||||
repmgrd: add additional optional parameters to "failover_validation command"
|
repmgrd: add additional optional parameters to "failover_validation command"
|
||||||
(spaskalev; GitHub #651)
|
(spaskalev; GitHub #651)
|
||||||
repmgrd: ensure primary connection is reset if same as upstream;
|
repmgrd: ensure primary connection is reset if same as upstream;
|
||||||
|
|||||||
@@ -100,19 +100,21 @@ get_system_identifier(const char *data_directory)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
DBState
|
bool
|
||||||
get_db_state(const char *data_directory)
|
get_db_state(const char *data_directory, DBState *state)
|
||||||
{
|
{
|
||||||
ControlFileInfo *control_file_info = NULL;
|
ControlFileInfo *control_file_info = NULL;
|
||||||
DBState state;
|
bool control_file_processed;
|
||||||
|
|
||||||
control_file_info = get_controlfile(data_directory);
|
control_file_info = get_controlfile(data_directory);
|
||||||
|
control_file_processed = control_file_info->control_file_processed;
|
||||||
|
|
||||||
state = control_file_info->state;
|
if (control_file_processed == true)
|
||||||
|
*state = control_file_info->state;
|
||||||
|
|
||||||
pfree(control_file_info);
|
pfree(control_file_info);
|
||||||
|
|
||||||
return state;
|
return control_file_processed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -137,7 +139,7 @@ int
|
|||||||
get_data_checksum_version(const char *data_directory)
|
get_data_checksum_version(const char *data_directory)
|
||||||
{
|
{
|
||||||
ControlFileInfo *control_file_info = NULL;
|
ControlFileInfo *control_file_info = NULL;
|
||||||
int data_checksum_version = -1;
|
int data_checksum_version = UNKNOWN_DATA_CHECKSUM_VERSION;
|
||||||
|
|
||||||
control_file_info = get_controlfile(data_directory);
|
control_file_info = get_controlfile(data_directory);
|
||||||
|
|
||||||
|
|||||||
@@ -377,7 +377,7 @@ typedef struct ControlFileData12
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern int get_pg_version(const char *data_directory, char *version_string);
|
extern int get_pg_version(const char *data_directory, char *version_string);
|
||||||
extern DBState get_db_state(const char *data_directory);
|
extern bool get_db_state(const char *data_directory, DBState *state);
|
||||||
extern const char *describe_db_state(DBState state);
|
extern const char *describe_db_state(DBState state);
|
||||||
extern int get_data_checksum_version(const char *data_directory);
|
extern int get_data_checksum_version(const char *data_directory);
|
||||||
extern uint64 get_system_identifier(const char *data_directory);
|
extern uint64 get_system_identifier(const char *data_directory);
|
||||||
|
|||||||
@@ -92,7 +92,11 @@
|
|||||||
report database connection error if the <option>--optformat</option> was provided.
|
report database connection error if the <option>--optformat</option> was provided.
|
||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
Improve handling of pg_control read errors.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
</itemizedlist>
|
</itemizedlist>
|
||||||
</para>
|
</para>
|
||||||
</sect2>
|
</sect2>
|
||||||
|
|||||||
@@ -134,13 +134,22 @@ do_node_status(void)
|
|||||||
|
|
||||||
if (runtime_options.verbose == true)
|
if (runtime_options.verbose == true)
|
||||||
{
|
{
|
||||||
uint64 local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
|
uint64 local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
||||||
|
|
||||||
local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
|
||||||
|
{
|
||||||
key_value_list_set_format(&node_status,
|
key_value_list_set(&node_status,
|
||||||
"System identifier",
|
"System identifier",
|
||||||
"%lu", local_system_identifier);
|
"unknown");
|
||||||
|
item_list_append_format(&warnings,
|
||||||
|
_("unable to retrieve system identifier from pg_control"));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
key_value_list_set_format(&node_status,
|
||||||
|
"System identifier",
|
||||||
|
"%lu", local_system_identifier);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
key_value_list_set(&node_status,
|
key_value_list_set(&node_status,
|
||||||
@@ -638,9 +647,17 @@ _do_node_status_is_shutdown_cleanly(void)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* check what pg_controldata says */
|
/* check what pg_control says */
|
||||||
|
|
||||||
db_state = get_db_state(config_file_options.data_directory);
|
if (get_db_state(config_file_options.data_directory, &db_state) == false)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Unable to retrieve the database state from pg_control
|
||||||
|
*/
|
||||||
|
node_status = NODE_STATUS_UNKNOWN;
|
||||||
|
log_verbose(LOG_DEBUG, "unable to determine db state");
|
||||||
|
goto return_state;
|
||||||
|
}
|
||||||
|
|
||||||
log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state));
|
log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state));
|
||||||
|
|
||||||
@@ -659,21 +676,23 @@ _do_node_status_is_shutdown_cleanly(void)
|
|||||||
|
|
||||||
checkPoint = get_latest_checkpoint_location(config_file_options.data_directory);
|
checkPoint = get_latest_checkpoint_location(config_file_options.data_directory);
|
||||||
|
|
||||||
/* unable to read pg_control, don't know what's happening */
|
|
||||||
if (checkPoint == InvalidXLogRecPtr)
|
if (checkPoint == InvalidXLogRecPtr)
|
||||||
{
|
{
|
||||||
|
/* unable to read pg_control, don't know what's happening */
|
||||||
node_status = NODE_STATUS_UNKNOWN;
|
node_status = NODE_STATUS_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* if still "UNKNOWN" at this point, then the node must be cleanly shut
|
|
||||||
* down
|
|
||||||
*/
|
|
||||||
else if (node_status == NODE_STATUS_UNKNOWN)
|
else if (node_status == NODE_STATUS_UNKNOWN)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* if still "UNKNOWN" at this point, then the node must be cleanly shut
|
||||||
|
* down
|
||||||
|
*/
|
||||||
node_status = NODE_STATUS_DOWN;
|
node_status = NODE_STATUS_DOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return_state:
|
||||||
|
|
||||||
log_verbose(LOG_DEBUG, "node status determined as: %s",
|
log_verbose(LOG_DEBUG, "node status determined as: %s",
|
||||||
print_node_status(node_status));
|
print_node_status(node_status));
|
||||||
|
|
||||||
@@ -2504,7 +2523,11 @@ do_node_rejoin(void)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
db_state = get_db_state(config_file_options.data_directory);
|
if (get_db_state(config_file_options.data_directory, &db_state) == false)
|
||||||
|
{
|
||||||
|
log_error(_("unable to determine database state from pg_control"));
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
if (is_shutdown == false)
|
if (is_shutdown == false)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -3612,7 +3612,7 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea
|
|||||||
{
|
{
|
||||||
int data_checksum_version = get_data_checksum_version(data_directory);
|
int data_checksum_version = get_data_checksum_version(data_directory);
|
||||||
|
|
||||||
if (data_checksum_version < 0)
|
if (data_checksum_version == UNKNOWN_DATA_CHECKSUM_VERSION)
|
||||||
{
|
{
|
||||||
if (can_use == false)
|
if (can_use == false)
|
||||||
appendPQExpBuffer(reason, "; ");
|
appendPQExpBuffer(reason, "; ");
|
||||||
@@ -4081,10 +4081,31 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
|||||||
local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check for thing that should never happen, but expect the unexpected anyway.
|
* Check for things that should never happen, but expect the unexpected anyway.
|
||||||
*/
|
*/
|
||||||
if (follow_target_identification.system_identifier != local_system_identifier)
|
|
||||||
|
if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* We don't return immediately here so subsequent checks can be
|
||||||
|
* made, but indicate the node will not be able to rejoin.
|
||||||
|
*/
|
||||||
|
success = false;
|
||||||
|
if (runtime_options.dry_run == true)
|
||||||
|
{
|
||||||
|
log_warning(_("unable to retrieve system identifier from pg_control"));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_error(_("unable to retrieve system identifier from pg_control, aborting"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (follow_target_identification.system_identifier != local_system_identifier)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* It's never going to be possible to rejoin a node from another cluster,
|
||||||
|
* so no need to bother with further checks.
|
||||||
|
*/
|
||||||
log_error(_("this node is not part of the %s target node's replication cluster"), action);
|
log_error(_("this node is not part of the %s target node's replication cluster"), action);
|
||||||
log_detail(_("this node's system identifier is %lu, %s target node's system identifier is %lu"),
|
log_detail(_("this node's system identifier is %lu, %s target node's system identifier is %lu"),
|
||||||
local_system_identifier,
|
local_system_identifier,
|
||||||
@@ -4093,8 +4114,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
|||||||
PQfinish(follow_target_repl_conn);
|
PQfinish(follow_target_repl_conn);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
else if (runtime_options.dry_run == true)
|
||||||
if (runtime_options.dry_run == true)
|
|
||||||
{
|
{
|
||||||
log_info(_("local and %s target system identifiers match"), action);
|
log_info(_("local and %s target system identifiers match"), action);
|
||||||
log_detail(_("system identifier is %lu"), local_system_identifier);
|
log_detail(_("system identifier is %lu"), local_system_identifier);
|
||||||
|
|||||||
1
repmgr.h
1
repmgr.h
@@ -84,6 +84,7 @@
|
|||||||
|
|
||||||
#define UNKNOWN_TIMELINE_ID -1
|
#define UNKNOWN_TIMELINE_ID -1
|
||||||
#define UNKNOWN_SYSTEM_IDENTIFIER 0
|
#define UNKNOWN_SYSTEM_IDENTIFIER 0
|
||||||
|
#define UNKNOWN_DATA_CHECKSUM_VERSION -1
|
||||||
#define UNKNOWN_PID -1
|
#define UNKNOWN_PID -1
|
||||||
#define UNKNOWN_REPLICATION_LAG -1
|
#define UNKNOWN_REPLICATION_LAG -1
|
||||||
#define UNKNOWN_VALUE -1
|
#define UNKNOWN_VALUE -1
|
||||||
|
|||||||
Reference in New Issue
Block a user