Further improve handling of possible pg_control read errors

Builds on changes in commit 147f454, and ensures appropriate
action is taken if a value cannot be read from pg_control.
This commit is contained in:
Ian Barwick
2020-09-28 13:54:18 +09:00
parent 3ffeffbd8b
commit 70061c51aa
7 changed files with 79 additions and 28 deletions

View File

@@ -14,6 +14,7 @@
demotion candidate during "standby switchover" (Ian)
repmgr: fix issue with tablespace mapping when cloning from Barman;
GitHub #650 (Ian)
repmgr: improve handling of pg_control read errors (Ian)
repmgrd: add additional optional parameters to "failover_validation command"
(spaskalev; GitHub #651)
repmgrd: ensure primary connection is reset if same as upstream;

View File

@@ -100,19 +100,21 @@ get_system_identifier(const char *data_directory)
}
DBState
get_db_state(const char *data_directory)
bool
get_db_state(const char *data_directory, DBState *state)
{
ControlFileInfo *control_file_info = NULL;
DBState state;
bool control_file_processed;
control_file_info = get_controlfile(data_directory);
control_file_processed = control_file_info->control_file_processed;
state = control_file_info->state;
if (control_file_processed == true)
*state = control_file_info->state;
pfree(control_file_info);
return state;
return control_file_processed;
}
@@ -137,7 +139,7 @@ int
get_data_checksum_version(const char *data_directory)
{
ControlFileInfo *control_file_info = NULL;
int data_checksum_version = -1;
int data_checksum_version = UNKNOWN_DATA_CHECKSUM_VERSION;
control_file_info = get_controlfile(data_directory);

View File

@@ -377,7 +377,7 @@ typedef struct ControlFileData12
#endif
extern int get_pg_version(const char *data_directory, char *version_string);
extern DBState get_db_state(const char *data_directory);
extern bool get_db_state(const char *data_directory, DBState *state);
extern const char *describe_db_state(DBState state);
extern int get_data_checksum_version(const char *data_directory);
extern uint64 get_system_identifier(const char *data_directory);

View File

@@ -92,7 +92,11 @@
report database connection error if the <option>--optformat</option> was provided.
</para>
</listitem>
<listitem>
<para>
Improve handling of pg_control read errors.
</para>
</listitem>
</itemizedlist>
</para>
</sect2>

View File

@@ -134,14 +134,23 @@ do_node_status(void)
if (runtime_options.verbose == true)
{
uint64 local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
local_system_identifier = get_system_identifier(config_file_options.data_directory);
uint64 local_system_identifier = get_system_identifier(config_file_options.data_directory);
if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
{
key_value_list_set(&node_status,
"System identifier",
"unknown");
item_list_append_format(&warnings,
_("unable to retrieve system identifier from pg_control"));
}
else
{
key_value_list_set_format(&node_status,
"System identifier",
"%lu", local_system_identifier);
}
}
key_value_list_set(&node_status,
"Role",
@@ -638,9 +647,17 @@ _do_node_status_is_shutdown_cleanly(void)
break;
}
/* check what pg_controldata says */
/* check what pg_control says */
db_state = get_db_state(config_file_options.data_directory);
if (get_db_state(config_file_options.data_directory, &db_state) == false)
{
/*
* Unable to retrieve the database state from pg_control
*/
node_status = NODE_STATUS_UNKNOWN;
log_verbose(LOG_DEBUG, "unable to determine db state");
goto return_state;
}
log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state));
@@ -659,21 +676,23 @@ _do_node_status_is_shutdown_cleanly(void)
checkPoint = get_latest_checkpoint_location(config_file_options.data_directory);
/* unable to read pg_control, don't know what's happening */
if (checkPoint == InvalidXLogRecPtr)
{
/* unable to read pg_control, don't know what's happening */
node_status = NODE_STATUS_UNKNOWN;
}
else if (node_status == NODE_STATUS_UNKNOWN)
{
/*
* if still "UNKNOWN" at this point, then the node must be cleanly shut
* down
*/
else if (node_status == NODE_STATUS_UNKNOWN)
{
node_status = NODE_STATUS_DOWN;
}
return_state:
log_verbose(LOG_DEBUG, "node status determined as: %s",
print_node_status(node_status));
@@ -2504,7 +2523,11 @@ do_node_rejoin(void)
break;
}
db_state = get_db_state(config_file_options.data_directory);
if (get_db_state(config_file_options.data_directory, &db_state) == false)
{
log_error(_("unable to determine database state from pg_control"));
exit(ERR_BAD_CONFIG);
}
if (is_shutdown == false)
{

View File

@@ -3612,7 +3612,7 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea
{
int data_checksum_version = get_data_checksum_version(data_directory);
if (data_checksum_version < 0)
if (data_checksum_version == UNKNOWN_DATA_CHECKSUM_VERSION)
{
if (can_use == false)
appendPQExpBuffer(reason, "; ");
@@ -4081,10 +4081,31 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
local_system_identifier = get_system_identifier(config_file_options.data_directory);
/*
* Check for thing that should never happen, but expect the unexpected anyway.
* Check for things that should never happen, but expect the unexpected anyway.
*/
if (follow_target_identification.system_identifier != local_system_identifier)
if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
{
/*
* We don't return immediately here so subsequent checks can be
* made, but indicate the node will not be able to rejoin.
*/
success = false;
if (runtime_options.dry_run == true)
{
log_warning(_("unable to retrieve system identifier from pg_control"));
}
else
{
log_error(_("unable to retrieve system identifier from pg_control, aborting"));
}
}
else if (follow_target_identification.system_identifier != local_system_identifier)
{
/*
* It's never going to be possible to rejoin a node from another cluster,
* so no need to bother with further checks.
*/
log_error(_("this node is not part of the %s target node's replication cluster"), action);
log_detail(_("this node's system identifier is %lu, %s target node's system identifier is %lu"),
local_system_identifier,
@@ -4093,8 +4114,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
PQfinish(follow_target_repl_conn);
return false;
}
if (runtime_options.dry_run == true)
else if (runtime_options.dry_run == true)
{
log_info(_("local and %s target system identifiers match"), action);
log_detail(_("system identifier is %lu"), local_system_identifier);

View File

@@ -84,6 +84,7 @@
#define UNKNOWN_TIMELINE_ID -1
#define UNKNOWN_SYSTEM_IDENTIFIER 0
#define UNKNOWN_DATA_CHECKSUM_VERSION -1
#define UNKNOWN_PID -1
#define UNKNOWN_REPLICATION_LAG -1
#define UNKNOWN_VALUE -1