mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 17:06:29 +00:00
"node rejoin": verify status of rejoin target
This adapts the code previously added to "standby follow" to verify whether the rejoin target can actually be rejoined.
This commit is contained in:
2
HISTORY
2
HISTORY
@@ -5,6 +5,8 @@
|
|||||||
repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
|
repmgr: add "node check --data-directory-config"; GitHub #523 (Ian)
|
||||||
repmgr: ensure "standby switchover" verifies repmgr can read the
|
repmgr: ensure "standby switchover" verifies repmgr can read the
|
||||||
data directory on the demotion candidate; GitHub #523 (Ian)
|
data directory on the demotion candidate; GitHub #523 (Ian)
|
||||||
|
repmgr: when executing "standby follow" and "node rejoin", check that
|
||||||
|
it will actually be possible to stream from the target node (Ian)
|
||||||
repmgr: "standby switchover": improve handling of connection URIs when
|
repmgr: "standby switchover": improve handling of connection URIs when
|
||||||
executing "node rejoin" on the demotion candidate; GitHub #525 (Ian)
|
executing "node rejoin" on the demotion candidate; GitHub #525 (Ian)
|
||||||
repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
|
repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
|
||||||
|
|||||||
@@ -26,6 +26,28 @@
|
|||||||
<para>
|
<para>
|
||||||
<itemizedlist>
|
<itemizedlist>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>:
|
||||||
|
option <option>--upstream-node-id</option> can now be used to specify another standby
|
||||||
|
to follow.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
<link linkend="repmgr-standby-follow"><command>repmgr standby follow</command></link>:
|
||||||
|
verify that it is actually possible to follow another node.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
<link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>:
|
||||||
|
verify that it is actually possible to attach the node to the current primary.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
<listitem>
|
<listitem>
|
||||||
<para>
|
<para>
|
||||||
Add <option>--terse</option> to <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command> (GitHub #521).
|
Add <option>--terse</option> to <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command> (GitHub #521).
|
||||||
|
|||||||
@@ -2240,8 +2240,24 @@ do_node_rejoin(void)
|
|||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
}
|
}
|
||||||
|
|
||||||
// sanity-check that it will actually be possible to stream from the new upstream
|
/*
|
||||||
|
* sanity-check that it will actually be possible to stream from the new upstream
|
||||||
|
*/
|
||||||
|
{
|
||||||
|
bool can_follow;
|
||||||
|
|
||||||
|
can_follow = check_node_can_attach(get_timeline(config_file_options.data_directory),
|
||||||
|
get_min_recovery_location(config_file_options.data_directory),
|
||||||
|
upstream_conn,
|
||||||
|
&primary_node_record,
|
||||||
|
true);
|
||||||
|
|
||||||
|
if (can_follow == false)
|
||||||
|
{
|
||||||
|
PQfinish(upstream_conn);
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|||||||
@@ -2489,8 +2489,10 @@ do_standby_follow(void)
|
|||||||
if (PQstatus(local_repl_conn) != CONNECTION_OK)
|
if (PQstatus(local_repl_conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
log_error(_("unable to establish a replication connection to the local node"));
|
log_error(_("unable to establish a replication connection to the local node"));
|
||||||
|
|
||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
PQfinish(follow_target_conn);
|
PQfinish(follow_target_conn);
|
||||||
|
|
||||||
exit(ERR_FOLLOW_FAIL);
|
exit(ERR_FOLLOW_FAIL);
|
||||||
}
|
}
|
||||||
else if (runtime_options.dry_run == true)
|
else if (runtime_options.dry_run == true)
|
||||||
@@ -2499,21 +2501,24 @@ do_standby_follow(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
success = identify_system(local_repl_conn, &local_identification);
|
success = identify_system(local_repl_conn, &local_identification);
|
||||||
|
PQfinish(local_repl_conn);
|
||||||
|
|
||||||
if (success == false)
|
if (success == false)
|
||||||
{
|
{
|
||||||
log_error(_("unable to query the local node's system identification"));
|
log_error(_("unable to query the local node's system identification"));
|
||||||
|
|
||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
|
|
||||||
PQfinish(follow_target_conn);
|
PQfinish(follow_target_conn);
|
||||||
|
|
||||||
exit(ERR_FOLLOW_FAIL);
|
exit(ERR_FOLLOW_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
PQfinish(local_repl_conn);
|
|
||||||
|
|
||||||
can_follow = check_node_can_attach(local_identification.timeline,
|
can_follow = check_node_can_attach(local_identification.timeline,
|
||||||
local_xlogpos,
|
local_xlogpos,
|
||||||
follow_target_conn,
|
follow_target_conn,
|
||||||
&follow_target_node_record);
|
&follow_target_node_record,
|
||||||
|
false);
|
||||||
|
|
||||||
if (can_follow == false)
|
if (can_follow == false)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -253,6 +253,6 @@ extern void init_node_record(t_node_info *node_record);
|
|||||||
extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
|
extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *reason);
|
||||||
extern void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);
|
extern void drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);
|
||||||
|
|
||||||
extern bool check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record);
|
extern bool check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin);
|
||||||
|
|
||||||
#endif /* _REPMGR_CLIENT_GLOBAL_H_ */
|
#endif /* _REPMGR_CLIENT_GLOBAL_H_ */
|
||||||
|
|||||||
@@ -3173,14 +3173,14 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
|
|||||||
* can actually be followed.
|
* can actually be followed.
|
||||||
*/
|
*/
|
||||||
bool
|
bool
|
||||||
check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record)
|
check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin)
|
||||||
{
|
{
|
||||||
uint64 local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
|
uint64 local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
|
||||||
t_conninfo_param_list follow_target_repl_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
t_conninfo_param_list follow_target_repl_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
||||||
PGconn *follow_target_repl_conn = NULL;
|
PGconn *follow_target_repl_conn = NULL;
|
||||||
t_system_identification follow_target_identification = T_SYSTEM_IDENTIFICATION_INITIALIZER;
|
t_system_identification follow_target_identification = T_SYSTEM_IDENTIFICATION_INITIALIZER;
|
||||||
TimeLineHistoryEntry *follow_target_history = NULL;
|
TimeLineHistoryEntry *follow_target_history = NULL;
|
||||||
bool success;
|
bool success = true;
|
||||||
|
|
||||||
/* check replication connection */
|
/* check replication connection */
|
||||||
initialize_conninfo_params(&follow_target_repl_conninfo, false);
|
initialize_conninfo_params(&follow_target_repl_conninfo, false);
|
||||||
@@ -3210,10 +3210,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* check system_identifiers match */
|
/* check system_identifiers match */
|
||||||
local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
if (identify_system(follow_target_repl_conn, &follow_target_identification) == false)
|
||||||
success = identify_system(follow_target_repl_conn, &follow_target_identification);
|
|
||||||
|
|
||||||
if (success == false)
|
|
||||||
{
|
{
|
||||||
log_error(_("unable to query the follow target node's system identification"));
|
log_error(_("unable to query the follow target node's system identification"));
|
||||||
|
|
||||||
@@ -3221,6 +3218,11 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check for thing that should never happen, but expect the unexpected anyway.
|
||||||
|
*/
|
||||||
if (follow_target_identification.system_identifier != local_system_identifier)
|
if (follow_target_identification.system_identifier != local_system_identifier)
|
||||||
{
|
{
|
||||||
log_error(_("this node is not part of the follow target node's replication cluster"));
|
log_error(_("this node is not part of the follow target node's replication cluster"));
|
||||||
@@ -3230,13 +3232,13 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
|||||||
PQfinish(follow_target_repl_conn);
|
PQfinish(follow_target_repl_conn);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else if (runtime_options.dry_run == true)
|
|
||||||
|
if (runtime_options.dry_run == true)
|
||||||
{
|
{
|
||||||
log_info(_("local and follow target system identifiers match"));
|
log_info(_("local and follow target system identifiers match"));
|
||||||
log_detail(_("system identifier is %lu"), local_system_identifier);
|
log_detail(_("system identifier is %lu"), local_system_identifier);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* check timelines */
|
/* check timelines */
|
||||||
|
|
||||||
log_verbose(LOG_DEBUG, "local timeline: %i; follow target timeline: %i",
|
log_verbose(LOG_DEBUG, "local timeline: %i; follow target timeline: %i",
|
||||||
@@ -3275,12 +3277,21 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
log_error(_("this node is ahead of the follow target"));
|
const char *error_msg = _("this node is ahead of the follow target");
|
||||||
|
|
||||||
|
if (is_rejoin == true && runtime_options.force_rewind_used == true)
|
||||||
|
{
|
||||||
|
log_warning("%s", error_msg);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_error("%s", error_msg);
|
||||||
|
success = false;
|
||||||
|
}
|
||||||
|
|
||||||
log_detail(_("local node lsn is %X/%X, follow target lsn is %X/%X"),
|
log_detail(_("local node lsn is %X/%X, follow target lsn is %X/%X"),
|
||||||
format_lsn(local_xlogpos),
|
format_lsn(local_xlogpos),
|
||||||
format_lsn(follow_target_xlogpos));
|
format_lsn(follow_target_xlogpos));
|
||||||
PQfinish(follow_target_repl_conn);
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -3305,17 +3316,33 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
|||||||
* not contain all changes which are being replayed to this standby.
|
* not contain all changes which are being replayed to this standby.
|
||||||
*/
|
*/
|
||||||
if (local_xlogpos > follow_target_history->end)
|
if (local_xlogpos > follow_target_history->end)
|
||||||
|
{
|
||||||
|
if (is_rejoin == true && runtime_options.force_rewind_used == true)
|
||||||
|
{
|
||||||
|
log_notice(_("pg_rewind execution required for this node to attach to follow target node %i"),
|
||||||
|
follow_target_node_record->node_id);
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
log_error(_("this node cannot attach to follow target node %i"),
|
log_error(_("this node cannot attach to follow target node %i"),
|
||||||
follow_target_node_record->node_id);
|
follow_target_node_record->node_id);
|
||||||
log_detail(_("follow target server's timeline %i forked off current database system timeline %i before current recovery point %X/%X\n"),
|
success = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
log_detail(_("follow target server's timeline %i forked off current database system timeline %i before current recovery point %X/%X"),
|
||||||
local_tli + 1,
|
local_tli + 1,
|
||||||
local_tli,
|
local_tli,
|
||||||
format_lsn(local_xlogpos));
|
format_lsn(local_xlogpos));
|
||||||
return false;
|
|
||||||
|
if (is_rejoin == true && runtime_options.force_rewind_used == false)
|
||||||
|
{
|
||||||
|
log_hint(_("use --force-rewind to execute pg_rewind"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (runtime_options.dry_run == true)
|
if (success == true && runtime_options.dry_run == true)
|
||||||
|
{
|
||||||
|
if (is_rejoin == false || (is_rejoin == true && runtime_options.force_rewind_used == false))
|
||||||
{
|
{
|
||||||
log_info(_("local node %i can attach to target node %i"),
|
log_info(_("local node %i can attach to target node %i"),
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
@@ -3326,6 +3353,8 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
|
|||||||
format_lsn(follow_target_history->end));
|
format_lsn(follow_target_history->end));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return true;
|
|
||||||
|
PQfinish(follow_target_repl_conn);
|
||||||
|
return success;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user