mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
repmgr: prevent a standby being cloned from a witness server
Previously repmgr would happily clone from whatever server it found at the provided source server address. We should ensure that a standby can only be cloned from a node which is part of the main replication cluster. This check fetches a list of nodes from the source server, connects to the first non-witness server it finds, and compares the system identifiers of the source node and the node it has connected to. If there is a mismatch, then the source server is clearly not part of the main replication cluster, and is most likely the witness server.
This commit is contained in:
1
HISTORY
1
HISTORY
@@ -8,6 +8,7 @@
|
||||
as reported by each individual node (Ian)
|
||||
repmgr: in "cluster show" and "daemon status", check if a node is attached
|
||||
to its advertised upstream node
|
||||
repmgr: prevent a standby being cloned from a witness server (Ian)
|
||||
repmgrd: monitor standbys attached to primary (Ian)
|
||||
general: documentation converted to DocBook XML format (Ian)
|
||||
|
||||
|
||||
39
dbutils.c
39
dbutils.c
@@ -1551,12 +1551,12 @@ get_ready_archive_files(PGconn *conn, const char *data_directory)
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool
|
||||
identify_system(PGconn *repl_conn, t_system_identification *identification)
|
||||
{
|
||||
PGresult *res = NULL;
|
||||
|
||||
/* semicolon required here */
|
||||
res = PQexec(repl_conn, "IDENTIFY_SYSTEM;");
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK || !PQntuples(res))
|
||||
@@ -1576,6 +1576,43 @@ identify_system(PGconn *repl_conn, t_system_identification *identification)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Return the system identifier by querying pg_control_system().
|
||||
*
|
||||
* Note there is a similar function in controldata.c ("get_system_identifier()")
|
||||
* which reads the control file.
|
||||
*/
|
||||
uint64
|
||||
system_identifier(PGconn *conn)
|
||||
{
|
||||
uint64 system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
|
||||
PGresult *res = NULL;
|
||||
|
||||
/*
|
||||
* pg_control_system() was introduced in PostgreSQL 9.6
|
||||
*/
|
||||
if (PQserverVersion(conn) < 90600)
|
||||
{
|
||||
return UNKNOWN_SYSTEM_IDENTIFIER;
|
||||
}
|
||||
|
||||
res = PQexec(conn, "SELECT system_identifier FROM pg_catalog.pg_control_system()");
|
||||
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
log_db_error(conn, NULL, _("get_system_identifier(): unable to query pg_control_system()"));
|
||||
}
|
||||
else
|
||||
{
|
||||
system_identifier = atol(PQgetvalue(res, 0, 0));
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return system_identifier;
|
||||
}
|
||||
|
||||
|
||||
TimeLineHistoryEntry *
|
||||
get_timeline_history(PGconn *repl_conn, TimeLineID tli)
|
||||
{
|
||||
|
||||
@@ -440,8 +440,8 @@ RecoveryType get_recovery_type(PGconn *conn);
|
||||
int get_primary_node_id(PGconn *conn);
|
||||
int get_ready_archive_files(PGconn *conn, const char *data_directory);
|
||||
bool identify_system(PGconn *repl_conn, t_system_identification *identification);
|
||||
uint64 system_identifier(PGconn *conn);
|
||||
TimeLineHistoryEntry *get_timeline_history(PGconn *repl_conn, TimeLineID tli);
|
||||
bool get_child_nodes(PGconn *conn, int node_id, NodeInfoList *node_list);
|
||||
|
||||
/* repmgrd shared memory functions */
|
||||
bool repmgrd_set_local_node_id(PGconn *conn, int local_node_id);
|
||||
@@ -484,6 +484,7 @@ bool get_primary_node_record(PGconn *conn, t_node_info *node_info);
|
||||
bool get_all_node_records(PGconn *conn, NodeInfoList *node_list);
|
||||
void get_downstream_node_records(PGconn *conn, int node_id, NodeInfoList *nodes);
|
||||
void get_active_sibling_node_records(PGconn *conn, int node_id, int upstream_node_id, NodeInfoList *node_list);
|
||||
bool get_child_nodes(PGconn *conn, int node_id, NodeInfoList *node_list);
|
||||
void get_node_records_by_priority(PGconn *conn, NodeInfoList *node_list);
|
||||
bool get_all_node_records_with_upstream(PGconn *conn, NodeInfoList *node_list);
|
||||
bool get_downstream_nodes_with_missing_slot(PGconn *conn, int this_node_id, NodeInfoList *noede_list);
|
||||
|
||||
@@ -88,6 +88,14 @@
|
||||
warning if the node is not attached.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
<listitem>
|
||||
<para>
|
||||
<link linkend="repmgr-standby-clone"><command>repmgr standby clone</command></link>:
|
||||
prevent a standby from being cloned from a witness server.
|
||||
</para>
|
||||
</listitem>
|
||||
|
||||
</itemizedlist>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
@@ -4867,7 +4867,82 @@ check_source_server()
|
||||
|
||||
log_warning(_("repmgr extension not found on source node"));
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* If upstream is not a standby, retrieve its node records
|
||||
* and attempt to connect to one; we'll then compare
|
||||
* that node's system identifier to that of the source
|
||||
* connection, to ensure we're cloning from a node which is
|
||||
* part of the physical replication cluster. This is mainly
|
||||
* to prevent cloning a standby from a witness server.
|
||||
*
|
||||
* Note that it doesn't matter if the node from the node record
|
||||
* list is the same as the source node; also if the source node
|
||||
* does not have any node records, there's not a lot we can do.
|
||||
*
|
||||
* This check will be only carried out on PostgreSQL 9.6 and
|
||||
* later, as this is a precautionary check and we can retrieve the system
|
||||
* identifier with a normal connection.
|
||||
*/
|
||||
if (get_recovery_type(source_conn) == RECTYPE_PRIMARY && PQserverVersion(source_conn) >= 90600)
|
||||
{
|
||||
uint64 source_system_identifier = system_identifier(source_conn);
|
||||
|
||||
if (source_system_identifier != UNKNOWN_SYSTEM_IDENTIFIER)
|
||||
{
|
||||
NodeInfoList all_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||
NodeInfoListCell *cell = NULL;
|
||||
get_all_node_records(source_conn, &all_nodes);
|
||||
|
||||
log_debug("%i node records returned by source node", all_nodes.node_count);
|
||||
|
||||
/* loop through its nodes table */
|
||||
|
||||
for (cell = all_nodes.head; cell; cell = cell->next)
|
||||
{
|
||||
|
||||
/* exclude the witness node, as its system identifier will be different, of course */
|
||||
if (cell->node_info->type == WITNESS)
|
||||
continue;
|
||||
|
||||
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
|
||||
if (PQstatus(cell->node_info->conn) == CONNECTION_OK)
|
||||
{
|
||||
uint64 test_system_identifier = system_identifier(cell->node_info->conn);
|
||||
PQfinish(cell->node_info->conn);
|
||||
|
||||
if (test_system_identifier != UNKNOWN_SYSTEM_IDENTIFIER)
|
||||
{
|
||||
if (source_system_identifier != test_system_identifier)
|
||||
{
|
||||
log_error(_("source node's system identifier does not match other nodes in the replication cluster"));
|
||||
log_detail(_("source node's system identifier is %lu, replication cluster member \"%s\"'s system identifier is %lu"),
|
||||
source_system_identifier,
|
||||
cell->node_info->node_name,
|
||||
test_system_identifier);
|
||||
log_hint(_("check that the source node is not a witness server"));
|
||||
PQfinish(source_conn);
|
||||
source_conn = NULL;
|
||||
|
||||
if (superuser_conn != NULL)
|
||||
PQfinish(superuser_conn);
|
||||
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
/* identifiers match - our work here is done */
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
PQfinish(cell->node_info->conn);
|
||||
}
|
||||
}
|
||||
clear_node_info_list(&all_nodes);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Fetch the source's data directory */
|
||||
get_superuser_connection(&source_conn, &superuser_conn, &privileged_conn);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user