repmgr: prevent a standby being cloned from a witness server

Previously repmgr would happily clone from whatever server
it found at the provided source server address. We should
ensure that a standby can only be cloned from a node which
is part of the main replication cluster.

This check fetches a list of nodes from the source server,
connects to the first non-witness server it finds, and
compares the system identifiers of the source node and the
node it has connected to. If there is a mismatch, then the
source server is clearly not part of the main replication
cluster, and is most likely the witness server.
This commit is contained in:
Ian Barwick
2019-05-22 16:29:41 +09:00
parent fa66e72c2f
commit c9e85996f5
5 changed files with 124 additions and 2 deletions

View File

@@ -8,6 +8,7 @@
as reported by each individual node (Ian)
repmgr: in "cluster show" and "daemon status", check if a node is attached
to its advertised upstream node
repmgr: prevent a standby being cloned from a witness server (Ian)
repmgrd: monitor standbys attached to primary (Ian)
general: documentation converted to DocBook XML format (Ian)

View File

@@ -1551,12 +1551,12 @@ get_ready_archive_files(PGconn *conn, const char *data_directory)
}
bool
identify_system(PGconn *repl_conn, t_system_identification *identification)
{
PGresult *res = NULL;
/* semicolon required here */
res = PQexec(repl_conn, "IDENTIFY_SYSTEM;");
if (PQresultStatus(res) != PGRES_TUPLES_OK || !PQntuples(res))
@@ -1576,6 +1576,43 @@ identify_system(PGconn *repl_conn, t_system_identification *identification)
}
/*
* Return the system identifier by querying pg_control_system().
*
* Note there is a similar function in controldata.c ("get_system_identifier()")
* which reads the control file.
*/
uint64
system_identifier(PGconn *conn)
{
uint64 system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
PGresult *res = NULL;
/*
* pg_control_system() was introduced in PostgreSQL 9.6
*/
if (PQserverVersion(conn) < 90600)
{
return UNKNOWN_SYSTEM_IDENTIFIER;
}
res = PQexec(conn, "SELECT system_identifier FROM pg_catalog.pg_control_system()");
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_db_error(conn, NULL, _("get_system_identifier(): unable to query pg_control_system()"));
}
else
{
system_identifier = atol(PQgetvalue(res, 0, 0));
}
PQclear(res);
return system_identifier;
}
TimeLineHistoryEntry *
get_timeline_history(PGconn *repl_conn, TimeLineID tli)
{

View File

@@ -440,8 +440,8 @@ RecoveryType get_recovery_type(PGconn *conn);
int get_primary_node_id(PGconn *conn);
int get_ready_archive_files(PGconn *conn, const char *data_directory);
bool identify_system(PGconn *repl_conn, t_system_identification *identification);
uint64 system_identifier(PGconn *conn);
TimeLineHistoryEntry *get_timeline_history(PGconn *repl_conn, TimeLineID tli);
bool get_child_nodes(PGconn *conn, int node_id, NodeInfoList *node_list);
/* repmgrd shared memory functions */
bool repmgrd_set_local_node_id(PGconn *conn, int local_node_id);
@@ -484,6 +484,7 @@ bool get_primary_node_record(PGconn *conn, t_node_info *node_info);
bool get_all_node_records(PGconn *conn, NodeInfoList *node_list);
void get_downstream_node_records(PGconn *conn, int node_id, NodeInfoList *nodes);
void get_active_sibling_node_records(PGconn *conn, int node_id, int upstream_node_id, NodeInfoList *node_list);
bool get_child_nodes(PGconn *conn, int node_id, NodeInfoList *node_list);
void get_node_records_by_priority(PGconn *conn, NodeInfoList *node_list);
bool get_all_node_records_with_upstream(PGconn *conn, NodeInfoList *node_list);
bool get_downstream_nodes_with_missing_slot(PGconn *conn, int this_node_id, NodeInfoList *noede_list);

View File

@@ -88,6 +88,14 @@
warning if the node is not attached.
</para>
</listitem>
<listitem>
<para>
<link linkend="repmgr-standby-clone"><command>repmgr standby clone</command></link>:
prevent a standby from being cloned from a witness server.
</para>
</listitem>
</itemizedlist>
</para>
</sect2>

View File

@@ -4867,7 +4867,82 @@ check_source_server()
log_warning(_("repmgr extension not found on source node"));
}
else
{
/*
* If upstream is not a standby, retrieve its node records
* and attempt to connect to one; we'll then compare
* that node's system identifier to that of the source
* connection, to ensure we're cloning from a node which is
* part of the physical replication cluster. This is mainly
* to prevent cloning a standby from a witness server.
*
* Note that it doesn't matter if the node from the node record
* list is the same as the source node; also if the source node
* does not have any node records, there's not a lot we can do.
*
* This check will be only carried out on PostgreSQL 9.6 and
* later, as this is a precautionary check and we can retrieve the system
* identifier with a normal connection.
*/
if (get_recovery_type(source_conn) == RECTYPE_PRIMARY && PQserverVersion(source_conn) >= 90600)
{
uint64 source_system_identifier = system_identifier(source_conn);
if (source_system_identifier != UNKNOWN_SYSTEM_IDENTIFIER)
{
NodeInfoList all_nodes = T_NODE_INFO_LIST_INITIALIZER;
NodeInfoListCell *cell = NULL;
get_all_node_records(source_conn, &all_nodes);
log_debug("%i node records returned by source node", all_nodes.node_count);
/* loop through its nodes table */
for (cell = all_nodes.head; cell; cell = cell->next)
{
/* exclude the witness node, as its system identifier will be different, of course */
if (cell->node_info->type == WITNESS)
continue;
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
if (PQstatus(cell->node_info->conn) == CONNECTION_OK)
{
uint64 test_system_identifier = system_identifier(cell->node_info->conn);
PQfinish(cell->node_info->conn);
if (test_system_identifier != UNKNOWN_SYSTEM_IDENTIFIER)
{
if (source_system_identifier != test_system_identifier)
{
log_error(_("source node's system identifier does not match other nodes in the replication cluster"));
log_detail(_("source node's system identifier is %lu, replication cluster member \"%s\"'s system identifier is %lu"),
source_system_identifier,
cell->node_info->node_name,
test_system_identifier);
log_hint(_("check that the source node is not a witness server"));
PQfinish(source_conn);
source_conn = NULL;
if (superuser_conn != NULL)
PQfinish(superuser_conn);
exit(ERR_BAD_CONFIG);
}
/* identifiers match - our work here is done */
break;
}
}
else
{
PQfinish(cell->node_info->conn);
}
}
clear_node_info_list(&all_nodes);
}
}
}
/* Fetch the source's data directory */
get_superuser_connection(&source_conn, &superuser_conn, &privileged_conn);