repmgrd: improve primary visibility consensus check

Exclude sibling nodes which report they're following a different
node. This shouldn't happen, but could.
This commit is contained in:
Ian Barwick
2019-04-04 16:03:55 +09:00
parent 008bd00a59
commit cd6a55c7cb
4 changed files with 32 additions and 16 deletions

View File

@@ -4993,7 +4993,8 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
" END AS replication_lag_time, "
" last_wal_receive_lsn >= last_wal_replay_lsn AS receiving_streamed_wal, "
" wal_replay_paused, "
" upstream_last_seen "
" upstream_last_seen, "
" upstream_node_id "
" FROM ( "
" SELECT CURRENT_TIMESTAMP AS ts, "
" pg_catalog.pg_is_in_recovery() AS in_recovery, "
@@ -5033,10 +5034,12 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
" END AS wal_replay_paused, ");
}
/* Add information about upstream node from shared memory */
if (node_type == WITNESS)
{
appendPQExpBufferStr(&query,
" repmgr.get_upstream_last_seen() AS upstream_last_seen");
" repmgr.get_upstream_last_seen() AS upstream_last_seen, "
" repmgr.get_upstream_node_id() AS upstream_node_id ");
}
else
{
@@ -5044,7 +5047,12 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
" CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
" THEN -1 "
" ELSE repmgr.get_upstream_last_seen() "
" END AS upstream_last_seen ");
" END AS upstream_last_seen, ");
appendPQExpBufferStr(&query,
" CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
" THEN -1 "
" ELSE repmgr.get_upstream_node_id() "
" END AS upstream_node_id ");
}
appendPQExpBufferStr(&query,
@@ -5075,6 +5083,7 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
replication_info->receiving_streamed_wal = atobool(PQgetvalue(res, 0, 6));
replication_info->wal_replay_paused = atobool(PQgetvalue(res, 0, 7));
replication_info->upstream_last_seen = atoi(PQgetvalue(res, 0, 8));
replication_info->upstream_node_id = atoi(PQgetvalue(res, 0, 9));
}
termPQExpBuffer(&query);

View File

@@ -310,6 +310,7 @@ typedef struct
bool receiving_streamed_wal;
bool wal_replay_paused;
int upstream_last_seen;
int upstream_node_id;
} ReplInfo;
typedef struct

View File

@@ -436,10 +436,6 @@ get_upstream_node_id(PG_FUNCTION_ARGS)
if (!shared_state)
PG_RETURN_NULL();
/* A primary node cannot have an upstream ID */
if (!RecoveryInProgress())
PG_RETURN_INT32(UNKNOWN_NODE_ID);
LWLockAcquire(shared_state->lock, LW_SHARED);
upstream_node_id = shared_state->upstream_node_id;
LWLockRelease(shared_state->lock);

View File

@@ -3633,15 +3633,25 @@ do_election(NodeInfoList *sibling_nodes, int *new_primary_id)
if (sibling_replication_info.upstream_last_seen >= 0 && sibling_replication_info.upstream_last_seen < (config_file_options.monitor_interval_secs * 2))
{
nodes_with_primary_still_visible++;
log_notice(_("node %i last saw primary node %i second(s) ago, considering primary still visible"),
cell->node_info->node_id,
sibling_replication_info.upstream_last_seen);
appendPQExpBuffer(&nodes_with_primary_visible,
" - node \"%s\" (ID: %i): %i second(s) ago\n",
cell->node_info->node_name,
cell->node_info->node_id,
sibling_replication_info.upstream_last_seen);
if (sibling_replication_info.upstream_node_id != upstream_node_info.node_id)
{
log_warning(_("assumed sibling node %i monitoring different upstream node %i"),
cell->node_info->node_id,
sibling_replication_info.upstream_node_id);
}
else
{
nodes_with_primary_still_visible++;
log_notice(_("node %i last saw primary node %i second(s) ago, considering primary still visible"),
cell->node_info->node_id,
sibling_replication_info.upstream_last_seen);
appendPQExpBuffer(&nodes_with_primary_visible,
" - node \"%s\" (ID: %i): %i second(s) ago\n",
cell->node_info->node_name,
cell->node_info->node_id,
sibling_replication_info.upstream_last_seen);
}
}
else
{