From e9a25c367a478abe1f2537fefa75b26a886a45f0 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Mon, 4 Jan 2016 13:31:50 +0900 Subject: [PATCH] Prevent invalid replication_lag values being written to the monitoring table A fix for this was introduced with commit ee9270fe8dc652fbd8c7c5a29f326ce9717c33f0 and removed in 4f1c67a1bf91d6a07748070b9a9385918fefb026. Refactor the original fix to simply omit attempting to write an invalid entry to the monitoring table. --- repmgrd.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/repmgrd.c b/repmgrd.c index 1784240c..5b3016d1 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -658,6 +658,7 @@ standby_monitor(void) char last_wal_standby_received[MAXLEN]; char last_wal_standby_applied[MAXLEN]; char last_wal_standby_applied_timestamp[MAXLEN]; + bool last_wal_standby_received_gte_replayed; char sqlquery[QUERY_STR_LEN]; XLogRecPtr lsn_master; @@ -824,7 +825,7 @@ standby_monitor(void) PQfinish(upstream_conn); - continue_monitoring_standby: + continue_monitoring_standby: /* Check if we still are a standby, we could have been promoted */ do { @@ -884,7 +885,6 @@ standby_monitor(void) if (!monitoring_history) return; - /* * If original master has gone away we'll need to get the new one * from the upstream node to write monitoring information @@ -946,7 +946,8 @@ standby_monitor(void) /* Get local xlog info */ sqlquery_snprintf(sqlquery, "SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), " - "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp()"); + "pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp(), " + "pg_last_xlog_receive_location() >= pg_last_xlog_replay_location()"); res = PQexec(my_local_conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) @@ -961,9 +962,27 @@ standby_monitor(void) strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN); strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN); strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN); + last_wal_standby_received_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0) + ? true + : false; PQclear(res); + /* + * In the unusual event of a standby becoming disconnected from the primary, + * while this repmgrd remains connected to the primary, subtracting + * "lsn_standby_applied" from "lsn_standby_received" and coercing to + * (long long unsigned int) will result in a meaningless, very large + * value which will overflow a BIGINT column and spew error messages into the + * PostgreSQL log. In the absence of a better strategy, skip attempting + * to insert a monitoring record. + */ + if (last_wal_standby_received_gte_replayed == false) + { + log_verbose(LOG_WARNING, + "Invalid replication_lag value calculated - is this standby connected to its upstream?\n"); + return; + } /* Get master xlog info */ sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_current_xlog_location()");