Terminate repmgrd if standby is no longer connected to upstream

This commit is contained in:
Ian Barwick
2015-10-28 16:05:35 +09:00
committed by Ian Barwick
parent a70a44605f
commit d353fe2a9f
3 changed files with 58 additions and 15 deletions

View File

@@ -355,6 +355,7 @@ Following event types currently exist:
standby_promote
witness_create
repmgrd_start
repmgrd_monitor
repmgrd_failover_promote
repmgrd_failover_follow
@@ -585,20 +586,20 @@ and one view:
`repmgr` or `repmgrd` will return one of the following error codes on program
exit:
* SUCCESS (0) Program ran successfully.
* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid
* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error
* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed
* ERR_DB_CON (6) Error when trying to connect to a database
* ERR_DB_QUERY (7) Error while executing a database query
* ERR_PROMOTED (8) Exiting program because the node has been promoted to master
* ERR_BAD_PASSWORD (9) Password used to connect to a database was rejected
* ERR_STR_OVERFLOW (10) String overflow error
* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only)
* ERR_BAD_SSH (12) Error when connecting to remote host via SSH
* ERR_SYS_FAILURE (13) Error when forking (repmgrd only)
* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup
* SUCCESS (0) Program ran successfully.
* ERR_BAD_CONFIG (1) Configuration file could not be parsed or was invalid
* ERR_BAD_RSYNC (2) An rsync call made by the program returned an error
* ERR_NO_RESTART (4) An attempt to restart a PostgreSQL instance failed
* ERR_DB_CON (6) Error when trying to connect to a database
* ERR_DB_QUERY (7) Error while executing a database query
* ERR_PROMOTED (8) Exiting program because the node has been promoted to master
* ERR_BAD_PASSWORD (9) Password used to connect to a database was rejected
* ERR_STR_OVERFLOW (10) String overflow error
* ERR_FAILOVER_FAIL (11) Error encountered during failover (repmgrd only)
* ERR_BAD_SSH (12) Error when connecting to remote host via SSH
* ERR_SYS_FAILURE (13) Error when forking (repmgrd only)
* ERR_BAD_BASEBACKUP (14) Error when executing pg_basebackup
* ERR_MONITORING_FAIL (16) Unrecoverable error encountered during monitoring (repmgrd only)
Support and Assistance
----------------------

View File

@@ -36,5 +36,6 @@
#define ERR_SYS_FAILURE 13
#define ERR_BAD_BASEBACKUP 14
#define ERR_INTERNAL 15
#define ERR_MONITORING_FAIL 16
#endif /* _ERRCODE_H_ */

View File

@@ -677,6 +677,7 @@ standby_monitor(void)
char last_wal_standby_received[MAXLEN];
char last_wal_standby_applied[MAXLEN];
char last_wal_standby_applied_timestamp[MAXLEN];
bool last_wal_standby_received_gte_applied;
char sqlquery[QUERY_STR_LEN];
XLogRecPtr lsn_master;
@@ -956,7 +957,8 @@ standby_monitor(void)
/* Get local xlog info */
sqlquery_snprintf(sqlquery,
"SELECT CURRENT_TIMESTAMP, pg_last_xlog_receive_location(), "
"pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp() ");
"pg_last_xlog_replay_location(), pg_last_xact_replay_timestamp(), "
"pg_last_xlog_receive_location() >= pg_last_xlog_replay_location()");
res = PQexec(my_local_conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -971,8 +973,47 @@ standby_monitor(void)
strncpy(last_wal_standby_received, PQgetvalue(res, 0, 1), MAXLEN);
strncpy(last_wal_standby_applied, PQgetvalue(res, 0, 2), MAXLEN);
strncpy(last_wal_standby_applied_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
last_wal_standby_received_gte_applied = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
? true
: false;
PQclear(res);
/*
* Check that last WAL received is greater or equal to last WAL applied
*
* This situation can occur when the standby is no longer connected to
* the upstream node; in this case repmgrd should terminate itself
* as the node may no longer be capable of being promoted or following
* a new upstream node
*
* XXX check if we should (optionally) adopt other strategies to handle
* this situation
*/
if(last_wal_standby_received_gte_applied == false)
{
PQExpBufferData errmsg;
initPQExpBuffer(&errmsg);
appendPQExpBuffer(&errmsg,
/* XXX improve message */
_("This node is no longer connected to its upstream node - terminating"));
log_crit("%s\n", errmsg.data);
create_event_record(master_conn,
&local_options,
local_options.node,
"repmgrd_monitor",
false,
errmsg.data);
// XXX use better code
terminate(ERR_MONITORING_FAIL);
}
/* Get master xlog info */
sqlquery_snprintf(sqlquery, "SELECT pg_current_xlog_location()");