From 951c7dbd07f9bba2300be77b8af73342fa07bb88 Mon Sep 17 00:00:00 2001
From: Ian Barwick <ian@2ndquadrant.com>
Date: Fri, 14 Jul 2017 15:01:18 +0900
Subject: [PATCH] repmgrd: in BDR mode, have each repmgrd monitor each node

This will cover both the case when an entire node including
repmgrd goes down, and when one PostgreSQL instance goes down
but repmgrd is still up (in which case only one of the repmgrds
will handle the failover).
---
 dbutils.c               |  39 ++++++++++-
 dbutils.h               |  11 ++-
 repmgr--4.0.sql         |  12 ++++
 repmgr-action-standby.c |   2 -
 repmgr.c                |  46 +++++++++++++
 repmgrd-bdr.c           | 144 +++++++++++++++++++++++-----------------
 repmgrd-physical.c      |  21 +-----
 repmgrd.c               |  19 ++++++
 repmgrd.h               |   6 +-
 9 files changed, 211 insertions(+), 89 deletions(-)

diff --git a/dbutils.c b/dbutils.c
index 11ff7ac5..85947d8b 100644
--- a/dbutils.c
+++ b/dbutils.c
@@ -16,7 +16,7 @@
 
 
 /* mainly for use by repmgrd */
-int			server_version_num = 0;
+int			server_version_num = UNKNOWN_SERVER_VERSION_NUM;
 
 
 static PGconn *_establish_db_connection(const char *conninfo,
@@ -1247,6 +1247,7 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row)
 	node_info->is_ready = false;
 	node_info->is_visible = false;
 	node_info->last_wal_receive_lsn = InvalidXLogRecPtr;
+	node_info->monitoring_state = MS_NORMAL;
 }
 
 
@@ -3135,7 +3136,7 @@ void _populate_bdr_node_records(PGresult *res, BdrNodeInfoList *node_list)
 {
 	int				i;
 
-	clear_node_info_list(node_list);
+	clear_node_info_list((NodeInfoList *)node_list);
 
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
@@ -3180,3 +3181,37 @@ _populate_bdr_node_record(PGresult *res, t_bdr_node_info *node_info, int row)
 }
 
 
+bool
+am_bdr_failover_handler(PGconn *conn, int node_id)
+{
+	PQExpBufferData	  query;
+	PGresult		 *res;
+	bool		  	  am_handler;
+
+	initPQExpBuffer(&query);
+
+	appendPQExpBuffer(
+		&query,
+		"SELECT repmgr.am_bdr_failover_handler(%i)",
+		node_id);
+
+
+	res = PQexec(conn, query.data);
+	termPQExpBuffer(&query);
+
+	am_handler = (strcmp(PQgetvalue(res, 0, 0), "t") == 0) ? true : false;
+
+	PQclear(res);
+
+	return am_handler;
+}
+
+void
+unset_bdr_failover_handler(PGconn *conn)
+{
+	PGresult		 *res;
+	res = PQexec(conn, "SELECT repmgr.unset_bdr_failover_handler()");
+
+	PQclear(res);
+	return;
+}
diff --git a/dbutils.h b/dbutils.h
index 1b9ede4d..adae6848 100644
--- a/dbutils.h
+++ b/dbutils.h
@@ -9,6 +9,7 @@
 
 #include "access/xlogdefs.h"
 #include "pqexpbuffer.h"
+#include "portability/instr_time.h"
 
 #include "config.h"
 #include "strutil.h"
@@ -40,8 +41,10 @@ typedef enum {
 	RECORD_NOT_FOUND
 } RecordStatus;
 
-
-
+typedef enum {
+	MS_NORMAL = 0,
+	MS_DEGRADED = 1
+} MonitoringState;
 
 /*
  * Struct to store node information
@@ -62,6 +65,7 @@ typedef struct s_node_info
 	bool		  is_ready;
 	bool		  is_visible;
 	XLogRecPtr	  last_wal_receive_lsn;
+	MonitoringState monitoring_state;
 	PGconn		 *conn;
 }	t_node_info;
 
@@ -80,6 +84,7 @@ typedef struct s_node_info
   false, \
   false, \
   InvalidXLogRecPtr, \
+  MS_NORMAL, \
   NULL \
 }
 
@@ -323,5 +328,7 @@ void		 add_extension_tables_to_bdr_replication_set(PGconn *conn);
 
 bool		 bdr_node_exists(PGconn *conn, const char *node_name);
 
+bool		 am_bdr_failover_handler(PGconn *conn, int node_id);
+void		 unset_bdr_failover_handler(PGconn *conn);
 #endif /* dbutils.h */
 
diff --git a/repmgr--4.0.sql b/repmgr--4.0.sql
index eabf04d1..37c1ac26 100644
--- a/repmgr--4.0.sql
+++ b/repmgr--4.0.sql
@@ -70,3 +70,15 @@ CREATE FUNCTION reset_voting_status()
   RETURNS VOID
   AS '$libdir/repmgr', 'reset_voting_status'
   LANGUAGE C STRICT;
+
+
+CREATE FUNCTION am_bdr_failover_handler(INT)
+  RETURNS BOOL
+  AS '$libdir/repmgr', 'am_bdr_failover_handler'
+  LANGUAGE C STRICT;
+
+
+CREATE FUNCTION unset_bdr_failover_handler()
+  RETURNS VOID
+  AS '$libdir/repmgr', 'unset_bdr_failover_handler'
+  LANGUAGE C STRICT;
diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c
index 66840464..f55df9ca 100644
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -37,8 +37,6 @@ typedef struct TablespaceDataList
 static PGconn  *primary_conn = NULL;
 static PGconn  *source_conn = NULL;
 
-static int		server_version_num = UNKNOWN_SERVER_VERSION_NUM;
-
 static char		local_data_directory[MAXPGPATH];
 static bool		local_data_directory_provided = false;
 
diff --git a/repmgr.c b/repmgr.c
index 328c66db..93499563 100644
--- a/repmgr.c
+++ b/repmgr.c
@@ -45,11 +45,14 @@ typedef enum {
 typedef struct repmgrdSharedState
 {
 	LWLockId	lock;			/* protects search/modification */
+	/* streaming failover */
 	NodeState	node_state;
 	NodeVotingStatus voting_status;
 	int current_electoral_term;
 	int candidate_node_id;
 	bool follow_new_primary;
+	/* BDR failover */
+    int bdr_failover_handler;
 }	repmgrdSharedState;
 
 static repmgrdSharedState *shared_state = NULL;
@@ -83,6 +86,13 @@ PG_FUNCTION_INFO_V1(get_new_primary);
 Datum		reset_voting_status(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(reset_voting_status);
 
+Datum		am_bdr_failover_handler(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(am_bdr_failover_handler);
+
+Datum		unset_bdr_failover_handler(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(unset_bdr_failover_handler);
+
+
 /*
  * Module load callback
  */
@@ -157,6 +167,7 @@ repmgr_shmem_startup(void)
 		shared_state->voting_status = VS_NO_VOTE;
 		shared_state->candidate_node_id = UNKNOWN_NODE_ID;
 		shared_state->follow_new_primary = false;
+		shared_state->bdr_failover_handler = UNKNOWN_NODE_ID;
 	}
 
 	LWLockRelease(AddinShmemInitLock);
@@ -330,3 +341,38 @@ reset_voting_status(PG_FUNCTION_ARGS)
 
 	PG_RETURN_VOID();
 }
+
+
+Datum
+am_bdr_failover_handler(PG_FUNCTION_ARGS)
+{
+	int node_id = PG_GETARG_INT32(0);
+	bool am_handler = false;
+
+	LWLockAcquire(shared_state->lock, LW_SHARED);
+
+	if (shared_state->bdr_failover_handler == UNKNOWN_NODE_ID)
+	{
+		shared_state->bdr_failover_handler = node_id;
+		am_handler = true;
+	}
+	else if (shared_state->bdr_failover_handler == node_id)
+	{
+		am_handler = true;
+	}
+
+	LWLockRelease(shared_state->lock);
+
+	PG_RETURN_BOOL(am_handler);
+}
+
+
+Datum
+unset_bdr_failover_handler(PG_FUNCTION_ARGS)
+{
+	LWLockAcquire(shared_state->lock, LW_SHARED);
+	shared_state->bdr_failover_handler = UNKNOWN_NODE_ID;
+	LWLockRelease(shared_state->lock);
+
+	PG_RETURN_VOID();
+}
diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c
index 9a6f0656..267ea142 100644
--- a/repmgrd-bdr.c
+++ b/repmgrd-bdr.c
@@ -14,7 +14,7 @@
 
 static volatile sig_atomic_t got_SIGHUP = false;
 
-static void do_bdr_failover(NodeInfoList *nodes);
+static void do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node);
 
 
 void
@@ -31,6 +31,8 @@ monitor_bdr(void)
 	t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER;
 	RecordStatus  record_status;
 
+//	t_node_info other_node_info = T_NODE_INFO_INITIALIZER;
+
 	/* sanity check local database */
 	log_info(_("connecting to local database '%s'"),
 			 config_file_options.conninfo);
@@ -61,7 +63,6 @@ monitor_bdr(void)
 		exit(ERR_BAD_CONFIG);
 	}
 
-
 	if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr") == false)
 	{
 		log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"),
@@ -125,7 +126,7 @@ monitor_bdr(void)
 						NULL);
 
 	/*
-	 * retrieve list of nodes - we'll need these if the DB connection goes away,
+	 * retrieve list of all nodes - we'll need these if the DB connection goes away,
 	 */
 	get_all_node_records(local_conn, &nodes);
 
@@ -135,44 +136,56 @@ monitor_bdr(void)
 
 	while (true)
 	{
-
+		NodeInfoListCell *cell;
 		/* monitoring loop */
 		log_verbose(LOG_DEBUG, "bdr check loop...");
 
-		switch (monitoring_state)
+		for (cell = nodes.head; cell; cell = cell->next)
 		{
-			case MS_NORMAL:
+			if (cell->node_info->node_id == local_node_info.node_id)
 			{
-				if (is_server_available(local_node_info.conninfo) == false)
-				{
-					// XXX improve
-					log_warning("connection problem!");
-					do_bdr_failover(&nodes);
-				}
-				else
-				{
-					log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
-								config_file_options.monitor_interval_secs);
-					sleep(config_file_options.monitor_interval_secs);
-				}
+				log_debug("checking local node %i in %s state",
+						  local_node_info.node_id,
+						  print_monitoring_state(cell->node_info->monitoring_state));
 			}
-			case MS_DEGRADED:
+			else
 			{
-				/* degraded monitoring */
-				if (is_server_available(local_node_info.conninfo) == true)
+				log_debug("checking other node %i in %s state",
+						  cell->node_info->node_id,
+						  print_monitoring_state(cell->node_info->monitoring_state));
+			}
+
+
+			switch (cell->node_info->monitoring_state)
+			{
+				case MS_NORMAL:
 				{
-					log_notice(_("monitored node %i has recovered"), local_node_info.node_id);
-					// do_bdr_recovery()
+					if (is_server_available(cell->node_info->conninfo) == false)
+					{
+						// XXX improve
+						log_warning("connection problem! to node %i", cell->node_info->node_id);
+						do_bdr_failover(&nodes, cell->node_info);
+					}
 				}
-				else
+				break;
+				case MS_DEGRADED:
 				{
-					log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
-								config_file_options.monitor_interval_secs);
-					sleep(config_file_options.monitor_interval_secs);
+					/* degraded monitoring */
+					if (is_server_available(cell->node_info->conninfo) == true)
+					{
+						log_notice(_("monitored node %i has recovered"),  cell->node_info->node_id);
+						// do_bdr_recovery()
+					}
+
 				}
+				break;
 			}
 		}
 
+		log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
+					config_file_options.monitor_interval_secs);
+		sleep(config_file_options.monitor_interval_secs);
+
 		if (got_SIGHUP)
 		{
 			/*
@@ -186,9 +199,6 @@ monitor_bdr(void)
 				update_registration(local_conn);
 			}
 
-			/* reload node list */
-			get_all_node_records(local_conn, &nodes);
-
 			got_SIGHUP = false;
 		}
 
@@ -199,14 +209,13 @@ monitor_bdr(void)
 
 /*
  * do_bdr_failover()
- *
+ *0
  * Here we attempt to perform a BDR "failover".
  *
  * As there's no equivalent of a physical replication failover,
  * we'll do the following:
  *
- *  - attempt to find another node, to set our node record as inactive
- *    (there should be only one other node)
+ *  - connect to active node
  *  - generate an event log record on that node
  *  - optionally execute `bdr_failover_command`, passing the conninfo string
  *    of that node to the command; this can be used for e.g. reconfiguring
@@ -215,29 +224,33 @@ monitor_bdr(void)
  */
 
 void
-do_bdr_failover(NodeInfoList *nodes)
+do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *next_node_conn = NULL;
 	NodeInfoListCell *cell;
-	bool	    failover_success = false;
+//	bool	    failover_success = false;
 	PQExpBufferData event_details;
 	RecordStatus  record_status;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
-	t_node_info target_node = T_NODE_INFO_INITIALIZER;
+	t_node_info target_node  = T_NODE_INFO_INITIALIZER;
 
 	initPQExpBuffer(&event_details);
 
-	/* get next active node */
+	monitored_node->monitoring_state = MS_DEGRADED;
+	INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+
+	/* get other node */
 
 	for (cell = nodes->head; cell; cell = cell->next)
 	{
 		log_debug("do_bdr_failover() %s", cell->node_info->node_name);
 
 		/* don't attempt to connect to the current monitored node, as that's the one which has failed  */
-		if (cell->node_info->node_id == local_node_info.node_id)
+		if (cell->node_info->node_id == monitored_node->node_id)
 			continue;
 
 		/* XXX skip inactive node? */
+		// reuse local conn if local node is up
 		next_node_conn = establish_db_connection(cell->node_info->conninfo, false);
 
 		if (PQstatus(next_node_conn) == CONNECTION_OK)
@@ -251,6 +264,7 @@ do_bdr_failover(NodeInfoList *nodes)
 		next_node_conn = NULL;
 	}
 
+	/* shouldn't happen, and if it does, it means everything is down */
 	if (next_node_conn == NULL)
 	{
 		appendPQExpBuffer(&event_details,
@@ -258,33 +272,42 @@ do_bdr_failover(NodeInfoList *nodes)
 
 		log_error("%s", event_details.data);
 
-		// no other nodes found
-		// continue degraded monitoring until node is restored?
+		/* no other nodes found - continue degraded monitoring */
+		return;
 	}
-	else
+
+
+	// call: repmgr.am_bdr_failover_handler(node_id)
+	if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
 	{
-		log_info(_("connecting to target node %s"), target_node.node_name);
-
-		failover_success = true;
-
-		event_info.conninfo_str = target_node.conninfo;
-		event_info.node_name = target_node.node_name;
-
-		/* update our own record on the other node */
-		update_node_record_set_active(next_node_conn, local_node_info.node_id, false);
-
-		appendPQExpBuffer(&event_details,
-						  _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
-						  local_node_info.node_name,
-						  local_node_info.node_id,
-						  target_node.node_name,
-						  target_node.node_id);
+		log_debug("XXX am not failover handler");
+		PQfinish(next_node_conn);
+		log_debug("other node's repmgrd is handling failover");
+		return;
 	}
 
-	monitoring_state = MS_DEGRADED;
-	INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+	log_debug("YYYam the failover handler");
 
 	// check here that the node hasn't come back up...
+	log_info(_("connecting to target node %s"), target_node.node_name);
+
+//	failover_success = true;
+
+	event_info.conninfo_str = target_node.conninfo;
+	event_info.node_name = target_node.node_name;
+
+	/* update our own record on the other node */
+	update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
+
+	appendPQExpBuffer(&event_details,
+					  _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
+					  monitored_node->node_name,
+					  monitored_node->node_id,
+					  target_node.node_name,
+					  target_node.node_id);
+
+
+
 
 	/*
 	 * Create an event record
@@ -301,12 +324,13 @@ do_bdr_failover(NodeInfoList *nodes)
 		&config_file_options,
 		config_file_options.node_id,
 		"bdr_failover",
-		failover_success,
+		true,
 		event_details.data,
 		&event_info);
 
 	termPQExpBuffer(&event_details);
 
+	unset_bdr_failover_handler(next_node_conn);
 
 	/* local monitoring mode - there's no new node to monitor */
 	return;
diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index 4efa4d4b..b1370b15 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -47,7 +47,6 @@ static PGconn *primary_conn = NULL;
 static ElectionResult do_election(void);
 static const char *_print_voting_status(NodeVotingStatus voting_status);
 static const char *_print_election_result(ElectionResult result);
-static const char *_print_monitoring_state(MonitoringState monitoring_state);
 
 static FailoverState promote_self(void);
 static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);
@@ -315,7 +314,7 @@ monitor_streaming_primary(void)
 				log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"),
 						 local_node_info.node_name,
 						 local_node_info.node_id,
-						 _print_monitoring_state(monitoring_state));
+						 print_monitoring_state(monitoring_state));
 
 				if (monitoring_state == MS_DEGRADED)
 				{
@@ -617,7 +616,7 @@ monitor_streaming_standby(void)
 						 local_node_info.node_id,
 						 upstream_node_info.node_name,
 						 upstream_node_info.node_id,
-						 _print_monitoring_state(monitoring_state));
+						 print_monitoring_state(monitoring_state));
 
 				if (monitoring_state == MS_DEGRADED)
 				{
@@ -1459,22 +1458,6 @@ _print_election_result(ElectionResult result)
 	return "UNKNOWN";
 }
 
-static const char *
-_print_monitoring_state(MonitoringState monitoring_state)
-{
-	switch(monitoring_state)
-	{
-		case MS_NORMAL:
-			return "normal";
-
-		case MS_DEGRADED:
-			return "degraded";
-	}
-
-	/* should never reach here */
-	return "UNKNOWN";
-}
-
 
 
 static ElectionResult
diff --git a/repmgrd.c b/repmgrd.c
index 00335f75..59d21e6f 100644
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -665,6 +665,23 @@ calculate_elapsed(instr_time start_time)
 }
 
 
+const char *
+print_monitoring_state(MonitoringState monitoring_state)
+{
+	switch(monitoring_state)
+	{
+		case MS_NORMAL:
+			return "normal";
+
+		case MS_DEGRADED:
+			return "degraded";
+	}
+
+	/* should never reach here */
+	return "UNKNOWN";
+}
+
+
 static void
 close_connections()
 {
@@ -694,3 +711,5 @@ terminate(int retval)
 
 	exit(retval);
 }
+
+
diff --git a/repmgrd.h b/repmgrd.h
index 56b6b3b8..2a6caea9 100644
--- a/repmgrd.h
+++ b/repmgrd.h
@@ -15,10 +15,6 @@ typedef enum {
 	NODE_STATUS_DOWN
 } NodeStatus;
 
-typedef enum {
-	MS_NORMAL = 0,
-	MS_DEGRADED = 1
-} MonitoringState;
 
 extern MonitoringState monitoring_state;
 extern instr_time	degraded_monitoring_start;
@@ -31,6 +27,8 @@ extern bool			startup_event_logged;
 PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
 
 int calculate_elapsed(instr_time start_time);
+const char *print_monitoring_state(MonitoringState monitoring_state);
+
 void update_registration(PGconn *conn);
 void terminate(int retval);
 #endif /* _REPMGRD_H_ */