repmgrd: in BDR mode, have each repmgrd monitor each node

This will cover both the case when an entire node including repmgrd goes down, and when one PostgreSQL instance goes down but repmgrd is still up (in which case only one of the repmgrds will handle the failover).
2026-03-22 22:56:29 +00:00 · 2017-07-14 15:01:18 +09:00
parent e3b3fb65f0
commit 951c7dbd07
9 changed files with 211 additions and 89 deletions
--- a/repmgrd-bdr.c
+++ b/repmgrd-bdr.c
@@ -14,7 +14,7 @@

 static volatile sig_atomic_t got_SIGHUP = false;

-static void do_bdr_failover(NodeInfoList *nodes);
+static void do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node);


 void
@@ -31,6 +31,8 @@ monitor_bdr(void)
 	t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER;
 	RecordStatus  record_status;

+//	t_node_info other_node_info = T_NODE_INFO_INITIALIZER;
+
 	/* sanity check local database */
 	log_info(_("connecting to local database '%s'"),
 			 config_file_options.conninfo);
@@ -61,7 +63,6 @@ monitor_bdr(void)
 		exit(ERR_BAD_CONFIG);
 	}

-
 	if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr") == false)
 	{
 		log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"),
@@ -125,7 +126,7 @@ monitor_bdr(void)
 						NULL);

 	/*
-	 * retrieve list of nodes - we'll need these if the DB connection goes away,
+	 * retrieve list of all nodes - we'll need these if the DB connection goes away,
 	 */
 	get_all_node_records(local_conn, &nodes);

@@ -135,44 +136,56 @@ monitor_bdr(void)

 	while (true)
 	{
-
+		NodeInfoListCell *cell;
 		/* monitoring loop */
 		log_verbose(LOG_DEBUG, "bdr check loop...");

-		switch (monitoring_state)
+		for (cell = nodes.head; cell; cell = cell->next)
 		{
-			case MS_NORMAL:
+			if (cell->node_info->node_id == local_node_info.node_id)
 			{
-				if (is_server_available(local_node_info.conninfo) == false)
-				{
-					// XXX improve
-					log_warning("connection problem!");
-					do_bdr_failover(&nodes);
-				}
-				else
-				{
-					log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
-								config_file_options.monitor_interval_secs);
-					sleep(config_file_options.monitor_interval_secs);
-				}
+				log_debug("checking local node %i in %s state",
+						  local_node_info.node_id,
+						  print_monitoring_state(cell->node_info->monitoring_state));
 			}
-			case MS_DEGRADED:
+			else
 			{
-				/* degraded monitoring */
-				if (is_server_available(local_node_info.conninfo) == true)
+				log_debug("checking other node %i in %s state",
+						  cell->node_info->node_id,
+						  print_monitoring_state(cell->node_info->monitoring_state));
+			}
+
+
+			switch (cell->node_info->monitoring_state)
+			{
+				case MS_NORMAL:
 				{
-					log_notice(_("monitored node %i has recovered"), local_node_info.node_id);
-					// do_bdr_recovery()
+					if (is_server_available(cell->node_info->conninfo) == false)
+					{
+						// XXX improve
+						log_warning("connection problem! to node %i", cell->node_info->node_id);
+						do_bdr_failover(&nodes, cell->node_info);
+					}
 				}
-				else
+				break;
+				case MS_DEGRADED:
 				{
-					log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
-								config_file_options.monitor_interval_secs);
-					sleep(config_file_options.monitor_interval_secs);
+					/* degraded monitoring */
+					if (is_server_available(cell->node_info->conninfo) == true)
+					{
+						log_notice(_("monitored node %i has recovered"),  cell->node_info->node_id);
+						// do_bdr_recovery()
+					}
+
 				}
+				break;
 			}
 		}

+		log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
+					config_file_options.monitor_interval_secs);
+		sleep(config_file_options.monitor_interval_secs);
+
 		if (got_SIGHUP)
 		{
 			/*
@@ -186,9 +199,6 @@ monitor_bdr(void)
 				update_registration(local_conn);
 			}

-			/* reload node list */
-			get_all_node_records(local_conn, &nodes);
-
 			got_SIGHUP = false;
 		}

@@ -199,14 +209,13 @@ monitor_bdr(void)

 /*
 * do_bdr_failover()
- *
+ *0
 * Here we attempt to perform a BDR "failover".
 *
 * As there's no equivalent of a physical replication failover,
 * we'll do the following:
 *
- *  - attempt to find another node, to set our node record as inactive
- *    (there should be only one other node)
+ *  - connect to active node
 *  - generate an event log record on that node
 *  - optionally execute `bdr_failover_command`, passing the conninfo string
 *    of that node to the command; this can be used for e.g. reconfiguring
@@ -215,29 +224,33 @@ monitor_bdr(void)
 */

 void
-do_bdr_failover(NodeInfoList *nodes)
+do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *next_node_conn = NULL;
 	NodeInfoListCell *cell;
-	bool	    failover_success = false;
+//	bool	    failover_success = false;
 	PQExpBufferData event_details;
 	RecordStatus  record_status;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
-	t_node_info target_node = T_NODE_INFO_INITIALIZER;
+	t_node_info target_node  = T_NODE_INFO_INITIALIZER;

 	initPQExpBuffer(&event_details);

-	/* get next active node */
+	monitored_node->monitoring_state = MS_DEGRADED;
+	INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+
+	/* get other node */

 	for (cell = nodes->head; cell; cell = cell->next)
 	{
 		log_debug("do_bdr_failover() %s", cell->node_info->node_name);

 		/* don't attempt to connect to the current monitored node, as that's the one which has failed  */
-		if (cell->node_info->node_id == local_node_info.node_id)
+		if (cell->node_info->node_id == monitored_node->node_id)
 			continue;

 		/* XXX skip inactive node? */
+		// reuse local conn if local node is up
 		next_node_conn = establish_db_connection(cell->node_info->conninfo, false);

 		if (PQstatus(next_node_conn) == CONNECTION_OK)
@@ -251,6 +264,7 @@ do_bdr_failover(NodeInfoList *nodes)
 		next_node_conn = NULL;
 	}

+	/* shouldn't happen, and if it does, it means everything is down */
 	if (next_node_conn == NULL)
 	{
 		appendPQExpBuffer(&event_details,
@@ -258,33 +272,42 @@ do_bdr_failover(NodeInfoList *nodes)

 		log_error("%s", event_details.data);

-		// no other nodes found
-		// continue degraded monitoring until node is restored?
+		/* no other nodes found - continue degraded monitoring */
+		return;
 	}
-	else
+
+
+	// call: repmgr.am_bdr_failover_handler(node_id)
+	if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
 	{
-		log_info(_("connecting to target node %s"), target_node.node_name);
-
-		failover_success = true;
-
-		event_info.conninfo_str = target_node.conninfo;
-		event_info.node_name = target_node.node_name;
-
-		/* update our own record on the other node */
-		update_node_record_set_active(next_node_conn, local_node_info.node_id, false);
-
-		appendPQExpBuffer(&event_details,
-						  _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
-						  local_node_info.node_name,
-						  local_node_info.node_id,
-						  target_node.node_name,
-						  target_node.node_id);
+		log_debug("XXX am not failover handler");
+		PQfinish(next_node_conn);
+		log_debug("other node's repmgrd is handling failover");
+		return;
 	}

-	monitoring_state = MS_DEGRADED;
-	INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
+	log_debug("YYYam the failover handler");

 	// check here that the node hasn't come back up...
+	log_info(_("connecting to target node %s"), target_node.node_name);
+
+//	failover_success = true;
+
+	event_info.conninfo_str = target_node.conninfo;
+	event_info.node_name = target_node.node_name;
+
+	/* update our own record on the other node */
+	update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
+
+	appendPQExpBuffer(&event_details,
+					  _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
+					  monitored_node->node_name,
+					  monitored_node->node_id,
+					  target_node.node_name,
+					  target_node.node_id);
+
+
+

 	/*
 	 * Create an event record
@@ -301,12 +324,13 @@ do_bdr_failover(NodeInfoList *nodes)
 		&config_file_options,
 		config_file_options.node_id,
 		"bdr_failover",
-		failover_success,
+		true,
 		event_details.data,
 		&event_info);

 	termPQExpBuffer(&event_details);

+	unset_bdr_failover_handler(next_node_conn);

 	/* local monitoring mode - there's no new node to monitor */
 	return;