repmgrd: refactory primary failover code into separate function

2026-06-01 11:49:06 +00:00 · 2017-07-04 20:42:22 +09:00
parent f7f49ae85e
commit e1f4384f7e
1 changed files with 215 additions and 177 deletions
@@ -93,6 +93,10 @@ static void handle_sigint(SIGNAL_ARGS);
 #endif

 static PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
+
+static bool do_primary_failover(void);
+static bool do_upstream_standby_failover(void);
+
 static ElectionResult do_election(void);
 static const char *_print_voting_status(NodeVotingStatus voting_status);
 static const char *_print_election_result(ElectionResult result);
@@ -699,8 +703,94 @@ monitor_streaming_standby(void)
 					goto loop;
 				}

-				/* still down after reconnect attempt(s) - */
+				/* still down after reconnect attempt(s) */
 				if (upstream_node_status == NODE_STATUS_DOWN)
+				{
+					bool failover_done = false;
+
+					if (upstream_node_info.type == PRIMARY)
+					{
+						failover_done = do_primary_failover();
+					}
+					else if (upstream_node_info.type == STANDBY)
+					{
+						failover_done = do_upstream_standby_failover();
+					}
+
+					// it's possible it will make sense to return in
+					// all cases to restart monitoring
+					if (failover_done == true)
+						return;
+
+				}
+
+			}
+		}
+
+	loop:
+
+		/* emit "still alive" log message at regular intervals, if requested */
+		if (config_file_options.log_status_interval > 0)
+		{
+			double		log_status_interval_elapsed = 0;
+			instr_time	log_status_interval_current;
+
+			INSTR_TIME_SET_CURRENT(log_status_interval_current);
+			INSTR_TIME_SUBTRACT(log_status_interval_current, log_status_interval_start);
+			log_status_interval_elapsed = INSTR_TIME_GET_DOUBLE(log_status_interval_current);
+
+			if ((int) log_status_interval_elapsed >= config_file_options.log_status_interval)
+			{
+				log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
+						 local_node_info.node_name,
+						 local_node_info.node_id,
+						 upstream_node_info.node_name,
+						 upstream_node_info.node_id);
+
+				//log_debug(
+				INSTR_TIME_SET_CURRENT(log_status_interval_start);
+			}
+		}
+
+		/*
+		 * handle local node failure
+		 *
+		 * currently we'll just check the connection, and try to reconnect
+		 *
+		 * TODO: add timeout, after which we run in degraded state
+		 */
+		if (is_server_available(local_node_info.conninfo) == false)
+		{
+			log_warning(_("connection to local node %i lost"), local_node_info.node_id);
+
+			if (local_conn != NULL)
+			{
+				PQfinish(local_conn);
+				local_conn = NULL;
+			}
+		}
+
+		if (PQstatus(local_conn) != CONNECTION_OK)
+		{
+			log_info(_("attempting to reconnect"));
+			local_conn = establish_db_connection(config_file_options.conninfo, false);
+
+			if (PQstatus(local_conn) != CONNECTION_OK)
+			{
+				log_warning(_("reconnection failed"));
+			}
+			else
+			{
+				log_info(_("reconnected"));
+			}
+		}
+		sleep(1);
+	}
+}
+
+
+static bool
+do_primary_failover(void)
 {
 	/* attempt to initiate voting process */
 	ElectionResult election_result = do_election();
@@ -834,7 +924,7 @@ monitor_streaming_standby(void)
 			log_info(_("switching to primary monitoring mode"));

 			failover_state = FAILOVER_STATE_NONE;
-							return;
+			return true;

 		case FAILOVER_STATE_PRIMARY_REAPPEARED:
 			log_debug("failover state is PRIMARY_REAPPEARED");
@@ -851,11 +941,8 @@ monitor_streaming_standby(void)
 					   upstream_node_info.node_name, upstream_node_info.node_id);

 			failover_state = FAILOVER_STATE_NONE;
-							return;
+			return true;

-						case FAILOVER_STATE_PROMOTION_FAILED:
-							log_debug("failover state is PROMOTION FAILED");
-							break;

 		case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
 			log_info(_("resuming standby monitoring mode"));
@@ -863,7 +950,7 @@ monitor_streaming_standby(void)
 					   upstream_node_info.node_name, upstream_node_info.node_id);
 			failover_state = FAILOVER_STATE_NONE;

-							return;
+			return true;

 		case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
 			log_info(_("resuming standby monitoring mode"));
@@ -871,85 +958,36 @@ monitor_streaming_standby(void)
 					   upstream_node_info.node_name, upstream_node_info.node_id);
 			failover_state = FAILOVER_STATE_NONE;

-							return;
+			return true;

+		case FAILOVER_STATE_PROMOTION_FAILED:
+			log_debug("failover state is PROMOTION FAILED");
+			return false;
 		case FAILOVER_STATE_NO_NEW_PRIMARY:
 		case FAILOVER_STATE_WAITING_NEW_PRIMARY:
 			/* pass control back down to start_monitoring() */
 			// -> should kick off new election
-							return;
+			return false;

 		case FAILOVER_STATE_LOCAL_NODE_FAILURE:
 		case FAILOVER_STATE_UNKNOWN:
 		case FAILOVER_STATE_NONE:
 			log_debug("failover state is %i", failover_state);
-							break;
-					}
+			return false;
 	}

-			}
+	// should never reach here
+	return false;
 }

-	loop:

-		/* emit "still alive" log message at regular intervals, if requested */
-		if (config_file_options.log_status_interval > 0)
+static bool
+do_upstream_standby_failover(void)
 {
-			double		log_status_interval_elapsed = 0;
-			instr_time	log_status_interval_current;
-
-			INSTR_TIME_SET_CURRENT(log_status_interval_current);
-			INSTR_TIME_SUBTRACT(log_status_interval_current, log_status_interval_start);
-			log_status_interval_elapsed = INSTR_TIME_GET_DOUBLE(log_status_interval_current);
-
-			if ((int) log_status_interval_elapsed >= config_file_options.log_status_interval)
-			{
-				log_info(_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
-						 local_node_info.node_name,
-						 local_node_info.node_id,
-						 upstream_node_info.node_name,
-						 upstream_node_info.node_id);
-
-				//log_debug(
-				INSTR_TIME_SET_CURRENT(log_status_interval_start);
-			}
+	// not implemented yet
+	return false;
 }

-		/*
-		 * handle local node failure
-		 *
-		 * currently we'll just check the connection, and try to reconnect
-		 *
-		 * TODO: add timeout, after which we run in degraded state
-		 */
-		if (is_server_available(local_node_info.conninfo) == false)
-		{
-			log_warning(_("connection to local node %i lost"), local_node_info.node_id);
-
-			if (local_conn != NULL)
-			{
-				PQfinish(local_conn);
-				local_conn = NULL;
-			}
-		}
-
-		if (PQstatus(local_conn) != CONNECTION_OK)
-		{
-			log_info(_("attempting to reconnect"));
-			local_conn = establish_db_connection(config_file_options.conninfo, false);
-
-			if (PQstatus(local_conn) != CONNECTION_OK)
-			{
-				log_warning(_("reconnection failed"));
-			}
-			else
-			{
-				log_info(_("reconnected"));
-			}
-		}
-		sleep(1);
-	}
-}

 static FailoverState
 promote_self(void)