Merge pull request #131 from martinmarques/fix-failed-standby

Fix failed standby
2026-06-01 03:39:05 +00:00 · 2015-12-29 13:24:08 -03:00
parent b15e8debe1 aca2b9547f
commit d6bf870316
1 changed files with 38 additions and 30 deletions
@@ -71,7 +71,7 @@ static void check_node_configuration(void);
 static void standby_monitor(void);
 static void witness_monitor(void);
 static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
-static bool set_local_node_failed(void);
+static bool set_local_node_status(void);

 static void update_shared_memory(char *last_wal_standby_applied);
 static void update_registration(void);
@@ -685,23 +685,17 @@ standby_monitor(void)
 	{
 		PQExpBufferData errmsg;

-		set_local_node_failed();
+		set_local_node_status();

 		initPQExpBuffer(&errmsg);

 		appendPQExpBuffer(&errmsg,
-						  _("failed to connect to local node, node marked as failed and terminating!"));
+						  _("failed to connect to local node, node marked as failed!"));

 		log_err("%s\n", errmsg.data);

-		create_event_record(master_conn,
-							&local_options,
-							local_options.node,
-							"repmgrd_shutdown",
-							false,
-							errmsg.data);
-
-		terminate(ERR_DB_CON);
+		//terminate(ERR_DB_CON);
+		goto continue_monitoring_standby;
 	}

 	upstream_conn = get_upstream_connection(my_local_conn,
@@ -830,6 +824,7 @@ standby_monitor(void)

 	PQfinish(upstream_conn);

+ continue_monitoring_standby:
 	/* Check if we still are a standby, we could have been promoted */
 	do
 	{
@@ -845,10 +840,13 @@ standby_monitor(void)
 				 * will require manual resolution as there's no way of determing
 				 * which master is the correct one.
 				 *
+				 * We should log a message so the user knows of the situation at hand.
+				 *
 				 * XXX check if the original master is still active and display a
 				 * warning
 				 */
-				log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
+				log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
+				log_err(_("Check your cluster and manually fix any anomaly.\n"));
 				terminate(1);
 				break;

@@ -858,17 +856,28 @@ standby_monitor(void)

 				if (!check_connection(&my_local_conn, "standby", NULL))
 				{
-					set_local_node_failed();
-					terminate(0);
+					set_local_node_status();
+					/* 
+					 * Let's continue checking, and if the postgres server on the
+					 * standby comes back up, we will activate it again
+					 */
 				}

 				break;
+		  
 		}
 	} while (ret == -1);

 	if (did_retry)
 	{
-		log_info(_("standby connection recovered!\n"));
+	        /*
+		 * There's a possible situation where the standby went down for some reason
+		 * (maintanence for example) and is now up and maybe connected once again to
+		 * the stream. If we set the local standby node as failed and it's now running
+		 * and receiving replication data, we should activate it again.
+		 */
+	        set_local_node_status();
+	        log_info(_("standby connection recovered!\n"));
 	}

 	/* Fast path for the case where no history is requested */
@@ -1769,7 +1778,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)


 /*
- * set_local_node_failed()
+ * set_local_node_status()
 *
 * If failure of the local node is detected, attempt to connect
 * to the current master server (as stored in the global variable
@@ -1777,16 +1786,16 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
 */

 static bool
-set_local_node_failed(void)
+set_local_node_status(void)
 {
-	PGresult   *res;
+        PGresult       *res;
 	char		sqlquery[QUERY_STR_LEN];
-	int			active_master_node_id = NODE_NOT_FOUND;
+	int		active_master_node_id = NODE_NOT_FOUND;
 	char		master_conninfo[MAXLEN];

 	if (!check_connection(&master_conn, "master", NULL))
 	{
-		log_err(_("set_local_node_failed(): Unable to connect to last known master node\n"));
+		log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
 		return false;
 	}

@@ -1840,17 +1849,16 @@ set_local_node_failed(void)


 	/*
-	 * Attempt to set own record as inactive
+	 * Attempt to set the active record to the correct value.
+	 * First
 	 */
-	sqlquery_snprintf(sqlquery,
-					  "UPDATE %s.repl_nodes "
-					  "   SET active = FALSE "
-					  " WHERE id = %i ",
-					  get_repmgr_schema_quoted(master_conn),
-					  node_info.node_id);
-
-	res = PQexec(master_conn, sqlquery);
-	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+	  
+	if (!update_node_record_status(master_conn,
+					    local_options.cluster_name,
+					    node_info.node_id,
+					    "standby",
+					    node_info.upstream_node_id,
+					    is_standby(my_local_conn)==1))
 	{
 		log_err(_("unable to set local node %i as inactive on master: %s\n"),
 				node_info.node_id,