Merge pull request #131 from martinmarques/fix-failed-standby

Fix failed standby
2026-06-01 11:49:06 +00:00 · 2015-12-29 13:24:08 -03:00
parent b15e8debe1 aca2b9547f
commit d6bf870316
1 changed files with 38 additions and 30 deletions
@@ -71,7 +71,7 @@ static void check_node_configuration(void);
 static void standby_monitor(void);
 static void witness_monitor(void);
 static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
-static bool set_local_node_failed(void);
+static bool set_local_node_status(void);
 static void update_shared_memory(char *last_wal_standby_applied);
 static void update_registration(void);
@@ -685,23 +685,17 @@ standby_monitor(void)
 	{
 		PQExpBufferData errmsg;
-		set_local_node_failed();
+		set_local_node_status();
 		initPQExpBuffer(&errmsg);
 		appendPQExpBuffer(&errmsg,
-						  _("failed to connect to local node, node marked as failed and terminating!"));
+						  _("failed to connect to local node, node marked as failed!"));
 		log_err("%s\n", errmsg.data);
-		create_event_record(master_conn,
+		//terminate(ERR_DB_CON);
-							&local_options,
+		goto continue_monitoring_standby;
 							local_options.node,
 							"repmgrd_shutdown",
 							false,
 							errmsg.data);
 		terminate(ERR_DB_CON);
 	}
 	upstream_conn = get_upstream_connection(my_local_conn,
@@ -830,6 +824,7 @@ standby_monitor(void)
 	PQfinish(upstream_conn);
 continue_monitoring_standby:
 	/* Check if we still are a standby, we could have been promoted */
 	do
 	{
@@ -845,10 +840,13 @@ standby_monitor(void)
 				 * will require manual resolution as there's no way of determing
 				 * which master is the correct one.
 				 *
 				 * We should log a message so the user knows of the situation at hand.
 				 *
 				 * XXX check if the original master is still active and display a
 				 * warning
 				 */
-				log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
+				log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
 				log_err(_("Check your cluster and manually fix any anomaly.\n"));
 				terminate(1);
 				break;
@@ -858,17 +856,28 @@ standby_monitor(void)
 				if (!check_connection(&my_local_conn, "standby", NULL))
 				{
-					set_local_node_failed();
+					set_local_node_status();
-					terminate(0);
+					/* 
 					 * Let's continue checking, and if the postgres server on the
 					 * standby comes back up, we will activate it again
 					 */
 				}
 				break;
 		}
 	} while (ret == -1);
 	if (did_retry)
 	{
-		log_info(_("standby connection recovered!\n"));
+	        /*
 		 * There's a possible situation where the standby went down for some reason
 		 * (maintanence for example) and is now up and maybe connected once again to
 		 * the stream. If we set the local standby node as failed and it's now running
 		 * and receiving replication data, we should activate it again.
 		 */
 	        set_local_node_status();
 	        log_info(_("standby connection recovered!\n"));
 	}
 	/* Fast path for the case where no history is requested */
@@ -1769,7 +1778,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
 /*
- * set_local_node_failed()
+ * set_local_node_status()
 *
 * If failure of the local node is detected, attempt to connect
 * to the current master server (as stored in the global variable
@@ -1777,16 +1786,16 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
 */
 static bool
-set_local_node_failed(void)
+set_local_node_status(void)
 {
-	PGresult   *res;
+        PGresult       *res;
 	char		sqlquery[QUERY_STR_LEN];
-	int			active_master_node_id = NODE_NOT_FOUND;
+	int		active_master_node_id = NODE_NOT_FOUND;
 	char		master_conninfo[MAXLEN];
 	if (!check_connection(&master_conn, "master", NULL))
 	{
-		log_err(_("set_local_node_failed(): Unable to connect to last known master node\n"));
+		log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
 		return false;
 	}
@@ -1840,17 +1849,16 @@ set_local_node_failed(void)
 	/*
-	 * Attempt to set own record as inactive
+	 * Attempt to set the active record to the correct value.
 	 * First
 	 */
 	sqlquery_snprintf(sqlquery,
 					  "UPDATE %s.repl_nodes "
 					  "   SET active = FALSE "
 					  " WHERE id = %i ",
 					  get_repmgr_schema_quoted(master_conn),
 					  node_info.node_id);
-	res = PQexec(master_conn, sqlquery);
+	if (!update_node_record_status(master_conn,
-	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+					    local_options.cluster_name,
 					    node_info.node_id,
 					    "standby",
 					    node_info.upstream_node_id,
 					    is_standby(my_local_conn)==1))
 	{
 		log_err(_("unable to set local node %i as inactive on master: %s\n"),
 				node_info.node_id,