mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 00:46:29 +00:00
Merge pull request #131 from martinmarques/fix-failed-standby
Fix failed standby
This commit is contained in:
68
repmgrd.c
68
repmgrd.c
@@ -71,7 +71,7 @@ static void check_node_configuration(void);
|
|||||||
static void standby_monitor(void);
|
static void standby_monitor(void);
|
||||||
static void witness_monitor(void);
|
static void witness_monitor(void);
|
||||||
static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
|
static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
|
||||||
static bool set_local_node_failed(void);
|
static bool set_local_node_status(void);
|
||||||
|
|
||||||
static void update_shared_memory(char *last_wal_standby_applied);
|
static void update_shared_memory(char *last_wal_standby_applied);
|
||||||
static void update_registration(void);
|
static void update_registration(void);
|
||||||
@@ -685,23 +685,17 @@ standby_monitor(void)
|
|||||||
{
|
{
|
||||||
PQExpBufferData errmsg;
|
PQExpBufferData errmsg;
|
||||||
|
|
||||||
set_local_node_failed();
|
set_local_node_status();
|
||||||
|
|
||||||
initPQExpBuffer(&errmsg);
|
initPQExpBuffer(&errmsg);
|
||||||
|
|
||||||
appendPQExpBuffer(&errmsg,
|
appendPQExpBuffer(&errmsg,
|
||||||
_("failed to connect to local node, node marked as failed and terminating!"));
|
_("failed to connect to local node, node marked as failed!"));
|
||||||
|
|
||||||
log_err("%s\n", errmsg.data);
|
log_err("%s\n", errmsg.data);
|
||||||
|
|
||||||
create_event_record(master_conn,
|
//terminate(ERR_DB_CON);
|
||||||
&local_options,
|
goto continue_monitoring_standby;
|
||||||
local_options.node,
|
|
||||||
"repmgrd_shutdown",
|
|
||||||
false,
|
|
||||||
errmsg.data);
|
|
||||||
|
|
||||||
terminate(ERR_DB_CON);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
upstream_conn = get_upstream_connection(my_local_conn,
|
upstream_conn = get_upstream_connection(my_local_conn,
|
||||||
@@ -830,6 +824,7 @@ standby_monitor(void)
|
|||||||
|
|
||||||
PQfinish(upstream_conn);
|
PQfinish(upstream_conn);
|
||||||
|
|
||||||
|
continue_monitoring_standby:
|
||||||
/* Check if we still are a standby, we could have been promoted */
|
/* Check if we still are a standby, we could have been promoted */
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
@@ -845,10 +840,13 @@ standby_monitor(void)
|
|||||||
* will require manual resolution as there's no way of determing
|
* will require manual resolution as there's no way of determing
|
||||||
* which master is the correct one.
|
* which master is the correct one.
|
||||||
*
|
*
|
||||||
|
* We should log a message so the user knows of the situation at hand.
|
||||||
|
*
|
||||||
* XXX check if the original master is still active and display a
|
* XXX check if the original master is still active and display a
|
||||||
* warning
|
* warning
|
||||||
*/
|
*/
|
||||||
log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
|
log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
|
||||||
|
log_err(_("Check your cluster and manually fix any anomaly.\n"));
|
||||||
terminate(1);
|
terminate(1);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@@ -858,17 +856,28 @@ standby_monitor(void)
|
|||||||
|
|
||||||
if (!check_connection(&my_local_conn, "standby", NULL))
|
if (!check_connection(&my_local_conn, "standby", NULL))
|
||||||
{
|
{
|
||||||
set_local_node_failed();
|
set_local_node_status();
|
||||||
terminate(0);
|
/*
|
||||||
|
* Let's continue checking, and if the postgres server on the
|
||||||
|
* standby comes back up, we will activate it again
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
}
|
}
|
||||||
} while (ret == -1);
|
} while (ret == -1);
|
||||||
|
|
||||||
if (did_retry)
|
if (did_retry)
|
||||||
{
|
{
|
||||||
log_info(_("standby connection recovered!\n"));
|
/*
|
||||||
|
* There's a possible situation where the standby went down for some reason
|
||||||
|
* (maintanence for example) and is now up and maybe connected once again to
|
||||||
|
* the stream. If we set the local standby node as failed and it's now running
|
||||||
|
* and receiving replication data, we should activate it again.
|
||||||
|
*/
|
||||||
|
set_local_node_status();
|
||||||
|
log_info(_("standby connection recovered!\n"));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Fast path for the case where no history is requested */
|
/* Fast path for the case where no history is requested */
|
||||||
@@ -1769,7 +1778,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* set_local_node_failed()
|
* set_local_node_status()
|
||||||
*
|
*
|
||||||
* If failure of the local node is detected, attempt to connect
|
* If failure of the local node is detected, attempt to connect
|
||||||
* to the current master server (as stored in the global variable
|
* to the current master server (as stored in the global variable
|
||||||
@@ -1777,16 +1786,16 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
set_local_node_failed(void)
|
set_local_node_status(void)
|
||||||
{
|
{
|
||||||
PGresult *res;
|
PGresult *res;
|
||||||
char sqlquery[QUERY_STR_LEN];
|
char sqlquery[QUERY_STR_LEN];
|
||||||
int active_master_node_id = NODE_NOT_FOUND;
|
int active_master_node_id = NODE_NOT_FOUND;
|
||||||
char master_conninfo[MAXLEN];
|
char master_conninfo[MAXLEN];
|
||||||
|
|
||||||
if (!check_connection(&master_conn, "master", NULL))
|
if (!check_connection(&master_conn, "master", NULL))
|
||||||
{
|
{
|
||||||
log_err(_("set_local_node_failed(): Unable to connect to last known master node\n"));
|
log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1840,17 +1849,16 @@ set_local_node_failed(void)
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Attempt to set own record as inactive
|
* Attempt to set the active record to the correct value.
|
||||||
|
* First
|
||||||
*/
|
*/
|
||||||
sqlquery_snprintf(sqlquery,
|
|
||||||
"UPDATE %s.repl_nodes "
|
if (!update_node_record_status(master_conn,
|
||||||
" SET active = FALSE "
|
local_options.cluster_name,
|
||||||
" WHERE id = %i ",
|
node_info.node_id,
|
||||||
get_repmgr_schema_quoted(master_conn),
|
"standby",
|
||||||
node_info.node_id);
|
node_info.upstream_node_id,
|
||||||
|
is_standby(my_local_conn)==1))
|
||||||
res = PQexec(master_conn, sqlquery);
|
|
||||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
|
||||||
{
|
{
|
||||||
log_err(_("unable to set local node %i as inactive on master: %s\n"),
|
log_err(_("unable to set local node %i as inactive on master: %s\n"),
|
||||||
node_info.node_id,
|
node_info.node_id,
|
||||||
|
|||||||
Reference in New Issue
Block a user