Merge pull request #131 from martinmarques/fix-failed-standby

Fix failed standby
This commit is contained in:
Martín Marqués
2015-12-29 13:24:08 -03:00

View File

@@ -71,7 +71,7 @@ static void check_node_configuration(void);
static void standby_monitor(void);
static void witness_monitor(void);
static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
static bool set_local_node_failed(void);
static bool set_local_node_status(void);
static void update_shared_memory(char *last_wal_standby_applied);
static void update_registration(void);
@@ -685,23 +685,17 @@ standby_monitor(void)
{
PQExpBufferData errmsg;
set_local_node_failed();
set_local_node_status();
initPQExpBuffer(&errmsg);
appendPQExpBuffer(&errmsg,
_("failed to connect to local node, node marked as failed and terminating!"));
_("failed to connect to local node, node marked as failed!"));
log_err("%s\n", errmsg.data);
create_event_record(master_conn,
&local_options,
local_options.node,
"repmgrd_shutdown",
false,
errmsg.data);
terminate(ERR_DB_CON);
//terminate(ERR_DB_CON);
goto continue_monitoring_standby;
}
upstream_conn = get_upstream_connection(my_local_conn,
@@ -830,6 +824,7 @@ standby_monitor(void)
PQfinish(upstream_conn);
continue_monitoring_standby:
/* Check if we still are a standby, we could have been promoted */
do
{
@@ -845,10 +840,13 @@ standby_monitor(void)
* will require manual resolution as there's no way of determing
* which master is the correct one.
*
* We should log a message so the user knows of the situation at hand.
*
* XXX check if the original master is still active and display a
* warning
*/
log_err(_("It seems like we have been promoted, so exit from monitoring...\n"));
log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
log_err(_("Check your cluster and manually fix any anomaly.\n"));
terminate(1);
break;
@@ -858,17 +856,28 @@ standby_monitor(void)
if (!check_connection(&my_local_conn, "standby", NULL))
{
set_local_node_failed();
terminate(0);
set_local_node_status();
/*
* Let's continue checking, and if the postgres server on the
* standby comes back up, we will activate it again
*/
}
break;
}
} while (ret == -1);
if (did_retry)
{
log_info(_("standby connection recovered!\n"));
/*
* There's a possible situation where the standby went down for some reason
* (maintanence for example) and is now up and maybe connected once again to
* the stream. If we set the local standby node as failed and it's now running
* and receiving replication data, we should activate it again.
*/
set_local_node_status();
log_info(_("standby connection recovered!\n"));
}
/* Fast path for the case where no history is requested */
@@ -1769,7 +1778,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
/*
* set_local_node_failed()
* set_local_node_status()
*
* If failure of the local node is detected, attempt to connect
* to the current master server (as stored in the global variable
@@ -1777,16 +1786,16 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
*/
static bool
set_local_node_failed(void)
set_local_node_status(void)
{
PGresult *res;
PGresult *res;
char sqlquery[QUERY_STR_LEN];
int active_master_node_id = NODE_NOT_FOUND;
int active_master_node_id = NODE_NOT_FOUND;
char master_conninfo[MAXLEN];
if (!check_connection(&master_conn, "master", NULL))
{
log_err(_("set_local_node_failed(): Unable to connect to last known master node\n"));
log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
return false;
}
@@ -1840,17 +1849,16 @@ set_local_node_failed(void)
/*
* Attempt to set own record as inactive
* Attempt to set the active record to the correct value.
* First
*/
sqlquery_snprintf(sqlquery,
"UPDATE %s.repl_nodes "
" SET active = FALSE "
" WHERE id = %i ",
get_repmgr_schema_quoted(master_conn),
node_info.node_id);
res = PQexec(master_conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
if (!update_node_record_status(master_conn,
local_options.cluster_name,
node_info.node_id,
"standby",
node_info.upstream_node_id,
is_standby(my_local_conn)==1))
{
log_err(_("unable to set local node %i as inactive on master: %s\n"),
node_info.node_id,