Merge pull request #131 from martinmarques/fix-failed-standby

Fix failed standby
This commit is contained in:
Martín Marqués
2015-12-29 13:24:08 -03:00

View File

@@ -71,7 +71,7 @@ static void check_node_configuration(void);
static void standby_monitor(void); static void standby_monitor(void);
static void witness_monitor(void); static void witness_monitor(void);
static bool check_connection(PGconn **conn, const char *type, const char *conninfo); static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
static bool set_local_node_failed(void); static bool set_local_node_status(void);
static void update_shared_memory(char *last_wal_standby_applied); static void update_shared_memory(char *last_wal_standby_applied);
static void update_registration(void); static void update_registration(void);
@@ -685,23 +685,17 @@ standby_monitor(void)
{ {
PQExpBufferData errmsg; PQExpBufferData errmsg;
set_local_node_failed(); set_local_node_status();
initPQExpBuffer(&errmsg); initPQExpBuffer(&errmsg);
appendPQExpBuffer(&errmsg, appendPQExpBuffer(&errmsg,
_("failed to connect to local node, node marked as failed and terminating!")); _("failed to connect to local node, node marked as failed!"));
log_err("%s\n", errmsg.data); log_err("%s\n", errmsg.data);
create_event_record(master_conn, //terminate(ERR_DB_CON);
&local_options, goto continue_monitoring_standby;
local_options.node,
"repmgrd_shutdown",
false,
errmsg.data);
terminate(ERR_DB_CON);
} }
upstream_conn = get_upstream_connection(my_local_conn, upstream_conn = get_upstream_connection(my_local_conn,
@@ -830,6 +824,7 @@ standby_monitor(void)
PQfinish(upstream_conn); PQfinish(upstream_conn);
continue_monitoring_standby:
/* Check if we still are a standby, we could have been promoted */ /* Check if we still are a standby, we could have been promoted */
do do
{ {
@@ -845,10 +840,13 @@ standby_monitor(void)
* will require manual resolution as there's no way of determing * will require manual resolution as there's no way of determing
* which master is the correct one. * which master is the correct one.
* *
* We should log a message so the user knows of the situation at hand.
*
* XXX check if the original master is still active and display a * XXX check if the original master is still active and display a
* warning * warning
*/ */
log_err(_("It seems like we have been promoted, so exit from monitoring...\n")); log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
log_err(_("Check your cluster and manually fix any anomaly.\n"));
terminate(1); terminate(1);
break; break;
@@ -858,17 +856,28 @@ standby_monitor(void)
if (!check_connection(&my_local_conn, "standby", NULL)) if (!check_connection(&my_local_conn, "standby", NULL))
{ {
set_local_node_failed(); set_local_node_status();
terminate(0); /*
* Let's continue checking, and if the postgres server on the
* standby comes back up, we will activate it again
*/
} }
break; break;
} }
} while (ret == -1); } while (ret == -1);
if (did_retry) if (did_retry)
{ {
log_info(_("standby connection recovered!\n")); /*
* There's a possible situation where the standby went down for some reason
* (maintanence for example) and is now up and maybe connected once again to
* the stream. If we set the local standby node as failed and it's now running
* and receiving replication data, we should activate it again.
*/
set_local_node_status();
log_info(_("standby connection recovered!\n"));
} }
/* Fast path for the case where no history is requested */ /* Fast path for the case where no history is requested */
@@ -1769,7 +1778,7 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
/* /*
* set_local_node_failed() * set_local_node_status()
* *
* If failure of the local node is detected, attempt to connect * If failure of the local node is detected, attempt to connect
* to the current master server (as stored in the global variable * to the current master server (as stored in the global variable
@@ -1777,16 +1786,16 @@ check_connection(PGconn **conn, const char *type, const char *conninfo)
*/ */
static bool static bool
set_local_node_failed(void) set_local_node_status(void)
{ {
PGresult *res; PGresult *res;
char sqlquery[QUERY_STR_LEN]; char sqlquery[QUERY_STR_LEN];
int active_master_node_id = NODE_NOT_FOUND; int active_master_node_id = NODE_NOT_FOUND;
char master_conninfo[MAXLEN]; char master_conninfo[MAXLEN];
if (!check_connection(&master_conn, "master", NULL)) if (!check_connection(&master_conn, "master", NULL))
{ {
log_err(_("set_local_node_failed(): Unable to connect to last known master node\n")); log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
return false; return false;
} }
@@ -1840,17 +1849,16 @@ set_local_node_failed(void)
/* /*
* Attempt to set own record as inactive * Attempt to set the active record to the correct value.
* First
*/ */
sqlquery_snprintf(sqlquery,
"UPDATE %s.repl_nodes "
" SET active = FALSE "
" WHERE id = %i ",
get_repmgr_schema_quoted(master_conn),
node_info.node_id);
res = PQexec(master_conn, sqlquery); if (!update_node_record_status(master_conn,
if (PQresultStatus(res) != PGRES_COMMAND_OK) local_options.cluster_name,
node_info.node_id,
"standby",
node_info.upstream_node_id,
is_standby(my_local_conn)==1))
{ {
log_err(_("unable to set local node %i as inactive on master: %s\n"), log_err(_("unable to set local node %i as inactive on master: %s\n"),
node_info.node_id, node_info.node_id,