Rearrange the logic in do_failover() for further improvements.

Specially, make this a more coordinated process by making all
nodes waiting for the others before going to the next step.

This is one step further in following Andres Freund advices
but there is still a lot to do in order to complete that,
specially it could be needed to add more fields to repl_nodes
and to the shm area.
This commit is contained in:
Jaime Casanova
2013-08-22 14:50:29 -05:00
parent 3b66a31ac9
commit 1afaa3a26f

280
repmgrd.c
View File

@@ -52,8 +52,10 @@ const XLogRecPtr InvalidXLogRecPtr = {0, 0};
typedef struct nodeInfo typedef struct nodeInfo
{ {
int nodeId; int nodeId;
char conninfostr[MAXLEN];
XLogRecPtr xlog_location; XLogRecPtr xlog_location;
bool is_ready; bool is_ready;
bool is_visible;
bool is_witness; bool is_witness;
} nodeInfo; } nodeInfo;
@@ -570,23 +572,22 @@ StandbyMonitor(void)
static void static void
do_failover(void) do_failover(void)
{ {
PGresult *res1; PGresult *res;
PGresult *res2;
char sqlquery[8192]; char sqlquery[8192];
int total_nodes = 0; int total_nodes = 0;
int visible_nodes = 0; int visible_nodes = 0;
int ready_nodes = 0;
bool find_best = false; bool find_best = false;
bool witness = false;
int i; int i;
int r; int r;
int node; uint32 uxlogid;
char nodeConninfo[MAXLEN]; uint32 uxrecoff;
XLogRecPtr xlog_recptr;
unsigned int uxlogid;
unsigned int uxrecoff;
char last_wal_standby_applied[MAXLEN]; char last_wal_standby_applied[MAXLEN];
PGconn *nodeConn = NULL; PGconn *nodeConn = NULL;
@@ -596,108 +597,60 @@ do_failover(void)
* which seems to be large enough for most scenarios * which seems to be large enough for most scenarios
*/ */
nodeInfo nodes[50]; nodeInfo nodes[50];
/* initialize to keep compiler quiet */ /* initialize to keep compiler quiet */
nodeInfo best_candidate = {-1, InvalidXLogRecPtr, false, false}; nodeInfo best_candidate = {-1, "", InvalidXLogRecPtr, false, false, false};
/* first we get info about this node, and update shared memory */
sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
res1 = PQexec(myLocalConn, sqlquery);
if (PQresultStatus(res1) != PGRES_TUPLES_OK)
{
log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
PQclear(res1);
sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
update_shared_memory(last_wal_standby_applied);
exit(ERR_DB_QUERY);
}
/* write last location in shared memory */
update_shared_memory(PQgetvalue(res1, 0, 0));
/*
* we sleep the monitor time + one second
* we bet it should be enough for other repmgrd to update their own data
*/
sleep(SLEEP_MONITOR + 1);
/* get a list of standby nodes, including myself */ /* get a list of standby nodes, including myself */
sprintf(sqlquery, "SELECT id, conninfo, witness " sprintf(sqlquery, "SELECT id, conninfo, witness "
" FROM %s.repl_nodes " " FROM %s.repl_nodes "
" WHERE id <> %d " " WHERE cluster = '%s' "
" AND cluster = '%s' "
" ORDER BY priority, id ", " ORDER BY priority, id ",
repmgr_schema, primary_options.node, local_options.cluster_name); repmgr_schema, local_options.cluster_name);
res1 = PQexec(myLocalConn, sqlquery); res = PQexec(myLocalConn, sqlquery);
if (PQresultStatus(res1) != PGRES_TUPLES_OK) if (PQresultStatus(res) != PGRES_TUPLES_OK)
{ {
log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn)); log_err(_("Can't get nodes' info: %s\n"), PQerrorMessage(myLocalConn));
PQclear(res1); PQclear(res);
PQfinish(myLocalConn); PQfinish(myLocalConn);
exit(ERR_DB_QUERY); exit(ERR_DB_QUERY);
} }
log_debug(_("%s: there are %d nodes registered"), progname, PQntuples(res1)); /*
/* ask for the locations */ * total nodes that are registered
for (i = 0; i < PQntuples(res1); i++) */
total_nodes = PQntuples(res);
log_debug(_("%s: there are %d nodes registered\n"), progname, total_nodes);
/* Build an array with the nodes and indicate which ones are visible and ready */
for (i = 0; i < total_nodes; i++)
{ {
node = atoi(PQgetvalue(res1, i, 0)); nodes[i].nodeId = atoi(PQgetvalue(res, i, 0));
strncpy(nodes[i].conninfostr, PQgetvalue(res, i, 1), MAXLEN);
nodes[i].is_witness = (strcmp(PQgetvalue(res, i, 2), "t") == 0) ? true : false;
/* Initialize on false so if we can't reach this node we know that later */ /* Initialize on false so if we can't reach this node we know that later */
nodes[i].is_visible = false;
nodes[i].is_ready = false; nodes[i].is_ready = false;
strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN); nodes[i].xlog_location.xlogid = 0;
witness = (strcmp(PQgetvalue(res1, i, 2), "t") == 0) ? true : false; nodes[i].xlog_location.xrecoff = 0;
log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s"), progname, node, nodeConninfo, (witness) ? "true" : "false"); log_debug(_("%s: node=%d conninfo=\"%s\" witness=%s\n"),
progname, nodes[i].nodeId, nodes[i].conninfostr, (nodes[i].is_witness) ? "true" : "false");
nodeConn = establishDBConnection(nodeConninfo, false); nodeConn = establishDBConnection(nodes[i].conninfostr, false);
/* if we can't see the node just skip it */ /* if we can't see the node just skip it */
if (PQstatus(nodeConn) != CONNECTION_OK) if (PQstatus(nodeConn) != CONNECTION_OK)
continue; continue;
/* the witness will always show 0/0 so avoid a useless query */
if (!witness)
{
sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema);
res2 = PQexec(nodeConn, sqlquery);
if (PQresultStatus(res2) != PGRES_TUPLES_OK)
{
log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
log_info(_("Connection details: %s\n"), nodeConninfo);
PQclear(res2);
PQfinish(nodeConn);
continue;
}
if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0));
PQclear(res2);
}
else
{
uxlogid = 0;
uxrecoff = 0;
}
visible_nodes++; visible_nodes++;
nodes[i].is_visible = true;
nodes[i].nodeId = node;
nodes[i].xlog_location.xlogid = uxlogid;
nodes[i].xlog_location.xrecoff = uxrecoff;
nodes[i].is_ready = true;
nodes[i].is_witness = witness;
PQfinish(nodeConn); PQfinish(nodeConn);
} }
PQclear(res1); PQclear(res);
/* Close the connection to this server */
PQfinish(myLocalConn);
/* log_debug(_("Total nodes counted: registered=%d, visible=%d\n"), total_nodes, visible_nodes);
* total nodes that are registered, include master which is a node but was
* not counted because it's not a standby
*/
total_nodes = i + 1;
/* /*
* am i on the group that should keep alive? * am i on the group that should keep alive?
@@ -711,16 +664,165 @@ do_failover(void)
exit(ERR_FAILOVER_FAIL); exit(ERR_FAILOVER_FAIL);
} }
/* Query all the nodes to determine which ones are ready */
for (i = 0; i < total_nodes; i++)
{
/* if the node is not visible, skip it */
if (!nodes[i].is_visible)
continue;
if (nodes[i].is_witness)
continue;
nodeConn = establishDBConnection(nodes[i].conninfostr, false);
/* XXX
* This shouldn't happen, if this happens it means this is a major problem
* maybe network outages? anyway, is better for a human to react
*/
if (PQstatus(nodeConn) != CONNECTION_OK)
{
log_err(_("It seems new problems are arising, manual intervention is needed\n"));
exit(ERR_FAILOVER_FAIL);
}
sqlquery_snprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
res = PQexec(nodeConn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
log_info(_("Connection details: %s\n"), nodes[i].conninfostr);
PQclear(res);
PQfinish(nodeConn);
exit(ERR_FAILOVER_FAIL);
}
if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0));
log_debug("XLog position of node %d: log id=%u (%X), offset=%u (%X)\n",
nodes[i].nodeId, uxlogid, uxlogid, uxrecoff, uxrecoff);
/* If position is 0/0, error */
if (uxlogid == 0 && uxrecoff == 0)
{
PQclear(res);
PQfinish(nodeConn);
log_info(_("InvalidXLogRecPtr detected in a standby\n"));
exit(ERR_FAILOVER_FAIL);
}
nodes[i].xlog_location.xlogid = uxlogid;
nodes[i].xlog_location.xrecoff = uxrecoff;
PQclear(res);
PQfinish(nodeConn);
}
/* last we get info about this node, and update shared memory */
sprintf(sqlquery, "SELECT pg_last_xlog_receive_location()");
res = PQexec(myLocalConn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
PQfinish(myLocalConn);
PQclear(res);
sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
update_shared_memory(last_wal_standby_applied);
exit(ERR_DB_QUERY);
}
/* write last location in shared memory */
update_shared_memory(PQgetvalue(res, 0, 0));
PQclear(res);
for (i = 0; i < total_nodes; i++)
{
while (!nodes[i].is_ready)
{
/*
* the witness will always be masked as ready if it's still
* not marked that way and avoid a useless query
*/
if (nodes[i].is_witness)
{
if (!nodes[i].is_ready)
{
nodes[i].is_ready = true;
ready_nodes++;
}
break;
}
/* if the node is not visible, skip it */
if (!nodes[i].is_visible)
break;
/* if the node is ready there is nothing to check, skip it too */
if (nodes[i].is_ready)
break;
nodeConn = establishDBConnection(nodes[i].conninfostr, false);
/* XXX
* This shouldn't happen, if this happens it means this is a major problem
* maybe network outages? anyway, is better for a human to react
*/
if (PQstatus(nodeConn) != CONNECTION_OK)
{
/* XXX */
log_info(_("At this point, it could be some race conditions that are acceptable, assume the node is restarting and starting failover procedure\n"));
break;
}
sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema);
res = PQexec(nodeConn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(nodeConn));
PQclear(res);
PQfinish(nodeConn);
exit(ERR_DB_QUERY);
}
if (sscanf(PQgetvalue(res, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res, 0, 0));
PQclear(res);
PQfinish(nodeConn);
/* If position is 0/0, keep checking */
if (uxlogid == 0 && uxrecoff == 0)
continue;
xlog_recptr.xlogid = uxlogid;
xlog_recptr.xrecoff = uxrecoff;
if (XLByteLT(nodes[i].xlog_location, xlog_recptr))
{
nodes[i].xlog_location.xlogid = uxlogid;
nodes[i].xlog_location.xrecoff = uxrecoff;
}
log_debug("Last XLog position of node %d: log id=%u (%X), offset=%u (%X)\n",
nodes[i].nodeId, nodes[i].xlog_location.xlogid, nodes[i].xlog_location.xlogid,
nodes[i].xlog_location.xrecoff, nodes[i].xlog_location.xrecoff);
ready_nodes++;
nodes[i].is_ready = true;
}
}
/* Close the connection to this server */
PQfinish(myLocalConn);
/* /*
* determine which one is the best candidate to promote to primary * determine which one is the best candidate to promote to primary
*/ */
for (i = 0; i < total_nodes - 1; i++) for (i = 0; i < total_nodes; i++)
{ {
/* witness is never a good candidate */ /* witness is never a good candidate */
if (nodes[i].is_witness) if (nodes[i].is_witness)
continue; continue;
if (!nodes[i].is_ready) if (!nodes[i].is_ready || !nodes[i].is_visible)
continue; continue;
if (!find_best) if (!find_best)
@@ -759,6 +861,9 @@ do_failover(void)
exit(ERR_FAILOVER_FAIL); exit(ERR_FAILOVER_FAIL);
} }
/* wait */
sleep(5);
if (verbose) if (verbose)
log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"), log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"),
progname); progname);
@@ -772,6 +877,9 @@ do_failover(void)
} }
else if (find_best) else if (find_best)
{ {
/* wait */
sleep(10);
if (verbose) if (verbose)
log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"), log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"),
progname, best_candidate.nodeId); progname, best_candidate.nodeId);