Don't terminate a standby's repmgrd if self-promotion fails due to master reappearing

Per GitHub #173
This commit is contained in:
Ian Barwick
2016-05-10 11:45:03 +09:00
parent 10e47441a2
commit d5e24689a4

View File

@@ -1473,6 +1473,8 @@ do_master_failover(void)
terminate(ERR_FAILOVER_FAIL);
}
log_debug("best candidate node id is %i\n", best_candidate.node_id);
/* if local node is the best candidate, promote it */
if (best_candidate.node_id == local_options.node)
{
@@ -1484,7 +1486,7 @@ do_master_failover(void)
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
log_debug(_("promote command is: \"%s\"\n"),
log_debug("promote command is: \"%s\"\n",
local_options.promote_command);
if (log_type == REPMGR_STDERR && *local_options.logfile)
@@ -1495,6 +1497,33 @@ do_master_failover(void)
r = system(local_options.promote_command);
if (r != 0)
{
int master_node_id;
/*
* Check whether the primary reappeared, which will have caused the
* promote command to fail
*/
my_local_conn = establish_db_connection(local_options.conninfo, false);
if (my_local_conn != NULL)
{
master_conn = get_master_connection(my_local_conn,
local_options.cluster_name,
&master_node_id, NULL);
if (master_conn != NULL && master_node_id == failed_master.node_id)
{
log_notice(_("Original master reappeared before this standby was promoted - no action taken\n"));
PQfinish(master_conn);
/* no failover occurred but we'll want to restart connections */
failover_done = true;
return;
}
PQfinish(my_local_conn);
}
log_err(_("promote command failed. You could check and try it manually.\n"));
terminate(ERR_DB_QUERY);