Make repmgrd survive to the failover

To do this it needs to reconnect to the new master
This commit is contained in:
Jaime Casanova
2013-09-09 11:10:20 -05:00
parent 1afaa3a26f
commit d99024ba11

233
repmgrd.c
View File

@@ -81,6 +81,8 @@ bool verbose = false;
bool monitoring_history = false; bool monitoring_history = false;
char repmgr_schema[MAXLEN]; char repmgr_schema[MAXLEN];
bool failover_done = false;
/* /*
* should initialize with {0} to be ANSI complaint ? but this raises * should initialize with {0} to be ANSI complaint ? but this raises
* error with gcc -Wall * error with gcc -Wall
@@ -203,63 +205,37 @@ main(int argc, char **argv)
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
/* /*
* Set my server mode, establish a connection to primary * MAIN LOOP
* and start monitor * This loops cicles once per failover and at startup
*/ * Requisites:
if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node)) * - myLocalConn needs to be already setted with an active connection
myLocalMode = WITNESS_MODE; * - no master connection
else if (is_standby(myLocalConn)) */
myLocalMode = STANDBY_MODE; do
else /* is the master */
myLocalMode = PRIMARY_MODE;
switch (myLocalMode)
{ {
case PRIMARY_MODE:
primary_options.node = local_options.node;
strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
primaryConn = myLocalConn;
checkClusterConfiguration(myLocalConn, primaryConn);
checkNodeConfiguration(local_options.conninfo);
if (reload_configuration(config_file, &local_options))
{
PQfinish(myLocalConn);
myLocalConn = establishDBConnection(local_options.conninfo, true);
primaryConn = myLocalConn;
update_registration();
}
log_info(_("%s Starting continuous primary connection check\n"), progname);
/* Check that primary is still alive, and standbies are sending info */
/* /*
* Every SLEEP_MONITOR seconds, do master checks * Set my server mode, establish a connection to primary
* XXX * and start monitor
* Check that standbies are sending info */
*/ if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node))
for (;;) myLocalMode = WITNESS_MODE;
{ else if (is_standby(myLocalConn))
if (CheckPrimaryConnection()) myLocalMode = STANDBY_MODE;
{ else /* is the master */
/* myLocalMode = PRIMARY_MODE;
CheckActiveStandbiesConnections();
CheckInactiveStandbies(); switch (myLocalMode)
*/ {
sleep(SLEEP_MONITOR); case PRIMARY_MODE:
} primary_options.node = local_options.node;
else strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
{ primaryConn = myLocalConn;
/* XXX
* May we do something more verbose ? checkClusterConfiguration(myLocalConn, primaryConn);
*/ checkNodeConfiguration(local_options.conninfo);
exit (1);
}
if (got_SIGHUP)
{
/* if we can reload, then could need to change myLocalConn */
if (reload_configuration(config_file, &local_options)) if (reload_configuration(config_file, &local_options))
{ {
PQfinish(myLocalConn); PQfinish(myLocalConn);
@@ -267,70 +243,112 @@ main(int argc, char **argv)
primaryConn = myLocalConn; primaryConn = myLocalConn;
update_registration(); update_registration();
} }
got_SIGHUP = false;
}
}
break;
case WITNESS_MODE:
case STANDBY_MODE:
/* I need the id of the primary as well as a connection to it */
log_info(_("%s Connecting to primary for cluster '%s'\n"),
progname, local_options.cluster_name);
primaryConn = getMasterConnection(myLocalConn, repmgr_schema,
local_options.cluster_name,
&primary_options.node, NULL);
if (primaryConn == NULL)
{
CloseConnections();
exit(ERR_BAD_CONFIG);
}
checkClusterConfiguration(myLocalConn, primaryConn); log_info(_("%s Starting continuous primary connection check\n"), progname);
checkNodeConfiguration(local_options.conninfo);
if (reload_configuration(config_file, &local_options)) /* Check that primary is still alive, and standbies are sending info */
{
PQfinish(myLocalConn);
myLocalConn = establishDBConnection(local_options.conninfo, true);
update_registration();
}
/* /*
* Every SLEEP_MONITOR seconds, do checks * Every SLEEP_MONITOR seconds, do master checks
*/ * XXX
if (myLocalMode == WITNESS_MODE) * Check that standbies are sending info
{ */
log_info(_("%s Starting continuous witness node monitoring\n"), progname); do
} {
else if (myLocalMode == STANDBY_MODE) if (CheckPrimaryConnection())
{ {
log_info(_("%s Starting continuous standby node monitoring\n"), progname); /*
} CheckActiveStandbiesConnections();
CheckInactiveStandbies();
*/
sleep(SLEEP_MONITOR);
}
else
{
/* XXX
* May we do something more verbose ?
*/
exit(1);
}
for (;;) if (got_SIGHUP)
{ {
if (myLocalMode == WITNESS_MODE) /* if we can reload, then could need to change myLocalConn */
WitnessMonitor(); if (reload_configuration(config_file, &local_options))
else if (myLocalMode == STANDBY_MODE) {
StandbyMonitor(); PQfinish(myLocalConn);
sleep(SLEEP_MONITOR); myLocalConn = establishDBConnection(local_options.conninfo, true);
primaryConn = myLocalConn;
update_registration();
}
got_SIGHUP = false;
}
} while (!failover_done);
break;
case WITNESS_MODE:
case STANDBY_MODE:
/* I need the id of the primary as well as a connection to it */
log_info(_("%s Connecting to primary for cluster '%s'\n"),
progname, local_options.cluster_name);
primaryConn = getMasterConnection(myLocalConn, repmgr_schema,
local_options.cluster_name,
&primary_options.node, NULL);
if (primaryConn == NULL)
{
CloseConnections();
exit(ERR_BAD_CONFIG);
}
checkClusterConfiguration(myLocalConn, primaryConn);
checkNodeConfiguration(local_options.conninfo);
if (got_SIGHUP)
{
/* if we can reload, then could need to change myLocalConn */
if (reload_configuration(config_file, &local_options)) if (reload_configuration(config_file, &local_options))
{ {
PQfinish(myLocalConn); PQfinish(myLocalConn);
myLocalConn = establishDBConnection(local_options.conninfo, true); myLocalConn = establishDBConnection(local_options.conninfo, true);
update_registration(); update_registration();
} }
got_SIGHUP = false;
} /*
* Every SLEEP_MONITOR seconds, do checks
*/
if (myLocalMode == WITNESS_MODE)
{
log_info(_("%s Starting continuous witness node monitoring\n"), progname);
}
else if (myLocalMode == STANDBY_MODE)
{
log_info(_("%s Starting continuous standby node monitoring\n"), progname);
}
do
{
if (myLocalMode == WITNESS_MODE)
WitnessMonitor();
else if (myLocalMode == STANDBY_MODE)
StandbyMonitor();
sleep(SLEEP_MONITOR);
if (got_SIGHUP)
{
/* if we can reload, then could need to change myLocalConn */
if (reload_configuration(config_file, &local_options))
{
PQfinish(myLocalConn);
myLocalConn = establishDBConnection(local_options.conninfo, true);
update_registration();
}
got_SIGHUP = false;
}
} while (!failover_done);
break;
default:
log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node);
} }
break;
default: failover_done = false;
log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node);
} } while (true);
/* Prevent a double-free */ /* Prevent a double-free */
if (primaryConn == myLocalConn) if (primaryConn == myLocalConn)
@@ -481,6 +499,7 @@ StandbyMonitor(void)
* a new primaryConn * a new primaryConn
*/ */
do_failover(); do_failover();
return;
} }
} }
@@ -901,6 +920,9 @@ do_failover(void)
exit(ERR_FAILOVER_FAIL); exit(ERR_FAILOVER_FAIL);
} }
/* to force it to re-calculate mode and master node */
failover_done = true;
/* and reconnect to the local database */ /* and reconnect to the local database */
myLocalConn = establishDBConnection(local_options.conninfo, true); myLocalConn = establishDBConnection(local_options.conninfo, true);
} }
@@ -1089,6 +1111,7 @@ static void
handle_sigint(SIGNAL_ARGS) handle_sigint(SIGNAL_ARGS)
{ {
CloseConnections(); CloseConnections();
logger_shutdown();
exit(1); exit(1);
} }