From d99024ba1123a3f44fce198d9285659fe9ee0c7d Mon Sep 17 00:00:00 2001 From: Jaime Casanova Date: Mon, 9 Sep 2013 11:10:20 -0500 Subject: [PATCH] Make repmgrd survive to the failover To do this it needs to reconnect to the new master --- repmgrd.c | 239 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 131 insertions(+), 108 deletions(-) diff --git a/repmgrd.c b/repmgrd.c index 1f60162a..b55e9342 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -81,6 +81,8 @@ bool verbose = false; bool monitoring_history = false; char repmgr_schema[MAXLEN]; +bool failover_done = false; + /* * should initialize with {0} to be ANSI complaint ? but this raises * error with gcc -Wall @@ -203,63 +205,37 @@ main(int argc, char **argv) exit(ERR_BAD_CONFIG); } + /* - * Set my server mode, establish a connection to primary - * and start monitor - */ - if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node)) - myLocalMode = WITNESS_MODE; - else if (is_standby(myLocalConn)) - myLocalMode = STANDBY_MODE; - else /* is the master */ - myLocalMode = PRIMARY_MODE; - - switch (myLocalMode) + * MAIN LOOP + * This loops cicles once per failover and at startup + * Requisites: + * - myLocalConn needs to be already setted with an active connection + * - no master connection + */ + do { - case PRIMARY_MODE: - primary_options.node = local_options.node; - strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN); - primaryConn = myLocalConn; - - checkClusterConfiguration(myLocalConn, primaryConn); - checkNodeConfiguration(local_options.conninfo); - - if (reload_configuration(config_file, &local_options)) - { - PQfinish(myLocalConn); - myLocalConn = establishDBConnection(local_options.conninfo, true); - primaryConn = myLocalConn; - update_registration(); - } - - log_info(_("%s Starting continuous primary connection check\n"), progname); - /* Check that primary is still alive, and standbies are sending info */ /* - * Every SLEEP_MONITOR seconds, do master checks - * XXX - * Check that standbies are sending info - */ - for (;;) + * Set my server mode, establish a connection to primary + * and start monitor + */ + if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node)) + myLocalMode = WITNESS_MODE; + else if (is_standby(myLocalConn)) + myLocalMode = STANDBY_MODE; + else /* is the master */ + myLocalMode = PRIMARY_MODE; + + switch (myLocalMode) { - if (CheckPrimaryConnection()) - { - /* - CheckActiveStandbiesConnections(); - CheckInactiveStandbies(); - */ - sleep(SLEEP_MONITOR); - } - else - { - /* XXX - * May we do something more verbose ? - */ - exit (1); - } - - if (got_SIGHUP) - { - /* if we can reload, then could need to change myLocalConn */ + case PRIMARY_MODE: + primary_options.node = local_options.node; + strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN); + primaryConn = myLocalConn; + + checkClusterConfiguration(myLocalConn, primaryConn); + checkNodeConfiguration(local_options.conninfo); + if (reload_configuration(config_file, &local_options)) { PQfinish(myLocalConn); @@ -267,70 +243,112 @@ main(int argc, char **argv) primaryConn = myLocalConn; update_registration(); } - got_SIGHUP = false; - } - } - break; - case WITNESS_MODE: - case STANDBY_MODE: - /* I need the id of the primary as well as a connection to it */ - log_info(_("%s Connecting to primary for cluster '%s'\n"), - progname, local_options.cluster_name); - primaryConn = getMasterConnection(myLocalConn, repmgr_schema, - local_options.cluster_name, - &primary_options.node, NULL); - if (primaryConn == NULL) - { - CloseConnections(); - exit(ERR_BAD_CONFIG); - } + + log_info(_("%s Starting continuous primary connection check\n"), progname); + + /* Check that primary is still alive, and standbies are sending info */ + + /* + * Every SLEEP_MONITOR seconds, do master checks + * XXX + * Check that standbies are sending info + */ + do + { + if (CheckPrimaryConnection()) + { + /* + CheckActiveStandbiesConnections(); + CheckInactiveStandbies(); + */ + sleep(SLEEP_MONITOR); + } + else + { + /* XXX + * May we do something more verbose ? + */ + exit(1); + } + + if (got_SIGHUP) + { + /* if we can reload, then could need to change myLocalConn */ + if (reload_configuration(config_file, &local_options)) + { + PQfinish(myLocalConn); + myLocalConn = establishDBConnection(local_options.conninfo, true); + primaryConn = myLocalConn; + update_registration(); + } + got_SIGHUP = false; + } + } while (!failover_done); + break; + case WITNESS_MODE: + case STANDBY_MODE: + /* I need the id of the primary as well as a connection to it */ + log_info(_("%s Connecting to primary for cluster '%s'\n"), + progname, local_options.cluster_name); + primaryConn = getMasterConnection(myLocalConn, repmgr_schema, + local_options.cluster_name, + &primary_options.node, NULL); + if (primaryConn == NULL) + { + CloseConnections(); + exit(ERR_BAD_CONFIG); + } - checkClusterConfiguration(myLocalConn, primaryConn); - checkNodeConfiguration(local_options.conninfo); + checkClusterConfiguration(myLocalConn, primaryConn); + checkNodeConfiguration(local_options.conninfo); - if (reload_configuration(config_file, &local_options)) - { - PQfinish(myLocalConn); - myLocalConn = establishDBConnection(local_options.conninfo, true); - update_registration(); - } - - /* - * Every SLEEP_MONITOR seconds, do checks - */ - if (myLocalMode == WITNESS_MODE) - { - log_info(_("%s Starting continuous witness node monitoring\n"), progname); - } - else if (myLocalMode == STANDBY_MODE) - { - log_info(_("%s Starting continuous standby node monitoring\n"), progname); - } - - for (;;) - { - if (myLocalMode == WITNESS_MODE) - WitnessMonitor(); - else if (myLocalMode == STANDBY_MODE) - StandbyMonitor(); - sleep(SLEEP_MONITOR); - - if (got_SIGHUP) - { - /* if we can reload, then could need to change myLocalConn */ if (reload_configuration(config_file, &local_options)) { PQfinish(myLocalConn); myLocalConn = establishDBConnection(local_options.conninfo, true); update_registration(); } - got_SIGHUP = false; - } + + /* + * Every SLEEP_MONITOR seconds, do checks + */ + if (myLocalMode == WITNESS_MODE) + { + log_info(_("%s Starting continuous witness node monitoring\n"), progname); + } + else if (myLocalMode == STANDBY_MODE) + { + log_info(_("%s Starting continuous standby node monitoring\n"), progname); + } + + do + { + if (myLocalMode == WITNESS_MODE) + WitnessMonitor(); + else if (myLocalMode == STANDBY_MODE) + StandbyMonitor(); + sleep(SLEEP_MONITOR); + + if (got_SIGHUP) + { + /* if we can reload, then could need to change myLocalConn */ + if (reload_configuration(config_file, &local_options)) + { + PQfinish(myLocalConn); + myLocalConn = establishDBConnection(local_options.conninfo, true); + update_registration(); + } + got_SIGHUP = false; + } + } while (!failover_done); + break; + default: + log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node); } - break; - default: - log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node); - } + + failover_done = false; + + } while (true); /* Prevent a double-free */ if (primaryConn == myLocalConn) @@ -481,6 +499,7 @@ StandbyMonitor(void) * a new primaryConn */ do_failover(); + return; } } @@ -901,6 +920,9 @@ do_failover(void) exit(ERR_FAILOVER_FAIL); } + /* to force it to re-calculate mode and master node */ + failover_done = true; + /* and reconnect to the local database */ myLocalConn = establishDBConnection(local_options.conninfo, true); } @@ -1089,6 +1111,7 @@ static void handle_sigint(SIGNAL_ARGS) { CloseConnections(); + logger_shutdown(); exit(1); }