From a666a49977092ce017d24d250570a57d6d837343 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Fri, 30 Jun 2017 16:04:47 +0900 Subject: [PATCH] Execute promote command --- repmgr.c | 9 ++--- repmgrd.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 108 insertions(+), 16 deletions(-) diff --git a/repmgr.c b/repmgr.c index 53eaee85..3e6f0f0d 100644 --- a/repmgr.c +++ b/repmgr.c @@ -167,8 +167,6 @@ request_vote(PG_FUNCTION_ARGS) LWLockAcquire(shared_state->lock, LW_SHARED); - // keep lock until end of function? - /* this node has initiated voting or already responded to another node */ if (current_electoral_term == shared_state->current_electoral_term && shared_state->voting_status != VS_NO_VOTE) @@ -188,19 +186,18 @@ request_vote(PG_FUNCTION_ARGS) initStringInfo(&query); appendStringInfo( &query, - "SELECT pg_catalog.pg_last_wal_receive_lsn()" - ); + "SELECT pg_catalog.pg_last_wal_receive_lsn()"); elog(INFO, "query: %s", query.data); ret = SPI_execute(query.data, true, 0); - // xxx handle errors + // XXX handle errors our_lsn = DatumGetLSN(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull)); - elog(INFO, "Our LSN is %X/%X", + elog(INFO, "Our LSN is %X/%X", (uint32) (our_lsn >> 32), (uint32) our_lsn); diff --git a/repmgrd.c b/repmgrd.c index 693833fe..6e5105df 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -43,6 +43,7 @@ static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER; static PGconn *upstream_conn = NULL; static PGconn *primary_conn = NULL; + static NodeInfoList standby_nodes = T_NODE_INFO_LIST_INITIALIZER; /* Collate command line errors here for friendlier reporting */ @@ -76,6 +77,8 @@ static ElectionResult do_election(void); static const char *_print_voting_status(NodeVotingStatus voting_status); static const char *_print_election_result(ElectionResult result); +static void promote_self(void); + static void close_connections(); static void terminate(int retval); @@ -306,7 +309,7 @@ main(int argc, char **argv) if (record_status != RECORD_FOUND) { log_error(_("no metadata record found for this node - terminating")); - log_hint(_("Check that 'repmgr (primary|standby) register' was executed for this node")); + log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node")); PQfinish(local_conn); terminate(ERR_BAD_CONFIG); @@ -348,7 +351,6 @@ main(int argc, char **argv) } } - if (config_file_options.failover_mode == FAILOVER_AUTOMATIC) { /* @@ -378,6 +380,7 @@ main(int argc, char **argv) } } + if (daemonize == true) { daemonize_process(); @@ -491,7 +494,6 @@ monitor_streaming_primary(void) static void monitor_streaming_standby(void) { - NodeStatus upstream_node_status = NODE_STATUS_UP; // check result @@ -544,7 +546,7 @@ monitor_streaming_standby(void) /* still down after reconnect attempt(s) - */ if (upstream_node_status == NODE_STATUS_DOWN) { - // begin voting process + /* attempt to initiate voting process */ ElectionResult election_result = do_election(); @@ -552,7 +554,9 @@ monitor_streaming_standby(void) if (election_result == ELECTION_WON) { - log_info("I am the winner, will now promote self and inform other nodes"); + log_notice("I am the winner, will now promote self and inform other nodes"); + + promote_self(); } else if (election_result == ELECTION_LOST) { @@ -560,13 +564,10 @@ monitor_streaming_standby(void) } else { + // --> need timeout in case new primary doesn't come up, then rerun election + log_info("I am a follower and am waiting to be informed by the winner"); } - // if ELECTION_WON - // promote self, notify nodes - - // else if ELECTION_NOT_CANDIDATE, wait for new primary notification - // --> need timeout in case new primary doesn't come up, then rerun election } } @@ -577,6 +578,100 @@ monitor_streaming_standby(void) } } +static void +promote_self(void) +{ + char *promote_command; + int r; + + /* Store details of the failed node here */ + t_node_info failed_primary = T_NODE_INFO_INITIALIZER; + RecordStatus record_status; + + record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary); + // XXX check success + + /* the presence of either of these commands has been established already */ + if (config_file_options.service_promote_command[0] != '\0') + promote_command = config_file_options.service_promote_command; + else + promote_command = config_file_options.promote_command; + + log_debug("promote command is:\n \"%s\"", + promote_command); + + if (log_type == REPMGR_STDERR && *config_file_options.logfile) + { + fflush(stderr); + } + + r = system(promote_command); + + /* connection should stay up, but check just in case */ + if(PQstatus(local_conn) != CONNECTION_OK) + { + local_conn = establish_db_connection(local_node_info.conninfo, true); + + /* assume node failed */ + if(PQstatus(local_conn) != CONNECTION_OK) + { + log_error(_("unable to reconnect to local node")); + // XXX handle this + return; + } + } + + if (r != 0) + { + int primary_node_id; + + primary_conn = get_primary_connection(local_conn, + &primary_node_id, NULL); + + if (primary_conn != NULL && primary_node_id == failed_primary.node_id) + { + log_notice(_("original primary reappeared before this standby was promoted - no action taken")); + + /* XXX log an event here? */ + + PQfinish(primary_conn); + primary_conn = NULL; + + // XXX handle this! + // -> we'll need to let the other nodes know too.... + /* no failover occurred but we'll want to restart connections */ + //failover_done = true; + return; + } + + // handle this + // -> check if somehow primary; otherwise go for new election? + log_error(_("promote command failed")); + } + else + { + PQExpBufferData event_details; + initPQExpBuffer(&event_details); + + /* update own internal node record */ + record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info); + + // XXX we're assuming the promote command updated metadata + appendPQExpBuffer(&event_details, + _("node %i promoted to primary; old primary %i marked as failed"), + local_node_info.node_id, + failed_primary.node_id); + /* my_local_conn is now the master */ + create_event_record(local_conn, + &config_file_options, + local_node_info.node_id, + "repmgrd_failover_promote", + true, + event_details.data); + termPQExpBuffer(&event_details); + } +} + static const char * _print_voting_status(NodeVotingStatus voting_status)