diff --git a/config.c b/config.c index e857cdea..e5d49e4b 100644 --- a/config.c +++ b/config.c @@ -268,9 +268,14 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * options->event_notifications.tail = NULL; /* barman settings */ + /* --------------- */ memset(options->barman_server, 0, sizeof(options->barman_server)); memset(options->barman_config, 0, sizeof(options->barman_config)); + /* undocumented test settings */ + /* -------------------------- */ + options->promote_delay = 0; + /* * If no configuration file available (user didn't specify and none found * in the default locations), return with default values @@ -455,6 +460,10 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * else if (strcmp(name, "barman_config") == 0) strncpy(options->barman_config, value, MAXLEN); + /* undocumented test settings */ + else if (strcmp(name, "promote_delay") == 0) + options->promote_delay = repmgr_atoi(value, name, error_list, 1); + /* Following parameters have been deprecated or renamed from 3.x - issue a warning */ else if (strcmp(name, "cluster") == 0) { diff --git a/config.h b/config.h index c836837e..2b84ee89 100644 --- a/config.h +++ b/config.h @@ -106,6 +106,9 @@ typedef struct char barman_host[MAXLEN]; char barman_server[MAXLEN]; char barman_config[MAXLEN]; + + /* undocumented test settings */ + int promote_delay; } t_configuration_options; /* @@ -131,7 +134,10 @@ typedef struct /* bdr settings */ \ BDR_MONITORING_LOCAL, \ /* barman settings */ \ - "", "", "" } + "", "", "", \ + /* undocumented test settings */ \ + 0 \ + } diff --git a/dbutils.c b/dbutils.c index 8bff59c7..39903bb9 100644 --- a/dbutils.c +++ b/dbutils.c @@ -1714,6 +1714,8 @@ clear_node_info_list(NodeInfoList *nodes) NodeInfoListCell *cell; NodeInfoListCell *next_cell; + log_debug("clear_node_info_list() - closing open connections"); + /* close any open connections */ for (cell = nodes->head; cell; cell = cell->next) { @@ -1724,6 +1726,8 @@ clear_node_info_list(NodeInfoList *nodes) } } + log_debug("clear_node_info_list() - unlinking"); + cell = nodes->head; while (cell != NULL) @@ -1733,7 +1737,8 @@ clear_node_info_list(NodeInfoList *nodes) pfree(cell); cell = next_cell; } - + nodes->head = NULL; + nodes->tail = NULL; nodes->node_count = 0; } @@ -2493,6 +2498,32 @@ get_new_primary(PGconn *conn, int *primary_node_id) } +void +reset_voting_status(PGconn *conn) +{ + PQExpBufferData query; + PGresult *res; + + initPQExpBuffer(&query); + + appendPQExpBuffer(&query, + "SELECT repmgr.reset_voting_status()"); + + res = PQexec(conn, query.data); + termPQExpBuffer(&query); + + // COMMAND_OK? + if (!res || PQresultStatus(res) != PGRES_TUPLES_OK) + { + log_error(_("unable to execute repmgr..reset_voting_status():\n %s"), + PQerrorMessage(conn)); + } + + PQclear(res); + return; +} + + /* ============================ */ /* replication status functions */ /* ============================ */ diff --git a/dbutils.h b/dbutils.h index f0b80f0e..36f9e064 100644 --- a/dbutils.h +++ b/dbutils.h @@ -249,6 +249,7 @@ int set_voting_status_initiated(PGconn *conn); bool announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term); void notify_follow_primary(PGconn *conn, int primary_node_id); bool get_new_primary(PGconn *conn, int *primary_node_id); +void reset_voting_status(PGconn *conn); /* replication status functions */ diff --git a/repmgr--4.0.sql b/repmgr--4.0.sql index 23ee0cc4..d985227c 100644 --- a/repmgr--4.0.sql +++ b/repmgr--4.0.sql @@ -65,3 +65,8 @@ CREATE FUNCTION get_new_primary() RETURNS INT AS '$libdir/repmgr', 'get_new_primary' LANGUAGE C STRICT; + +CREATE FUNCTION reset_voting_status() + RETURNS VOID + AS '$libdir/repmgr', 'reset_voting_status' + LANGUAGE C STRICT; diff --git a/repmgr.c b/repmgr.c index bae7121c..328c66db 100644 --- a/repmgr.c +++ b/repmgr.c @@ -80,8 +80,8 @@ PG_FUNCTION_INFO_V1(notify_follow_primary); Datum get_new_primary(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(get_new_primary); -//Datum set_new_primary(PG_FUNCTION_ARGS); -//PG_FUNCTION_INFO_V1(set_new_primary); +Datum reset_voting_status(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(reset_voting_status); /* * Module load callback @@ -153,9 +153,9 @@ repmgr_shmem_startup(void) shared_state->lock = LWLockAssign(); #endif + shared_state->current_electoral_term = 0; shared_state->voting_status = VS_NO_VOTE; shared_state->candidate_node_id = UNKNOWN_NODE_ID; - shared_state->current_electoral_term = 0; shared_state->follow_new_primary = false; } @@ -315,3 +315,18 @@ get_new_primary(PG_FUNCTION_ARGS) PG_RETURN_INT32(new_primary_node_id); } + + +Datum +reset_voting_status(PG_FUNCTION_ARGS) +{ + LWLockAcquire(shared_state->lock, LW_SHARED); + + shared_state->voting_status = VS_NO_VOTE; + shared_state->candidate_node_id = UNKNOWN_NODE_ID; + shared_state->follow_new_primary = false; + + LWLockRelease(shared_state->lock); + + PG_RETURN_VOID(); +} diff --git a/repmgrd.c b/repmgrd.c index 2f305934..47351d8a 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -33,9 +33,11 @@ typedef enum { FAILOVER_STATE_PRIMARY_REAPPEARED, FAILOVER_STATE_LOCAL_NODE_FAILURE, FAILOVER_STATE_WAITING_NEW_PRIMARY, - FAILOVER_STATE_NO_NEW_PRIMARY, FAILOVER_STATE_FOLLOWED_NEW_PRIMARY, - FAILOVER_STATE_FOLLOW_FAIL + FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY, + FAILOVER_STATE_NO_NEW_PRIMARY, + FAILOVER_STATE_FOLLOW_FAIL, + FAILOVER_STATE_NODE_NOTIFICATION_ERROR } FailoverState; @@ -96,13 +98,15 @@ static const char *_print_voting_status(NodeVotingStatus voting_status); static const char *_print_election_result(ElectionResult result); static FailoverState promote_self(void); -static void notify_followers(NodeInfoList *standby_nodes); +static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id); static t_node_info *poll_best_candidate(NodeInfoList *standby_nodes); static bool wait_primary_notification(int *new_primary_id); static FailoverState follow_new_primary(int new_primary_id); +static void reset_node_voting_status(void); + static void close_connections(); static void terminate(int retval); @@ -436,10 +440,10 @@ start_monitoring(void) local_node_info.node_name, local_node_info.node_id); - failover_state = FAILOVER_STATE_NONE; - while(true) { + reset_node_voting_status(); + switch (local_node_info.type) { case PRIMARY: @@ -471,17 +475,26 @@ monitor_streaming_primary(void) /* Log startup event */ if (startup_event_logged == false) { + PQExpBufferData event_details; + initPQExpBuffer(&event_details); + + appendPQExpBuffer(&event_details, + _("monitoring cluster primary \"%s\" (node ID: %i)"), + local_node_info.node_name, + local_node_info.node_id); + create_event_record(local_conn, &config_file_options, config_file_options.node_id, "repmgrd_start", true, - NULL); + event_details.data); + startup_event_logged = true; - log_notice(_("monitoring cluster primary \"%s\" (node ID: %i)"), - local_node_info.node_name, - local_node_info.node_id); + log_notice("%s", event_details.data); + + termPQExpBuffer(&event_details); } INSTR_TIME_SET_CURRENT(log_status_interval_start); @@ -502,13 +515,12 @@ monitor_streaming_primary(void) INSTR_TIME_SET_CURRENT(local_node_unreachable_start); - initPQExpBuffer(&event_details); appendPQExpBuffer(&event_details, _("unable to connect to local node")); - log_warning(event_details.data); + log_warning("%s", event_details.data); node_status = NODE_STATUS_UNKNOWN; @@ -540,7 +552,7 @@ monitor_streaming_primary(void) appendPQExpBuffer(&event_details, _("reconnected to local node after %i seconds"), (int)local_node_unreachable_elapsed); - log_notice(event_details.data); + log_notice("%s", event_details.data); create_event_record(local_conn, &config_file_options, @@ -603,19 +615,26 @@ monitor_streaming_standby(void) /* Log startup event */ if (startup_event_logged == false) { + PQExpBufferData event_details; + initPQExpBuffer(&event_details); + + appendPQExpBuffer(&event_details, + _("monitoring upstream node \"%s\" (node ID: %i)"), + upstream_node_info.node_name, + upstream_node_info.node_id); + create_event_record(upstream_conn, &config_file_options, config_file_options.node_id, "repmgrd_start", true, - NULL); + event_details.data); + startup_event_logged = true; - log_notice(_("repmgrd on node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"), - local_node_info.node_name, - local_node_info.node_id, - upstream_node_info.node_name, - upstream_node_info.node_id); + log_notice("%s", event_details.data); + + termPQExpBuffer(&event_details); } INSTR_TIME_SET_CURRENT(log_status_interval_start); @@ -647,6 +666,8 @@ monitor_streaming_standby(void) { /* attempt to initiate voting process */ ElectionResult election_result = do_election(); + + /* XXX add pre-event notification here */ failover_state = FAILOVER_STATE_UNKNOWN; log_debug("election result: %s", _print_election_result(election_result)); @@ -673,7 +694,10 @@ monitor_streaming_standby(void) best_candidate = poll_best_candidate(&standby_nodes); - /* this can occur in a tie-break situation, after we establish this node has priority*/ + /* + * this can occur in a tie-break situation, where this node establishes + * it is the best candidate + */ if (best_candidate->node_id == local_node_info.node_id) { log_notice("I am the best candidate, will now promote self and inform other nodes"); @@ -687,25 +711,36 @@ monitor_streaming_standby(void) log_info("node %i is the best candidate, waiting for it to confirm so I can follow it", best_candidate->node_id); - // notify candidate + /* notify the best candidate so it */ - // XXX check result candidate_conn = establish_db_connection(best_candidate->conninfo, false); - notify_follow_primary(candidate_conn, best_candidate->node_id); + if (PQstatus(candidate_conn) == CONNECTION_OK) + { + notify_follow_primary(candidate_conn, best_candidate->node_id); + /* we'll wait for the candidate to get back to us */ + failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY; + } + else + { + log_error(_("unable to connect to candidate node (ID: %i)"), best_candidate->node_id); + failover_state = FAILOVER_STATE_NODE_NOTIFICATION_ERROR; + } PQfinish(candidate_conn); - // we'll wait for the candidate to get back to us - failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY; } } else { - log_info("I am a follower and am waiting to be informed by the winner"); + log_info(_("follower node awaiting notification from the candidate node")); failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY; } + /* + * node has decided it is a follower, so will await notification + * from the candidate that it has promoted itself and can be followed + */ if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY) { int new_primary_id; @@ -715,10 +750,15 @@ monitor_streaming_standby(void) /* either follow or time out; either way resume monitoring */ if (wait_primary_notification(&new_primary_id) == true) { - // if new_primary_id is self, promote - if (new_primary_id == local_node_info.node_id) + /* if primary has reappeared, no action needed */ + if (new_primary_id == upstream_node_info.node_id) { - log_notice("looks like I'm the promotion candidate, promoting"); + failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY; + } + /* if new_primary_id is self, promote */ + else if (new_primary_id == local_node_info.node_id) + { + log_notice(_("this node is promotion candidate, promoting")); failover_state = promote_self(); @@ -744,9 +784,11 @@ monitor_streaming_standby(void) switch(failover_state) { case FAILOVER_STATE_PROMOTED: - /* inform former siblings that we are Number 1 */ + log_debug("failover state is PROMOTED"); + + /* notify former siblings that they should now follow this node */ + notify_followers(&standby_nodes, local_node_info.node_id); - notify_followers(&standby_nodes); /* we no longer care about our former siblings */ clear_node_info_list(&standby_nodes); @@ -755,19 +797,50 @@ monitor_streaming_standby(void) failover_state = FAILOVER_STATE_NONE; return; + + case FAILOVER_STATE_PRIMARY_REAPPEARED: + log_debug("failover state is PRIMARY_REAPPEARED"); + + /* notify siblings that they should resume following the original primary */ + notify_followers(&standby_nodes, upstream_node_info.node_id); + + /* we no longer care about our former siblings */ + clear_node_info_list(&standby_nodes); + + /* pass control back down to start_monitoring() */ + log_info(_("resuming standby monitoring mode")); + log_detail(_("original primary \"%s\" (node ID: %i) reappeared"), + upstream_node_info.node_name, upstream_node_info.node_id); + + failover_state = FAILOVER_STATE_NONE; + return; + case FAILOVER_STATE_PROMOTION_FAILED: - log_debug("failover state is FAILED"); + log_debug("failover state is PROMOTION FAILED"); break; + case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY: log_info(_("resuming standby monitoring mode")); + log_detail(_("following new primary \"%s\" (node id: %i)"), + upstream_node_info.node_name, upstream_node_info.node_id); failover_state = FAILOVER_STATE_NONE; - break; + + return; + + case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY: + log_info(_("resuming standby monitoring mode")); + log_detail(_("following original primary \"%s\" (node id: %i)"), + upstream_node_info.node_name, upstream_node_info.node_id); + failover_state = FAILOVER_STATE_NONE; + + return; + case FAILOVER_STATE_NO_NEW_PRIMARY: case FAILOVER_STATE_WAITING_NEW_PRIMARY: /* pass control back down to start_monitoring() */ // -> should kick off new election return; - case FAILOVER_STATE_PRIMARY_REAPPEARED: + case FAILOVER_STATE_LOCAL_NODE_FAILURE: case FAILOVER_STATE_UNKNOWN: case FAILOVER_STATE_NONE: @@ -799,6 +872,7 @@ monitor_streaming_standby(void) upstream_node_info.node_name, upstream_node_info.node_id); + //log_debug( INSTR_TIME_SET_CURRENT(log_status_interval_start); } } @@ -817,6 +891,18 @@ promote_self(void) t_node_info failed_primary = T_NODE_INFO_INITIALIZER; RecordStatus record_status; + /* + * optionally add a delay before promoting the standby; this is mainly + * useful for testing (e.g. for reappearance of the original primary) + * and is not documented. + */ + if (config_file_options.promote_delay > 0) + { + log_debug("sleeping %i seconds before promoting standby", + config_file_options.promote_delay); + sleep(config_file_options.promote_delay); + } + // XXX check success record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary); @@ -859,12 +945,26 @@ promote_self(void) if (PQstatus(primary_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id) { - log_notice(_("original primary reappeared before this standby was promoted - no action taken")); + log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"), + failed_primary.node_id); - /* XXX log an event here? */ + initPQExpBuffer(&event_details); + appendPQExpBuffer(&event_details, + _("original primary \"%s\" (node ID: %i) reappeared"), + failed_primary.node_name, + failed_primary.node_id); - PQfinish(primary_conn); - primary_conn = NULL; + create_event_record(primary_conn, + &config_file_options, + local_node_info.node_id, + "repmgrd_failover_abort", + true, + event_details.data); + + termPQExpBuffer(&event_details); + + //PQfinish(primary_conn); + //primary_conn = NULL; // XXX handle this! // -> we'll need to let the other nodes know too.... @@ -885,36 +985,46 @@ promote_self(void) /* update own internal node record */ record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info); - // XXX we're assuming the promote command updated metadata + /* + * XXX here we're assuming the promote command updated metadata + */ appendPQExpBuffer(&event_details, _("node %i promoted to primary; old primary %i marked as failed"), local_node_info.node_id, failed_primary.node_id); - /* my_local_conn is now the master */ + + /* local_conn is now the primary connection */ create_event_record(local_conn, &config_file_options, local_node_info.node_id, "repmgrd_failover_promote", true, event_details.data); + termPQExpBuffer(&event_details); return FAILOVER_STATE_PROMOTED; } +/* + * Notify follower nodes about which node to follow. Normally this + * will be the current node, however if the original primary reappeared + * before this node could be promoted, we'll inform the followers they + * should resume monitoring the original primary. + */ static void -notify_followers(NodeInfoList *standby_nodes) +notify_followers(NodeInfoList *standby_nodes, int follow_node_id) { NodeInfoListCell *cell; log_debug("notify_followers()"); for (cell = standby_nodes->head; cell; cell = cell->next) { - log_debug("intending to notify %i... ", cell->node_info->node_id); + log_debug("intending to notify node %i... ", cell->node_info->node_id); if (PQstatus(cell->node_info->conn) != CONNECTION_OK) { - log_debug("connection to %i lost... ", cell->node_info->node_id); + log_debug("reconnecting to node %i... ", cell->node_info->node_id); cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false); } @@ -925,8 +1035,10 @@ notify_followers(NodeInfoList *standby_nodes) continue; } - log_debug("notifying node %i to follow new primary", cell->node_info->node_id); - notify_follow_primary(cell->node_info->conn, local_node_info.node_id); + + log_debug("notifying node %i to follow node %i", + cell->node_info->node_id, follow_node_id); + notify_follow_primary(cell->node_info->conn, follow_node_id); } } @@ -1333,6 +1445,20 @@ do_election(void) } +static void +reset_node_voting_status(void) +{ + failover_state = FAILOVER_STATE_NONE; + + if (PQstatus(local_conn) != CONNECTION_OK) + { + log_error(_("reset_node_voting_status(): local_conn not set")); + return; + } + reset_voting_status(local_conn); +} + + static void daemonize_process(void) {