From 2f978847b1b4f5c154500dca253d04adcc1e3682 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 15 Nov 2017 10:54:39 +0900 Subject: [PATCH] repmgrd: handle witness server --- repmgrd-physical.c | 405 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 371 insertions(+), 34 deletions(-) diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 63618034..936f34ce 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -72,14 +72,19 @@ static void check_connection(t_node_info *node_info, PGconn **conn); static bool wait_primary_notification(int *new_primary_id); static FailoverState follow_new_primary(int new_primary_id); +static FailoverState witness_follow_new_primary(int new_primary_id); static void reset_node_voting_status(void); void close_connections_physical(); static bool do_primary_failover(void); static bool do_upstream_standby_failover(void); +static bool do_witness_failover(void); static void update_monitoring_history(void); + +static const char * format_failover_state(FailoverState failover_state); + #endif @@ -643,6 +648,7 @@ monitor_streaming_standby(void) _("unable to connect to upstream node \"%s\" (node ID: %i)"), upstream_node_info.node_name, upstream_node_info.node_id); + /* */ if (upstream_node_info.type == STANDBY) { /* XXX possible pre-action event */ @@ -653,6 +659,16 @@ monitor_streaming_standby(void) true, event_details.data); } + else + { + /* primary connection lost - script notification only */ + create_event_record(NULL, + &config_file_options, + config_file_options.node_id, + "repmgrd_upstream_disconnect", + true, + event_details.data); + } log_warning("%s", event_details.data); termPQExpBuffer(&event_details); @@ -964,8 +980,7 @@ loop: log_warning("%s", event_details.data) - create_event_notification( - primary_conn, + create_event_notification(primary_conn, &config_file_options, local_node_info.node_id, "standby_recovery", @@ -1057,21 +1072,118 @@ monitor_streaming_witness(void) */ record_status = get_node_record(primary_conn, upstream_node_info.node_id, &upstream_node_info); + + /* Log startup event */ + if (startup_event_logged == false) + { + PQExpBufferData event_details; + + initPQExpBuffer(&event_details); + + appendPQExpBuffer(&event_details, + _("witness monitoring connection to primary node \"%s\" (node ID: %i)"), + upstream_node_info.node_name, + upstream_node_info.node_id); + + create_event_notification(primary_conn, + &config_file_options, + config_file_options.node_id, + "repmgrd_start", + true, + event_details.data); + + startup_event_logged = true; + + log_info("%s", event_details.data); + + termPQExpBuffer(&event_details); + } + monitoring_state = MS_NORMAL; INSTR_TIME_SET_CURRENT(log_status_interval_start); upstream_node_info.node_status = NODE_STATUS_UP; - // XXX startup event - while (true) { if (is_server_available(upstream_node_info.conninfo) == false) { + if (upstream_node_info.node_status == NODE_STATUS_UP) + { + instr_time upstream_node_unreachable_start; + INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start); + + initPQExpBuffer(&event_details); + + upstream_node_info.node_status = NODE_STATUS_UNKNOWN; + + appendPQExpBuffer(&event_details, + _("unable to connect to primary node \"%s\" (node ID: %i)"), + upstream_node_info.node_name, upstream_node_info.node_id); + + create_event_record(NULL, + &config_file_options, + config_file_options.node_id, + "repmgrd_upstream_disconnect", + true, + event_details.data); + + PQfinish(primary_conn); + primary_conn = try_reconnect(&upstream_node_info); + + /* Node has recovered - log and continue */ + if (upstream_node_info.node_status == NODE_STATUS_UP) + { + int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start); + + initPQExpBuffer(&event_details); + + appendPQExpBuffer(&event_details, + _("reconnected to upstream node after %i seconds"), + upstream_node_unreachable_elapsed); + log_notice("%s", event_details.data); + + create_event_notification(upstream_conn, + &config_file_options, + config_file_options.node_id, + "repmgrd_upstream_reconnect", + true, + event_details.data); + termPQExpBuffer(&event_details); + + goto loop; + } + + /* still down after reconnect attempt(s) */ + if (upstream_node_info.node_status == NODE_STATUS_DOWN) + { + bool failover_done = false; + + + failover_done = do_witness_failover(); + + /* + * XXX it's possible it will make sense to return in all + * cases to restart monitoring + */ + if (failover_done == true) + { + primary_node_id = get_primary_node_id(local_conn); + return; + } + } + } } + + if (monitoring_state == MS_DEGRADED) + { + // XXX + } loop: + // XXX refresh repmgr.nodes + /* emit "still alive" log message at regular intervals, if requested */ if (config_file_options.log_status_interval > 0) { @@ -1146,7 +1258,6 @@ do_primary_failover(void) failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY; } - /* * node has decided it is a follower, so will await notification from the * candidate that it has promoted itself and can be followed @@ -1204,8 +1315,7 @@ do_primary_failover(void) new_primary_conn = establish_db_connection(new_primary.conninfo, false); - create_event_notification( - new_primary_conn, + create_event_notification(new_primary_conn, &config_file_options, local_node_info.node_id, "standby_disconnect_manual", @@ -1233,11 +1343,12 @@ do_primary_failover(void) } } + log_verbose(LOG_DEBUG, "failover state is %s", + format_failover_state(failover_state)); + switch (failover_state) { case FAILOVER_STATE_PROMOTED: - log_debug("failover state is PROMOTED"); - /* notify former siblings that they should now follow this node */ notify_followers(&standby_nodes, local_node_info.node_id); @@ -1251,7 +1362,6 @@ do_primary_failover(void) return true; case FAILOVER_STATE_PRIMARY_REAPPEARED: - log_debug("failover state is PRIMARY_REAPPEARED"); /* * notify siblings that they should resume following the original @@ -1963,6 +2073,103 @@ follow_new_primary(int new_primary_id) } +static FailoverState +witness_follow_new_primary(int new_primary_id) +{ + PQExpBufferData event_details; + + t_node_info new_primary = T_NODE_INFO_INITIALIZER; + RecordStatus record_status = RECORD_NOT_FOUND; + bool new_primary_ok = false; + + record_status = get_node_record(local_conn, new_primary_id, &new_primary); + + if (record_status != RECORD_FOUND) + { + log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"), + new_primary_id); + return FAILOVER_STATE_FOLLOW_FAIL; + } + + /* TODO: check if new_primary_id == failed_primary.node_id? */ + + if (log_type == REPMGR_STDERR && *config_file_options.log_file) + { + fflush(stderr); + } + + upstream_conn = establish_db_connection(new_primary.conninfo, false); + + if (PQstatus(upstream_conn) == CONNECTION_OK) + { + RecoveryType primary_recovery_type = get_recovery_type(upstream_conn); + + if (primary_recovery_type == RECTYPE_PRIMARY) + { + new_primary_ok = true; + } + else + { + new_primary_ok = false; + log_warning(_("new primary is not in recovery")); + PQfinish(upstream_conn); + } + } + + if (new_primary_ok == false) + { + return FAILOVER_STATE_FOLLOW_FAIL; + } + + /* set new upstream node ID on primary */ + update_node_record_set_upstream(upstream_conn, local_node_info.node_id, new_primary_id); + + witness_copy_node_records(upstream_conn, local_conn); + + /* + * refresh local copy of local and primary node records - we get these + * directly from the primary to ensure they're the current version + */ + + record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info); + + if (record_status != RECORD_FOUND) + { + log_error(_("unable to retrieve metadata record found for node %i"), + new_primary_id); + return FAILOVER_STATE_FOLLOW_FAIL; + } + + record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info); + if (record_status != RECORD_FOUND) + { + log_error(_("unable to retrieve metadata record found for node %i"), + local_node_info.node_id); + return FAILOVER_STATE_FOLLOW_FAIL; + } + + initPQExpBuffer(&event_details); + appendPQExpBuffer(&event_details, + _("witness node %i now following new primary node %i"), + local_node_info.node_id, + upstream_node_info.node_id); + + log_notice("%s", event_details.data); + + create_event_notification( + upstream_conn, + &config_file_options, + local_node_info.node_id, + "repmgrd_failover_follow", + true, + event_details.data); + + termPQExpBuffer(&event_details); + + return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY; +} + + static const char * _print_election_result(ElectionResult result) { @@ -2002,7 +2209,6 @@ do_election(void) t_node_info *candidate_node = NULL; - /* * Check if at least one server in the primary's location is visible; if * not we'll assume a network split between this node and the primary @@ -2025,11 +2231,6 @@ do_election(void) log_debug("do_election(): electoral term is %i", electoral_term); - /* get all active nodes attached to primary, excluding self */ - get_active_sibling_node_records(local_conn, - local_node_info.node_id, - upstream_node_info.node_id, - &standby_nodes); if (config_file_options.failover == FAILOVER_MANUAL) { @@ -2047,6 +2248,11 @@ do_election(void) return ELECTION_NOT_CANDIDATE; } + /* get all active nodes attached to upstream, excluding self */ + get_active_sibling_node_records(local_conn, + local_node_info.node_id, + upstream_node_info.node_id, + &standby_nodes); log_debug("do_election(): primary location is %s", upstream_node_info.location); @@ -2060,7 +2266,7 @@ do_election(void) */ set_voting_status_initiated(local_conn, electoral_term); - /* no other standbys - normally win by default */ + /* fast path if no other standbys (or witness) exists - normally win by default */ if (standby_nodes.node_count == 0) { if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0) @@ -2070,6 +2276,15 @@ do_election(void) } else { + /* + * If primary and standby have different locations set, the assumption + * is that no action should be taken as we can't tell whether there's + * been a network interruption or not. + * + * Normally a situation with primary and standby in different physical + * locations would be handled by leaving the location as "default" and + * setting up a witness server in the primary's location. + */ log_debug("no other nodes, but primary and standby locations differ"); monitoring_state = MS_DEGRADED; @@ -2089,7 +2304,6 @@ do_election(void) for (cell = standby_nodes.head; cell; cell = cell->next) { - /* assume the worst case */ cell->node_info->node_status = NODE_STATUS_UNKNOWN; @@ -2102,10 +2316,29 @@ do_election(void) cell->node_info->node_status = NODE_STATUS_UP; + visible_nodes++; + + /* + * see if the node is in the primary's location (but skip the check if + * we've seen a node there already) + */ + if (primary_location_seen == false) + { + if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0) + { + primary_location_seen = true; + } + } + + /* don't interrogate a witness server */ + if (cell->node_info->type == WITNESS) + { + log_debug("node %i is witness, not querying state", cell->node_info->node_id); + continue; + } /* XXX don't check 0-priority nodes */ - // get node's LSN - // if "higher" than current winner, current node is candidate + /* get node's LSN - if "higher" than current winner, current node is candidate */ cell->node_info->last_wal_receive_lsn = get_last_wal_receive_location(cell->node_info->conn); @@ -2113,7 +2346,7 @@ do_election(void) cell->node_info->node_id, format_lsn(cell->node_info->last_wal_receive_lsn)); - // compare LSN + /* compare LSN */ if (cell->node_info->last_wal_receive_lsn > candidate_node->last_wal_receive_lsn) { /* other node is ahead */ @@ -2123,7 +2356,7 @@ do_election(void) candidate_node = cell->node_info; } - // LSN same - tiebreak on priority, then node_id + /* LSN is same - tiebreak on priority, then node_id */ else if(cell->node_info->last_wal_receive_lsn == candidate_node->last_wal_receive_lsn) { log_verbose(LOG_DEBUG, "node %i has same LSN as current candidate %i", @@ -2157,19 +2390,7 @@ do_election(void) candidate_node->priority); } } - /* - * see if the node is in the primary's location (but skip the check if - * we've seen a node there already) - */ - if (primary_location_seen == false) - { - if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0) - { - primary_location_seen = true; - } - } - visible_nodes++; } if (primary_location_seen == false) @@ -2194,6 +2415,85 @@ do_election(void) return ELECTION_LOST; } +/* + * "failover" for the witness node; the witness has no part in the election + * other than being reachable, so just needs to await notification from the + * new primary + */ +static +bool do_witness_failover(void) +{ + int new_primary_id = UNKNOWN_NODE_ID; + + /* TODO add pre-event notification here */ + failover_state = FAILOVER_STATE_UNKNOWN; + + if (wait_primary_notification(&new_primary_id) == true) + { + /* if primary has reappeared, no action needed */ + if (new_primary_id == upstream_node_info.node_id) + { + failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY; + } + else + { + failover_state = witness_follow_new_primary(new_primary_id); + } + } + else + { + failover_state = FAILOVER_STATE_NO_NEW_PRIMARY; + } + + + log_verbose(LOG_DEBUG, "failover state is %s", + format_failover_state(failover_state)); + + switch (failover_state) + { + case FAILOVER_STATE_PRIMARY_REAPPEARED: + /* pass control back down to start_monitoring() */ + log_info(_("resuming witness monitoring mode")); + log_detail(_("original primary \"%s\" (node ID: %i) reappeared"), + upstream_node_info.node_name, upstream_node_info.node_id); + + failover_state = FAILOVER_STATE_NONE; + return true; + + + case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY: + log_info(_("resuming standby monitoring mode")); + log_detail(_("following new primary \"%s\" (node id: %i)"), + upstream_node_info.node_name, upstream_node_info.node_id); + failover_state = FAILOVER_STATE_NONE; + + return true; + + case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY: + log_info(_("resuming witness monitoring mode")); + log_detail(_("following original primary \"%s\" (node id: %i)"), + upstream_node_info.node_name, upstream_node_info.node_id); + failover_state = FAILOVER_STATE_NONE; + + return true; + case FAILOVER_STATE_FOLLOW_FAIL: + + /* + * for whatever reason we were unable to follow the new primary - + * continue monitoring in degraded state + */ + monitoring_state = MS_DEGRADED; + INSTR_TIME_SET_CURRENT(degraded_monitoring_start); + + return false; + + default: + return false; + } + /* should never reach here */ + return false; +} + static void reset_node_voting_status(void) @@ -2241,6 +2541,43 @@ check_connection(t_node_info *node_info, PGconn **conn) } +static const char * +format_failover_state(FailoverState failover_state) +{ + switch(failover_state) + { + case FAILOVER_STATE_UNKNOWN: + return "UNKNOWN"; + case FAILOVER_STATE_NONE: + return "NONE"; + case FAILOVER_STATE_PROMOTED: + return "PROMOTED"; + case FAILOVER_STATE_PROMOTION_FAILED: + return "PROMOTION_FAILED"; + case FAILOVER_STATE_PRIMARY_REAPPEARED: + return "PRIMARY_REAPPEARED"; + case FAILOVER_STATE_LOCAL_NODE_FAILURE: + return "LOCAL_NODE_FAILURE"; + case FAILOVER_STATE_WAITING_NEW_PRIMARY: + return "WAITING_NEW_PRIMARY"; + case FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER: + return "REQUIRES_MANUAL_FAILOVER"; + case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY: + return "FOLLOWED_NEW_PRIMARY"; + case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY: + return "FOLLOWING_ORIGINAL_PRIMARY"; + case FAILOVER_STATE_NO_NEW_PRIMARY: + return "NO_NEW_PRIMARY"; + case FAILOVER_STATE_FOLLOW_FAIL: + return "FOLLOW_FAIL"; + case FAILOVER_STATE_NODE_NOTIFICATION_ERROR: + return "ODE_NOTIFICATION_ERROR"; + } + + /* should never reach here */ + return "UNKNOWN_FAILOVER_STATE"; +} + #endif /* #ifndef BDR_ONLY */ void