repmgrd: handle witness server

This commit is contained in:
Ian Barwick
2017-11-15 10:54:39 +09:00
parent 3014f72fda
commit 2f978847b1

View File

@@ -72,14 +72,19 @@ static void check_connection(t_node_info *node_info, PGconn **conn);
static bool wait_primary_notification(int *new_primary_id);
static FailoverState follow_new_primary(int new_primary_id);
static FailoverState witness_follow_new_primary(int new_primary_id);
static void reset_node_voting_status(void);
void close_connections_physical();
static bool do_primary_failover(void);
static bool do_upstream_standby_failover(void);
static bool do_witness_failover(void);
static void update_monitoring_history(void);
static const char * format_failover_state(FailoverState failover_state);
#endif
@@ -643,6 +648,7 @@ monitor_streaming_standby(void)
_("unable to connect to upstream node \"%s\" (node ID: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
/* */
if (upstream_node_info.type == STANDBY)
{
/* XXX possible pre-action event */
@@ -653,6 +659,16 @@ monitor_streaming_standby(void)
true,
event_details.data);
}
else
{
/* primary connection lost - script notification only */
create_event_record(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_disconnect",
true,
event_details.data);
}
log_warning("%s", event_details.data);
termPQExpBuffer(&event_details);
@@ -964,8 +980,7 @@ loop:
log_warning("%s", event_details.data)
create_event_notification(
primary_conn,
create_event_notification(primary_conn,
&config_file_options,
local_node_info.node_id,
"standby_recovery",
@@ -1057,21 +1072,118 @@ monitor_streaming_witness(void)
*/
record_status = get_node_record(primary_conn, upstream_node_info.node_id, &upstream_node_info);
/* Log startup event */
if (startup_event_logged == false)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("witness monitoring connection to primary node \"%s\" (node ID: %i)"),
upstream_node_info.node_name,
upstream_node_info.node_id);
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_start",
true,
event_details.data);
startup_event_logged = true;
log_info("%s", event_details.data);
termPQExpBuffer(&event_details);
}
monitoring_state = MS_NORMAL;
INSTR_TIME_SET_CURRENT(log_status_interval_start);
upstream_node_info.node_status = NODE_STATUS_UP;
// XXX startup event
while (true)
{
if (is_server_available(upstream_node_info.conninfo) == false)
{
if (upstream_node_info.node_status == NODE_STATUS_UP)
{
instr_time upstream_node_unreachable_start;
INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);
initPQExpBuffer(&event_details);
upstream_node_info.node_status = NODE_STATUS_UNKNOWN;
appendPQExpBuffer(&event_details,
_("unable to connect to primary node \"%s\" (node ID: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
create_event_record(NULL,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_disconnect",
true,
event_details.data);
PQfinish(primary_conn);
primary_conn = try_reconnect(&upstream_node_info);
/* Node has recovered - log and continue */
if (upstream_node_info.node_status == NODE_STATUS_UP)
{
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("reconnected to upstream node after %i seconds"),
upstream_node_unreachable_elapsed);
log_notice("%s", event_details.data);
create_event_notification(upstream_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_reconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
goto loop;
}
/* still down after reconnect attempt(s) */
if (upstream_node_info.node_status == NODE_STATUS_DOWN)
{
bool failover_done = false;
failover_done = do_witness_failover();
/*
* XXX it's possible it will make sense to return in all
* cases to restart monitoring
*/
if (failover_done == true)
{
primary_node_id = get_primary_node_id(local_conn);
return;
}
}
}
}
if (monitoring_state == MS_DEGRADED)
{
// XXX
}
loop:
// XXX refresh repmgr.nodes
/* emit "still alive" log message at regular intervals, if requested */
if (config_file_options.log_status_interval > 0)
{
@@ -1146,7 +1258,6 @@ do_primary_failover(void)
failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
}
/*
* node has decided it is a follower, so will await notification from the
* candidate that it has promoted itself and can be followed
@@ -1204,8 +1315,7 @@ do_primary_failover(void)
new_primary_conn = establish_db_connection(new_primary.conninfo, false);
create_event_notification(
new_primary_conn,
create_event_notification(new_primary_conn,
&config_file_options,
local_node_info.node_id,
"standby_disconnect_manual",
@@ -1233,11 +1343,12 @@ do_primary_failover(void)
}
}
log_verbose(LOG_DEBUG, "failover state is %s",
format_failover_state(failover_state));
switch (failover_state)
{
case FAILOVER_STATE_PROMOTED:
log_debug("failover state is PROMOTED");
/* notify former siblings that they should now follow this node */
notify_followers(&standby_nodes, local_node_info.node_id);
@@ -1251,7 +1362,6 @@ do_primary_failover(void)
return true;
case FAILOVER_STATE_PRIMARY_REAPPEARED:
log_debug("failover state is PRIMARY_REAPPEARED");
/*
* notify siblings that they should resume following the original
@@ -1963,6 +2073,103 @@ follow_new_primary(int new_primary_id)
}
static FailoverState
witness_follow_new_primary(int new_primary_id)
{
PQExpBufferData event_details;
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
RecordStatus record_status = RECORD_NOT_FOUND;
bool new_primary_ok = false;
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
new_primary_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
/* TODO: check if new_primary_id == failed_primary.node_id? */
if (log_type == REPMGR_STDERR && *config_file_options.log_file)
{
fflush(stderr);
}
upstream_conn = establish_db_connection(new_primary.conninfo, false);
if (PQstatus(upstream_conn) == CONNECTION_OK)
{
RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
if (primary_recovery_type == RECTYPE_PRIMARY)
{
new_primary_ok = true;
}
else
{
new_primary_ok = false;
log_warning(_("new primary is not in recovery"));
PQfinish(upstream_conn);
}
}
if (new_primary_ok == false)
{
return FAILOVER_STATE_FOLLOW_FAIL;
}
/* set new upstream node ID on primary */
update_node_record_set_upstream(upstream_conn, local_node_info.node_id, new_primary_id);
witness_copy_node_records(upstream_conn, local_conn);
/*
* refresh local copy of local and primary node records - we get these
* directly from the primary to ensure they're the current version
*/
record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record found for node %i"),
new_primary_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record found for node %i"),
local_node_info.node_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("witness node %i now following new primary node %i"),
local_node_info.node_id,
upstream_node_info.node_id);
log_notice("%s", event_details.data);
create_event_notification(
upstream_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
termPQExpBuffer(&event_details);
return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
}
static const char *
_print_election_result(ElectionResult result)
{
@@ -2002,7 +2209,6 @@ do_election(void)
t_node_info *candidate_node = NULL;
/*
* Check if at least one server in the primary's location is visible; if
* not we'll assume a network split between this node and the primary
@@ -2025,11 +2231,6 @@ do_election(void)
log_debug("do_election(): electoral term is %i", electoral_term);
/* get all active nodes attached to primary, excluding self */
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
upstream_node_info.node_id,
&standby_nodes);
if (config_file_options.failover == FAILOVER_MANUAL)
{
@@ -2047,6 +2248,11 @@ do_election(void)
return ELECTION_NOT_CANDIDATE;
}
/* get all active nodes attached to upstream, excluding self */
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
upstream_node_info.node_id,
&standby_nodes);
log_debug("do_election(): primary location is %s", upstream_node_info.location);
@@ -2060,7 +2266,7 @@ do_election(void)
*/
set_voting_status_initiated(local_conn, electoral_term);
/* no other standbys - normally win by default */
/* fast path if no other standbys (or witness) exists - normally win by default */
if (standby_nodes.node_count == 0)
{
if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
@@ -2070,6 +2276,15 @@ do_election(void)
}
else
{
/*
* If primary and standby have different locations set, the assumption
* is that no action should be taken as we can't tell whether there's
* been a network interruption or not.
*
* Normally a situation with primary and standby in different physical
* locations would be handled by leaving the location as "default" and
* setting up a witness server in the primary's location.
*/
log_debug("no other nodes, but primary and standby locations differ");
monitoring_state = MS_DEGRADED;
@@ -2089,7 +2304,6 @@ do_election(void)
for (cell = standby_nodes.head; cell; cell = cell->next)
{
/* assume the worst case */
cell->node_info->node_status = NODE_STATUS_UNKNOWN;
@@ -2102,10 +2316,29 @@ do_election(void)
cell->node_info->node_status = NODE_STATUS_UP;
visible_nodes++;
/*
* see if the node is in the primary's location (but skip the check if
* we've seen a node there already)
*/
if (primary_location_seen == false)
{
if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0)
{
primary_location_seen = true;
}
}
/* don't interrogate a witness server */
if (cell->node_info->type == WITNESS)
{
log_debug("node %i is witness, not querying state", cell->node_info->node_id);
continue;
}
/* XXX don't check 0-priority nodes */
// get node's LSN
// if "higher" than current winner, current node is candidate
/* get node's LSN - if "higher" than current winner, current node is candidate */
cell->node_info->last_wal_receive_lsn = get_last_wal_receive_location(cell->node_info->conn);
@@ -2113,7 +2346,7 @@ do_election(void)
cell->node_info->node_id,
format_lsn(cell->node_info->last_wal_receive_lsn));
// compare LSN
/* compare LSN */
if (cell->node_info->last_wal_receive_lsn > candidate_node->last_wal_receive_lsn)
{
/* other node is ahead */
@@ -2123,7 +2356,7 @@ do_election(void)
candidate_node = cell->node_info;
}
// LSN same - tiebreak on priority, then node_id
/* LSN is same - tiebreak on priority, then node_id */
else if(cell->node_info->last_wal_receive_lsn == candidate_node->last_wal_receive_lsn)
{
log_verbose(LOG_DEBUG, "node %i has same LSN as current candidate %i",
@@ -2157,19 +2390,7 @@ do_election(void)
candidate_node->priority);
}
}
/*
* see if the node is in the primary's location (but skip the check if
* we've seen a node there already)
*/
if (primary_location_seen == false)
{
if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0)
{
primary_location_seen = true;
}
}
visible_nodes++;
}
if (primary_location_seen == false)
@@ -2194,6 +2415,85 @@ do_election(void)
return ELECTION_LOST;
}
/*
* "failover" for the witness node; the witness has no part in the election
* other than being reachable, so just needs to await notification from the
* new primary
*/
static
bool do_witness_failover(void)
{
int new_primary_id = UNKNOWN_NODE_ID;
/* TODO add pre-event notification here */
failover_state = FAILOVER_STATE_UNKNOWN;
if (wait_primary_notification(&new_primary_id) == true)
{
/* if primary has reappeared, no action needed */
if (new_primary_id == upstream_node_info.node_id)
{
failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY;
}
else
{
failover_state = witness_follow_new_primary(new_primary_id);
}
}
else
{
failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
}
log_verbose(LOG_DEBUG, "failover state is %s",
format_failover_state(failover_state));
switch (failover_state)
{
case FAILOVER_STATE_PRIMARY_REAPPEARED:
/* pass control back down to start_monitoring() */
log_info(_("resuming witness monitoring mode"));
log_detail(_("original primary \"%s\" (node ID: %i) reappeared"),
upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE;
return true;
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
log_info(_("resuming standby monitoring mode"));
log_detail(_("following new primary \"%s\" (node id: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE;
return true;
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
log_info(_("resuming witness monitoring mode"));
log_detail(_("following original primary \"%s\" (node id: %i)"),
upstream_node_info.node_name, upstream_node_info.node_id);
failover_state = FAILOVER_STATE_NONE;
return true;
case FAILOVER_STATE_FOLLOW_FAIL:
/*
* for whatever reason we were unable to follow the new primary -
* continue monitoring in degraded state
*/
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
return false;
default:
return false;
}
/* should never reach here */
return false;
}
static void
reset_node_voting_status(void)
@@ -2241,6 +2541,43 @@ check_connection(t_node_info *node_info, PGconn **conn)
}
static const char *
format_failover_state(FailoverState failover_state)
{
switch(failover_state)
{
case FAILOVER_STATE_UNKNOWN:
return "UNKNOWN";
case FAILOVER_STATE_NONE:
return "NONE";
case FAILOVER_STATE_PROMOTED:
return "PROMOTED";
case FAILOVER_STATE_PROMOTION_FAILED:
return "PROMOTION_FAILED";
case FAILOVER_STATE_PRIMARY_REAPPEARED:
return "PRIMARY_REAPPEARED";
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
return "LOCAL_NODE_FAILURE";
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
return "WAITING_NEW_PRIMARY";
case FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER:
return "REQUIRES_MANUAL_FAILOVER";
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
return "FOLLOWED_NEW_PRIMARY";
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
return "FOLLOWING_ORIGINAL_PRIMARY";
case FAILOVER_STATE_NO_NEW_PRIMARY:
return "NO_NEW_PRIMARY";
case FAILOVER_STATE_FOLLOW_FAIL:
return "FOLLOW_FAIL";
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
return "ODE_NOTIFICATION_ERROR";
}
/* should never reach here */
return "UNKNOWN_FAILOVER_STATE";
}
#endif /* #ifndef BDR_ONLY */
void