mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 08:36:30 +00:00
repmgrd: various fixed, mainly clearing status after a failover event
This commit is contained in:
9
config.c
9
config.c
@@ -268,9 +268,14 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->event_notifications.tail = NULL;
|
options->event_notifications.tail = NULL;
|
||||||
|
|
||||||
/* barman settings */
|
/* barman settings */
|
||||||
|
/* --------------- */
|
||||||
memset(options->barman_server, 0, sizeof(options->barman_server));
|
memset(options->barman_server, 0, sizeof(options->barman_server));
|
||||||
memset(options->barman_config, 0, sizeof(options->barman_config));
|
memset(options->barman_config, 0, sizeof(options->barman_config));
|
||||||
|
|
||||||
|
/* undocumented test settings */
|
||||||
|
/* -------------------------- */
|
||||||
|
options->promote_delay = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If no configuration file available (user didn't specify and none found
|
* If no configuration file available (user didn't specify and none found
|
||||||
* in the default locations), return with default values
|
* in the default locations), return with default values
|
||||||
@@ -455,6 +460,10 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
else if (strcmp(name, "barman_config") == 0)
|
else if (strcmp(name, "barman_config") == 0)
|
||||||
strncpy(options->barman_config, value, MAXLEN);
|
strncpy(options->barman_config, value, MAXLEN);
|
||||||
|
|
||||||
|
/* undocumented test settings */
|
||||||
|
else if (strcmp(name, "promote_delay") == 0)
|
||||||
|
options->promote_delay = repmgr_atoi(value, name, error_list, 1);
|
||||||
|
|
||||||
/* Following parameters have been deprecated or renamed from 3.x - issue a warning */
|
/* Following parameters have been deprecated or renamed from 3.x - issue a warning */
|
||||||
else if (strcmp(name, "cluster") == 0)
|
else if (strcmp(name, "cluster") == 0)
|
||||||
{
|
{
|
||||||
|
|||||||
8
config.h
8
config.h
@@ -106,6 +106,9 @@ typedef struct
|
|||||||
char barman_host[MAXLEN];
|
char barman_host[MAXLEN];
|
||||||
char barman_server[MAXLEN];
|
char barman_server[MAXLEN];
|
||||||
char barman_config[MAXLEN];
|
char barman_config[MAXLEN];
|
||||||
|
|
||||||
|
/* undocumented test settings */
|
||||||
|
int promote_delay;
|
||||||
} t_configuration_options;
|
} t_configuration_options;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -131,7 +134,10 @@ typedef struct
|
|||||||
/* bdr settings */ \
|
/* bdr settings */ \
|
||||||
BDR_MONITORING_LOCAL, \
|
BDR_MONITORING_LOCAL, \
|
||||||
/* barman settings */ \
|
/* barman settings */ \
|
||||||
"", "", "" }
|
"", "", "", \
|
||||||
|
/* undocumented test settings */ \
|
||||||
|
0 \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
33
dbutils.c
33
dbutils.c
@@ -1714,6 +1714,8 @@ clear_node_info_list(NodeInfoList *nodes)
|
|||||||
NodeInfoListCell *cell;
|
NodeInfoListCell *cell;
|
||||||
NodeInfoListCell *next_cell;
|
NodeInfoListCell *next_cell;
|
||||||
|
|
||||||
|
log_debug("clear_node_info_list() - closing open connections");
|
||||||
|
|
||||||
/* close any open connections */
|
/* close any open connections */
|
||||||
for (cell = nodes->head; cell; cell = cell->next)
|
for (cell = nodes->head; cell; cell = cell->next)
|
||||||
{
|
{
|
||||||
@@ -1724,6 +1726,8 @@ clear_node_info_list(NodeInfoList *nodes)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log_debug("clear_node_info_list() - unlinking");
|
||||||
|
|
||||||
cell = nodes->head;
|
cell = nodes->head;
|
||||||
|
|
||||||
while (cell != NULL)
|
while (cell != NULL)
|
||||||
@@ -1733,7 +1737,8 @@ clear_node_info_list(NodeInfoList *nodes)
|
|||||||
pfree(cell);
|
pfree(cell);
|
||||||
cell = next_cell;
|
cell = next_cell;
|
||||||
}
|
}
|
||||||
|
nodes->head = NULL;
|
||||||
|
nodes->tail = NULL;
|
||||||
nodes->node_count = 0;
|
nodes->node_count = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2493,6 +2498,32 @@ get_new_primary(PGconn *conn, int *primary_node_id)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
reset_voting_status(PGconn *conn)
|
||||||
|
{
|
||||||
|
PQExpBufferData query;
|
||||||
|
PGresult *res;
|
||||||
|
|
||||||
|
initPQExpBuffer(&query);
|
||||||
|
|
||||||
|
appendPQExpBuffer(&query,
|
||||||
|
"SELECT repmgr.reset_voting_status()");
|
||||||
|
|
||||||
|
res = PQexec(conn, query.data);
|
||||||
|
termPQExpBuffer(&query);
|
||||||
|
|
||||||
|
// COMMAND_OK?
|
||||||
|
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
log_error(_("unable to execute repmgr..reset_voting_status():\n %s"),
|
||||||
|
PQerrorMessage(conn));
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(res);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* ============================ */
|
/* ============================ */
|
||||||
/* replication status functions */
|
/* replication status functions */
|
||||||
/* ============================ */
|
/* ============================ */
|
||||||
|
|||||||
@@ -249,6 +249,7 @@ int set_voting_status_initiated(PGconn *conn);
|
|||||||
bool announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term);
|
bool announce_candidature(PGconn *conn, t_node_info *this_node, t_node_info *other_node, int electoral_term);
|
||||||
void notify_follow_primary(PGconn *conn, int primary_node_id);
|
void notify_follow_primary(PGconn *conn, int primary_node_id);
|
||||||
bool get_new_primary(PGconn *conn, int *primary_node_id);
|
bool get_new_primary(PGconn *conn, int *primary_node_id);
|
||||||
|
void reset_voting_status(PGconn *conn);
|
||||||
|
|
||||||
/* replication status functions */
|
/* replication status functions */
|
||||||
|
|
||||||
|
|||||||
@@ -65,3 +65,8 @@ CREATE FUNCTION get_new_primary()
|
|||||||
RETURNS INT
|
RETURNS INT
|
||||||
AS '$libdir/repmgr', 'get_new_primary'
|
AS '$libdir/repmgr', 'get_new_primary'
|
||||||
LANGUAGE C STRICT;
|
LANGUAGE C STRICT;
|
||||||
|
|
||||||
|
CREATE FUNCTION reset_voting_status()
|
||||||
|
RETURNS VOID
|
||||||
|
AS '$libdir/repmgr', 'reset_voting_status'
|
||||||
|
LANGUAGE C STRICT;
|
||||||
|
|||||||
21
repmgr.c
21
repmgr.c
@@ -80,8 +80,8 @@ PG_FUNCTION_INFO_V1(notify_follow_primary);
|
|||||||
Datum get_new_primary(PG_FUNCTION_ARGS);
|
Datum get_new_primary(PG_FUNCTION_ARGS);
|
||||||
PG_FUNCTION_INFO_V1(get_new_primary);
|
PG_FUNCTION_INFO_V1(get_new_primary);
|
||||||
|
|
||||||
//Datum set_new_primary(PG_FUNCTION_ARGS);
|
Datum reset_voting_status(PG_FUNCTION_ARGS);
|
||||||
//PG_FUNCTION_INFO_V1(set_new_primary);
|
PG_FUNCTION_INFO_V1(reset_voting_status);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Module load callback
|
* Module load callback
|
||||||
@@ -153,9 +153,9 @@ repmgr_shmem_startup(void)
|
|||||||
shared_state->lock = LWLockAssign();
|
shared_state->lock = LWLockAssign();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
shared_state->current_electoral_term = 0;
|
||||||
shared_state->voting_status = VS_NO_VOTE;
|
shared_state->voting_status = VS_NO_VOTE;
|
||||||
shared_state->candidate_node_id = UNKNOWN_NODE_ID;
|
shared_state->candidate_node_id = UNKNOWN_NODE_ID;
|
||||||
shared_state->current_electoral_term = 0;
|
|
||||||
shared_state->follow_new_primary = false;
|
shared_state->follow_new_primary = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -315,3 +315,18 @@ get_new_primary(PG_FUNCTION_ARGS)
|
|||||||
|
|
||||||
PG_RETURN_INT32(new_primary_node_id);
|
PG_RETURN_INT32(new_primary_node_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Datum
|
||||||
|
reset_voting_status(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
LWLockAcquire(shared_state->lock, LW_SHARED);
|
||||||
|
|
||||||
|
shared_state->voting_status = VS_NO_VOTE;
|
||||||
|
shared_state->candidate_node_id = UNKNOWN_NODE_ID;
|
||||||
|
shared_state->follow_new_primary = false;
|
||||||
|
|
||||||
|
LWLockRelease(shared_state->lock);
|
||||||
|
|
||||||
|
PG_RETURN_VOID();
|
||||||
|
}
|
||||||
|
|||||||
214
repmgrd.c
214
repmgrd.c
@@ -33,9 +33,11 @@ typedef enum {
|
|||||||
FAILOVER_STATE_PRIMARY_REAPPEARED,
|
FAILOVER_STATE_PRIMARY_REAPPEARED,
|
||||||
FAILOVER_STATE_LOCAL_NODE_FAILURE,
|
FAILOVER_STATE_LOCAL_NODE_FAILURE,
|
||||||
FAILOVER_STATE_WAITING_NEW_PRIMARY,
|
FAILOVER_STATE_WAITING_NEW_PRIMARY,
|
||||||
FAILOVER_STATE_NO_NEW_PRIMARY,
|
|
||||||
FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
|
FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
|
||||||
FAILOVER_STATE_FOLLOW_FAIL
|
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
|
||||||
|
FAILOVER_STATE_NO_NEW_PRIMARY,
|
||||||
|
FAILOVER_STATE_FOLLOW_FAIL,
|
||||||
|
FAILOVER_STATE_NODE_NOTIFICATION_ERROR
|
||||||
} FailoverState;
|
} FailoverState;
|
||||||
|
|
||||||
|
|
||||||
@@ -96,13 +98,15 @@ static const char *_print_voting_status(NodeVotingStatus voting_status);
|
|||||||
static const char *_print_election_result(ElectionResult result);
|
static const char *_print_election_result(ElectionResult result);
|
||||||
|
|
||||||
static FailoverState promote_self(void);
|
static FailoverState promote_self(void);
|
||||||
static void notify_followers(NodeInfoList *standby_nodes);
|
static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);
|
||||||
|
|
||||||
static t_node_info *poll_best_candidate(NodeInfoList *standby_nodes);
|
static t_node_info *poll_best_candidate(NodeInfoList *standby_nodes);
|
||||||
|
|
||||||
static bool wait_primary_notification(int *new_primary_id);
|
static bool wait_primary_notification(int *new_primary_id);
|
||||||
static FailoverState follow_new_primary(int new_primary_id);
|
static FailoverState follow_new_primary(int new_primary_id);
|
||||||
|
|
||||||
|
static void reset_node_voting_status(void);
|
||||||
|
|
||||||
static void close_connections();
|
static void close_connections();
|
||||||
static void terminate(int retval);
|
static void terminate(int retval);
|
||||||
|
|
||||||
@@ -436,10 +440,10 @@ start_monitoring(void)
|
|||||||
local_node_info.node_name,
|
local_node_info.node_name,
|
||||||
local_node_info.node_id);
|
local_node_info.node_id);
|
||||||
|
|
||||||
failover_state = FAILOVER_STATE_NONE;
|
|
||||||
|
|
||||||
while(true)
|
while(true)
|
||||||
{
|
{
|
||||||
|
reset_node_voting_status();
|
||||||
|
|
||||||
switch (local_node_info.type)
|
switch (local_node_info.type)
|
||||||
{
|
{
|
||||||
case PRIMARY:
|
case PRIMARY:
|
||||||
@@ -471,17 +475,26 @@ monitor_streaming_primary(void)
|
|||||||
/* Log startup event */
|
/* Log startup event */
|
||||||
if (startup_event_logged == false)
|
if (startup_event_logged == false)
|
||||||
{
|
{
|
||||||
|
PQExpBufferData event_details;
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("monitoring cluster primary \"%s\" (node ID: %i)"),
|
||||||
|
local_node_info.node_name,
|
||||||
|
local_node_info.node_id);
|
||||||
|
|
||||||
create_event_record(local_conn,
|
create_event_record(local_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
"repmgrd_start",
|
"repmgrd_start",
|
||||||
true,
|
true,
|
||||||
NULL);
|
event_details.data);
|
||||||
|
|
||||||
startup_event_logged = true;
|
startup_event_logged = true;
|
||||||
|
|
||||||
log_notice(_("monitoring cluster primary \"%s\" (node ID: %i)"),
|
log_notice("%s", event_details.data);
|
||||||
local_node_info.node_name,
|
|
||||||
local_node_info.node_id);
|
termPQExpBuffer(&event_details);
|
||||||
}
|
}
|
||||||
|
|
||||||
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
||||||
@@ -502,13 +515,12 @@ monitor_streaming_primary(void)
|
|||||||
|
|
||||||
INSTR_TIME_SET_CURRENT(local_node_unreachable_start);
|
INSTR_TIME_SET_CURRENT(local_node_unreachable_start);
|
||||||
|
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
_("unable to connect to local node"));
|
_("unable to connect to local node"));
|
||||||
|
|
||||||
log_warning(event_details.data);
|
log_warning("%s", event_details.data);
|
||||||
|
|
||||||
node_status = NODE_STATUS_UNKNOWN;
|
node_status = NODE_STATUS_UNKNOWN;
|
||||||
|
|
||||||
@@ -540,7 +552,7 @@ monitor_streaming_primary(void)
|
|||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
_("reconnected to local node after %i seconds"),
|
_("reconnected to local node after %i seconds"),
|
||||||
(int)local_node_unreachable_elapsed);
|
(int)local_node_unreachable_elapsed);
|
||||||
log_notice(event_details.data);
|
log_notice("%s", event_details.data);
|
||||||
|
|
||||||
create_event_record(local_conn,
|
create_event_record(local_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
@@ -603,19 +615,26 @@ monitor_streaming_standby(void)
|
|||||||
/* Log startup event */
|
/* Log startup event */
|
||||||
if (startup_event_logged == false)
|
if (startup_event_logged == false)
|
||||||
{
|
{
|
||||||
|
PQExpBufferData event_details;
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("monitoring upstream node \"%s\" (node ID: %i)"),
|
||||||
|
upstream_node_info.node_name,
|
||||||
|
upstream_node_info.node_id);
|
||||||
|
|
||||||
create_event_record(upstream_conn,
|
create_event_record(upstream_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
"repmgrd_start",
|
"repmgrd_start",
|
||||||
true,
|
true,
|
||||||
NULL);
|
event_details.data);
|
||||||
|
|
||||||
startup_event_logged = true;
|
startup_event_logged = true;
|
||||||
|
|
||||||
log_notice(_("repmgrd on node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i)"),
|
log_notice("%s", event_details.data);
|
||||||
local_node_info.node_name,
|
|
||||||
local_node_info.node_id,
|
termPQExpBuffer(&event_details);
|
||||||
upstream_node_info.node_name,
|
|
||||||
upstream_node_info.node_id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
||||||
@@ -647,6 +666,8 @@ monitor_streaming_standby(void)
|
|||||||
{
|
{
|
||||||
/* attempt to initiate voting process */
|
/* attempt to initiate voting process */
|
||||||
ElectionResult election_result = do_election();
|
ElectionResult election_result = do_election();
|
||||||
|
|
||||||
|
/* XXX add pre-event notification here */
|
||||||
failover_state = FAILOVER_STATE_UNKNOWN;
|
failover_state = FAILOVER_STATE_UNKNOWN;
|
||||||
|
|
||||||
log_debug("election result: %s", _print_election_result(election_result));
|
log_debug("election result: %s", _print_election_result(election_result));
|
||||||
@@ -673,7 +694,10 @@ monitor_streaming_standby(void)
|
|||||||
|
|
||||||
best_candidate = poll_best_candidate(&standby_nodes);
|
best_candidate = poll_best_candidate(&standby_nodes);
|
||||||
|
|
||||||
/* this can occur in a tie-break situation, after we establish this node has priority*/
|
/*
|
||||||
|
* this can occur in a tie-break situation, where this node establishes
|
||||||
|
* it is the best candidate
|
||||||
|
*/
|
||||||
if (best_candidate->node_id == local_node_info.node_id)
|
if (best_candidate->node_id == local_node_info.node_id)
|
||||||
{
|
{
|
||||||
log_notice("I am the best candidate, will now promote self and inform other nodes");
|
log_notice("I am the best candidate, will now promote self and inform other nodes");
|
||||||
@@ -687,25 +711,36 @@ monitor_streaming_standby(void)
|
|||||||
log_info("node %i is the best candidate, waiting for it to confirm so I can follow it",
|
log_info("node %i is the best candidate, waiting for it to confirm so I can follow it",
|
||||||
best_candidate->node_id);
|
best_candidate->node_id);
|
||||||
|
|
||||||
// notify candidate
|
/* notify the best candidate so it */
|
||||||
|
|
||||||
// XXX check result
|
|
||||||
candidate_conn = establish_db_connection(best_candidate->conninfo, false);
|
candidate_conn = establish_db_connection(best_candidate->conninfo, false);
|
||||||
|
|
||||||
notify_follow_primary(candidate_conn, best_candidate->node_id);
|
if (PQstatus(candidate_conn) == CONNECTION_OK)
|
||||||
|
{
|
||||||
|
notify_follow_primary(candidate_conn, best_candidate->node_id);
|
||||||
|
|
||||||
|
/* we'll wait for the candidate to get back to us */
|
||||||
|
failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_error(_("unable to connect to candidate node (ID: %i)"), best_candidate->node_id);
|
||||||
|
failover_state = FAILOVER_STATE_NODE_NOTIFICATION_ERROR;
|
||||||
|
}
|
||||||
PQfinish(candidate_conn);
|
PQfinish(candidate_conn);
|
||||||
// we'll wait for the candidate to get back to us
|
|
||||||
failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
log_info("I am a follower and am waiting to be informed by the winner");
|
log_info(_("follower node awaiting notification from the candidate node"));
|
||||||
failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
|
failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* node has decided it is a follower, so will await notification
|
||||||
|
* from the candidate that it has promoted itself and can be followed
|
||||||
|
*/
|
||||||
if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
|
if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
|
||||||
{
|
{
|
||||||
int new_primary_id;
|
int new_primary_id;
|
||||||
@@ -715,10 +750,15 @@ monitor_streaming_standby(void)
|
|||||||
/* either follow or time out; either way resume monitoring */
|
/* either follow or time out; either way resume monitoring */
|
||||||
if (wait_primary_notification(&new_primary_id) == true)
|
if (wait_primary_notification(&new_primary_id) == true)
|
||||||
{
|
{
|
||||||
// if new_primary_id is self, promote
|
/* if primary has reappeared, no action needed */
|
||||||
if (new_primary_id == local_node_info.node_id)
|
if (new_primary_id == upstream_node_info.node_id)
|
||||||
{
|
{
|
||||||
log_notice("looks like I'm the promotion candidate, promoting");
|
failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY;
|
||||||
|
}
|
||||||
|
/* if new_primary_id is self, promote */
|
||||||
|
else if (new_primary_id == local_node_info.node_id)
|
||||||
|
{
|
||||||
|
log_notice(_("this node is promotion candidate, promoting"));
|
||||||
|
|
||||||
failover_state = promote_self();
|
failover_state = promote_self();
|
||||||
|
|
||||||
@@ -744,9 +784,11 @@ monitor_streaming_standby(void)
|
|||||||
switch(failover_state)
|
switch(failover_state)
|
||||||
{
|
{
|
||||||
case FAILOVER_STATE_PROMOTED:
|
case FAILOVER_STATE_PROMOTED:
|
||||||
/* inform former siblings that we are Number 1 */
|
log_debug("failover state is PROMOTED");
|
||||||
|
|
||||||
|
/* notify former siblings that they should now follow this node */
|
||||||
|
notify_followers(&standby_nodes, local_node_info.node_id);
|
||||||
|
|
||||||
notify_followers(&standby_nodes);
|
|
||||||
/* we no longer care about our former siblings */
|
/* we no longer care about our former siblings */
|
||||||
clear_node_info_list(&standby_nodes);
|
clear_node_info_list(&standby_nodes);
|
||||||
|
|
||||||
@@ -755,19 +797,50 @@ monitor_streaming_standby(void)
|
|||||||
|
|
||||||
failover_state = FAILOVER_STATE_NONE;
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
case FAILOVER_STATE_PRIMARY_REAPPEARED:
|
||||||
|
log_debug("failover state is PRIMARY_REAPPEARED");
|
||||||
|
|
||||||
|
/* notify siblings that they should resume following the original primary */
|
||||||
|
notify_followers(&standby_nodes, upstream_node_info.node_id);
|
||||||
|
|
||||||
|
/* we no longer care about our former siblings */
|
||||||
|
clear_node_info_list(&standby_nodes);
|
||||||
|
|
||||||
|
/* pass control back down to start_monitoring() */
|
||||||
|
log_info(_("resuming standby monitoring mode"));
|
||||||
|
log_detail(_("original primary \"%s\" (node ID: %i) reappeared"),
|
||||||
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
|
|
||||||
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
|
return;
|
||||||
|
|
||||||
case FAILOVER_STATE_PROMOTION_FAILED:
|
case FAILOVER_STATE_PROMOTION_FAILED:
|
||||||
log_debug("failover state is FAILED");
|
log_debug("failover state is PROMOTION FAILED");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
|
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
|
||||||
log_info(_("resuming standby monitoring mode"));
|
log_info(_("resuming standby monitoring mode"));
|
||||||
|
log_detail(_("following new primary \"%s\" (node id: %i)"),
|
||||||
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
failover_state = FAILOVER_STATE_NONE;
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
break;
|
|
||||||
|
return;
|
||||||
|
|
||||||
|
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
|
||||||
|
log_info(_("resuming standby monitoring mode"));
|
||||||
|
log_detail(_("following original primary \"%s\" (node id: %i)"),
|
||||||
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
|
|
||||||
|
return;
|
||||||
|
|
||||||
case FAILOVER_STATE_NO_NEW_PRIMARY:
|
case FAILOVER_STATE_NO_NEW_PRIMARY:
|
||||||
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
|
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
|
||||||
/* pass control back down to start_monitoring() */
|
/* pass control back down to start_monitoring() */
|
||||||
// -> should kick off new election
|
// -> should kick off new election
|
||||||
return;
|
return;
|
||||||
case FAILOVER_STATE_PRIMARY_REAPPEARED:
|
|
||||||
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
||||||
case FAILOVER_STATE_UNKNOWN:
|
case FAILOVER_STATE_UNKNOWN:
|
||||||
case FAILOVER_STATE_NONE:
|
case FAILOVER_STATE_NONE:
|
||||||
@@ -799,6 +872,7 @@ monitor_streaming_standby(void)
|
|||||||
upstream_node_info.node_name,
|
upstream_node_info.node_name,
|
||||||
upstream_node_info.node_id);
|
upstream_node_info.node_id);
|
||||||
|
|
||||||
|
//log_debug(
|
||||||
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -817,6 +891,18 @@ promote_self(void)
|
|||||||
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
|
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
|
||||||
RecordStatus record_status;
|
RecordStatus record_status;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* optionally add a delay before promoting the standby; this is mainly
|
||||||
|
* useful for testing (e.g. for reappearance of the original primary)
|
||||||
|
* and is not documented.
|
||||||
|
*/
|
||||||
|
if (config_file_options.promote_delay > 0)
|
||||||
|
{
|
||||||
|
log_debug("sleeping %i seconds before promoting standby",
|
||||||
|
config_file_options.promote_delay);
|
||||||
|
sleep(config_file_options.promote_delay);
|
||||||
|
}
|
||||||
|
|
||||||
// XXX check success
|
// XXX check success
|
||||||
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
|
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
|
||||||
|
|
||||||
@@ -859,12 +945,26 @@ promote_self(void)
|
|||||||
|
|
||||||
if (PQstatus(primary_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
|
if (PQstatus(primary_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
|
||||||
{
|
{
|
||||||
log_notice(_("original primary reappeared before this standby was promoted - no action taken"));
|
log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"),
|
||||||
|
failed_primary.node_id);
|
||||||
|
|
||||||
/* XXX log an event here? */
|
initPQExpBuffer(&event_details);
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("original primary \"%s\" (node ID: %i) reappeared"),
|
||||||
|
failed_primary.node_name,
|
||||||
|
failed_primary.node_id);
|
||||||
|
|
||||||
PQfinish(primary_conn);
|
create_event_record(primary_conn,
|
||||||
primary_conn = NULL;
|
&config_file_options,
|
||||||
|
local_node_info.node_id,
|
||||||
|
"repmgrd_failover_abort",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
//PQfinish(primary_conn);
|
||||||
|
//primary_conn = NULL;
|
||||||
|
|
||||||
// XXX handle this!
|
// XXX handle this!
|
||||||
// -> we'll need to let the other nodes know too....
|
// -> we'll need to let the other nodes know too....
|
||||||
@@ -885,36 +985,46 @@ promote_self(void)
|
|||||||
/* update own internal node record */
|
/* update own internal node record */
|
||||||
record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
|
record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
|
||||||
|
|
||||||
// XXX we're assuming the promote command updated metadata
|
/*
|
||||||
|
* XXX here we're assuming the promote command updated metadata
|
||||||
|
*/
|
||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
_("node %i promoted to primary; old primary %i marked as failed"),
|
_("node %i promoted to primary; old primary %i marked as failed"),
|
||||||
local_node_info.node_id,
|
local_node_info.node_id,
|
||||||
failed_primary.node_id);
|
failed_primary.node_id);
|
||||||
/* my_local_conn is now the master */
|
|
||||||
|
/* local_conn is now the primary connection */
|
||||||
create_event_record(local_conn,
|
create_event_record(local_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
local_node_info.node_id,
|
local_node_info.node_id,
|
||||||
"repmgrd_failover_promote",
|
"repmgrd_failover_promote",
|
||||||
true,
|
true,
|
||||||
event_details.data);
|
event_details.data);
|
||||||
|
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
return FAILOVER_STATE_PROMOTED;
|
return FAILOVER_STATE_PROMOTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Notify follower nodes about which node to follow. Normally this
|
||||||
|
* will be the current node, however if the original primary reappeared
|
||||||
|
* before this node could be promoted, we'll inform the followers they
|
||||||
|
* should resume monitoring the original primary.
|
||||||
|
*/
|
||||||
static void
|
static void
|
||||||
notify_followers(NodeInfoList *standby_nodes)
|
notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
|
||||||
{
|
{
|
||||||
NodeInfoListCell *cell;
|
NodeInfoListCell *cell;
|
||||||
|
|
||||||
log_debug("notify_followers()");
|
log_debug("notify_followers()");
|
||||||
for (cell = standby_nodes->head; cell; cell = cell->next)
|
for (cell = standby_nodes->head; cell; cell = cell->next)
|
||||||
{
|
{
|
||||||
log_debug("intending to notify %i... ", cell->node_info->node_id);
|
log_debug("intending to notify node %i... ", cell->node_info->node_id);
|
||||||
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
log_debug("connection to %i lost... ", cell->node_info->node_id);
|
log_debug("reconnecting to node %i... ", cell->node_info->node_id);
|
||||||
|
|
||||||
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
||||||
}
|
}
|
||||||
@@ -925,8 +1035,10 @@ notify_followers(NodeInfoList *standby_nodes)
|
|||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
log_debug("notifying node %i to follow new primary", cell->node_info->node_id);
|
|
||||||
notify_follow_primary(cell->node_info->conn, local_node_info.node_id);
|
log_debug("notifying node %i to follow node %i",
|
||||||
|
cell->node_info->node_id, follow_node_id);
|
||||||
|
notify_follow_primary(cell->node_info->conn, follow_node_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1333,6 +1445,20 @@ do_election(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
reset_node_voting_status(void)
|
||||||
|
{
|
||||||
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
|
|
||||||
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
log_error(_("reset_node_voting_status(): local_conn not set"));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
reset_voting_status(local_conn);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
daemonize_process(void)
|
daemonize_process(void)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user