mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 08:56:29 +00:00
Various fixes
Add more event logging, and improve comments.
This commit is contained in:
146
repmgrd.c
146
repmgrd.c
@@ -217,7 +217,9 @@ main(int argc, char **argv)
|
|||||||
{
|
{
|
||||||
PQExpBufferData invalid_log_level;
|
PQExpBufferData invalid_log_level;
|
||||||
initPQExpBuffer(&invalid_log_level);
|
initPQExpBuffer(&invalid_log_level);
|
||||||
appendPQExpBuffer(&invalid_log_level, _("invalid log level \"%s\" provided"), optarg);
|
appendPQExpBuffer(&invalid_log_level,
|
||||||
|
_("invalid log level \"%s\" provided"),
|
||||||
|
optarg);
|
||||||
item_list_append(&cli_errors, invalid_log_level.data);
|
item_list_append(&cli_errors, invalid_log_level.data);
|
||||||
termPQExpBuffer(&invalid_log_level);
|
termPQExpBuffer(&invalid_log_level);
|
||||||
}
|
}
|
||||||
@@ -488,25 +490,66 @@ monitor_streaming_primary(void)
|
|||||||
{
|
{
|
||||||
|
|
||||||
// cache node list here, refresh at `node_list_refresh_interval`
|
// cache node list here, refresh at `node_list_refresh_interval`
|
||||||
|
// also return reason for inavailability so we can log it
|
||||||
if (is_server_available(local_node_info.conninfo) == false)
|
if (is_server_available(local_node_info.conninfo) == false)
|
||||||
{
|
{
|
||||||
|
|
||||||
/* node is down, we were expecting it to be up */
|
/* node is down, we were expecting it to be up */
|
||||||
if (node_status == NODE_STATUS_UP)
|
if (node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
// log disconnect event
|
PQExpBufferData event_details;
|
||||||
log_warning(_("unable to connect to local node"));
|
instr_time local_node_unreachable_start;
|
||||||
|
|
||||||
|
INSTR_TIME_SET_CURRENT(local_node_unreachable_start);
|
||||||
|
|
||||||
|
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("unable to connect to local node"));
|
||||||
|
|
||||||
|
log_warning(event_details.data);
|
||||||
|
|
||||||
node_status = NODE_STATUS_UNKNOWN;
|
node_status = NODE_STATUS_UNKNOWN;
|
||||||
|
|
||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
|
|
||||||
|
/* */
|
||||||
|
create_event_record(NULL,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"repmgrd_local_disconnect",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
local_conn = try_reconnect(local_node_info.conninfo, &node_status);
|
local_conn = try_reconnect(local_node_info.conninfo, &node_status);
|
||||||
|
|
||||||
if (node_status == NODE_STATUS_UP)
|
if (node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
// log reconnect event, details
|
double local_node_unreachable_elapsed = 0;
|
||||||
log_notice(_("reconnected to local node"));
|
instr_time local_node_unreachable_current;
|
||||||
|
|
||||||
|
INSTR_TIME_SET_CURRENT(local_node_unreachable_current);
|
||||||
|
INSTR_TIME_SUBTRACT(local_node_unreachable_current, local_node_unreachable_start);
|
||||||
|
local_node_unreachable_elapsed = INSTR_TIME_GET_DOUBLE(local_node_unreachable_current);
|
||||||
|
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("reconnected to local node after %i seconds"),
|
||||||
|
(int)local_node_unreachable_elapsed);
|
||||||
|
log_notice(event_details.data);
|
||||||
|
|
||||||
|
create_event_record(local_conn,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"repmgrd_local_reconnect",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
goto loop;
|
goto loop;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -966,6 +1009,7 @@ follow_new_primary(int new_primary_id)
|
|||||||
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
|
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
|
||||||
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
|
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
|
||||||
RecordStatus record_status;
|
RecordStatus record_status;
|
||||||
|
bool new_primary_ok = false;
|
||||||
|
|
||||||
// XXX check success
|
// XXX check success
|
||||||
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
|
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
|
||||||
@@ -989,30 +1033,56 @@ follow_new_primary(int new_primary_id)
|
|||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
local_conn = NULL;
|
local_conn = NULL;
|
||||||
|
|
||||||
|
primary_conn = establish_db_connection(new_primary.conninfo, false);
|
||||||
|
|
||||||
|
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||||
|
{
|
||||||
|
RecoveryType primary_recovery_type = get_recovery_type(primary_conn);
|
||||||
|
if (primary_recovery_type == RECTYPE_PRIMARY)
|
||||||
|
{
|
||||||
|
new_primary_ok = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_warning(_("new primary is not in recovery"));
|
||||||
|
PQfinish(primary_conn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (new_primary_ok == false)
|
||||||
|
{
|
||||||
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||||
|
}
|
||||||
|
// XXX check new primary is reachable and is not in recovery here
|
||||||
r = system(config_file_options.follow_command);
|
r = system(config_file_options.follow_command);
|
||||||
|
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
{
|
{
|
||||||
|
PGconn *old_primary_conn;
|
||||||
/*
|
/*
|
||||||
* If the follow action failed, check the previous primary has not
|
* The follow action could still fail due to the original primary reappearing
|
||||||
* reappeared - unlikely we won't reach here until informed by the
|
* before the candidate could promote itself ("repmgr standby follow" will
|
||||||
* new primary.
|
* refuse to promote another node if the primary is available). However
|
||||||
|
* the new primary will only instruct use to follow it after it's successfully
|
||||||
|
* promoted itself, so that very likely won't be the reason for the failure.
|
||||||
|
*
|
||||||
*
|
*
|
||||||
* TODO: check the new primary too - we could have a split-brain
|
* TODO: check the new primary too - we could have a split-brain
|
||||||
* situation where the old primary reappeared just after the new
|
* situation where the old primary reappeared just after the new
|
||||||
* one promoted itself.
|
* one promoted itself.
|
||||||
*/
|
*/
|
||||||
primary_conn = establish_db_connection(failed_primary.conninfo, false);
|
old_primary_conn = establish_db_connection(failed_primary.conninfo, false);
|
||||||
|
|
||||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
if (PQstatus(old_primary_conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
// XXX add event notifications
|
// XXX add event notifications
|
||||||
RecoveryType upstream_recovery_type = get_recovery_type(primary_conn);
|
RecoveryType upstream_recovery_type = get_recovery_type(old_primary_conn);
|
||||||
PQfinish(primary_conn);
|
PQfinish(old_primary_conn);
|
||||||
|
|
||||||
if (upstream_recovery_type == RECTYPE_PRIMARY)
|
if (upstream_recovery_type == RECTYPE_PRIMARY)
|
||||||
{
|
{
|
||||||
log_notice(_("original primary reappeared before this standby was promoted - no action taken"));
|
log_notice(_("original primary reappeared - no action taken"));
|
||||||
return FAILOVER_STATE_PRIMARY_REAPPEARED;
|
return FAILOVER_STATE_PRIMARY_REAPPEARED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1020,21 +1090,35 @@ follow_new_primary(int new_primary_id)
|
|||||||
return FAILOVER_STATE_FOLLOW_FAIL;
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// XXX check success
|
|
||||||
primary_conn = establish_db_connection(new_primary.conninfo, false);
|
|
||||||
|
|
||||||
// XXX check success
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* refresh local copy of local and primary node records - we get these
|
* refresh local copy of local and primary node records - we get these
|
||||||
* directly from the primary to ensure they're the current version
|
* directly from the primary to ensure they're the current version
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// XXX check success
|
||||||
|
|
||||||
record_status = get_node_record(primary_conn, new_primary_id, &upstream_node_info);
|
record_status = get_node_record(primary_conn, new_primary_id, &upstream_node_info);
|
||||||
record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);
|
record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);
|
||||||
|
|
||||||
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("node %i now following new upstream node %i"),
|
||||||
|
local_node_info.node_id,
|
||||||
|
upstream_node_info.node_id);
|
||||||
|
|
||||||
|
log_notice("%s\n", event_details.data);
|
||||||
|
|
||||||
|
create_event_record(primary_conn,
|
||||||
|
&config_file_options,
|
||||||
|
local_node_info.node_id,
|
||||||
|
"repmgrd_failover_follow",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
// XXX log event
|
|
||||||
return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
|
return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1081,8 +1165,7 @@ _print_election_result(ElectionResult result)
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
// store lsndiffs, in the event we're not the best node,
|
|
||||||
// i.e. don't get all the votes, we pass the baton to the best node
|
|
||||||
static ElectionResult
|
static ElectionResult
|
||||||
do_election(void)
|
do_election(void)
|
||||||
{
|
{
|
||||||
@@ -1095,9 +1178,11 @@ do_election(void)
|
|||||||
int visible_nodes = 1;
|
int visible_nodes = 1;
|
||||||
|
|
||||||
|
|
||||||
// get voting status from shared memory
|
/*
|
||||||
// should be "VS_NO_VOTE" or "VS_VOTE_REQUEST_RECEIVED"
|
* get voting status from shared memory - should be one of "VS_NO_VOTE"
|
||||||
// if VS_NO_VOTE, initiate voting process
|
* or "VS_VOTE_REQUEST_RECEIVED". If VS_NO_VOTE, we declare ourselves as
|
||||||
|
* candidate and initiate the voting process.
|
||||||
|
*/
|
||||||
NodeVotingStatus voting_status;
|
NodeVotingStatus voting_status;
|
||||||
|
|
||||||
NodeInfoListCell *cell;
|
NodeInfoListCell *cell;
|
||||||
@@ -1128,8 +1213,12 @@ do_election(void)
|
|||||||
return ELECTION_NOT_CANDIDATE;
|
return ELECTION_NOT_CANDIDATE;
|
||||||
}
|
}
|
||||||
|
|
||||||
// mark ourselves as candidate
|
/*
|
||||||
// -> so any further vote requests are rejected
|
* Here we mark ourselves as candidate, so any further vote requests
|
||||||
|
* are rejected. However it's possible another node has done the
|
||||||
|
* same thing, so when announcing ourselves as candidate to the other
|
||||||
|
* nodes, we'll check for that and withdraw our candidature.
|
||||||
|
*/
|
||||||
electoral_term = set_voting_status_initiated(local_conn);
|
electoral_term = set_voting_status_initiated(local_conn);
|
||||||
|
|
||||||
/* get all active nodes attached to primary, excluding self */
|
/* get all active nodes attached to primary, excluding self */
|
||||||
@@ -1168,8 +1257,13 @@ do_election(void)
|
|||||||
* itself, we withdraw
|
* itself, we withdraw
|
||||||
*
|
*
|
||||||
* XXX check for situations where more than one node could end up as candidate?
|
* XXX check for situations where more than one node could end up as candidate?
|
||||||
|
*
|
||||||
|
* XXX note it's possible some nodes accepted our candidature before we
|
||||||
|
* found out about the other candidate, check what happens in that situation
|
||||||
|
* -> other node will have info from all the nodes, even if not the vote,
|
||||||
|
* so it should be able to determine the best node anyway
|
||||||
*/
|
*/
|
||||||
// other node: if not candidate in this term, reset state (but don't bump term)
|
|
||||||
if (announce_candidature(cell->node_info->conn, &local_node_info, cell->node_info, electoral_term) == false)
|
if (announce_candidature(cell->node_info->conn, &local_node_info, cell->node_info, electoral_term) == false)
|
||||||
{
|
{
|
||||||
log_debug("node %i is candidate", cell->node_info->node_id);
|
log_debug("node %i is candidate", cell->node_info->node_id);
|
||||||
|
|||||||
Reference in New Issue
Block a user