mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 16:46:28 +00:00
repmgrd: detect role change from primary to standby
If repmgrd is monitoring a primary which is taken off-line, then later restored as a standby, detect this change and resume monitoring in standby node. Addresses GitHub #338.
This commit is contained in:
@@ -318,28 +318,105 @@ monitor_streaming_primary(void)
|
|||||||
{
|
{
|
||||||
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
||||||
|
|
||||||
if (PQstatus(local_conn) == CONNECTION_OK)
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
log_warning(_("node appears to be up but no connection could be made"));
|
||||||
|
PQfinish(local_conn);
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
local_node_info.node_status = NODE_STATUS_UP;
|
local_node_info.node_status = NODE_STATUS_UP;
|
||||||
monitoring_state = MS_NORMAL;
|
monitoring_state = MS_NORMAL;
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
/* check to see if the node has been restored as a standby */
|
||||||
_("reconnected to primary node after %i seconds, resuming monitoring"),
|
if (get_recovery_type(local_conn) == RECTYPE_STANDBY)
|
||||||
degraded_monitoring_elapsed);
|
{
|
||||||
|
PGconn *new_primary_conn;
|
||||||
|
|
||||||
create_event_notification(local_conn,
|
appendPQExpBuffer(&event_details,
|
||||||
&config_file_options,
|
_("reconnected to node after %i seconds, node is now a standby, switching to standby monitoring"),
|
||||||
config_file_options.node_id,
|
degraded_monitoring_elapsed);
|
||||||
"repmgrd_local_reconnect",
|
log_notice("%s", event_details.data);
|
||||||
true,
|
termPQExpBuffer(&event_details);
|
||||||
event_details.data);
|
|
||||||
|
|
||||||
log_notice("%s", event_details.data);
|
|
||||||
termPQExpBuffer(&event_details);
|
|
||||||
|
|
||||||
goto loop;
|
primary_node_id = UNKNOWN_NODE_ID;
|
||||||
|
|
||||||
|
new_primary_conn = get_primary_connection_quiet(local_conn, &primary_node_id, NULL);
|
||||||
|
|
||||||
|
if (PQstatus(new_primary_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
PQfinish(new_primary_conn);
|
||||||
|
log_warning(_("unable to connect to new primary node %i"), primary_node_id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
RecordStatus record_status;
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
log_debug("primary node id is now %i", primary_node_id);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* poll for a while until record type is returned as "STANDBY" - it's possible
|
||||||
|
* that there's a gap between the server being restarted and the record
|
||||||
|
* being updated
|
||||||
|
*/
|
||||||
|
for (i = 0; i < 30; i++)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* try and refresh the local node record from the primary, as the updated
|
||||||
|
* local node record may not have been replicated yet
|
||||||
|
*/
|
||||||
|
|
||||||
|
record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
|
||||||
|
|
||||||
|
if (record_status == RECORD_FOUND)
|
||||||
|
{
|
||||||
|
log_debug("type = %s", get_node_type_string(local_node_info.type));
|
||||||
|
|
||||||
|
if (local_node_info.type == STANDBY)
|
||||||
|
{
|
||||||
|
PQfinish(new_primary_conn);
|
||||||
|
|
||||||
|
/* XXX add event notification */
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sleep(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
PQfinish(new_primary_conn);
|
||||||
|
|
||||||
|
if (record_status == RECORD_FOUND)
|
||||||
|
{
|
||||||
|
log_warning(_("repmgr node record is still %s"), get_node_type_string(local_node_info.type));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_error(_("no metadata record found for this node"));
|
||||||
|
log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("reconnected to primary node after %i seconds, resuming monitoring"),
|
||||||
|
degraded_monitoring_elapsed);
|
||||||
|
|
||||||
|
create_event_notification(local_conn,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"repmgrd_local_reconnect",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
|
||||||
|
log_notice("%s", event_details.data);
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
goto loop;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1909,7 +1986,7 @@ do_election(void)
|
|||||||
|
|
||||||
log_debug("our last receive lsn: %X/%X", format_lsn(local_node_info.last_wal_receive_lsn));
|
log_debug("our last receive lsn: %X/%X", format_lsn(local_node_info.last_wal_receive_lsn));
|
||||||
|
|
||||||
// pointer to "winning" node, by default self
|
/* pointer to "winning" node, initially self */
|
||||||
candidate_node = &local_node_info;
|
candidate_node = &local_node_info;
|
||||||
|
|
||||||
for (cell = standby_nodes.head; cell; cell = cell->next)
|
for (cell = standby_nodes.head; cell; cell = cell->next)
|
||||||
@@ -1927,7 +2004,7 @@ do_election(void)
|
|||||||
|
|
||||||
cell->node_info->node_status = NODE_STATUS_UP;
|
cell->node_info->node_status = NODE_STATUS_UP;
|
||||||
|
|
||||||
// XXX don't check 0-priority nodes
|
/* XXX don't check 0-priority nodes */
|
||||||
|
|
||||||
// get node's LSN
|
// get node's LSN
|
||||||
// if "higher" than current winner, current node is candidate
|
// if "higher" than current winner, current node is candidate
|
||||||
|
|||||||
Reference in New Issue
Block a user