repmgrd: detect role change from primary to standby

If repmgrd is monitoring a primary which is taken off-line, then later
restored as a standby, detect this change and resume monitoring
in standby node.

Addresses GitHub #338.
This commit is contained in:
Ian Barwick
2017-11-10 17:19:30 +09:00
parent aa089820ab
commit 9908a9c662

View File

@@ -318,28 +318,105 @@ monitor_streaming_primary(void)
{
local_conn = establish_db_connection(local_node_info.conninfo, false);
if (PQstatus(local_conn) == CONNECTION_OK)
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_warning(_("node appears to be up but no connection could be made"));
PQfinish(local_conn);
}
else
{
local_node_info.node_status = NODE_STATUS_UP;
monitoring_state = MS_NORMAL;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("reconnected to primary node after %i seconds, resuming monitoring"),
degraded_monitoring_elapsed);
/* check to see if the node has been restored as a standby */
if (get_recovery_type(local_conn) == RECTYPE_STANDBY)
{
PGconn *new_primary_conn;
create_event_notification(local_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_local_reconnect",
true,
event_details.data);
appendPQExpBuffer(&event_details,
_("reconnected to node after %i seconds, node is now a standby, switching to standby monitoring"),
degraded_monitoring_elapsed);
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
goto loop;
primary_node_id = UNKNOWN_NODE_ID;
new_primary_conn = get_primary_connection_quiet(local_conn, &primary_node_id, NULL);
if (PQstatus(new_primary_conn) != CONNECTION_OK)
{
PQfinish(new_primary_conn);
log_warning(_("unable to connect to new primary node %i"), primary_node_id);
}
else
{
RecordStatus record_status;
int i = 0;
log_debug("primary node id is now %i", primary_node_id);
/*
* poll for a while until record type is returned as "STANDBY" - it's possible
* that there's a gap between the server being restarted and the record
* being updated
*/
for (i = 0; i < 30; i++)
{
/*
* try and refresh the local node record from the primary, as the updated
* local node record may not have been replicated yet
*/
record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
if (record_status == RECORD_FOUND)
{
log_debug("type = %s", get_node_type_string(local_node_info.type));
if (local_node_info.type == STANDBY)
{
PQfinish(new_primary_conn);
/* XXX add event notification */
return;
}
}
sleep(1);
}
PQfinish(new_primary_conn);
if (record_status == RECORD_FOUND)
{
log_warning(_("repmgr node record is still %s"), get_node_type_string(local_node_info.type));
}
else
{
log_error(_("no metadata record found for this node"));
log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
}
}
}
else
{
appendPQExpBuffer(&event_details,
_("reconnected to primary node after %i seconds, resuming monitoring"),
degraded_monitoring_elapsed);
create_event_notification(local_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_local_reconnect",
true,
event_details.data);
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
goto loop;
}
}
}
@@ -1909,7 +1986,7 @@ do_election(void)
log_debug("our last receive lsn: %X/%X", format_lsn(local_node_info.last_wal_receive_lsn));
// pointer to "winning" node, by default self
/* pointer to "winning" node, initially self */
candidate_node = &local_node_info;
for (cell = standby_nodes.head; cell; cell = cell->next)
@@ -1927,7 +2004,7 @@ do_election(void)
cell->node_info->node_status = NODE_STATUS_UP;
// XXX don't check 0-priority nodes
/* XXX don't check 0-priority nodes */
// get node's LSN
// if "higher" than current winner, current node is candidate