mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 08:36:30 +00:00
repmgrd: improve failover handling
Make retry frequency/interval configurable as per streaming replication.
This commit is contained in:
@@ -31,6 +31,8 @@ monitor_bdr(void)
|
|||||||
NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
|
NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||||
t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER;
|
t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER;
|
||||||
RecordStatus record_status;
|
RecordStatus record_status;
|
||||||
|
NodeInfoListCell *cell;
|
||||||
|
PQExpBufferData event_details;
|
||||||
|
|
||||||
/* sanity check local database */
|
/* sanity check local database */
|
||||||
log_info(_("connecting to local database '%s'"),
|
log_info(_("connecting to local database '%s'"),
|
||||||
@@ -129,13 +131,18 @@ monitor_bdr(void)
|
|||||||
*/
|
*/
|
||||||
get_all_node_records(local_conn, &nodes);
|
get_all_node_records(local_conn, &nodes);
|
||||||
|
|
||||||
|
/* we're expecting all (both) nodes to be up */
|
||||||
|
for (cell = nodes.head; cell; cell = cell->next)
|
||||||
|
{
|
||||||
|
cell->node_info->node_status = NODE_STATUS_UP;
|
||||||
|
}
|
||||||
|
|
||||||
log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id);
|
log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id);
|
||||||
|
|
||||||
log_info(_("starting continuous bdr node monitoring"));
|
log_info(_("starting continuous bdr node monitoring"));
|
||||||
|
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
NodeInfoListCell *cell;
|
|
||||||
|
|
||||||
/* monitoring loop */
|
/* monitoring loop */
|
||||||
log_verbose(LOG_DEBUG, "bdr check loop...");
|
log_verbose(LOG_DEBUG, "bdr check loop...");
|
||||||
@@ -168,14 +175,53 @@ monitor_bdr(void)
|
|||||||
{
|
{
|
||||||
if (is_server_available(cell->node_info->conninfo) == false)
|
if (is_server_available(cell->node_info->conninfo) == false)
|
||||||
{
|
{
|
||||||
instr_time upstream_node_unreachable_start;
|
/* node is down, we were expecting it to be up */
|
||||||
|
if (cell->node_info->node_status == NODE_STATUS_UP)
|
||||||
|
{
|
||||||
|
instr_time node_unreachable_start;
|
||||||
|
INSTR_TIME_SET_CURRENT(node_unreachable_start);
|
||||||
|
|
||||||
INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);
|
cell->node_info->node_status = NODE_STATUS_DOWN;
|
||||||
|
|
||||||
|
if (cell->node_info->conn != NULL)
|
||||||
|
{
|
||||||
|
PQfinish(cell->node_info->conn);
|
||||||
|
cell->node_info->conn = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
// XXX improve
|
cell->node_info->conn = try_reconnect(cell->node_info);
|
||||||
log_warning("connection problem! to node %i", cell->node_info->node_id);
|
|
||||||
do_bdr_failover(&nodes, cell->node_info);
|
/* Node has recovered - log and continue */
|
||||||
|
if (cell->node_info->node_status == NODE_STATUS_UP)
|
||||||
|
{
|
||||||
|
int node_unreachable_elapsed = calculate_elapsed(node_unreachable_start);
|
||||||
|
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("reconnected to node %i after %i seconds"),
|
||||||
|
cell->node_info->node_id,
|
||||||
|
node_unreachable_elapsed);
|
||||||
|
log_notice("%s", event_details.data);
|
||||||
|
|
||||||
|
create_event_notification(cell->node_info->conn,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"repmgrd_upstream_reconnect",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
goto loop;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* still down after reconnect attempt(s) */
|
||||||
|
if (cell->node_info->node_status == NODE_STATUS_DOWN)
|
||||||
|
{
|
||||||
|
do_bdr_failover(&nodes, cell->node_info);
|
||||||
|
goto loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -193,6 +239,8 @@ monitor_bdr(void)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
loop:
|
||||||
|
|
||||||
if (got_SIGHUP)
|
if (got_SIGHUP)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
|||||||
@@ -503,6 +503,7 @@ monitor_streaming_standby(void)
|
|||||||
PQfinish(upstream_conn);
|
PQfinish(upstream_conn);
|
||||||
upstream_conn = try_reconnect(&upstream_node_info);
|
upstream_conn = try_reconnect(&upstream_node_info);
|
||||||
|
|
||||||
|
/* Node has recovered - log and continue */
|
||||||
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
|
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
|
||||||
|
|||||||
Reference in New Issue
Block a user