mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 00:26:30 +00:00
repmgrd: more fixes for BDR node recovery
This commit is contained in:
@@ -3204,6 +3204,8 @@ get_bdr_node_replication_slot_status(PGconn *conn, const char *node_name)
|
|||||||
" WHERE node_name = '%s') ",
|
" WHERE node_name = '%s') ",
|
||||||
node_name);
|
node_name);
|
||||||
|
|
||||||
|
log_verbose(LOG_DEBUG, "get_bdr_node_replication_slot_status():\n %s", query.data);
|
||||||
|
|
||||||
res = PQexec(conn, query.data);
|
res = PQexec(conn, query.data);
|
||||||
termPQExpBuffer(&query);
|
termPQExpBuffer(&query);
|
||||||
|
|
||||||
@@ -3248,7 +3250,11 @@ get_bdr_other_node_name(PGconn *conn, int node_id, char *node_name)
|
|||||||
{
|
{
|
||||||
strncpy(node_name, PQgetvalue(res, 0, 0), MAXLEN);
|
strncpy(node_name, PQgetvalue(res, 0, 0), MAXLEN);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_warning(_("get_bdr_other_node_name(): unable to execute query\n %s"),
|
||||||
|
PQerrorMessage(conn));
|
||||||
|
}
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
|||||||
@@ -207,7 +207,7 @@ monitor_bdr(void)
|
|||||||
create_event_notification(cell->node_info->conn,
|
create_event_notification(cell->node_info->conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
"repmgrd_upstream_reconnect",
|
"repmgrd_bdr_reconnect",
|
||||||
true,
|
true,
|
||||||
event_details.data);
|
event_details.data);
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
@@ -296,6 +296,14 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
monitored_node->monitoring_state = MS_DEGRADED;
|
monitored_node->monitoring_state = MS_DEGRADED;
|
||||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||||
|
|
||||||
|
/* terminate local connection if this is the failed node */
|
||||||
|
if (monitored_node->node_id == local_node_info.node_id)
|
||||||
|
{
|
||||||
|
PQfinish(local_conn);
|
||||||
|
local_conn = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* get other node */
|
/* get other node */
|
||||||
|
|
||||||
for (cell = nodes->head; cell; cell = cell->next)
|
for (cell = nodes->head; cell; cell = cell->next)
|
||||||
@@ -346,7 +354,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
{
|
{
|
||||||
PQfinish(next_node_conn);
|
PQfinish(next_node_conn);
|
||||||
log_notice(_("record for node %i has already been set inactive"),
|
log_notice(_("record for node %i has already been set inactive"),
|
||||||
cell->node_info->node_id);
|
failed_node.node_id);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -409,6 +417,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
PQfinish(next_node_conn);
|
PQfinish(next_node_conn);
|
||||||
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -416,7 +425,6 @@ static void
|
|||||||
do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||||
{
|
{
|
||||||
PGconn *recovered_node_conn;
|
PGconn *recovered_node_conn;
|
||||||
PGconn *slot_check_conn;
|
|
||||||
|
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||||
@@ -426,6 +434,8 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
char node_name[MAXLEN] = "";
|
char node_name[MAXLEN] = "";
|
||||||
|
|
||||||
|
log_debug("handling recovery for monitored node %i", monitored_node->node_id);
|
||||||
|
|
||||||
recovered_node_conn = establish_db_connection(monitored_node->conninfo, false);
|
recovered_node_conn = establish_db_connection(monitored_node->conninfo, false);
|
||||||
|
|
||||||
if (PQstatus(recovered_node_conn) != CONNECTION_OK)
|
if (PQstatus(recovered_node_conn) != CONNECTION_OK)
|
||||||
@@ -434,18 +444,17 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* determine which replication slot to look fore */
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||||
if (monitored_node->node_id == local_node_info.node_id)
|
|
||||||
{
|
{
|
||||||
slot_check_conn = recovered_node_conn;
|
log_debug("no local conn");
|
||||||
get_bdr_other_node_name(recovered_node_conn, local_node_info.node_id, node_name);
|
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
slot_check_conn = local_conn;
|
|
||||||
strncpy(node_name, monitored_node->node_name, MAXLEN);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// double-check local conn
|
||||||
|
|
||||||
|
get_bdr_other_node_name(local_conn, local_node_info.node_id, node_name);
|
||||||
|
|
||||||
|
|
||||||
for (i = 0; i < config_file_options.bdr_recovery_timeout; i++)
|
for (i = 0; i < config_file_options.bdr_recovery_timeout; i++)
|
||||||
{
|
{
|
||||||
ReplSlotStatus slot_status;
|
ReplSlotStatus slot_status;
|
||||||
@@ -453,7 +462,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
log_debug("checking for state of replication slot for node \"%s\"", node_name);
|
log_debug("checking for state of replication slot for node \"%s\"", node_name);
|
||||||
|
|
||||||
slot_status = get_bdr_node_replication_slot_status(
|
slot_status = get_bdr_node_replication_slot_status(
|
||||||
slot_check_conn,
|
local_conn,
|
||||||
node_name);
|
node_name);
|
||||||
|
|
||||||
if (slot_status == SLOT_ACTIVE)
|
if (slot_status == SLOT_ACTIVE)
|
||||||
@@ -463,7 +472,6 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
}
|
}
|
||||||
|
|
||||||
sleep(1);
|
sleep(1);
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -483,6 +491,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
|
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
|
||||||
monitored_node->monitoring_state = MS_NORMAL;
|
monitored_node->monitoring_state = MS_NORMAL;
|
||||||
|
monitored_node->node_status = NODE_STATUS_UP;
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
@@ -535,6 +544,8 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
update_node_record_set_active(local_conn, monitored_node->node_id, true);
|
||||||
|
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
PQfinish(recovered_node_conn);
|
PQfinish(recovered_node_conn);
|
||||||
|
|||||||
@@ -619,7 +619,8 @@ try_reconnect(t_node_info *node_info)
|
|||||||
|
|
||||||
for (i = 0; i < max_attempts; i++)
|
for (i = 0; i < max_attempts; i++)
|
||||||
{
|
{
|
||||||
log_info(_("checking state of node, %i of %i attempts"), i, max_attempts);
|
log_info(_("checking state of node %i, %i of %i attempts"),
|
||||||
|
node_info->node_id, i, max_attempts);
|
||||||
if (is_server_available(node_info->conninfo) == true)
|
if (is_server_available(node_info->conninfo) == true)
|
||||||
{
|
{
|
||||||
log_notice(_("node has recovered, reconnecting"));
|
log_notice(_("node has recovered, reconnecting"));
|
||||||
|
|||||||
Reference in New Issue
Block a user