mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 23:26:30 +00:00
repmgrd: more fixes to BDR recovery handling
This commit is contained in:
@@ -254,7 +254,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
/* BDR settings
|
/* BDR settings
|
||||||
* ------------ */
|
* ------------ */
|
||||||
options->bdr_local_monitoring_only = false;
|
options->bdr_local_monitoring_only = false;
|
||||||
options->bdr_active_node_recovery = false;
|
|
||||||
options->bdr_recovery_timeout = DEFAULT_BDR_RECOVERY_TIMEOUT;
|
options->bdr_recovery_timeout = DEFAULT_BDR_RECOVERY_TIMEOUT;
|
||||||
|
|
||||||
/* service settings
|
/* service settings
|
||||||
@@ -431,10 +430,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
/* BDR settings */
|
/* BDR settings */
|
||||||
else if (strcmp(name, "bdr_local_monitoring_only") == 0)
|
else if (strcmp(name, "bdr_local_monitoring_only") == 0)
|
||||||
options->bdr_local_monitoring_only = parse_bool(value, name, error_list);
|
options->bdr_local_monitoring_only = parse_bool(value, name, error_list);
|
||||||
else if (strcmp(name, "bdr_active_node_recovery") == 0)
|
|
||||||
options->bdr_active_node_recovery = parse_bool(value, name, error_list);
|
|
||||||
else if (strcmp(name, "bdr_recovery_timeout") == 0)
|
else if (strcmp(name, "bdr_recovery_timeout") == 0)
|
||||||
options->bdr_active_node_recovery = repmgr_atoi(value, name, error_list, 0);
|
options->bdr_recovery_timeout = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
/* service settings */
|
/* service settings */
|
||||||
else if (strcmp(name, "pg_ctl_options") == 0)
|
else if (strcmp(name, "pg_ctl_options") == 0)
|
||||||
|
|||||||
@@ -88,7 +88,6 @@ typedef struct
|
|||||||
|
|
||||||
/* BDR settings */
|
/* BDR settings */
|
||||||
bool bdr_local_monitoring_only;
|
bool bdr_local_monitoring_only;
|
||||||
bool bdr_active_node_recovery;
|
|
||||||
bool bdr_recovery_timeout;
|
bool bdr_recovery_timeout;
|
||||||
|
|
||||||
/* service settings */
|
/* service settings */
|
||||||
@@ -134,7 +133,7 @@ typedef struct
|
|||||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||||
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
||||||
/* BDR settings */ \
|
/* BDR settings */ \
|
||||||
false, false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||||
/* service settings */ \
|
/* service settings */ \
|
||||||
"", "", "", "", "", "", \
|
"", "", "", "", "", "", \
|
||||||
/* event notification settings */ \
|
/* event notification settings */ \
|
||||||
|
|||||||
@@ -241,7 +241,3 @@ ssh_options='' # Options to append to "ssh"
|
|||||||
#bdr_recovery_timeout # If a BDR node was offline and has become available
|
#bdr_recovery_timeout # If a BDR node was offline and has become available
|
||||||
# maximum length of time in seconds to wait for the
|
# maximum length of time in seconds to wait for the
|
||||||
# node to reconnect to the cluster
|
# node to reconnect to the cluster
|
||||||
#bdr_active_node_recovery=false # If a BDR node was offline and has recovered,
|
|
||||||
# provide connection details with the "bdr_recovery"
|
|
||||||
# event to enable automatic reconfiguration of the node
|
|
||||||
# to accept connections
|
|
||||||
|
|||||||
@@ -207,7 +207,7 @@ monitor_bdr(void)
|
|||||||
create_event_notification(cell->node_info->conn,
|
create_event_notification(cell->node_info->conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
"repmgrd_bdr_reconnect",
|
"bdr_reconnect",
|
||||||
true,
|
true,
|
||||||
event_details.data);
|
event_details.data);
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
@@ -429,7 +429,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||||
int i;
|
int i;
|
||||||
bool node_recovered = false;
|
bool slot_reactivated = false;
|
||||||
int node_recovery_elapsed;
|
int node_recovery_elapsed;
|
||||||
|
|
||||||
char node_name[MAXLEN] = "";
|
char node_name[MAXLEN] = "";
|
||||||
@@ -446,11 +446,40 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
log_debug("no local conn");
|
log_debug("no local connection - attempting to reconnect ");
|
||||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// double-check local conn
|
/*
|
||||||
|
* still unable to connect - the local node is probably down, so we can't
|
||||||
|
* check for reconnection
|
||||||
|
*/
|
||||||
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
local_conn = NULL;
|
||||||
|
log_warning(_("unable to reconnect to local node"));
|
||||||
|
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
|
||||||
|
monitored_node->monitoring_state = MS_NORMAL;
|
||||||
|
monitored_node->node_status = NODE_STATUS_UP;
|
||||||
|
|
||||||
|
appendPQExpBuffer(
|
||||||
|
&event_details,
|
||||||
|
_("node \"%s\" (ID: %i) has become available after %i seconds"),
|
||||||
|
monitored_node->node_name,
|
||||||
|
monitored_node->node_id,
|
||||||
|
node_recovery_elapsed);
|
||||||
|
|
||||||
|
log_notice("%s", event_details.data);
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
PQfinish(recovered_node_conn);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
get_bdr_other_node_name(local_conn, local_node_info.node_id, node_name);
|
get_bdr_other_node_name(local_conn, local_node_info.node_id, node_name);
|
||||||
|
|
||||||
@@ -467,15 +496,17 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
if (slot_status == SLOT_ACTIVE)
|
if (slot_status == SLOT_ACTIVE)
|
||||||
{
|
{
|
||||||
node_recovered = true;
|
slot_reactivated = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
sleep(1);
|
sleep(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* mark node as up */
|
||||||
|
monitored_node->node_status = NODE_STATUS_UP;
|
||||||
|
|
||||||
if (node_recovered == false)
|
if (slot_reactivated == false)
|
||||||
{
|
{
|
||||||
log_warning(_("no active replication slot for node \"%s\" found after %i seconds"),
|
log_warning(_("no active replication slot for node \"%s\" found after %i seconds"),
|
||||||
node_name,
|
node_name,
|
||||||
@@ -491,11 +522,10 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
|
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
|
||||||
monitored_node->monitoring_state = MS_NORMAL;
|
monitored_node->monitoring_state = MS_NORMAL;
|
||||||
monitored_node->node_status = NODE_STATUS_UP;
|
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
_("node \"%s\" (ID: %i) has recovered after %i seconds"),
|
_("node \"%s\" (ID: %i) has recovered after %i seconds"),
|
||||||
monitored_node->node_name,
|
monitored_node->node_name,
|
||||||
@@ -518,32 +548,20 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
/* generate the event on the currently active node only */
|
/* generate the event on the currently active node only */
|
||||||
if (monitored_node->node_id != local_node_info.node_id)
|
if (monitored_node->node_id != local_node_info.node_id)
|
||||||
{
|
{
|
||||||
if (config_file_options.bdr_active_node_recovery == true)
|
event_info.conninfo_str = monitored_node->conninfo;
|
||||||
{
|
event_info.node_name = monitored_node->node_name;
|
||||||
event_info.conninfo_str = monitored_node->conninfo;
|
|
||||||
event_info.node_name = monitored_node->node_name;
|
|
||||||
|
|
||||||
create_event_notification_extended(
|
create_event_notification_extended(
|
||||||
local_conn,
|
local_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
"bdr_recovery",
|
"bdr_recovery",
|
||||||
true,
|
true,
|
||||||
event_details.data,
|
event_details.data,
|
||||||
&event_info);
|
&event_info);
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
create_event_record(
|
|
||||||
local_conn,
|
|
||||||
&config_file_options,
|
|
||||||
config_file_options.node_id,
|
|
||||||
"bdr_recovery",
|
|
||||||
true,
|
|
||||||
event_details.data);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
update_node_record_set_active(local_conn, monitored_node->node_id, true);
|
update_node_record_set_active(local_conn, monitored_node->node_id, true);
|
||||||
|
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|||||||
Reference in New Issue
Block a user