repmgrd: more fixes to BDR recovery handling

This commit is contained in:
Ian Barwick
2017-07-27 16:33:41 +09:00
parent b4a655d074
commit 4cf66c33db
4 changed files with 52 additions and 42 deletions

View File

@@ -254,7 +254,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
/* BDR settings
* ------------ */
options->bdr_local_monitoring_only = false;
options->bdr_active_node_recovery = false;
options->bdr_recovery_timeout = DEFAULT_BDR_RECOVERY_TIMEOUT;
/* service settings
@@ -431,10 +430,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
/* BDR settings */
else if (strcmp(name, "bdr_local_monitoring_only") == 0)
options->bdr_local_monitoring_only = parse_bool(value, name, error_list);
else if (strcmp(name, "bdr_active_node_recovery") == 0)
options->bdr_active_node_recovery = parse_bool(value, name, error_list);
else if (strcmp(name, "bdr_recovery_timeout") == 0)
options->bdr_active_node_recovery = repmgr_atoi(value, name, error_list, 0);
options->bdr_recovery_timeout = repmgr_atoi(value, name, error_list, 0);
/* service settings */
else if (strcmp(name, "pg_ctl_options") == 0)

View File

@@ -88,7 +88,6 @@ typedef struct
/* BDR settings */
bool bdr_local_monitoring_only;
bool bdr_active_node_recovery;
bool bdr_recovery_timeout;
/* service settings */
@@ -134,7 +133,7 @@ typedef struct
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
/* BDR settings */ \
false, false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
/* service settings */ \
"", "", "", "", "", "", \
/* event notification settings */ \

View File

@@ -241,7 +241,3 @@ ssh_options='' # Options to append to "ssh"
#bdr_recovery_timeout # If a BDR node was offline and has become available
# maximum length of time in seconds to wait for the
# node to reconnect to the cluster
#bdr_active_node_recovery=false # If a BDR node was offline and has recovered,
# provide connection details with the "bdr_recovery"
# event to enable automatic reconfiguration of the node
# to accept connections

View File

@@ -207,7 +207,7 @@ monitor_bdr(void)
create_event_notification(cell->node_info->conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_bdr_reconnect",
"bdr_reconnect",
true,
event_details.data);
termPQExpBuffer(&event_details);
@@ -429,7 +429,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
PQExpBufferData event_details;
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
int i;
bool node_recovered = false;
bool slot_reactivated = false;
int node_recovery_elapsed;
char node_name[MAXLEN] = "";
@@ -446,11 +446,40 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
if (PQstatus(local_conn) != CONNECTION_OK)
{
log_debug("no local conn");
local_conn = establish_db_connection(config_file_options.conninfo, true);
log_debug("no local connection - attempting to reconnect ");
local_conn = establish_db_connection(config_file_options.conninfo, false);
}
// double-check local conn
/*
* still unable to connect - the local node is probably down, so we can't
* check for reconnection
*/
if (PQstatus(local_conn) != CONNECTION_OK)
{
local_conn = NULL;
log_warning(_("unable to reconnect to local node"));
initPQExpBuffer(&event_details);
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
monitored_node->monitoring_state = MS_NORMAL;
monitored_node->node_status = NODE_STATUS_UP;
appendPQExpBuffer(
&event_details,
_("node \"%s\" (ID: %i) has become available after %i seconds"),
monitored_node->node_name,
monitored_node->node_id,
node_recovery_elapsed);
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
PQfinish(recovered_node_conn);
return;
}
get_bdr_other_node_name(local_conn, local_node_info.node_id, node_name);
@@ -467,15 +496,17 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
if (slot_status == SLOT_ACTIVE)
{
node_recovered = true;
slot_reactivated = true;
break;
}
sleep(1);
}
/* mark node as up */
monitored_node->node_status = NODE_STATUS_UP;
if (node_recovered == false)
if (slot_reactivated == false)
{
log_warning(_("no active replication slot for node \"%s\" found after %i seconds"),
node_name,
@@ -491,11 +522,10 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
monitored_node->monitoring_state = MS_NORMAL;
monitored_node->node_status = NODE_STATUS_UP;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("node \"%s\" (ID: %i) has recovered after %i seconds"),
monitored_node->node_name,
@@ -518,32 +548,20 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
/* generate the event on the currently active node only */
if (monitored_node->node_id != local_node_info.node_id)
{
if (config_file_options.bdr_active_node_recovery == true)
{
event_info.conninfo_str = monitored_node->conninfo;
event_info.node_name = monitored_node->node_name;
event_info.conninfo_str = monitored_node->conninfo;
event_info.node_name = monitored_node->node_name;
create_event_notification_extended(
local_conn,
&config_file_options,
config_file_options.node_id,
"bdr_recovery",
true,
event_details.data,
&event_info);
}
else
{
create_event_record(
local_conn,
&config_file_options,
config_file_options.node_id,
"bdr_recovery",
true,
event_details.data);
}
create_event_notification_extended(
local_conn,
&config_file_options,
config_file_options.node_id,
"bdr_recovery",
true,
event_details.data,
&event_info);
}
update_node_record_set_active(local_conn, monitored_node->node_id, true);
termPQExpBuffer(&event_details);