mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
repmgrd: updates for BDR monitoring
This commit is contained in:
@@ -255,6 +255,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
* ------------ */
|
* ------------ */
|
||||||
options->bdr_local_monitoring_only = false;
|
options->bdr_local_monitoring_only = false;
|
||||||
options->bdr_active_node_recovery = false;
|
options->bdr_active_node_recovery = false;
|
||||||
|
options->bdr_recovery_timeout = DEFAULT_BDR_RECOVERY_TIMEOUT;
|
||||||
|
|
||||||
/* service settings
|
/* service settings
|
||||||
* ---------------- */
|
* ---------------- */
|
||||||
@@ -432,6 +433,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->bdr_local_monitoring_only = parse_bool(value, name, error_list);
|
options->bdr_local_monitoring_only = parse_bool(value, name, error_list);
|
||||||
else if (strcmp(name, "bdr_active_node_recovery") == 0)
|
else if (strcmp(name, "bdr_active_node_recovery") == 0)
|
||||||
options->bdr_active_node_recovery = parse_bool(value, name, error_list);
|
options->bdr_active_node_recovery = parse_bool(value, name, error_list);
|
||||||
|
else if (strcmp(name, "bdr_recovery_timeout") == 0)
|
||||||
|
options->bdr_active_node_recovery = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
/* service settings */
|
/* service settings */
|
||||||
else if (strcmp(name, "pg_ctl_options") == 0)
|
else if (strcmp(name, "pg_ctl_options") == 0)
|
||||||
|
|||||||
@@ -89,6 +89,7 @@ typedef struct
|
|||||||
/* BDR settings */
|
/* BDR settings */
|
||||||
bool bdr_local_monitoring_only;
|
bool bdr_local_monitoring_only;
|
||||||
bool bdr_active_node_recovery;
|
bool bdr_active_node_recovery;
|
||||||
|
bool bdr_recovery_timeout;
|
||||||
|
|
||||||
/* service settings */
|
/* service settings */
|
||||||
char pg_ctl_options[MAXLEN];
|
char pg_ctl_options[MAXLEN];
|
||||||
@@ -133,7 +134,7 @@ typedef struct
|
|||||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||||
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
DEFAULT_PRIMARY_FOLLOW_TIMEOUT, \
|
||||||
/* BDR settings */ \
|
/* BDR settings */ \
|
||||||
false, false, \
|
false, false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||||
/* service settings */ \
|
/* service settings */ \
|
||||||
"", "", "", "", "", "", \
|
"", "", "", "", "", "", \
|
||||||
/* event notification settings */ \
|
/* event notification settings */ \
|
||||||
|
|||||||
@@ -236,4 +236,12 @@ ssh_options='' # Options to append to "ssh"
|
|||||||
# BDR monitoring options
|
# BDR monitoring options
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
|
|
||||||
#bdr_active_node_recovery=false #
|
#bdr_local_monitoring_only=false # Only monitor the local node; no checks will be
|
||||||
|
# performed on the other node
|
||||||
|
#bdr_recovery_timeout # If a BDR node was offline and has become available
|
||||||
|
# maximum length of time in seconds to wait for the
|
||||||
|
# node to reconnect to the cluster
|
||||||
|
#bdr_active_node_recovery=false # If a BDR node was offline and has recovered,
|
||||||
|
# provide connection details with the "bdr_recovery"
|
||||||
|
# event to enable automatic reconfiguration of the node
|
||||||
|
# to accept connections
|
||||||
|
|||||||
4
repmgr.h
4
repmgr.h
@@ -45,8 +45,8 @@
|
|||||||
#define DEFAULT_STATS_REPORTING_INTERVAL 2
|
#define DEFAULT_STATS_REPORTING_INTERVAL 2
|
||||||
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60
|
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60
|
||||||
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60
|
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60
|
||||||
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60
|
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60
|
||||||
|
#define DEFAULT_BDR_RECOVERY_TIMEOUT 30
|
||||||
#define FAILOVER_NODES_MAX_CHECK 50
|
#define FAILOVER_NODES_MAX_CHECK 50
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ monitor_bdr(void)
|
|||||||
{
|
{
|
||||||
log_error(_("unable to retrieve record for local node (ID: %i), terminating"),
|
log_error(_("unable to retrieve record for local node (ID: %i), terminating"),
|
||||||
local_node_info.node_id);
|
local_node_info.node_id);
|
||||||
log_hint(_("check that 'repmgr bdr register' was executed for this node"));
|
log_hint(_("check that \"repmgr bdr register\" was executed for this node"));
|
||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
}
|
}
|
||||||
@@ -191,7 +191,7 @@ monitor_bdr(void)
|
|||||||
|
|
||||||
cell->node_info->conn = try_reconnect(cell->node_info);
|
cell->node_info->conn = try_reconnect(cell->node_info);
|
||||||
|
|
||||||
/* Node has recovered - log and continue */
|
/* node has recovered - log and continue */
|
||||||
if (cell->node_info->node_status == NODE_STATUS_UP)
|
if (cell->node_info->node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
int node_unreachable_elapsed = calculate_elapsed(node_unreachable_start);
|
int node_unreachable_elapsed = calculate_elapsed(node_unreachable_start);
|
||||||
@@ -267,7 +267,7 @@ monitor_bdr(void)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* do_bdr_failover()
|
* do_bdr_failover()
|
||||||
*0
|
*
|
||||||
* Here we attempt to perform a BDR "failover".
|
* Here we attempt to perform a BDR "failover".
|
||||||
*
|
*
|
||||||
* As there's no equivalent of a physical replication failover,
|
* As there's no equivalent of a physical replication failover,
|
||||||
@@ -292,6 +292,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
t_node_info failed_node = T_NODE_INFO_INITIALIZER;
|
t_node_info failed_node = T_NODE_INFO_INITIALIZER;
|
||||||
RecordStatus record_status;
|
RecordStatus record_status;
|
||||||
|
|
||||||
|
/* if one of the two nodes is down, cluster will be in a degraded state */
|
||||||
monitored_node->monitoring_state = MS_DEGRADED;
|
monitored_node->monitoring_state = MS_DEGRADED;
|
||||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||||
|
|
||||||
@@ -305,8 +306,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
if (cell->node_info->node_id == monitored_node->node_id)
|
if (cell->node_info->node_id == monitored_node->node_id)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* XXX skip inactive node? */
|
/* TODO: reuse local conn if local node is up */
|
||||||
// reuse local conn if local node is up
|
|
||||||
next_node_conn = establish_db_connection(cell->node_info->conninfo, false);
|
next_node_conn = establish_db_connection(cell->node_info->conninfo, false);
|
||||||
|
|
||||||
if (PQstatus(next_node_conn) == CONNECTION_OK)
|
if (PQstatus(next_node_conn) == CONNECTION_OK)
|
||||||
@@ -353,22 +353,28 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
|
if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
|
||||||
{
|
{
|
||||||
PQfinish(next_node_conn);
|
PQfinish(next_node_conn);
|
||||||
log_debug("other node's repmgrd is handling failover");
|
log_notice(_("other node's repmgrd is handling failover"));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* check here that the node hasn't come back up */
|
||||||
|
if (is_server_available(monitored_node->conninfo) == true)
|
||||||
|
{
|
||||||
|
log_notice(_("node %i has reappeared, aborting failover"),
|
||||||
|
monitored_node->node_id);
|
||||||
|
monitored_node->monitoring_state = MS_NORMAL;
|
||||||
|
PQfinish(next_node_conn);
|
||||||
|
}
|
||||||
|
|
||||||
log_debug("this node is the failover handler");
|
log_debug("this node is the failover handler");
|
||||||
|
|
||||||
// check here that the node hasn't come back up...
|
|
||||||
|
|
||||||
log_info(_("connecting to target node %s"), target_node.node_name);
|
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
event_info.conninfo_str = target_node.conninfo;
|
event_info.conninfo_str = target_node.conninfo;
|
||||||
event_info.node_name = target_node.node_name;
|
event_info.node_name = target_node.node_name;
|
||||||
|
|
||||||
/* update our own record on the other node */
|
/* update node record on the active node */
|
||||||
update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
|
update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
@@ -401,6 +407,8 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
unset_bdr_failover_handler(next_node_conn);
|
unset_bdr_failover_handler(next_node_conn);
|
||||||
|
|
||||||
|
PQfinish(next_node_conn);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -413,6 +421,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||||
int i;
|
int i;
|
||||||
bool node_recovered = false;
|
bool node_recovered = false;
|
||||||
|
int node_recovery_elapsed;
|
||||||
|
|
||||||
recovered_node_conn = establish_db_connection(monitored_node->conninfo, false);
|
recovered_node_conn = establish_db_connection(monitored_node->conninfo, false);
|
||||||
|
|
||||||
@@ -429,8 +438,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// bdr_recovery_timeout
|
for (i = 0; i < config_file_options.bdr_recovery_timeout; i++)
|
||||||
for (i = 0; i < 30; i++)
|
|
||||||
{
|
{
|
||||||
RecordStatus record_status = get_bdr_node_record_by_name(
|
RecordStatus record_status = get_bdr_node_record_by_name(
|
||||||
recovered_node_conn,
|
recovered_node_conn,
|
||||||
@@ -439,6 +447,7 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
if (record_status == RECORD_FOUND && bdr_record.node_status == 'r')
|
if (record_status == RECORD_FOUND && bdr_record.node_status == 'r')
|
||||||
{
|
{
|
||||||
|
// check pg_stat_replication
|
||||||
node_recovered = true;
|
node_recovered = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -459,13 +468,15 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
// don't end up monitoring a parted node; if not attached,
|
// don't end up monitoring a parted node; if not attached,
|
||||||
// generate a failed bdr_recovery event
|
// generate a failed bdr_recovery event
|
||||||
|
|
||||||
|
|
||||||
// note elapsed
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
_("node '%s' (ID: %i) has recovered"),
|
_("node '%s' (ID: %i) has recovered after %i seconds"),
|
||||||
monitored_node->node_name,
|
monitored_node->node_name,
|
||||||
monitored_node->node_id);
|
monitored_node->node_id,
|
||||||
|
node_recovery_elapsed);
|
||||||
|
|
||||||
monitored_node->monitoring_state = MS_NORMAL;
|
monitored_node->monitoring_state = MS_NORMAL;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user