mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 17:06:29 +00:00
repmgrd: improve logging of BDR monitoring
Also always log information about event_notification command
This commit is contained in:
@@ -241,7 +241,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
strncpy(options->location, DEFAULT_LOCATION, MAXLEN);
|
strncpy(options->location, DEFAULT_LOCATION, MAXLEN);
|
||||||
memset(options->promote_command, 0, sizeof(options->promote_command));
|
memset(options->promote_command, 0, sizeof(options->promote_command));
|
||||||
memset(options->follow_command, 0, sizeof(options->follow_command));
|
memset(options->follow_command, 0, sizeof(options->follow_command));
|
||||||
options->monitor_interval_secs = DEFAULT_STATS_REPORTING_INTERVAL;
|
options->monitor_interval_secs = DEFAULT_MONITORING_INTERVAL;
|
||||||
/* default to 6 reconnection attempts at intervals of 10 seconds */
|
/* default to 6 reconnection attempts at intervals of 10 seconds */
|
||||||
options->reconnect_attempts = DEFAULT_RECONNECTION_ATTEMPTS;
|
options->reconnect_attempts = DEFAULT_RECONNECTION_ATTEMPTS;
|
||||||
options->reconnect_interval = DEFAULT_RECONNECTION_INTERVAL;
|
options->reconnect_interval = DEFAULT_RECONNECTION_INTERVAL;
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ typedef struct
|
|||||||
false, "", "", "", "", { NULL, NULL }, \
|
false, "", "", "", "", { NULL, NULL }, \
|
||||||
/* repmgrd settings */ \
|
/* repmgrd settings */ \
|
||||||
FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", \
|
FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", \
|
||||||
DEFAULT_STATS_REPORTING_INTERVAL, \
|
DEFAULT_MONITORING_INTERVAL, \
|
||||||
DEFAULT_RECONNECTION_ATTEMPTS, \
|
DEFAULT_RECONNECTION_ATTEMPTS, \
|
||||||
DEFAULT_RECONNECTION_INTERVAL, \
|
DEFAULT_RECONNECTION_INTERVAL, \
|
||||||
false, -1, \
|
false, -1, \
|
||||||
|
|||||||
@@ -2368,8 +2368,10 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
|
|||||||
|
|
||||||
*dst_ptr = '\0';
|
*dst_ptr = '\0';
|
||||||
|
|
||||||
log_debug("_create_event(): executing\n%s", parsed_command);
|
log_info(_("executing notification command for event \"%s\""),
|
||||||
|
event);
|
||||||
|
|
||||||
|
log_detail(_("command is:\n %s"), parsed_command);
|
||||||
r = system(parsed_command);
|
r = system(parsed_command);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
{
|
{
|
||||||
|
|||||||
2
repmgr.h
2
repmgr.h
@@ -42,7 +42,7 @@
|
|||||||
#define DEFAULT_PRIORITY 100
|
#define DEFAULT_PRIORITY 100
|
||||||
#define DEFAULT_RECONNECTION_ATTEMPTS 6
|
#define DEFAULT_RECONNECTION_ATTEMPTS 6
|
||||||
#define DEFAULT_RECONNECTION_INTERVAL 10
|
#define DEFAULT_RECONNECTION_INTERVAL 10
|
||||||
#define DEFAULT_STATS_REPORTING_INTERVAL 2
|
#define DEFAULT_MONITORING_INTERVAL 2
|
||||||
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60
|
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60
|
||||||
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60
|
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60
|
||||||
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60
|
#define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ monitor_bdr(void)
|
|||||||
RecordStatus record_status;
|
RecordStatus record_status;
|
||||||
NodeInfoListCell *cell;
|
NodeInfoListCell *cell;
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
|
instr_time log_status_interval_start;
|
||||||
|
|
||||||
/* sanity check local database */
|
/* sanity check local database */
|
||||||
log_info(_("connecting to local database '%s'"),
|
log_info(_("connecting to local database '%s'"),
|
||||||
@@ -139,13 +140,13 @@ monitor_bdr(void)
|
|||||||
|
|
||||||
log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id);
|
log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id);
|
||||||
|
|
||||||
log_info(_("starting continuous bdr node monitoring"));
|
log_info(_("starting continuous BDR node monitoring"));
|
||||||
|
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
|
|
||||||
/* monitoring loop */
|
/* monitoring loop */
|
||||||
log_verbose(LOG_DEBUG, "bdr check loop...");
|
log_verbose(LOG_DEBUG, "BDR check loop...");
|
||||||
|
|
||||||
for (cell = nodes.head; cell; cell = cell->next)
|
for (cell = nodes.head; cell; cell = cell->next)
|
||||||
{
|
{
|
||||||
@@ -189,6 +190,8 @@ monitor_bdr(void)
|
|||||||
cell->node_info->conn = NULL;
|
cell->node_info->conn = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log_warning(_("unable to connect to node %s (ID %i)"),
|
||||||
|
cell->node_info->node_name, cell->node_info->node_id);
|
||||||
cell->node_info->conn = try_reconnect(cell->node_info);
|
cell->node_info->conn = try_reconnect(cell->node_info);
|
||||||
|
|
||||||
/* node has recovered - log and continue */
|
/* node has recovered - log and continue */
|
||||||
@@ -240,6 +243,31 @@ monitor_bdr(void)
|
|||||||
|
|
||||||
loop:
|
loop:
|
||||||
|
|
||||||
|
/* emit "still alive" log message at regular intervals, if requested */
|
||||||
|
if (config_file_options.log_status_interval > 0)
|
||||||
|
{
|
||||||
|
int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
|
||||||
|
|
||||||
|
if (log_status_interval_elapsed >= config_file_options.log_status_interval)
|
||||||
|
{
|
||||||
|
log_info(_("monitoring BDR replication status on node \"%s\" (ID: %i)"),
|
||||||
|
local_node_info.node_name,
|
||||||
|
local_node_info.node_id);
|
||||||
|
|
||||||
|
for (cell = nodes.head; cell; cell = cell->next)
|
||||||
|
{
|
||||||
|
if (cell->node_info->monitoring_state == MS_DEGRADED)
|
||||||
|
{
|
||||||
|
log_detail(
|
||||||
|
_("monitoring node \"%s\" (ID: %i) in degraded mode"),
|
||||||
|
cell->node_info->node_name,
|
||||||
|
cell->node_info->node_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (got_SIGHUP)
|
if (got_SIGHUP)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
@@ -259,7 +287,6 @@ monitor_bdr(void)
|
|||||||
log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
|
log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
|
||||||
config_file_options.monitor_interval_secs);
|
config_file_options.monitor_interval_secs);
|
||||||
sleep(config_file_options.monitor_interval_secs);
|
sleep(config_file_options.monitor_interval_secs);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
@@ -385,6 +412,8 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
/* update node record on the active node */
|
/* update node record on the active node */
|
||||||
update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
|
update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
|
||||||
|
|
||||||
|
log_notice(_("setting node record for node %i to inactive"), monitored_node->node_id);
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
_("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
|
_("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
|
||||||
monitored_node->node_name,
|
monitored_node->node_name,
|
||||||
@@ -411,6 +440,8 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
event_details.data,
|
event_details.data,
|
||||||
&event_info);
|
&event_info);
|
||||||
|
|
||||||
|
log_info("%s", event_details.data);
|
||||||
|
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
unset_bdr_failover_handler(next_node_conn);
|
unset_bdr_failover_handler(next_node_conn);
|
||||||
|
|||||||
@@ -336,7 +336,7 @@ monitor_streaming_primary(void)
|
|||||||
|
|
||||||
if (monitoring_state == MS_DEGRADED)
|
if (monitoring_state == MS_DEGRADED)
|
||||||
{
|
{
|
||||||
log_detail(_("waiting primary to reappear"));
|
log_detail(_("waiting for primary to reappear"));
|
||||||
}
|
}
|
||||||
|
|
||||||
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
||||||
|
|||||||
@@ -647,7 +647,10 @@ try_reconnect(t_node_info *node_info)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
log_warning(_("unable to reconnect to node after %i attempts"), max_attempts);
|
log_warning(_("unable to reconnect to node %i after %i attempts"),
|
||||||
|
node_info->node_id,
|
||||||
|
max_attempts);
|
||||||
|
|
||||||
node_info->node_status = NODE_STATUS_DOWN;
|
node_info->node_status = NODE_STATUS_DOWN;
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|||||||
Reference in New Issue
Block a user