diff --git a/configfile.c b/configfile.c index 2132b979..c1b87abe 100644 --- a/configfile.c +++ b/configfile.c @@ -241,7 +241,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * strncpy(options->location, DEFAULT_LOCATION, MAXLEN); memset(options->promote_command, 0, sizeof(options->promote_command)); memset(options->follow_command, 0, sizeof(options->follow_command)); - options->monitor_interval_secs = DEFAULT_STATS_REPORTING_INTERVAL; + options->monitor_interval_secs = DEFAULT_MONITORING_INTERVAL; /* default to 6 reconnection attempts at intervals of 10 seconds */ options->reconnect_attempts = DEFAULT_RECONNECTION_ATTEMPTS; options->reconnect_interval = DEFAULT_RECONNECTION_INTERVAL; diff --git a/configfile.h b/configfile.h index d637001e..e66a7a4e 100644 --- a/configfile.h +++ b/configfile.h @@ -125,7 +125,7 @@ typedef struct false, "", "", "", "", { NULL, NULL }, \ /* repmgrd settings */ \ FAILOVER_MANUAL, DEFAULT_LOCATION, DEFAULT_PRIORITY, "", "", \ - DEFAULT_STATS_REPORTING_INTERVAL, \ + DEFAULT_MONITORING_INTERVAL, \ DEFAULT_RECONNECTION_ATTEMPTS, \ DEFAULT_RECONNECTION_INTERVAL, \ false, -1, \ diff --git a/dbutils.c b/dbutils.c index 66ffaa96..828a57a5 100644 --- a/dbutils.c +++ b/dbutils.c @@ -2368,8 +2368,10 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char *dst_ptr = '\0'; - log_debug("_create_event(): executing\n%s", parsed_command); + log_info(_("executing notification command for event \"%s\""), + event); + log_detail(_("command is:\n %s"), parsed_command); r = system(parsed_command); if (r != 0) { diff --git a/repmgr.h b/repmgr.h index f84c5fa3..ddb2f193 100644 --- a/repmgr.h +++ b/repmgr.h @@ -42,7 +42,7 @@ #define DEFAULT_PRIORITY 100 #define DEFAULT_RECONNECTION_ATTEMPTS 6 #define DEFAULT_RECONNECTION_INTERVAL 10 -#define DEFAULT_STATS_REPORTING_INTERVAL 2 +#define DEFAULT_MONITORING_INTERVAL 2 #define DEFAULT_ASYNC_QUERY_TIMEOUT 60 #define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 #define DEFAULT_PRIMARY_FOLLOW_TIMEOUT 60 diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c index eb578f6b..6be261f5 100644 --- a/repmgrd-bdr.c +++ b/repmgrd-bdr.c @@ -33,6 +33,7 @@ monitor_bdr(void) RecordStatus record_status; NodeInfoListCell *cell; PQExpBufferData event_details; + instr_time log_status_interval_start; /* sanity check local database */ log_info(_("connecting to local database '%s'"), @@ -139,13 +140,13 @@ monitor_bdr(void) log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id); - log_info(_("starting continuous bdr node monitoring")); + log_info(_("starting continuous BDR node monitoring")); while (true) { /* monitoring loop */ - log_verbose(LOG_DEBUG, "bdr check loop..."); + log_verbose(LOG_DEBUG, "BDR check loop..."); for (cell = nodes.head; cell; cell = cell->next) { @@ -189,6 +190,8 @@ monitor_bdr(void) cell->node_info->conn = NULL; } + log_warning(_("unable to connect to node %s (ID %i)"), + cell->node_info->node_name, cell->node_info->node_id); cell->node_info->conn = try_reconnect(cell->node_info); /* node has recovered - log and continue */ @@ -240,6 +243,31 @@ monitor_bdr(void) loop: + /* emit "still alive" log message at regular intervals, if requested */ + if (config_file_options.log_status_interval > 0) + { + int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start); + + if (log_status_interval_elapsed >= config_file_options.log_status_interval) + { + log_info(_("monitoring BDR replication status on node \"%s\" (ID: %i)"), + local_node_info.node_name, + local_node_info.node_id); + + for (cell = nodes.head; cell; cell = cell->next) + { + if (cell->node_info->monitoring_state == MS_DEGRADED) + { + log_detail( + _("monitoring node \"%s\" (ID: %i) in degraded mode"), + cell->node_info->node_name, + cell->node_info->node_id); + } + } + INSTR_TIME_SET_CURRENT(log_status_interval_start); + } + } + if (got_SIGHUP) { /* @@ -259,7 +287,6 @@ monitor_bdr(void) log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")", config_file_options.monitor_interval_secs); sleep(config_file_options.monitor_interval_secs); - } return; @@ -385,6 +412,8 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) /* update node record on the active node */ update_node_record_set_active(next_node_conn, monitored_node->node_id, false); + log_notice(_("setting node record for node %i to inactive"), monitored_node->node_id); + appendPQExpBuffer(&event_details, _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"), monitored_node->node_name, @@ -411,6 +440,8 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) event_details.data, &event_info); + log_info("%s", event_details.data); + termPQExpBuffer(&event_details); unset_bdr_failover_handler(next_node_conn); diff --git a/repmgrd-physical.c b/repmgrd-physical.c index df42726b..67f6fbcc 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -336,7 +336,7 @@ monitor_streaming_primary(void) if (monitoring_state == MS_DEGRADED) { - log_detail(_("waiting primary to reappear")); + log_detail(_("waiting for primary to reappear")); } INSTR_TIME_SET_CURRENT(log_status_interval_start); diff --git a/repmgrd.c b/repmgrd.c index 099321c5..d8e85755 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -647,7 +647,10 @@ try_reconnect(t_node_info *node_info) } - log_warning(_("unable to reconnect to node after %i attempts"), max_attempts); + log_warning(_("unable to reconnect to node %i after %i attempts"), + node_info->node_id, + max_attempts); + node_info->node_status = NODE_STATUS_DOWN; return NULL;