diff --git a/dbutils.h b/dbutils.h index 11ed36c3..4cf60086 100644 --- a/dbutils.h +++ b/dbutils.h @@ -46,6 +46,12 @@ typedef enum { MS_DEGRADED = 1 } MonitoringState; +typedef enum { + NODE_STATUS_UNKNOWN = -1, + NODE_STATUS_UP, + NODE_STATUS_DOWN +} NodeStatus; + /* * Struct to store node information */ @@ -65,27 +71,29 @@ typedef struct s_node_info bool is_ready; bool is_visible; XLogRecPtr last_wal_receive_lsn; + NodeStatus node_status; MonitoringState monitoring_state; PGconn *conn; } t_node_info; #define T_NODE_INFO_INITIALIZER { \ - NODE_NOT_FOUND, \ - NO_UPSTREAM_NODE, \ - UNKNOWN, \ - "", \ - "", \ - "", \ - DEFAULT_LOCATION, \ - DEFAULT_PRIORITY, \ - true, \ - "", \ - false, \ - false, \ - InvalidXLogRecPtr, \ - MS_NORMAL, \ - NULL \ + NODE_NOT_FOUND, \ + NO_UPSTREAM_NODE, \ + UNKNOWN, \ + "", \ + "", \ + "", \ + DEFAULT_LOCATION, \ + DEFAULT_PRIORITY, \ + true, \ + "", \ + false, \ + false, \ + InvalidXLogRecPtr, \ + NODE_STATUS_UNKNOWN, \ + MS_NORMAL, \ + NULL \ } diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c index e2e299d5..59f34eaf 100644 --- a/repmgrd-bdr.c +++ b/repmgrd-bdr.c @@ -168,6 +168,11 @@ monitor_bdr(void) { if (is_server_available(cell->node_info->conninfo) == false) { + instr_time upstream_node_unreachable_start; + + INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start); + + // XXX improve log_warning("connection problem! to node %i", cell->node_info->node_id); do_bdr_failover(&nodes, cell->node_info); @@ -235,7 +240,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) PGconn *next_node_conn = NULL; NodeInfoListCell *cell; PQExpBufferData event_details; - RecordStatus record_status; t_event_info event_info = T_EVENT_INFO_INITIALIZER; t_node_info target_node = T_NODE_INFO_INITIALIZER; @@ -258,10 +262,14 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) if (PQstatus(next_node_conn) == CONNECTION_OK) { - // XXX check if record returned - record_status = get_node_record(next_node_conn, cell->node_info->node_id, &target_node); + RecordStatus record_status = get_node_record(next_node_conn, + cell->node_info->node_id, + &target_node); - break; + if (record_status == RECORD_FOUND) + { + break; + } } next_node_conn = NULL; @@ -286,6 +294,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) log_debug("this node is the failover handler"); // check here that the node hasn't come back up... + log_info(_("connecting to target node %s"), target_node.node_name); initPQExpBuffer(&event_details); diff --git a/repmgrd-physical.c b/repmgrd-physical.c index c270624c..595cdca7 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -135,15 +135,18 @@ do_physical_node_check(void) - +/* + * repmgrd running on the primary server + */ void monitor_streaming_primary(void) { #ifndef BDR_ONLY - NodeStatus node_status = NODE_STATUS_UP; instr_time log_status_interval_start; PQExpBufferData event_details; + local_node_info.node_status = NODE_STATUS_UP; + reset_node_voting_status(); /* Log startup event */ @@ -181,7 +184,7 @@ monitor_streaming_primary(void) { /* node is down, we were expecting it to be up */ - if (node_status == NODE_STATUS_UP) + if (local_node_info.node_status == NODE_STATUS_UP) { PQExpBufferData event_details; instr_time local_node_unreachable_start; @@ -195,7 +198,7 @@ monitor_streaming_primary(void) log_warning("%s", event_details.data); - node_status = NODE_STATUS_UNKNOWN; + local_node_info.node_status = NODE_STATUS_UNKNOWN; PQfinish(local_conn); @@ -214,9 +217,9 @@ monitor_streaming_primary(void) termPQExpBuffer(&event_details); - local_conn = try_reconnect(local_node_info.conninfo, &node_status); + local_conn = try_reconnect(&local_node_info); - if (node_status == NODE_STATUS_UP) + if (local_node_info.node_status == NODE_STATUS_UP) { int local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start); @@ -279,7 +282,7 @@ monitor_streaming_primary(void) if (PQstatus(local_conn) == CONNECTION_OK) { - node_status = NODE_STATUS_UP; + local_node_info.node_status = NODE_STATUS_UP; monitoring_state = MS_NORMAL; initPQExpBuffer(&event_details); @@ -343,10 +346,10 @@ monitor_streaming_standby(void) { #ifndef BDR_ONLY RecordStatus record_status; - NodeStatus upstream_node_status = NODE_STATUS_UP; instr_time log_status_interval_start; PQExpBufferData event_details; + upstream_node_info.node_status = NODE_STATUS_UP; reset_node_voting_status(); log_debug("monitor_streaming_standby()"); @@ -470,7 +473,7 @@ monitor_streaming_standby(void) { /* upstream node is down, we were expecting it to be up */ - if (upstream_node_status == NODE_STATUS_UP) + if (upstream_node_info.node_status == NODE_STATUS_UP) { instr_time upstream_node_unreachable_start; @@ -478,7 +481,7 @@ monitor_streaming_standby(void) initPQExpBuffer(&event_details); - upstream_node_status = NODE_STATUS_UNKNOWN; + upstream_node_info.node_status = NODE_STATUS_UNKNOWN; appendPQExpBuffer(&event_details, _("unable to connect to upstream node \"%s\" (node ID: %i)"), @@ -499,9 +502,9 @@ monitor_streaming_standby(void) termPQExpBuffer(&event_details); PQfinish(upstream_conn); - upstream_conn = try_reconnect(upstream_node_info.conninfo, &upstream_node_status); + upstream_conn = try_reconnect(&upstream_node_info); - if (upstream_node_status == NODE_STATUS_UP) + if (upstream_node_info.node_status == NODE_STATUS_UP) { int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start); @@ -524,7 +527,7 @@ monitor_streaming_standby(void) } /* still down after reconnect attempt(s) */ - if (upstream_node_status == NODE_STATUS_DOWN) + if (upstream_node_info.node_status == NODE_STATUS_DOWN) { bool failover_done = false; @@ -564,7 +567,7 @@ monitor_streaming_standby(void) // and upstream is now former primary // XXX scan other nodes to see if any has become primary - upstream_node_status = NODE_STATUS_UP; + upstream_node_info.node_status = NODE_STATUS_UP; monitoring_state = MS_NORMAL; if (upstream_node_info.type == PRIMARY) diff --git a/repmgrd.c b/repmgrd.c index c9ccd401..efbef52c 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -63,9 +63,6 @@ static void handle_sighup(SIGNAL_ARGS); static void handle_sigint(SIGNAL_ARGS); #endif - -PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status); - int calculate_elapsed(instr_time start_time); void update_registration(PGconn *conn); void terminate(int retval); @@ -612,7 +609,7 @@ show_help(void) PGconn * -try_reconnect(const char *conninfo, NodeStatus *node_status) +try_reconnect(t_node_info *node_info) { PGconn *conn; @@ -623,7 +620,7 @@ try_reconnect(const char *conninfo, NodeStatus *node_status) for (i = 0; i < max_attempts; i++) { log_info(_("checking state of node, %i of %i attempts"), i, max_attempts); - if (is_server_available(conninfo) == true) + if (is_server_available(node_info->conninfo) == true) { log_notice(_("node has recovered, reconnecting")); @@ -633,10 +630,10 @@ try_reconnect(const char *conninfo, NodeStatus *node_status) * - fall back to degraded monitoring? * - make that configurable */ - conn = establish_db_connection(conninfo, false); + conn = establish_db_connection(node_info->conninfo, false); if (PQstatus(conn) == CONNECTION_OK) { - *node_status = NODE_STATUS_UP; + node_info->node_status = NODE_STATUS_UP; return conn; } @@ -650,7 +647,8 @@ try_reconnect(const char *conninfo, NodeStatus *node_status) log_warning(_("unable to reconnect to node after %i attempts"), max_attempts); - *node_status = NODE_STATUS_DOWN; + node_info->node_status = NODE_STATUS_DOWN; + return NULL; } diff --git a/repmgrd.h b/repmgrd.h index 0ab93582..46e2c600 100644 --- a/repmgrd.h +++ b/repmgrd.h @@ -10,12 +10,6 @@ #include #include "portability/instr_time.h" -typedef enum { - NODE_STATUS_UNKNOWN = -1, - NODE_STATUS_UP, - NODE_STATUS_DOWN -} NodeStatus; - extern MonitoringState monitoring_state; extern instr_time degraded_monitoring_start; @@ -25,7 +19,7 @@ extern t_node_info local_node_info; extern PGconn *local_conn; extern bool startup_event_logged; -PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status); +PGconn *try_reconnect(t_node_info *node_info); int calculate_elapsed(instr_time start_time); const char *print_monitoring_state(MonitoringState monitoring_state);