Store node status in node record struct

This commit is contained in:
Ian Barwick
2017-07-17 13:50:17 +09:00
parent 46acf75286
commit 0dcd479322
5 changed files with 60 additions and 48 deletions

View File

@@ -46,6 +46,12 @@ typedef enum {
MS_DEGRADED = 1 MS_DEGRADED = 1
} MonitoringState; } MonitoringState;
typedef enum {
NODE_STATUS_UNKNOWN = -1,
NODE_STATUS_UP,
NODE_STATUS_DOWN
} NodeStatus;
/* /*
* Struct to store node information * Struct to store node information
*/ */
@@ -65,27 +71,29 @@ typedef struct s_node_info
bool is_ready; bool is_ready;
bool is_visible; bool is_visible;
XLogRecPtr last_wal_receive_lsn; XLogRecPtr last_wal_receive_lsn;
NodeStatus node_status;
MonitoringState monitoring_state; MonitoringState monitoring_state;
PGconn *conn; PGconn *conn;
} t_node_info; } t_node_info;
#define T_NODE_INFO_INITIALIZER { \ #define T_NODE_INFO_INITIALIZER { \
NODE_NOT_FOUND, \ NODE_NOT_FOUND, \
NO_UPSTREAM_NODE, \ NO_UPSTREAM_NODE, \
UNKNOWN, \ UNKNOWN, \
"", \ "", \
"", \ "", \
"", \ "", \
DEFAULT_LOCATION, \ DEFAULT_LOCATION, \
DEFAULT_PRIORITY, \ DEFAULT_PRIORITY, \
true, \ true, \
"", \ "", \
false, \ false, \
false, \ false, \
InvalidXLogRecPtr, \ InvalidXLogRecPtr, \
MS_NORMAL, \ NODE_STATUS_UNKNOWN, \
NULL \ MS_NORMAL, \
NULL \
} }

View File

@@ -168,6 +168,11 @@ monitor_bdr(void)
{ {
if (is_server_available(cell->node_info->conninfo) == false) if (is_server_available(cell->node_info->conninfo) == false)
{ {
instr_time upstream_node_unreachable_start;
INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);
// XXX improve // XXX improve
log_warning("connection problem! to node %i", cell->node_info->node_id); log_warning("connection problem! to node %i", cell->node_info->node_id);
do_bdr_failover(&nodes, cell->node_info); do_bdr_failover(&nodes, cell->node_info);
@@ -235,7 +240,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
PGconn *next_node_conn = NULL; PGconn *next_node_conn = NULL;
NodeInfoListCell *cell; NodeInfoListCell *cell;
PQExpBufferData event_details; PQExpBufferData event_details;
RecordStatus record_status;
t_event_info event_info = T_EVENT_INFO_INITIALIZER; t_event_info event_info = T_EVENT_INFO_INITIALIZER;
t_node_info target_node = T_NODE_INFO_INITIALIZER; t_node_info target_node = T_NODE_INFO_INITIALIZER;
@@ -258,10 +262,14 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
if (PQstatus(next_node_conn) == CONNECTION_OK) if (PQstatus(next_node_conn) == CONNECTION_OK)
{ {
// XXX check if record returned RecordStatus record_status = get_node_record(next_node_conn,
record_status = get_node_record(next_node_conn, cell->node_info->node_id, &target_node); cell->node_info->node_id,
&target_node);
break; if (record_status == RECORD_FOUND)
{
break;
}
} }
next_node_conn = NULL; next_node_conn = NULL;
@@ -286,6 +294,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
log_debug("this node is the failover handler"); log_debug("this node is the failover handler");
// check here that the node hasn't come back up... // check here that the node hasn't come back up...
log_info(_("connecting to target node %s"), target_node.node_name); log_info(_("connecting to target node %s"), target_node.node_name);
initPQExpBuffer(&event_details); initPQExpBuffer(&event_details);

View File

@@ -135,15 +135,18 @@ do_physical_node_check(void)
/*
* repmgrd running on the primary server
*/
void void
monitor_streaming_primary(void) monitor_streaming_primary(void)
{ {
#ifndef BDR_ONLY #ifndef BDR_ONLY
NodeStatus node_status = NODE_STATUS_UP;
instr_time log_status_interval_start; instr_time log_status_interval_start;
PQExpBufferData event_details; PQExpBufferData event_details;
local_node_info.node_status = NODE_STATUS_UP;
reset_node_voting_status(); reset_node_voting_status();
/* Log startup event */ /* Log startup event */
@@ -181,7 +184,7 @@ monitor_streaming_primary(void)
{ {
/* node is down, we were expecting it to be up */ /* node is down, we were expecting it to be up */
if (node_status == NODE_STATUS_UP) if (local_node_info.node_status == NODE_STATUS_UP)
{ {
PQExpBufferData event_details; PQExpBufferData event_details;
instr_time local_node_unreachable_start; instr_time local_node_unreachable_start;
@@ -195,7 +198,7 @@ monitor_streaming_primary(void)
log_warning("%s", event_details.data); log_warning("%s", event_details.data);
node_status = NODE_STATUS_UNKNOWN; local_node_info.node_status = NODE_STATUS_UNKNOWN;
PQfinish(local_conn); PQfinish(local_conn);
@@ -214,9 +217,9 @@ monitor_streaming_primary(void)
termPQExpBuffer(&event_details); termPQExpBuffer(&event_details);
local_conn = try_reconnect(local_node_info.conninfo, &node_status); local_conn = try_reconnect(&local_node_info);
if (node_status == NODE_STATUS_UP) if (local_node_info.node_status == NODE_STATUS_UP)
{ {
int local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start); int local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);
@@ -279,7 +282,7 @@ monitor_streaming_primary(void)
if (PQstatus(local_conn) == CONNECTION_OK) if (PQstatus(local_conn) == CONNECTION_OK)
{ {
node_status = NODE_STATUS_UP; local_node_info.node_status = NODE_STATUS_UP;
monitoring_state = MS_NORMAL; monitoring_state = MS_NORMAL;
initPQExpBuffer(&event_details); initPQExpBuffer(&event_details);
@@ -343,10 +346,10 @@ monitor_streaming_standby(void)
{ {
#ifndef BDR_ONLY #ifndef BDR_ONLY
RecordStatus record_status; RecordStatus record_status;
NodeStatus upstream_node_status = NODE_STATUS_UP;
instr_time log_status_interval_start; instr_time log_status_interval_start;
PQExpBufferData event_details; PQExpBufferData event_details;
upstream_node_info.node_status = NODE_STATUS_UP;
reset_node_voting_status(); reset_node_voting_status();
log_debug("monitor_streaming_standby()"); log_debug("monitor_streaming_standby()");
@@ -470,7 +473,7 @@ monitor_streaming_standby(void)
{ {
/* upstream node is down, we were expecting it to be up */ /* upstream node is down, we were expecting it to be up */
if (upstream_node_status == NODE_STATUS_UP) if (upstream_node_info.node_status == NODE_STATUS_UP)
{ {
instr_time upstream_node_unreachable_start; instr_time upstream_node_unreachable_start;
@@ -478,7 +481,7 @@ monitor_streaming_standby(void)
initPQExpBuffer(&event_details); initPQExpBuffer(&event_details);
upstream_node_status = NODE_STATUS_UNKNOWN; upstream_node_info.node_status = NODE_STATUS_UNKNOWN;
appendPQExpBuffer(&event_details, appendPQExpBuffer(&event_details,
_("unable to connect to upstream node \"%s\" (node ID: %i)"), _("unable to connect to upstream node \"%s\" (node ID: %i)"),
@@ -499,9 +502,9 @@ monitor_streaming_standby(void)
termPQExpBuffer(&event_details); termPQExpBuffer(&event_details);
PQfinish(upstream_conn); PQfinish(upstream_conn);
upstream_conn = try_reconnect(upstream_node_info.conninfo, &upstream_node_status); upstream_conn = try_reconnect(&upstream_node_info);
if (upstream_node_status == NODE_STATUS_UP) if (upstream_node_info.node_status == NODE_STATUS_UP)
{ {
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start); int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
@@ -524,7 +527,7 @@ monitor_streaming_standby(void)
} }
/* still down after reconnect attempt(s) */ /* still down after reconnect attempt(s) */
if (upstream_node_status == NODE_STATUS_DOWN) if (upstream_node_info.node_status == NODE_STATUS_DOWN)
{ {
bool failover_done = false; bool failover_done = false;
@@ -564,7 +567,7 @@ monitor_streaming_standby(void)
// and upstream is now former primary // and upstream is now former primary
// XXX scan other nodes to see if any has become primary // XXX scan other nodes to see if any has become primary
upstream_node_status = NODE_STATUS_UP; upstream_node_info.node_status = NODE_STATUS_UP;
monitoring_state = MS_NORMAL; monitoring_state = MS_NORMAL;
if (upstream_node_info.type == PRIMARY) if (upstream_node_info.type == PRIMARY)

View File

@@ -63,9 +63,6 @@ static void handle_sighup(SIGNAL_ARGS);
static void handle_sigint(SIGNAL_ARGS); static void handle_sigint(SIGNAL_ARGS);
#endif #endif
PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
int calculate_elapsed(instr_time start_time); int calculate_elapsed(instr_time start_time);
void update_registration(PGconn *conn); void update_registration(PGconn *conn);
void terminate(int retval); void terminate(int retval);
@@ -612,7 +609,7 @@ show_help(void)
PGconn * PGconn *
try_reconnect(const char *conninfo, NodeStatus *node_status) try_reconnect(t_node_info *node_info)
{ {
PGconn *conn; PGconn *conn;
@@ -623,7 +620,7 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
for (i = 0; i < max_attempts; i++) for (i = 0; i < max_attempts; i++)
{ {
log_info(_("checking state of node, %i of %i attempts"), i, max_attempts); log_info(_("checking state of node, %i of %i attempts"), i, max_attempts);
if (is_server_available(conninfo) == true) if (is_server_available(node_info->conninfo) == true)
{ {
log_notice(_("node has recovered, reconnecting")); log_notice(_("node has recovered, reconnecting"));
@@ -633,10 +630,10 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
* - fall back to degraded monitoring? * - fall back to degraded monitoring?
* - make that configurable * - make that configurable
*/ */
conn = establish_db_connection(conninfo, false); conn = establish_db_connection(node_info->conninfo, false);
if (PQstatus(conn) == CONNECTION_OK) if (PQstatus(conn) == CONNECTION_OK)
{ {
*node_status = NODE_STATUS_UP; node_info->node_status = NODE_STATUS_UP;
return conn; return conn;
} }
@@ -650,7 +647,8 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
log_warning(_("unable to reconnect to node after %i attempts"), max_attempts); log_warning(_("unable to reconnect to node after %i attempts"), max_attempts);
*node_status = NODE_STATUS_DOWN; node_info->node_status = NODE_STATUS_DOWN;
return NULL; return NULL;
} }

View File

@@ -10,12 +10,6 @@
#include <time.h> #include <time.h>
#include "portability/instr_time.h" #include "portability/instr_time.h"
typedef enum {
NODE_STATUS_UNKNOWN = -1,
NODE_STATUS_UP,
NODE_STATUS_DOWN
} NodeStatus;
extern MonitoringState monitoring_state; extern MonitoringState monitoring_state;
extern instr_time degraded_monitoring_start; extern instr_time degraded_monitoring_start;
@@ -25,7 +19,7 @@ extern t_node_info local_node_info;
extern PGconn *local_conn; extern PGconn *local_conn;
extern bool startup_event_logged; extern bool startup_event_logged;
PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status); PGconn *try_reconnect(t_node_info *node_info);
int calculate_elapsed(instr_time start_time); int calculate_elapsed(instr_time start_time);
const char *print_monitoring_state(MonitoringState monitoring_state); const char *print_monitoring_state(MonitoringState monitoring_state);