mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Store node status in node record struct
This commit is contained in:
38
dbutils.h
38
dbutils.h
@@ -46,6 +46,12 @@ typedef enum {
|
|||||||
MS_DEGRADED = 1
|
MS_DEGRADED = 1
|
||||||
} MonitoringState;
|
} MonitoringState;
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
NODE_STATUS_UNKNOWN = -1,
|
||||||
|
NODE_STATUS_UP,
|
||||||
|
NODE_STATUS_DOWN
|
||||||
|
} NodeStatus;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Struct to store node information
|
* Struct to store node information
|
||||||
*/
|
*/
|
||||||
@@ -65,27 +71,29 @@ typedef struct s_node_info
|
|||||||
bool is_ready;
|
bool is_ready;
|
||||||
bool is_visible;
|
bool is_visible;
|
||||||
XLogRecPtr last_wal_receive_lsn;
|
XLogRecPtr last_wal_receive_lsn;
|
||||||
|
NodeStatus node_status;
|
||||||
MonitoringState monitoring_state;
|
MonitoringState monitoring_state;
|
||||||
PGconn *conn;
|
PGconn *conn;
|
||||||
} t_node_info;
|
} t_node_info;
|
||||||
|
|
||||||
|
|
||||||
#define T_NODE_INFO_INITIALIZER { \
|
#define T_NODE_INFO_INITIALIZER { \
|
||||||
NODE_NOT_FOUND, \
|
NODE_NOT_FOUND, \
|
||||||
NO_UPSTREAM_NODE, \
|
NO_UPSTREAM_NODE, \
|
||||||
UNKNOWN, \
|
UNKNOWN, \
|
||||||
"", \
|
"", \
|
||||||
"", \
|
"", \
|
||||||
"", \
|
"", \
|
||||||
DEFAULT_LOCATION, \
|
DEFAULT_LOCATION, \
|
||||||
DEFAULT_PRIORITY, \
|
DEFAULT_PRIORITY, \
|
||||||
true, \
|
true, \
|
||||||
"", \
|
"", \
|
||||||
false, \
|
false, \
|
||||||
false, \
|
false, \
|
||||||
InvalidXLogRecPtr, \
|
InvalidXLogRecPtr, \
|
||||||
MS_NORMAL, \
|
NODE_STATUS_UNKNOWN, \
|
||||||
NULL \
|
MS_NORMAL, \
|
||||||
|
NULL \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -168,6 +168,11 @@ monitor_bdr(void)
|
|||||||
{
|
{
|
||||||
if (is_server_available(cell->node_info->conninfo) == false)
|
if (is_server_available(cell->node_info->conninfo) == false)
|
||||||
{
|
{
|
||||||
|
instr_time upstream_node_unreachable_start;
|
||||||
|
|
||||||
|
INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);
|
||||||
|
|
||||||
|
|
||||||
// XXX improve
|
// XXX improve
|
||||||
log_warning("connection problem! to node %i", cell->node_info->node_id);
|
log_warning("connection problem! to node %i", cell->node_info->node_id);
|
||||||
do_bdr_failover(&nodes, cell->node_info);
|
do_bdr_failover(&nodes, cell->node_info);
|
||||||
@@ -235,7 +240,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
PGconn *next_node_conn = NULL;
|
PGconn *next_node_conn = NULL;
|
||||||
NodeInfoListCell *cell;
|
NodeInfoListCell *cell;
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
RecordStatus record_status;
|
|
||||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||||
t_node_info target_node = T_NODE_INFO_INITIALIZER;
|
t_node_info target_node = T_NODE_INFO_INITIALIZER;
|
||||||
|
|
||||||
@@ -258,10 +262,14 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
if (PQstatus(next_node_conn) == CONNECTION_OK)
|
if (PQstatus(next_node_conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
// XXX check if record returned
|
RecordStatus record_status = get_node_record(next_node_conn,
|
||||||
record_status = get_node_record(next_node_conn, cell->node_info->node_id, &target_node);
|
cell->node_info->node_id,
|
||||||
|
&target_node);
|
||||||
|
|
||||||
break;
|
if (record_status == RECORD_FOUND)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
next_node_conn = NULL;
|
next_node_conn = NULL;
|
||||||
@@ -286,6 +294,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
log_debug("this node is the failover handler");
|
log_debug("this node is the failover handler");
|
||||||
|
|
||||||
// check here that the node hasn't come back up...
|
// check here that the node hasn't come back up...
|
||||||
|
|
||||||
log_info(_("connecting to target node %s"), target_node.node_name);
|
log_info(_("connecting to target node %s"), target_node.node_name);
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|||||||
@@ -135,15 +135,18 @@ do_physical_node_check(void)
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* repmgrd running on the primary server
|
||||||
|
*/
|
||||||
void
|
void
|
||||||
monitor_streaming_primary(void)
|
monitor_streaming_primary(void)
|
||||||
{
|
{
|
||||||
#ifndef BDR_ONLY
|
#ifndef BDR_ONLY
|
||||||
NodeStatus node_status = NODE_STATUS_UP;
|
|
||||||
instr_time log_status_interval_start;
|
instr_time log_status_interval_start;
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
|
|
||||||
|
local_node_info.node_status = NODE_STATUS_UP;
|
||||||
|
|
||||||
reset_node_voting_status();
|
reset_node_voting_status();
|
||||||
|
|
||||||
/* Log startup event */
|
/* Log startup event */
|
||||||
@@ -181,7 +184,7 @@ monitor_streaming_primary(void)
|
|||||||
{
|
{
|
||||||
|
|
||||||
/* node is down, we were expecting it to be up */
|
/* node is down, we were expecting it to be up */
|
||||||
if (node_status == NODE_STATUS_UP)
|
if (local_node_info.node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
instr_time local_node_unreachable_start;
|
instr_time local_node_unreachable_start;
|
||||||
@@ -195,7 +198,7 @@ monitor_streaming_primary(void)
|
|||||||
|
|
||||||
log_warning("%s", event_details.data);
|
log_warning("%s", event_details.data);
|
||||||
|
|
||||||
node_status = NODE_STATUS_UNKNOWN;
|
local_node_info.node_status = NODE_STATUS_UNKNOWN;
|
||||||
|
|
||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
|
|
||||||
@@ -214,9 +217,9 @@ monitor_streaming_primary(void)
|
|||||||
|
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
local_conn = try_reconnect(local_node_info.conninfo, &node_status);
|
local_conn = try_reconnect(&local_node_info);
|
||||||
|
|
||||||
if (node_status == NODE_STATUS_UP)
|
if (local_node_info.node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
int local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);
|
int local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);
|
||||||
|
|
||||||
@@ -279,7 +282,7 @@ monitor_streaming_primary(void)
|
|||||||
|
|
||||||
if (PQstatus(local_conn) == CONNECTION_OK)
|
if (PQstatus(local_conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
node_status = NODE_STATUS_UP;
|
local_node_info.node_status = NODE_STATUS_UP;
|
||||||
monitoring_state = MS_NORMAL;
|
monitoring_state = MS_NORMAL;
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
@@ -343,10 +346,10 @@ monitor_streaming_standby(void)
|
|||||||
{
|
{
|
||||||
#ifndef BDR_ONLY
|
#ifndef BDR_ONLY
|
||||||
RecordStatus record_status;
|
RecordStatus record_status;
|
||||||
NodeStatus upstream_node_status = NODE_STATUS_UP;
|
|
||||||
instr_time log_status_interval_start;
|
instr_time log_status_interval_start;
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
|
|
||||||
|
upstream_node_info.node_status = NODE_STATUS_UP;
|
||||||
reset_node_voting_status();
|
reset_node_voting_status();
|
||||||
|
|
||||||
log_debug("monitor_streaming_standby()");
|
log_debug("monitor_streaming_standby()");
|
||||||
@@ -470,7 +473,7 @@ monitor_streaming_standby(void)
|
|||||||
{
|
{
|
||||||
|
|
||||||
/* upstream node is down, we were expecting it to be up */
|
/* upstream node is down, we were expecting it to be up */
|
||||||
if (upstream_node_status == NODE_STATUS_UP)
|
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
instr_time upstream_node_unreachable_start;
|
instr_time upstream_node_unreachable_start;
|
||||||
|
|
||||||
@@ -478,7 +481,7 @@ monitor_streaming_standby(void)
|
|||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
upstream_node_status = NODE_STATUS_UNKNOWN;
|
upstream_node_info.node_status = NODE_STATUS_UNKNOWN;
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
_("unable to connect to upstream node \"%s\" (node ID: %i)"),
|
_("unable to connect to upstream node \"%s\" (node ID: %i)"),
|
||||||
@@ -499,9 +502,9 @@ monitor_streaming_standby(void)
|
|||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
PQfinish(upstream_conn);
|
PQfinish(upstream_conn);
|
||||||
upstream_conn = try_reconnect(upstream_node_info.conninfo, &upstream_node_status);
|
upstream_conn = try_reconnect(&upstream_node_info);
|
||||||
|
|
||||||
if (upstream_node_status == NODE_STATUS_UP)
|
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
||||||
{
|
{
|
||||||
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
|
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
|
||||||
|
|
||||||
@@ -524,7 +527,7 @@ monitor_streaming_standby(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* still down after reconnect attempt(s) */
|
/* still down after reconnect attempt(s) */
|
||||||
if (upstream_node_status == NODE_STATUS_DOWN)
|
if (upstream_node_info.node_status == NODE_STATUS_DOWN)
|
||||||
{
|
{
|
||||||
bool failover_done = false;
|
bool failover_done = false;
|
||||||
|
|
||||||
@@ -564,7 +567,7 @@ monitor_streaming_standby(void)
|
|||||||
// and upstream is now former primary
|
// and upstream is now former primary
|
||||||
// XXX scan other nodes to see if any has become primary
|
// XXX scan other nodes to see if any has become primary
|
||||||
|
|
||||||
upstream_node_status = NODE_STATUS_UP;
|
upstream_node_info.node_status = NODE_STATUS_UP;
|
||||||
monitoring_state = MS_NORMAL;
|
monitoring_state = MS_NORMAL;
|
||||||
|
|
||||||
if (upstream_node_info.type == PRIMARY)
|
if (upstream_node_info.type == PRIMARY)
|
||||||
|
|||||||
14
repmgrd.c
14
repmgrd.c
@@ -63,9 +63,6 @@ static void handle_sighup(SIGNAL_ARGS);
|
|||||||
static void handle_sigint(SIGNAL_ARGS);
|
static void handle_sigint(SIGNAL_ARGS);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
|
|
||||||
|
|
||||||
int calculate_elapsed(instr_time start_time);
|
int calculate_elapsed(instr_time start_time);
|
||||||
void update_registration(PGconn *conn);
|
void update_registration(PGconn *conn);
|
||||||
void terminate(int retval);
|
void terminate(int retval);
|
||||||
@@ -612,7 +609,7 @@ show_help(void)
|
|||||||
|
|
||||||
|
|
||||||
PGconn *
|
PGconn *
|
||||||
try_reconnect(const char *conninfo, NodeStatus *node_status)
|
try_reconnect(t_node_info *node_info)
|
||||||
{
|
{
|
||||||
PGconn *conn;
|
PGconn *conn;
|
||||||
|
|
||||||
@@ -623,7 +620,7 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
|
|||||||
for (i = 0; i < max_attempts; i++)
|
for (i = 0; i < max_attempts; i++)
|
||||||
{
|
{
|
||||||
log_info(_("checking state of node, %i of %i attempts"), i, max_attempts);
|
log_info(_("checking state of node, %i of %i attempts"), i, max_attempts);
|
||||||
if (is_server_available(conninfo) == true)
|
if (is_server_available(node_info->conninfo) == true)
|
||||||
{
|
{
|
||||||
log_notice(_("node has recovered, reconnecting"));
|
log_notice(_("node has recovered, reconnecting"));
|
||||||
|
|
||||||
@@ -633,10 +630,10 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
|
|||||||
* - fall back to degraded monitoring?
|
* - fall back to degraded monitoring?
|
||||||
* - make that configurable
|
* - make that configurable
|
||||||
*/
|
*/
|
||||||
conn = establish_db_connection(conninfo, false);
|
conn = establish_db_connection(node_info->conninfo, false);
|
||||||
if (PQstatus(conn) == CONNECTION_OK)
|
if (PQstatus(conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
*node_status = NODE_STATUS_UP;
|
node_info->node_status = NODE_STATUS_UP;
|
||||||
return conn;
|
return conn;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -650,7 +647,8 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
|
|||||||
|
|
||||||
|
|
||||||
log_warning(_("unable to reconnect to node after %i attempts"), max_attempts);
|
log_warning(_("unable to reconnect to node after %i attempts"), max_attempts);
|
||||||
*node_status = NODE_STATUS_DOWN;
|
node_info->node_status = NODE_STATUS_DOWN;
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -10,12 +10,6 @@
|
|||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include "portability/instr_time.h"
|
#include "portability/instr_time.h"
|
||||||
|
|
||||||
typedef enum {
|
|
||||||
NODE_STATUS_UNKNOWN = -1,
|
|
||||||
NODE_STATUS_UP,
|
|
||||||
NODE_STATUS_DOWN
|
|
||||||
} NodeStatus;
|
|
||||||
|
|
||||||
|
|
||||||
extern MonitoringState monitoring_state;
|
extern MonitoringState monitoring_state;
|
||||||
extern instr_time degraded_monitoring_start;
|
extern instr_time degraded_monitoring_start;
|
||||||
@@ -25,7 +19,7 @@ extern t_node_info local_node_info;
|
|||||||
extern PGconn *local_conn;
|
extern PGconn *local_conn;
|
||||||
extern bool startup_event_logged;
|
extern bool startup_event_logged;
|
||||||
|
|
||||||
PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
|
PGconn *try_reconnect(t_node_info *node_info);
|
||||||
|
|
||||||
int calculate_elapsed(instr_time start_time);
|
int calculate_elapsed(instr_time start_time);
|
||||||
const char *print_monitoring_state(MonitoringState monitoring_state);
|
const char *print_monitoring_state(MonitoringState monitoring_state);
|
||||||
|
|||||||
Reference in New Issue
Block a user