mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 08:36:30 +00:00
Handle node recovery
This commit is contained in:
11
config.c
11
config.c
@@ -248,6 +248,11 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->monitoring_history = false; /* new in 4.0, replaces --monitoring-history */
|
options->monitoring_history = false; /* new in 4.0, replaces --monitoring-history */
|
||||||
options->degraded_monitoring_timeout = -1;
|
options->degraded_monitoring_timeout = -1;
|
||||||
|
|
||||||
|
/* BDR settings
|
||||||
|
* ------------ */
|
||||||
|
options->bdr_local_monitoring_only = false;
|
||||||
|
options->bdr_active_node_recovery = false;
|
||||||
|
|
||||||
/* service settings
|
/* service settings
|
||||||
* ---------------- */
|
* ---------------- */
|
||||||
memset(options->pg_ctl_options, 0, sizeof(options->pg_ctl_options));
|
memset(options->pg_ctl_options, 0, sizeof(options->pg_ctl_options));
|
||||||
@@ -415,6 +420,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
else if (strcmp(name, "degraded_monitoring_timeout") == 0)
|
else if (strcmp(name, "degraded_monitoring_timeout") == 0)
|
||||||
options->degraded_monitoring_timeout = repmgr_atoi(value, name, error_list, 1);
|
options->degraded_monitoring_timeout = repmgr_atoi(value, name, error_list, 1);
|
||||||
|
|
||||||
|
/* BDR settings */
|
||||||
|
else if (strcmp(name, "bdr_local_monitoring_only") == 0)
|
||||||
|
options->bdr_local_monitoring_only = parse_bool(value, name, error_list);
|
||||||
|
else if (strcmp(name, "bdr_active_node_recovery") == 0)
|
||||||
|
options->bdr_active_node_recovery = parse_bool(value, name, error_list);
|
||||||
|
|
||||||
/* service settings */
|
/* service settings */
|
||||||
else if (strcmp(name, "pg_ctl_options") == 0)
|
else if (strcmp(name, "pg_ctl_options") == 0)
|
||||||
strncpy(options->pg_ctl_options, value, MAXLEN);
|
strncpy(options->pg_ctl_options, value, MAXLEN);
|
||||||
|
|||||||
6
config.h
6
config.h
@@ -85,6 +85,10 @@ typedef struct
|
|||||||
bool monitoring_history;
|
bool monitoring_history;
|
||||||
int degraded_monitoring_timeout;
|
int degraded_monitoring_timeout;
|
||||||
|
|
||||||
|
/* BDR settings */
|
||||||
|
bool bdr_local_monitoring_only;
|
||||||
|
bool bdr_active_node_recovery;
|
||||||
|
|
||||||
/* service settings */
|
/* service settings */
|
||||||
char pg_ctl_options[MAXLEN];
|
char pg_ctl_options[MAXLEN];
|
||||||
char service_stop_command[MAXLEN];
|
char service_stop_command[MAXLEN];
|
||||||
@@ -125,6 +129,8 @@ typedef struct
|
|||||||
DEFAULT_RECONNECTION_ATTEMPTS, \
|
DEFAULT_RECONNECTION_ATTEMPTS, \
|
||||||
DEFAULT_RECONNECTION_INTERVAL, \
|
DEFAULT_RECONNECTION_INTERVAL, \
|
||||||
300, false, -1, \
|
300, false, -1, \
|
||||||
|
/* BDR settings */ \
|
||||||
|
false, false, \
|
||||||
/* service settings */ \
|
/* service settings */ \
|
||||||
"", "", "", "", "", "", \
|
"", "", "", "", "", "", \
|
||||||
/* event notification settings */ \
|
/* event notification settings */ \
|
||||||
|
|||||||
127
repmgrd-bdr.c
127
repmgrd-bdr.c
@@ -15,6 +15,7 @@
|
|||||||
static volatile sig_atomic_t got_SIGHUP = false;
|
static volatile sig_atomic_t got_SIGHUP = false;
|
||||||
|
|
||||||
static void do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node);
|
static void do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node);
|
||||||
|
static void do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node);
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
@@ -126,7 +127,7 @@ monitor_bdr(void)
|
|||||||
NULL);
|
NULL);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* retrieve list of all nodes - we'll need these if the DB connection goes away,
|
* retrieve list of all nodes - we'll need these if the DB connection goes away
|
||||||
*/
|
*/
|
||||||
get_all_node_records(local_conn, &nodes);
|
get_all_node_records(local_conn, &nodes);
|
||||||
|
|
||||||
@@ -142,6 +143,12 @@ monitor_bdr(void)
|
|||||||
|
|
||||||
for (cell = nodes.head; cell; cell = cell->next)
|
for (cell = nodes.head; cell; cell = cell->next)
|
||||||
{
|
{
|
||||||
|
if (config_file_options.bdr_local_monitoring_only == true
|
||||||
|
&& cell->node_info->node_id != local_node_info.node_id)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (cell->node_info->node_id == local_node_info.node_id)
|
if (cell->node_info->node_id == local_node_info.node_id)
|
||||||
{
|
{
|
||||||
log_debug("checking local node %i in %s state",
|
log_debug("checking local node %i in %s state",
|
||||||
@@ -174,7 +181,7 @@ monitor_bdr(void)
|
|||||||
if (is_server_available(cell->node_info->conninfo) == true)
|
if (is_server_available(cell->node_info->conninfo) == true)
|
||||||
{
|
{
|
||||||
log_notice(_("monitored node %i has recovered"), cell->node_info->node_id);
|
log_notice(_("monitored node %i has recovered"), cell->node_info->node_id);
|
||||||
// do_bdr_recovery()
|
do_bdr_recovery(&nodes, cell->node_info);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -228,14 +235,11 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
{
|
{
|
||||||
PGconn *next_node_conn = NULL;
|
PGconn *next_node_conn = NULL;
|
||||||
NodeInfoListCell *cell;
|
NodeInfoListCell *cell;
|
||||||
// bool failover_success = false;
|
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
RecordStatus record_status;
|
RecordStatus record_status;
|
||||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||||
t_node_info target_node = T_NODE_INFO_INITIALIZER;
|
t_node_info target_node = T_NODE_INFO_INITIALIZER;
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
|
||||||
|
|
||||||
monitored_node->monitoring_state = MS_DEGRADED;
|
monitored_node->monitoring_state = MS_DEGRADED;
|
||||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||||
|
|
||||||
@@ -267,31 +271,25 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
/* shouldn't happen, and if it does, it means everything is down */
|
/* shouldn't happen, and if it does, it means everything is down */
|
||||||
if (next_node_conn == NULL)
|
if (next_node_conn == NULL)
|
||||||
{
|
{
|
||||||
appendPQExpBuffer(&event_details,
|
log_error(_("no other available node found"));
|
||||||
_("no other available node found"));
|
|
||||||
|
|
||||||
log_error("%s", event_details.data);
|
|
||||||
|
|
||||||
/* no other nodes found - continue degraded monitoring */
|
/* no other nodes found - continue degraded monitoring */
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// call: repmgr.am_bdr_failover_handler(node_id)
|
|
||||||
if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
|
if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
|
||||||
{
|
{
|
||||||
log_debug("XXX am not failover handler");
|
|
||||||
PQfinish(next_node_conn);
|
PQfinish(next_node_conn);
|
||||||
log_debug("other node's repmgrd is handling failover");
|
log_debug("other node's repmgrd is handling failover");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
log_debug("YYYam the failover handler");
|
log_debug("this node is the failover handler");
|
||||||
|
|
||||||
// check here that the node hasn't come back up...
|
// check here that the node hasn't come back up...
|
||||||
log_info(_("connecting to target node %s"), target_node.node_name);
|
log_info(_("connecting to target node %s"), target_node.node_name);
|
||||||
|
|
||||||
// failover_success = true;
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
event_info.conninfo_str = target_node.conninfo;
|
event_info.conninfo_str = target_node.conninfo;
|
||||||
event_info.node_name = target_node.node_name;
|
event_info.node_name = target_node.node_name;
|
||||||
@@ -322,7 +320,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
create_event_notification_extended(
|
create_event_notification_extended(
|
||||||
next_node_conn,
|
next_node_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
monitored_node->node_id,
|
||||||
"bdr_failover",
|
"bdr_failover",
|
||||||
true,
|
true,
|
||||||
event_details.data,
|
event_details.data,
|
||||||
@@ -332,6 +330,103 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
|
|||||||
|
|
||||||
unset_bdr_failover_handler(next_node_conn);
|
unset_bdr_failover_handler(next_node_conn);
|
||||||
|
|
||||||
/* local monitoring mode - there's no new node to monitor */
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||||
|
{
|
||||||
|
PGconn *recovered_node_conn;
|
||||||
|
PQExpBufferData event_details;
|
||||||
|
t_bdr_node_info bdr_record;
|
||||||
|
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||||
|
int i;
|
||||||
|
bool node_recovered = false;
|
||||||
|
|
||||||
|
recovered_node_conn = establish_db_connection(monitored_node->conninfo, false);
|
||||||
|
|
||||||
|
if (PQstatus(recovered_node_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
PQfinish(recovered_node_conn);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (am_bdr_failover_handler(recovered_node_conn, local_node_info.node_id) == false)
|
||||||
|
{
|
||||||
|
PQfinish(recovered_node_conn);
|
||||||
|
log_debug("other node's repmgrd is handling recovery");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// bdr_recovery_timeout
|
||||||
|
for (i = 0; i < 30; i++)
|
||||||
|
{
|
||||||
|
RecordStatus record_status = get_bdr_node_record_by_name(
|
||||||
|
recovered_node_conn,
|
||||||
|
monitored_node->node_name,
|
||||||
|
&bdr_record);
|
||||||
|
|
||||||
|
if (record_status == RECORD_FOUND && bdr_record.node_status == 'r')
|
||||||
|
{
|
||||||
|
node_recovered = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(1);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node_recovered == false)
|
||||||
|
{
|
||||||
|
log_warning(_("node did not come up"));
|
||||||
|
PQfinish(recovered_node_conn);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// XXX check other node is attached to this one so we
|
||||||
|
// don't end up monitoring a parted node
|
||||||
|
|
||||||
|
|
||||||
|
// note elapsed
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("node '%s' (ID: %i) has recovered"),
|
||||||
|
monitored_node->node_name,
|
||||||
|
monitored_node->node_id);
|
||||||
|
|
||||||
|
monitored_node->monitoring_state = MS_NORMAL;
|
||||||
|
|
||||||
|
if (config_file_options.bdr_active_node_recovery == true)
|
||||||
|
{
|
||||||
|
event_info.conninfo_str = monitored_node->conninfo;
|
||||||
|
event_info.node_name = monitored_node->node_name;
|
||||||
|
|
||||||
|
create_event_notification_extended(
|
||||||
|
recovered_node_conn,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"bdr_recovery",
|
||||||
|
true,
|
||||||
|
event_details.data,
|
||||||
|
&event_info);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
create_event_record(
|
||||||
|
recovered_node_conn,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"bdr_recovery",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
}
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
unset_bdr_failover_handler(recovered_node_conn);
|
||||||
|
|
||||||
|
PQfinish(recovered_node_conn);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user