mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-28 09:26:29 +00:00
repmgrd: various fixes for "manual" failover mode
This commit is contained in:
14
configfile.c
14
configfile.c
@@ -236,7 +236,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
|
||||
/* repmgrd settings
|
||||
* ---------------- */
|
||||
options->failover_mode = FAILOVER_MANUAL;
|
||||
options->failover = FAILOVER_MANUAL;
|
||||
options->priority = DEFAULT_PRIORITY;
|
||||
memset(options->location, 0, sizeof(options->location));
|
||||
strncpy(options->location, DEFAULT_LOCATION, MAXLEN);
|
||||
@@ -400,15 +400,15 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->replication_lag_critical = repmgr_atoi(value, name, error_list, 1);
|
||||
|
||||
/* repmgrd settings */
|
||||
else if (strcmp(name, "failover_mode") == 0)
|
||||
else if (strcmp(name, "failover") == 0)
|
||||
{
|
||||
if (strcmp(value, "manual") == 0)
|
||||
{
|
||||
options->failover_mode = FAILOVER_MANUAL;
|
||||
options->failover = FAILOVER_MANUAL;
|
||||
}
|
||||
else if (strcmp(value, "automatic") == 0)
|
||||
{
|
||||
options->failover_mode = FAILOVER_AUTOMATIC;
|
||||
options->failover = FAILOVER_AUTOMATIC;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -486,12 +486,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
_("parameter \"cluster\" is deprecated and will be ignored"));
|
||||
known_parameter = false;
|
||||
}
|
||||
else if (strcmp(name, "failover") == 0)
|
||||
{
|
||||
item_list_append(warning_list,
|
||||
_("parameter \"failover\" has been renamed to \"failover_mode\""));
|
||||
known_parameter = false;
|
||||
}
|
||||
else if (strcmp(name, "node") == 0)
|
||||
{
|
||||
item_list_append(warning_list,
|
||||
|
||||
@@ -83,7 +83,7 @@ typedef struct
|
||||
int replication_lag_critical;
|
||||
|
||||
/* repmgrd settings */
|
||||
failover_mode_opt failover_mode;
|
||||
failover_mode_opt failover;
|
||||
char location[MAXLEN];
|
||||
int priority;
|
||||
char promote_command[MAXLEN];
|
||||
|
||||
@@ -54,9 +54,6 @@ while(<$fh>) {
|
||||
if ($param eq 'node') {
|
||||
push @outp, qq|node_id=${value}|;
|
||||
}
|
||||
elsif ($param eq 'failover') {
|
||||
push @outp, qq|failover_mode=${value}|;
|
||||
}
|
||||
elsif ($param eq 'loglevel') {
|
||||
push @outp, qq|log_level=${value}|;
|
||||
}
|
||||
|
||||
@@ -37,7 +37,6 @@ Following parameters have been added:
|
||||
|
||||
Following parameters have been renamed:
|
||||
|
||||
- `failover` → `failover_mode`
|
||||
- `node` → `node_id`
|
||||
- `loglevel` → `log_level`
|
||||
- `logfacility` → `log_facility`
|
||||
|
||||
@@ -196,7 +196,7 @@ ssh_options='-q' # Options to append to "ssh"
|
||||
# These settings are only applied when repmgrd is running. Values shown
|
||||
# are defaults.
|
||||
|
||||
#failover_mode=manual # one of 'automatic', 'manual'.
|
||||
#failover=manual # one of 'automatic', 'manual'.
|
||||
# determines what action to take in the event of upstream failure
|
||||
#
|
||||
# 'automatic': repmgrd will automatically attempt to promote the
|
||||
|
||||
@@ -19,6 +19,7 @@ typedef enum {
|
||||
FAILOVER_STATE_PRIMARY_REAPPEARED,
|
||||
FAILOVER_STATE_LOCAL_NODE_FAILURE,
|
||||
FAILOVER_STATE_WAITING_NEW_PRIMARY,
|
||||
FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER,
|
||||
FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
|
||||
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
|
||||
FAILOVER_STATE_NO_NEW_PRIMARY,
|
||||
@@ -68,18 +69,20 @@ static bool do_upstream_standby_failover(void);
|
||||
#endif
|
||||
|
||||
|
||||
/* perform some sanity checks on the node's configuration */
|
||||
|
||||
void
|
||||
do_physical_node_check(void)
|
||||
{
|
||||
#ifndef BDR_ONLY
|
||||
/*
|
||||
* Check if node record is active - if not, and `failover_mode=automatic`, the node
|
||||
* Check if node record is active - if not, and `failover=automatic`, the node
|
||||
* won't be considered as a promotion candidate; this often happens when
|
||||
* a failed primary is recloned and the node was not re-registered, giving
|
||||
* the impression failover capability is there when it's not. In this case
|
||||
* abort with an error and a hint about registering.
|
||||
*
|
||||
* If `failover_mode=manual`, repmgrd can continue to passively monitor the node, but
|
||||
* If `failover=manual`, repmgrd can continue to passively monitor the node, but
|
||||
* we should nevertheless issue a warning and the same hint.
|
||||
*/
|
||||
|
||||
@@ -87,9 +90,9 @@ do_physical_node_check(void)
|
||||
{
|
||||
char *hint = "Check that 'repmgr (primary|standby) register' was executed for this node";
|
||||
|
||||
switch (config_file_options.failover_mode)
|
||||
switch (config_file_options.failover)
|
||||
{
|
||||
/* "failover_mode" is an enum, all values should be covered here */
|
||||
/* "failover" is an enum, all values should be covered here */
|
||||
|
||||
case FAILOVER_AUTOMATIC:
|
||||
log_error(_("this node is marked as inactive and cannot be used as a failover target"));
|
||||
@@ -104,7 +107,7 @@ do_physical_node_check(void)
|
||||
}
|
||||
}
|
||||
|
||||
if (config_file_options.failover_mode == FAILOVER_AUTOMATIC)
|
||||
if (config_file_options.failover == FAILOVER_AUTOMATIC)
|
||||
{
|
||||
/*
|
||||
* check that promote/follow commands are defined, otherwise repmgrd
|
||||
@@ -664,7 +667,9 @@ monitor_streaming_standby(void)
|
||||
}
|
||||
}
|
||||
|
||||
// get all!
|
||||
|
||||
if (config_file_options.failover == FAILOVER_AUTOMATIC)
|
||||
{
|
||||
get_active_sibling_node_records(local_conn,
|
||||
local_node_info.node_id,
|
||||
local_node_info.upstream_node_id,
|
||||
@@ -700,11 +705,12 @@ monitor_streaming_standby(void)
|
||||
cell->node_info->conn = NULL;
|
||||
}
|
||||
|
||||
if (follow_node_id != UNKNOWN_NODE_ID && config_file_options.failover_mode == FAILOVER_AUTOMATIC)
|
||||
if (follow_node_id != UNKNOWN_NODE_ID)
|
||||
{
|
||||
follow_new_primary(follow_node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@@ -730,7 +736,7 @@ monitor_streaming_standby(void)
|
||||
upstream_node_info.node_id,
|
||||
print_monitoring_state(monitoring_state));
|
||||
|
||||
if (config_file_options.failover_mode == FAILOVER_MANUAL)
|
||||
if (config_file_options.failover == FAILOVER_MANUAL)
|
||||
{
|
||||
appendPQExpBuffer(
|
||||
&monitoring_summary,
|
||||
@@ -739,7 +745,7 @@ monitor_streaming_standby(void)
|
||||
|
||||
log_info("%s", monitoring_summary.data);
|
||||
termPQExpBuffer(&monitoring_summary);
|
||||
if (monitoring_state == MS_DEGRADED)
|
||||
if (monitoring_state == MS_DEGRADED && config_file_options.failover == FAILOVER_AUTOMATIC)
|
||||
{
|
||||
log_detail(_("waiting for upstream or another primary to reappear"));
|
||||
}
|
||||
@@ -833,11 +839,10 @@ do_primary_failover(void)
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Node is not a candidate but no other nodes are available
|
||||
*/
|
||||
|
||||
if (standby_nodes.node_count == 0)
|
||||
{
|
||||
/* Node is not a candidate but no other nodes are available */
|
||||
log_notice(_("no other nodes are available as promotion candidate"));
|
||||
log_hint(_("use \"repmgr standby promote\" to manually promote this node"));
|
||||
|
||||
@@ -885,6 +890,45 @@ do_primary_failover(void)
|
||||
&standby_nodes);
|
||||
|
||||
}
|
||||
else if (config_file_options.failover == FAILOVER_MANUAL)
|
||||
{
|
||||
/* automatic failover disabled */
|
||||
|
||||
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
|
||||
RecordStatus record_status = RECORD_NOT_FOUND;
|
||||
PGconn *new_primary_conn;
|
||||
|
||||
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
|
||||
|
||||
if (record_status != RECORD_FOUND)
|
||||
{
|
||||
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
|
||||
new_primary_id);
|
||||
}
|
||||
else
|
||||
{
|
||||
PQExpBufferData event_details;
|
||||
initPQExpBuffer(&event_details);
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("node %i is in manual failover mode and is now disconnected from streaming replication"),
|
||||
local_node_info.node_id);
|
||||
|
||||
new_primary_conn = establish_db_connection(new_primary.conninfo, false);
|
||||
|
||||
create_event_notification(
|
||||
new_primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"standby_disconnect_manual",
|
||||
/* here "true" indicates the action has occurred as expected */
|
||||
true,
|
||||
event_details.data);
|
||||
PQfinish(new_primary_conn);
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
}
|
||||
failover_state = FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER;
|
||||
}
|
||||
else
|
||||
{
|
||||
failover_state = follow_new_primary(new_primary_id);
|
||||
@@ -961,6 +1005,13 @@ do_primary_failover(void)
|
||||
|
||||
return false;
|
||||
|
||||
case FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER:
|
||||
log_info(_("automatic failover disabled for this node, manual intervention required"));
|
||||
|
||||
monitoring_state = MS_DEGRADED;
|
||||
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||
return false;
|
||||
|
||||
case FAILOVER_STATE_NO_NEW_PRIMARY:
|
||||
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
|
||||
/* pass control back down to start_monitoring() */
|
||||
@@ -1049,7 +1100,8 @@ do_upstream_standby_failover(void)
|
||||
* table but we should be able to generate an external notification
|
||||
* if required.
|
||||
*/
|
||||
create_event_notification(primary_conn,
|
||||
create_event_notification(
|
||||
primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
@@ -1073,7 +1125,8 @@ do_upstream_standby_failover(void)
|
||||
|
||||
log_error("%s", event_details.data);
|
||||
|
||||
create_event_notification(NULL,
|
||||
create_event_notification(
|
||||
NULL,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
@@ -1104,7 +1157,8 @@ do_upstream_standby_failover(void)
|
||||
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
create_event_notification(
|
||||
primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
@@ -1375,14 +1429,14 @@ follow_new_primary(int new_primary_id)
|
||||
/* Store details of the failed node here */
|
||||
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
|
||||
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
|
||||
RecordStatus record_status;
|
||||
RecordStatus record_status = RECORD_NOT_FOUND;
|
||||
bool new_primary_ok = false;
|
||||
|
||||
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
|
||||
|
||||
if (record_status != RECORD_FOUND)
|
||||
{
|
||||
log_error(_("unable to retrieve metadata record for upstream node (ID: %i)"),
|
||||
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
|
||||
new_primary_id);
|
||||
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||
}
|
||||
@@ -1504,7 +1558,8 @@ follow_new_primary(int new_primary_id)
|
||||
|
||||
log_notice("%s\n", event_details.data);
|
||||
|
||||
create_event_notification(upstream_conn,
|
||||
create_event_notification(
|
||||
upstream_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
@@ -1610,9 +1665,9 @@ do_election(void)
|
||||
upstream_node_info.node_id,
|
||||
&standby_nodes);
|
||||
|
||||
if (config_file_options.failover_mode == FAILOVER_MANUAL)
|
||||
if (config_file_options.failover == FAILOVER_MANUAL)
|
||||
{
|
||||
log_notice(_("this node is not configured for automatic failure so will not be considered as promotion candidate"));
|
||||
log_notice(_("this node is not configured for automatic failover so will not be considered as promotion candidate"));
|
||||
|
||||
return ELECTION_NOT_CANDIDATE;
|
||||
}
|
||||
@@ -1746,9 +1801,7 @@ do_election(void)
|
||||
/* get our lsn */
|
||||
local_node_info.last_wal_receive_lsn = get_last_wal_receive_location(local_conn);
|
||||
|
||||
log_debug("last receive lsn = %X/%X",
|
||||
(uint32) (local_node_info.last_wal_receive_lsn >> 32),
|
||||
(uint32) local_node_info.last_wal_receive_lsn);
|
||||
log_debug("last receive lsn = %X/%X", format_lsn(local_node_info.last_wal_receive_lsn));
|
||||
|
||||
/* request vote from each node */
|
||||
|
||||
|
||||
Reference in New Issue
Block a user