repmgrd: various fixes for "manual" failover mode

This commit is contained in:
Ian Barwick
2017-08-23 10:56:55 +09:00
parent ff07763242
commit 6259463007
6 changed files with 129 additions and 86 deletions

View File

@@ -236,7 +236,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
/* repmgrd settings
* ---------------- */
options->failover_mode = FAILOVER_MANUAL;
options->failover = FAILOVER_MANUAL;
options->priority = DEFAULT_PRIORITY;
memset(options->location, 0, sizeof(options->location));
strncpy(options->location, DEFAULT_LOCATION, MAXLEN);
@@ -400,15 +400,15 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->replication_lag_critical = repmgr_atoi(value, name, error_list, 1);
/* repmgrd settings */
else if (strcmp(name, "failover_mode") == 0)
else if (strcmp(name, "failover") == 0)
{
if (strcmp(value, "manual") == 0)
{
options->failover_mode = FAILOVER_MANUAL;
options->failover = FAILOVER_MANUAL;
}
else if (strcmp(value, "automatic") == 0)
{
options->failover_mode = FAILOVER_AUTOMATIC;
options->failover = FAILOVER_AUTOMATIC;
}
else
{
@@ -486,12 +486,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
_("parameter \"cluster\" is deprecated and will be ignored"));
known_parameter = false;
}
else if (strcmp(name, "failover") == 0)
{
item_list_append(warning_list,
_("parameter \"failover\" has been renamed to \"failover_mode\""));
known_parameter = false;
}
else if (strcmp(name, "node") == 0)
{
item_list_append(warning_list,

View File

@@ -83,7 +83,7 @@ typedef struct
int replication_lag_critical;
/* repmgrd settings */
failover_mode_opt failover_mode;
failover_mode_opt failover;
char location[MAXLEN];
int priority;
char promote_command[MAXLEN];

View File

@@ -54,9 +54,6 @@ while(<$fh>) {
if ($param eq 'node') {
push @outp, qq|node_id=${value}|;
}
elsif ($param eq 'failover') {
push @outp, qq|failover_mode=${value}|;
}
elsif ($param eq 'loglevel') {
push @outp, qq|log_level=${value}|;
}

View File

@@ -37,7 +37,6 @@ Following parameters have been added:
Following parameters have been renamed:
- `failover``failover_mode`
- `node``node_id`
- `loglevel``log_level`
- `logfacility``log_facility`

View File

@@ -196,7 +196,7 @@ ssh_options='-q' # Options to append to "ssh"
# These settings are only applied when repmgrd is running. Values shown
# are defaults.
#failover_mode=manual # one of 'automatic', 'manual'.
#failover=manual # one of 'automatic', 'manual'.
# determines what action to take in the event of upstream failure
#
# 'automatic': repmgrd will automatically attempt to promote the

View File

@@ -19,8 +19,9 @@ typedef enum {
FAILOVER_STATE_PRIMARY_REAPPEARED,
FAILOVER_STATE_LOCAL_NODE_FAILURE,
FAILOVER_STATE_WAITING_NEW_PRIMARY,
FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER,
FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
FAILOVER_STATE_NO_NEW_PRIMARY,
FAILOVER_STATE_FOLLOW_FAIL,
FAILOVER_STATE_NODE_NOTIFICATION_ERROR
@@ -68,18 +69,20 @@ static bool do_upstream_standby_failover(void);
#endif
/* perform some sanity checks on the node's configuration */
void
do_physical_node_check(void)
{
#ifndef BDR_ONLY
/*
* Check if node record is active - if not, and `failover_mode=automatic`, the node
* Check if node record is active - if not, and `failover=automatic`, the node
* won't be considered as a promotion candidate; this often happens when
* a failed primary is recloned and the node was not re-registered, giving
* the impression failover capability is there when it's not. In this case
* abort with an error and a hint about registering.
*
* If `failover_mode=manual`, repmgrd can continue to passively monitor the node, but
* If `failover=manual`, repmgrd can continue to passively monitor the node, but
* we should nevertheless issue a warning and the same hint.
*/
@@ -87,9 +90,9 @@ do_physical_node_check(void)
{
char *hint = "Check that 'repmgr (primary|standby) register' was executed for this node";
switch (config_file_options.failover_mode)
switch (config_file_options.failover)
{
/* "failover_mode" is an enum, all values should be covered here */
/* "failover" is an enum, all values should be covered here */
case FAILOVER_AUTOMATIC:
log_error(_("this node is marked as inactive and cannot be used as a failover target"));
@@ -104,7 +107,7 @@ do_physical_node_check(void)
}
}
if (config_file_options.failover_mode == FAILOVER_AUTOMATIC)
if (config_file_options.failover == FAILOVER_AUTOMATIC)
{
/*
* check that promote/follow commands are defined, otherwise repmgrd
@@ -664,45 +667,48 @@ monitor_streaming_standby(void)
}
}
// get all!
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
local_node_info.upstream_node_id,
&standby_nodes);
if (standby_nodes.node_count > 0)
if (config_file_options.failover == FAILOVER_AUTOMATIC)
{
log_debug("scanning %i node records to detect new primary...", standby_nodes.node_count);
for (cell = standby_nodes.head; cell; cell = cell->next)
get_active_sibling_node_records(local_conn,
local_node_info.node_id,
local_node_info.upstream_node_id,
&standby_nodes);
if (standby_nodes.node_count > 0)
{
/* skip local node check, we did that above */
if (cell->node_info->node_id == local_node_info.node_id)
log_debug("scanning %i node records to detect new primary...", standby_nodes.node_count);
for (cell = standby_nodes.head; cell; cell = cell->next)
{
continue;
}
/* skip local node check, we did that above */
if (cell->node_info->node_id == local_node_info.node_id)
{
continue;
}
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{
log_debug("unable to connect to %i ... ", cell->node_info->node_id);
continue;
}
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
{
log_debug("unable to connect to %i ... ", cell->node_info->node_id);
continue;
}
if (get_recovery_type(cell->node_info->conn) == RECTYPE_PRIMARY)
{
follow_node_id = cell->node_info->node_id;
if (get_recovery_type(cell->node_info->conn) == RECTYPE_PRIMARY)
{
follow_node_id = cell->node_info->node_id;
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
break;
}
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
break;
}
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
}
if (follow_node_id != UNKNOWN_NODE_ID && config_file_options.failover_mode == FAILOVER_AUTOMATIC)
{
follow_new_primary(follow_node_id);
if (follow_node_id != UNKNOWN_NODE_ID)
{
follow_new_primary(follow_node_id);
}
}
}
@@ -730,7 +736,7 @@ monitor_streaming_standby(void)
upstream_node_info.node_id,
print_monitoring_state(monitoring_state));
if (config_file_options.failover_mode == FAILOVER_MANUAL)
if (config_file_options.failover == FAILOVER_MANUAL)
{
appendPQExpBuffer(
&monitoring_summary,
@@ -739,7 +745,7 @@ monitor_streaming_standby(void)
log_info("%s", monitoring_summary.data);
termPQExpBuffer(&monitoring_summary);
if (monitoring_state == MS_DEGRADED)
if (monitoring_state == MS_DEGRADED && config_file_options.failover == FAILOVER_AUTOMATIC)
{
log_detail(_("waiting for upstream or another primary to reappear"));
}
@@ -833,11 +839,10 @@ do_primary_failover(void)
}
else
{
/*
* Node is not a candidate but no other nodes are available
*/
if (standby_nodes.node_count == 0)
{
/* Node is not a candidate but no other nodes are available */
log_notice(_("no other nodes are available as promotion candidate"));
log_hint(_("use \"repmgr standby promote\" to manually promote this node"));
@@ -885,6 +890,45 @@ do_primary_failover(void)
&standby_nodes);
}
else if (config_file_options.failover == FAILOVER_MANUAL)
{
/* automatic failover disabled */
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
RecordStatus record_status = RECORD_NOT_FOUND;
PGconn *new_primary_conn;
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
new_primary_id);
}
else
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("node %i is in manual failover mode and is now disconnected from streaming replication"),
local_node_info.node_id);
new_primary_conn = establish_db_connection(new_primary.conninfo, false);
create_event_notification(
new_primary_conn,
&config_file_options,
local_node_info.node_id,
"standby_disconnect_manual",
/* here "true" indicates the action has occurred as expected */
true,
event_details.data);
PQfinish(new_primary_conn);
termPQExpBuffer(&event_details);
}
failover_state = FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER;
}
else
{
failover_state = follow_new_primary(new_primary_id);
@@ -961,6 +1005,13 @@ do_primary_failover(void)
return false;
case FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER:
log_info(_("automatic failover disabled for this node, manual intervention required"));
monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
return false;
case FAILOVER_STATE_NO_NEW_PRIMARY:
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
/* pass control back down to start_monitoring() */
@@ -1049,12 +1100,13 @@ do_upstream_standby_failover(void)
* table but we should be able to generate an external notification
* if required.
*/
create_event_notification(primary_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
create_event_notification(
primary_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
termPQExpBuffer(&event_details);
}
@@ -1073,12 +1125,13 @@ do_upstream_standby_failover(void)
log_error("%s", event_details.data);
create_event_notification(NULL,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
create_event_notification(
NULL,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
termPQExpBuffer(&event_details);
@@ -1104,12 +1157,13 @@ do_upstream_standby_failover(void)
log_notice("%s", event_details.data);
create_event_notification(primary_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
create_event_notification(
primary_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
termPQExpBuffer(&event_details);
@@ -1375,14 +1429,14 @@ follow_new_primary(int new_primary_id)
/* Store details of the failed node here */
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
RecordStatus record_status;
RecordStatus record_status = RECORD_NOT_FOUND;
bool new_primary_ok = false;
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
if (record_status != RECORD_FOUND)
{
log_error(_("unable to retrieve metadata record for upstream node (ID: %i)"),
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
new_primary_id);
return FAILOVER_STATE_FOLLOW_FAIL;
}
@@ -1504,12 +1558,13 @@ follow_new_primary(int new_primary_id)
log_notice("%s\n", event_details.data);
create_event_notification(upstream_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
create_event_notification(
upstream_conn,
&config_file_options,
local_node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
termPQExpBuffer(&event_details);
@@ -1610,9 +1665,9 @@ do_election(void)
upstream_node_info.node_id,
&standby_nodes);
if (config_file_options.failover_mode == FAILOVER_MANUAL)
if (config_file_options.failover == FAILOVER_MANUAL)
{
log_notice(_("this node is not configured for automatic failure so will not be considered as promotion candidate"));
log_notice(_("this node is not configured for automatic failover so will not be considered as promotion candidate"));
return ELECTION_NOT_CANDIDATE;
}
@@ -1746,9 +1801,7 @@ do_election(void)
/* get our lsn */
local_node_info.last_wal_receive_lsn = get_last_wal_receive_location(local_conn);
log_debug("last receive lsn = %X/%X",
(uint32) (local_node_info.last_wal_receive_lsn >> 32),
(uint32) local_node_info.last_wal_receive_lsn);
log_debug("last receive lsn = %X/%X", format_lsn(local_node_info.last_wal_receive_lsn));
/* request vote from each node */