mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
repmgrd: enable election rerun
If "failover_validation_command" is set, and the command returns an error, rerun the election. There is a pause between reruns to avoid "churn"; the length of this pause is controlled by the configuration parameter "election_rerun_interval".
This commit is contained in:
@@ -363,6 +363,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->connection_check_type = CHECK_PING;
|
||||
options->primary_visibility_consensus = false;
|
||||
memset(options->failover_validation_command, 0, sizeof(options->failover_validation_command));
|
||||
options->election_rerun_interval = DEFAULT_ELECTION_RERUN_INTERVAL;
|
||||
|
||||
/*-------------
|
||||
* witness settings
|
||||
@@ -647,6 +648,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
||||
options->primary_visibility_consensus = parse_bool(value, name, error_list);
|
||||
else if (strcmp(name, "failover_validation_command") == 0)
|
||||
strncpy(options->failover_validation_command, value, sizeof(options->failover_validation_command));
|
||||
else if (strcmp(name, "election_rerun_interval") == 0)
|
||||
options->election_rerun_interval = repmgr_atoi(value, name, error_list, 0);
|
||||
|
||||
/* witness settings */
|
||||
else if (strcmp(name, "witness_sync_interval") == 0)
|
||||
|
||||
@@ -146,6 +146,7 @@ typedef struct
|
||||
ConnectionCheckType connection_check_type;
|
||||
bool primary_visibility_consensus;
|
||||
char failover_validation_command[MAXPGPATH];
|
||||
int election_rerun_interval;
|
||||
|
||||
/* BDR settings */
|
||||
bool bdr_local_monitoring_only;
|
||||
@@ -217,7 +218,8 @@ typedef struct
|
||||
false, -1, \
|
||||
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||
-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, CHECK_PING, true, "", \
|
||||
-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, \
|
||||
CHECK_PING, true, "", DEFAULT_ELECTION_RERUN_INTERVAL, \
|
||||
/* BDR settings */ \
|
||||
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||
/* service settings */ \
|
||||
|
||||
@@ -261,7 +261,7 @@
|
||||
</note>
|
||||
<para>
|
||||
One or both of the following parameter placeholders
|
||||
should be provided, which will be replaced by repmgrd with the appropriate
|
||||
should be provided, which will be replaced by repmgrd with the appropriate
|
||||
value:
|
||||
<itemizedlist spacing="compact" mark="bullet">
|
||||
<listitem>
|
||||
@@ -275,6 +275,19 @@
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<indexterm>
|
||||
<primary>election_rerun_interval</primary>
|
||||
</indexterm>
|
||||
<term><option>election_rerun_interval</option></term>
|
||||
<listitem>
|
||||
<para>
|
||||
If <option>failover_validation_command</option> is set, and the command returns
|
||||
an error, pause the specified amount of seconds (default: 15) before rerunning the election.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
||||
<para>
|
||||
|
||||
@@ -332,7 +332,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
||||
#failover_validation_command= # Script to execute for an external mechanism to validate the failover
|
||||
# decision made by repmgrd. One or both of the following parameter placeholders
|
||||
# should be provided, which will be replaced by repmgrd with the appropriate
|
||||
# value: %n (node_id), %a (node_name)
|
||||
# value: %n (node_id), %a (node_name). *Must* be the same on all nodes.
|
||||
#election_rerun_interval=15 # if "failover_validation_command" is set, and the command returns
|
||||
# an error, pause the specified amount of seconds before rerunning the election.
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# service control commands
|
||||
|
||||
2
repmgr.h
2
repmgr.h
@@ -60,6 +60,7 @@
|
||||
#define NO_UPSTREAM_NODE -1
|
||||
#define UNKNOWN_NODE_ID -1
|
||||
#define MIN_NODE_ID 1
|
||||
#define ELECTION_RERUN_NOTIFICATION -2
|
||||
#define VOTING_TERM_NOT_SET -1
|
||||
#define ARCHIVE_STATUS_DIR_ERROR -1
|
||||
|
||||
@@ -92,6 +93,7 @@
|
||||
#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */
|
||||
#define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT 30 /* seconds */
|
||||
#define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
|
||||
#define DEFAULT_ELECTION_RERUN_INTERVAL 15 /* seconds */
|
||||
|
||||
#define WALRECEIVER_DISABLE_TIMEOUT_VALUE 86400000 /* milliseconds */
|
||||
|
||||
|
||||
@@ -37,7 +37,8 @@ typedef enum
|
||||
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
|
||||
FAILOVER_STATE_NO_NEW_PRIMARY,
|
||||
FAILOVER_STATE_FOLLOW_FAIL,
|
||||
FAILOVER_STATE_NODE_NOTIFICATION_ERROR
|
||||
FAILOVER_STATE_NODE_NOTIFICATION_ERROR,
|
||||
FAILOVER_STATE_ELECTION_RERUN
|
||||
} FailoverState;
|
||||
|
||||
|
||||
@@ -46,7 +47,8 @@ typedef enum
|
||||
ELECTION_NOT_CANDIDATE = -1,
|
||||
ELECTION_WON,
|
||||
ELECTION_LOST,
|
||||
ELECTION_CANCELLED
|
||||
ELECTION_CANCELLED,
|
||||
ELECTION_RERUN
|
||||
} ElectionResult;
|
||||
|
||||
|
||||
@@ -2086,6 +2088,14 @@ do_primary_failover(void)
|
||||
log_notice(_("election cancelled"));
|
||||
return false;
|
||||
}
|
||||
else if (election_result == ELECTION_RERUN)
|
||||
{
|
||||
log_notice(_("election rerun"));
|
||||
/* notify siblings that they should rerun the election too */
|
||||
notify_followers(&sibling_nodes, ELECTION_RERUN_NOTIFICATION);
|
||||
|
||||
failover_state = FAILOVER_STATE_ELECTION_RERUN;
|
||||
}
|
||||
else if (election_result == ELECTION_WON)
|
||||
{
|
||||
if (sibling_nodes.node_count > 0)
|
||||
@@ -2148,6 +2158,12 @@ do_primary_failover(void)
|
||||
&sibling_nodes);
|
||||
|
||||
}
|
||||
/* election rerun */
|
||||
else if (new_primary_id == ELECTION_RERUN_NOTIFICATION)
|
||||
{
|
||||
log_notice(_("election rerun"));
|
||||
failover_state = FAILOVER_STATE_ELECTION_RERUN;
|
||||
}
|
||||
else if (config_file_options.failover == FAILOVER_MANUAL)
|
||||
{
|
||||
/* automatic failover disabled */
|
||||
@@ -2218,6 +2234,24 @@ do_primary_failover(void)
|
||||
failover_state = FAILOVER_STATE_NONE;
|
||||
return true;
|
||||
|
||||
|
||||
case FAILOVER_STATE_ELECTION_RERUN:
|
||||
|
||||
/* we no longer care about our former siblings */
|
||||
clear_node_info_list(&sibling_nodes);
|
||||
|
||||
log_notice(_("rerunning election after %i seconds (\"election_rerun_interval\")"),
|
||||
config_file_options.election_rerun_interval);
|
||||
sleep(config_file_options.election_rerun_interval);
|
||||
|
||||
/*
|
||||
* mark the upstream node as "up" so another election is triggered
|
||||
* after we fall back to monitoring
|
||||
*/
|
||||
upstream_node_info.node_status = NODE_STATUS_UP;
|
||||
failover_state = FAILOVER_STATE_NONE;
|
||||
return false;
|
||||
|
||||
case FAILOVER_STATE_PRIMARY_REAPPEARED:
|
||||
|
||||
/*
|
||||
@@ -2288,6 +2322,7 @@ do_primary_failover(void)
|
||||
case FAILOVER_STATE_UNKNOWN:
|
||||
case FAILOVER_STATE_NONE:
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
/* should never reach here */
|
||||
@@ -3160,6 +3195,9 @@ _print_election_result(ElectionResult result)
|
||||
|
||||
case ELECTION_CANCELLED:
|
||||
return "CANCELLED";
|
||||
|
||||
case ELECTION_RERUN:
|
||||
return "RERUN";
|
||||
}
|
||||
|
||||
/* should never reach here */
|
||||
@@ -3767,6 +3805,8 @@ format_failover_state(FailoverState failover_state)
|
||||
return "FOLLOW_FAIL";
|
||||
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
|
||||
return "NODE_NOTIFICATION_ERROR";
|
||||
case FAILOVER_STATE_ELECTION_RERUN:
|
||||
return "ELECTION_RERUN";
|
||||
}
|
||||
|
||||
/* should never reach here */
|
||||
@@ -3844,7 +3884,7 @@ execute_failover_validation_command(t_node_info *node_info)
|
||||
{
|
||||
/* create event here? */
|
||||
log_notice(_("failover validation command returned a non-zero value (%i)"), return_value);
|
||||
return ELECTION_LOST;
|
||||
return ELECTION_RERUN;
|
||||
}
|
||||
|
||||
log_notice(_("failover validation command returned zero"));
|
||||
|
||||
Reference in New Issue
Block a user