repmgrd: enable election rerun

If "failover_validation_command" is set, and the command returns an error,
rerun the election.

There is a pause between reruns to avoid "churn"; the length of this pause
is controlled by the configuration parameter "election_rerun_interval".
This commit is contained in:
Ian Barwick
2019-03-12 14:03:59 +09:00
committed by Ian Barwick
parent 99923f5ffc
commit fc397f25f6
6 changed files with 68 additions and 6 deletions

View File

@@ -363,6 +363,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->connection_check_type = CHECK_PING; options->connection_check_type = CHECK_PING;
options->primary_visibility_consensus = false; options->primary_visibility_consensus = false;
memset(options->failover_validation_command, 0, sizeof(options->failover_validation_command)); memset(options->failover_validation_command, 0, sizeof(options->failover_validation_command));
options->election_rerun_interval = DEFAULT_ELECTION_RERUN_INTERVAL;
/*------------- /*-------------
* witness settings * witness settings
@@ -647,6 +648,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
options->primary_visibility_consensus = parse_bool(value, name, error_list); options->primary_visibility_consensus = parse_bool(value, name, error_list);
else if (strcmp(name, "failover_validation_command") == 0) else if (strcmp(name, "failover_validation_command") == 0)
strncpy(options->failover_validation_command, value, sizeof(options->failover_validation_command)); strncpy(options->failover_validation_command, value, sizeof(options->failover_validation_command));
else if (strcmp(name, "election_rerun_interval") == 0)
options->election_rerun_interval = repmgr_atoi(value, name, error_list, 0);
/* witness settings */ /* witness settings */
else if (strcmp(name, "witness_sync_interval") == 0) else if (strcmp(name, "witness_sync_interval") == 0)

View File

@@ -146,6 +146,7 @@ typedef struct
ConnectionCheckType connection_check_type; ConnectionCheckType connection_check_type;
bool primary_visibility_consensus; bool primary_visibility_consensus;
char failover_validation_command[MAXPGPATH]; char failover_validation_command[MAXPGPATH];
int election_rerun_interval;
/* BDR settings */ /* BDR settings */
bool bdr_local_monitoring_only; bool bdr_local_monitoring_only;
@@ -217,7 +218,8 @@ typedef struct
false, -1, \ false, -1, \
DEFAULT_ASYNC_QUERY_TIMEOUT, \ DEFAULT_ASYNC_QUERY_TIMEOUT, \
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \ DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, CHECK_PING, true, "", \ -1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, \
CHECK_PING, true, "", DEFAULT_ELECTION_RERUN_INTERVAL, \
/* BDR settings */ \ /* BDR settings */ \
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \ false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
/* service settings */ \ /* service settings */ \

View File

@@ -261,7 +261,7 @@
</note> </note>
<para> <para>
One or both of the following parameter placeholders One or both of the following parameter placeholders
should be provided, which will be replaced by repmgrd with the appropriate should be provided, which will be replaced by repmgrd with the appropriate
value: value:
<itemizedlist spacing="compact" mark="bullet"> <itemizedlist spacing="compact" mark="bullet">
<listitem> <listitem>
@@ -275,6 +275,19 @@
</listitem> </listitem>
</varlistentry> </varlistentry>
<varlistentry>
<indexterm>
<primary>election_rerun_interval</primary>
</indexterm>
<term><option>election_rerun_interval</option></term>
<listitem>
<para>
If <option>failover_validation_command</option> is set, and the command returns
an error, pause the specified amount of seconds (default: 15) before rerunning the election.
</para>
</listitem>
</varlistentry>
</variablelist> </variablelist>
<para> <para>

View File

@@ -332,7 +332,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
#failover_validation_command= # Script to execute for an external mechanism to validate the failover #failover_validation_command= # Script to execute for an external mechanism to validate the failover
# decision made by repmgrd. One or both of the following parameter placeholders # decision made by repmgrd. One or both of the following parameter placeholders
# should be provided, which will be replaced by repmgrd with the appropriate # should be provided, which will be replaced by repmgrd with the appropriate
# value: %n (node_id), %a (node_name) # value: %n (node_id), %a (node_name). *Must* be the same on all nodes.
#election_rerun_interval=15 # if "failover_validation_command" is set, and the command returns
# an error, pause the specified amount of seconds before rerunning the election.
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# service control commands # service control commands

View File

@@ -60,6 +60,7 @@
#define NO_UPSTREAM_NODE -1 #define NO_UPSTREAM_NODE -1
#define UNKNOWN_NODE_ID -1 #define UNKNOWN_NODE_ID -1
#define MIN_NODE_ID 1 #define MIN_NODE_ID 1
#define ELECTION_RERUN_NOTIFICATION -2
#define VOTING_TERM_NOT_SET -1 #define VOTING_TERM_NOT_SET -1
#define ARCHIVE_STATUS_DIR_ERROR -1 #define ARCHIVE_STATUS_DIR_ERROR -1
@@ -92,6 +93,7 @@
#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */ #define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */
#define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT 30 /* seconds */ #define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT 30 /* seconds */
#define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */ #define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
#define DEFAULT_ELECTION_RERUN_INTERVAL 15 /* seconds */
#define WALRECEIVER_DISABLE_TIMEOUT_VALUE 86400000 /* milliseconds */ #define WALRECEIVER_DISABLE_TIMEOUT_VALUE 86400000 /* milliseconds */

View File

@@ -37,7 +37,8 @@ typedef enum
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY, FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
FAILOVER_STATE_NO_NEW_PRIMARY, FAILOVER_STATE_NO_NEW_PRIMARY,
FAILOVER_STATE_FOLLOW_FAIL, FAILOVER_STATE_FOLLOW_FAIL,
FAILOVER_STATE_NODE_NOTIFICATION_ERROR FAILOVER_STATE_NODE_NOTIFICATION_ERROR,
FAILOVER_STATE_ELECTION_RERUN
} FailoverState; } FailoverState;
@@ -46,7 +47,8 @@ typedef enum
ELECTION_NOT_CANDIDATE = -1, ELECTION_NOT_CANDIDATE = -1,
ELECTION_WON, ELECTION_WON,
ELECTION_LOST, ELECTION_LOST,
ELECTION_CANCELLED ELECTION_CANCELLED,
ELECTION_RERUN
} ElectionResult; } ElectionResult;
@@ -2086,6 +2088,14 @@ do_primary_failover(void)
log_notice(_("election cancelled")); log_notice(_("election cancelled"));
return false; return false;
} }
else if (election_result == ELECTION_RERUN)
{
log_notice(_("election rerun"));
/* notify siblings that they should rerun the election too */
notify_followers(&sibling_nodes, ELECTION_RERUN_NOTIFICATION);
failover_state = FAILOVER_STATE_ELECTION_RERUN;
}
else if (election_result == ELECTION_WON) else if (election_result == ELECTION_WON)
{ {
if (sibling_nodes.node_count > 0) if (sibling_nodes.node_count > 0)
@@ -2148,6 +2158,12 @@ do_primary_failover(void)
&sibling_nodes); &sibling_nodes);
} }
/* election rerun */
else if (new_primary_id == ELECTION_RERUN_NOTIFICATION)
{
log_notice(_("election rerun"));
failover_state = FAILOVER_STATE_ELECTION_RERUN;
}
else if (config_file_options.failover == FAILOVER_MANUAL) else if (config_file_options.failover == FAILOVER_MANUAL)
{ {
/* automatic failover disabled */ /* automatic failover disabled */
@@ -2218,6 +2234,24 @@ do_primary_failover(void)
failover_state = FAILOVER_STATE_NONE; failover_state = FAILOVER_STATE_NONE;
return true; return true;
case FAILOVER_STATE_ELECTION_RERUN:
/* we no longer care about our former siblings */
clear_node_info_list(&sibling_nodes);
log_notice(_("rerunning election after %i seconds (\"election_rerun_interval\")"),
config_file_options.election_rerun_interval);
sleep(config_file_options.election_rerun_interval);
/*
* mark the upstream node as "up" so another election is triggered
* after we fall back to monitoring
*/
upstream_node_info.node_status = NODE_STATUS_UP;
failover_state = FAILOVER_STATE_NONE;
return false;
case FAILOVER_STATE_PRIMARY_REAPPEARED: case FAILOVER_STATE_PRIMARY_REAPPEARED:
/* /*
@@ -2288,6 +2322,7 @@ do_primary_failover(void)
case FAILOVER_STATE_UNKNOWN: case FAILOVER_STATE_UNKNOWN:
case FAILOVER_STATE_NONE: case FAILOVER_STATE_NONE:
return false; return false;
} }
/* should never reach here */ /* should never reach here */
@@ -3160,6 +3195,9 @@ _print_election_result(ElectionResult result)
case ELECTION_CANCELLED: case ELECTION_CANCELLED:
return "CANCELLED"; return "CANCELLED";
case ELECTION_RERUN:
return "RERUN";
} }
/* should never reach here */ /* should never reach here */
@@ -3767,6 +3805,8 @@ format_failover_state(FailoverState failover_state)
return "FOLLOW_FAIL"; return "FOLLOW_FAIL";
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR: case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
return "NODE_NOTIFICATION_ERROR"; return "NODE_NOTIFICATION_ERROR";
case FAILOVER_STATE_ELECTION_RERUN:
return "ELECTION_RERUN";
} }
/* should never reach here */ /* should never reach here */
@@ -3844,7 +3884,7 @@ execute_failover_validation_command(t_node_info *node_info)
{ {
/* create event here? */ /* create event here? */
log_notice(_("failover validation command returned a non-zero value (%i)"), return_value); log_notice(_("failover validation command returned a non-zero value (%i)"), return_value);
return ELECTION_LOST; return ELECTION_RERUN;
} }
log_notice(_("failover validation command returned zero")); log_notice(_("failover validation command returned zero"));