mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
repmgrd: enable election rerun
If "failover_validation_command" is set, and the command returns an error, rerun the election. There is a pause between reruns to avoid "churn"; the length of this pause is controlled by the configuration parameter "election_rerun_interval".
This commit is contained in:
@@ -363,6 +363,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->connection_check_type = CHECK_PING;
|
options->connection_check_type = CHECK_PING;
|
||||||
options->primary_visibility_consensus = false;
|
options->primary_visibility_consensus = false;
|
||||||
memset(options->failover_validation_command, 0, sizeof(options->failover_validation_command));
|
memset(options->failover_validation_command, 0, sizeof(options->failover_validation_command));
|
||||||
|
options->election_rerun_interval = DEFAULT_ELECTION_RERUN_INTERVAL;
|
||||||
|
|
||||||
/*-------------
|
/*-------------
|
||||||
* witness settings
|
* witness settings
|
||||||
@@ -647,6 +648,8 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
|
|||||||
options->primary_visibility_consensus = parse_bool(value, name, error_list);
|
options->primary_visibility_consensus = parse_bool(value, name, error_list);
|
||||||
else if (strcmp(name, "failover_validation_command") == 0)
|
else if (strcmp(name, "failover_validation_command") == 0)
|
||||||
strncpy(options->failover_validation_command, value, sizeof(options->failover_validation_command));
|
strncpy(options->failover_validation_command, value, sizeof(options->failover_validation_command));
|
||||||
|
else if (strcmp(name, "election_rerun_interval") == 0)
|
||||||
|
options->election_rerun_interval = repmgr_atoi(value, name, error_list, 0);
|
||||||
|
|
||||||
/* witness settings */
|
/* witness settings */
|
||||||
else if (strcmp(name, "witness_sync_interval") == 0)
|
else if (strcmp(name, "witness_sync_interval") == 0)
|
||||||
|
|||||||
@@ -146,6 +146,7 @@ typedef struct
|
|||||||
ConnectionCheckType connection_check_type;
|
ConnectionCheckType connection_check_type;
|
||||||
bool primary_visibility_consensus;
|
bool primary_visibility_consensus;
|
||||||
char failover_validation_command[MAXPGPATH];
|
char failover_validation_command[MAXPGPATH];
|
||||||
|
int election_rerun_interval;
|
||||||
|
|
||||||
/* BDR settings */
|
/* BDR settings */
|
||||||
bool bdr_local_monitoring_only;
|
bool bdr_local_monitoring_only;
|
||||||
@@ -217,7 +218,8 @@ typedef struct
|
|||||||
false, -1, \
|
false, -1, \
|
||||||
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
DEFAULT_ASYNC_QUERY_TIMEOUT, \
|
||||||
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT, \
|
||||||
-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, CHECK_PING, true, "", \
|
-1, "", false, DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT, \
|
||||||
|
CHECK_PING, true, "", DEFAULT_ELECTION_RERUN_INTERVAL, \
|
||||||
/* BDR settings */ \
|
/* BDR settings */ \
|
||||||
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
false, DEFAULT_BDR_RECOVERY_TIMEOUT, \
|
||||||
/* service settings */ \
|
/* service settings */ \
|
||||||
|
|||||||
@@ -261,7 +261,7 @@
|
|||||||
</note>
|
</note>
|
||||||
<para>
|
<para>
|
||||||
One or both of the following parameter placeholders
|
One or both of the following parameter placeholders
|
||||||
should be provided, which will be replaced by repmgrd with the appropriate
|
should be provided, which will be replaced by repmgrd with the appropriate
|
||||||
value:
|
value:
|
||||||
<itemizedlist spacing="compact" mark="bullet">
|
<itemizedlist spacing="compact" mark="bullet">
|
||||||
<listitem>
|
<listitem>
|
||||||
@@ -275,6 +275,19 @@
|
|||||||
</listitem>
|
</listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
|
<varlistentry>
|
||||||
|
<indexterm>
|
||||||
|
<primary>election_rerun_interval</primary>
|
||||||
|
</indexterm>
|
||||||
|
<term><option>election_rerun_interval</option></term>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
If <option>failover_validation_command</option> is set, and the command returns
|
||||||
|
an error, pause the specified amount of seconds (default: 15) before rerunning the election.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
</varlistentry>
|
||||||
|
|
||||||
</variablelist>
|
</variablelist>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
|
|||||||
@@ -332,7 +332,9 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
|
|||||||
#failover_validation_command= # Script to execute for an external mechanism to validate the failover
|
#failover_validation_command= # Script to execute for an external mechanism to validate the failover
|
||||||
# decision made by repmgrd. One or both of the following parameter placeholders
|
# decision made by repmgrd. One or both of the following parameter placeholders
|
||||||
# should be provided, which will be replaced by repmgrd with the appropriate
|
# should be provided, which will be replaced by repmgrd with the appropriate
|
||||||
# value: %n (node_id), %a (node_name)
|
# value: %n (node_id), %a (node_name). *Must* be the same on all nodes.
|
||||||
|
#election_rerun_interval=15 # if "failover_validation_command" is set, and the command returns
|
||||||
|
# an error, pause the specified amount of seconds before rerunning the election.
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
# service control commands
|
# service control commands
|
||||||
|
|||||||
2
repmgr.h
2
repmgr.h
@@ -60,6 +60,7 @@
|
|||||||
#define NO_UPSTREAM_NODE -1
|
#define NO_UPSTREAM_NODE -1
|
||||||
#define UNKNOWN_NODE_ID -1
|
#define UNKNOWN_NODE_ID -1
|
||||||
#define MIN_NODE_ID 1
|
#define MIN_NODE_ID 1
|
||||||
|
#define ELECTION_RERUN_NOTIFICATION -2
|
||||||
#define VOTING_TERM_NOT_SET -1
|
#define VOTING_TERM_NOT_SET -1
|
||||||
#define ARCHIVE_STATUS_DIR_ERROR -1
|
#define ARCHIVE_STATUS_DIR_ERROR -1
|
||||||
|
|
||||||
@@ -92,6 +93,7 @@
|
|||||||
#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */
|
#define DEFAULT_NODE_REJOIN_TIMEOUT 60 /* seconds */
|
||||||
#define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT 30 /* seconds */
|
#define DEFAULT_WAL_RECEIVE_CHECK_TIMEOUT 30 /* seconds */
|
||||||
#define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
|
#define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
|
||||||
|
#define DEFAULT_ELECTION_RERUN_INTERVAL 15 /* seconds */
|
||||||
|
|
||||||
#define WALRECEIVER_DISABLE_TIMEOUT_VALUE 86400000 /* milliseconds */
|
#define WALRECEIVER_DISABLE_TIMEOUT_VALUE 86400000 /* milliseconds */
|
||||||
|
|
||||||
|
|||||||
@@ -37,7 +37,8 @@ typedef enum
|
|||||||
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
|
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
|
||||||
FAILOVER_STATE_NO_NEW_PRIMARY,
|
FAILOVER_STATE_NO_NEW_PRIMARY,
|
||||||
FAILOVER_STATE_FOLLOW_FAIL,
|
FAILOVER_STATE_FOLLOW_FAIL,
|
||||||
FAILOVER_STATE_NODE_NOTIFICATION_ERROR
|
FAILOVER_STATE_NODE_NOTIFICATION_ERROR,
|
||||||
|
FAILOVER_STATE_ELECTION_RERUN
|
||||||
} FailoverState;
|
} FailoverState;
|
||||||
|
|
||||||
|
|
||||||
@@ -46,7 +47,8 @@ typedef enum
|
|||||||
ELECTION_NOT_CANDIDATE = -1,
|
ELECTION_NOT_CANDIDATE = -1,
|
||||||
ELECTION_WON,
|
ELECTION_WON,
|
||||||
ELECTION_LOST,
|
ELECTION_LOST,
|
||||||
ELECTION_CANCELLED
|
ELECTION_CANCELLED,
|
||||||
|
ELECTION_RERUN
|
||||||
} ElectionResult;
|
} ElectionResult;
|
||||||
|
|
||||||
|
|
||||||
@@ -2086,6 +2088,14 @@ do_primary_failover(void)
|
|||||||
log_notice(_("election cancelled"));
|
log_notice(_("election cancelled"));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
else if (election_result == ELECTION_RERUN)
|
||||||
|
{
|
||||||
|
log_notice(_("election rerun"));
|
||||||
|
/* notify siblings that they should rerun the election too */
|
||||||
|
notify_followers(&sibling_nodes, ELECTION_RERUN_NOTIFICATION);
|
||||||
|
|
||||||
|
failover_state = FAILOVER_STATE_ELECTION_RERUN;
|
||||||
|
}
|
||||||
else if (election_result == ELECTION_WON)
|
else if (election_result == ELECTION_WON)
|
||||||
{
|
{
|
||||||
if (sibling_nodes.node_count > 0)
|
if (sibling_nodes.node_count > 0)
|
||||||
@@ -2148,6 +2158,12 @@ do_primary_failover(void)
|
|||||||
&sibling_nodes);
|
&sibling_nodes);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
/* election rerun */
|
||||||
|
else if (new_primary_id == ELECTION_RERUN_NOTIFICATION)
|
||||||
|
{
|
||||||
|
log_notice(_("election rerun"));
|
||||||
|
failover_state = FAILOVER_STATE_ELECTION_RERUN;
|
||||||
|
}
|
||||||
else if (config_file_options.failover == FAILOVER_MANUAL)
|
else if (config_file_options.failover == FAILOVER_MANUAL)
|
||||||
{
|
{
|
||||||
/* automatic failover disabled */
|
/* automatic failover disabled */
|
||||||
@@ -2218,6 +2234,24 @@ do_primary_failover(void)
|
|||||||
failover_state = FAILOVER_STATE_NONE;
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
|
||||||
|
case FAILOVER_STATE_ELECTION_RERUN:
|
||||||
|
|
||||||
|
/* we no longer care about our former siblings */
|
||||||
|
clear_node_info_list(&sibling_nodes);
|
||||||
|
|
||||||
|
log_notice(_("rerunning election after %i seconds (\"election_rerun_interval\")"),
|
||||||
|
config_file_options.election_rerun_interval);
|
||||||
|
sleep(config_file_options.election_rerun_interval);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* mark the upstream node as "up" so another election is triggered
|
||||||
|
* after we fall back to monitoring
|
||||||
|
*/
|
||||||
|
upstream_node_info.node_status = NODE_STATUS_UP;
|
||||||
|
failover_state = FAILOVER_STATE_NONE;
|
||||||
|
return false;
|
||||||
|
|
||||||
case FAILOVER_STATE_PRIMARY_REAPPEARED:
|
case FAILOVER_STATE_PRIMARY_REAPPEARED:
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -2288,6 +2322,7 @@ do_primary_failover(void)
|
|||||||
case FAILOVER_STATE_UNKNOWN:
|
case FAILOVER_STATE_UNKNOWN:
|
||||||
case FAILOVER_STATE_NONE:
|
case FAILOVER_STATE_NONE:
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* should never reach here */
|
/* should never reach here */
|
||||||
@@ -3160,6 +3195,9 @@ _print_election_result(ElectionResult result)
|
|||||||
|
|
||||||
case ELECTION_CANCELLED:
|
case ELECTION_CANCELLED:
|
||||||
return "CANCELLED";
|
return "CANCELLED";
|
||||||
|
|
||||||
|
case ELECTION_RERUN:
|
||||||
|
return "RERUN";
|
||||||
}
|
}
|
||||||
|
|
||||||
/* should never reach here */
|
/* should never reach here */
|
||||||
@@ -3767,6 +3805,8 @@ format_failover_state(FailoverState failover_state)
|
|||||||
return "FOLLOW_FAIL";
|
return "FOLLOW_FAIL";
|
||||||
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
|
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
|
||||||
return "NODE_NOTIFICATION_ERROR";
|
return "NODE_NOTIFICATION_ERROR";
|
||||||
|
case FAILOVER_STATE_ELECTION_RERUN:
|
||||||
|
return "ELECTION_RERUN";
|
||||||
}
|
}
|
||||||
|
|
||||||
/* should never reach here */
|
/* should never reach here */
|
||||||
@@ -3844,7 +3884,7 @@ execute_failover_validation_command(t_node_info *node_info)
|
|||||||
{
|
{
|
||||||
/* create event here? */
|
/* create event here? */
|
||||||
log_notice(_("failover validation command returned a non-zero value (%i)"), return_value);
|
log_notice(_("failover validation command returned a non-zero value (%i)"), return_value);
|
||||||
return ELECTION_LOST;
|
return ELECTION_RERUN;
|
||||||
}
|
}
|
||||||
|
|
||||||
log_notice(_("failover validation command returned zero"));
|
log_notice(_("failover validation command returned zero"));
|
||||||
|
|||||||
Reference in New Issue
Block a user