mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 16:46:28 +00:00
Improve handling of failover events when failover is set to manual
- prevent repmgrd from repeatedly executing the failover code - add event notification 'standby_disconnect_manual' - update documentation This addresses GitHub #221.
This commit is contained in:
7
HISTORY
7
HISTORY
@@ -4,8 +4,11 @@
|
|||||||
|
|
||||||
3.1.5 2016-08-
|
3.1.5 2016-08-
|
||||||
repmgrd: in a failover situation, prevent endless looping when
|
repmgrd: in a failover situation, prevent endless looping when
|
||||||
attempting to establish the status of a node with
|
attempting to establish the status of a node with
|
||||||
`failover=manual` (Ian)
|
`failover=manual` (Ian)
|
||||||
|
repmgrd: improve handling of failover events on standbys with
|
||||||
|
`failover=manual`, and create a new event notification
|
||||||
|
for this, `standby_disconnect_manual` (Ian)
|
||||||
|
|
||||||
3.1.4 2016-07-12
|
3.1.4 2016-07-12
|
||||||
repmgr: new configuration option for setting "restore_command"
|
repmgr: new configuration option for setting "restore_command"
|
||||||
|
|||||||
12
README.md
12
README.md
@@ -1122,9 +1122,16 @@ table , it's advisable to regularly purge historical data with
|
|||||||
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
|
`repmgr cluster cleanup`; use the `-k/--keep-history` to specify how
|
||||||
many day's worth of data should be retained.
|
many day's worth of data should be retained.
|
||||||
|
|
||||||
|
It's possible to use `repmgrd` to provide monitoring only for some or all
|
||||||
|
nodes by setting `failover = manual` in the node's `repmgr.conf`. In the
|
||||||
|
event of the node's upstream failing, no failover action will be taken
|
||||||
|
and the node will require manual intervention to be reattached to replication.
|
||||||
|
If this occurs, event notification `standby_disconnect_manual` will be
|
||||||
|
created.
|
||||||
|
|
||||||
Note that when a standby node is not streaming directly from its upstream
|
Note that when a standby node is not streaming directly from its upstream
|
||||||
node, i.e. recovering WAL from an archive, `apply_lag` will always
|
node, e.g. recovering WAL from an archive, `apply_lag` will always appear as
|
||||||
appear as `0 bytes`.
|
`0 bytes`.
|
||||||
|
|
||||||
|
|
||||||
Using a witness server with repmgrd
|
Using a witness server with repmgrd
|
||||||
@@ -1221,6 +1228,7 @@ The following event types are available:
|
|||||||
* `standby_promote`
|
* `standby_promote`
|
||||||
* `standby_follow`
|
* `standby_follow`
|
||||||
* `standby_switchover`
|
* `standby_switchover`
|
||||||
|
* `standby_disconnect_manual`
|
||||||
* `witness_create`
|
* `witness_create`
|
||||||
* `witness_create`
|
* `witness_create`
|
||||||
* `repmgrd_start`
|
* `repmgrd_start`
|
||||||
|
|||||||
@@ -144,10 +144,18 @@
|
|||||||
#reconnect_interval=10
|
#reconnect_interval=10
|
||||||
|
|
||||||
# Autofailover options
|
# Autofailover options
|
||||||
#failover=manual # one of 'automatic', 'manual'
|
#failover=manual # one of 'automatic', 'manual' (default: manual)
|
||||||
# (default: manual)
|
# defines the action to take in the event of upstream failure
|
||||||
#priority=100 # a value of zero or less prevents the node being promoted to primary
|
#
|
||||||
|
# 'automatic': repmgrd will automatically attempt to promote the
|
||||||
|
# node or follow the new upstream node
|
||||||
|
# 'manual': repmgrd will take no action and the mode will require
|
||||||
|
# manual attention to reattach it to replication
|
||||||
|
|
||||||
|
#priority=100 # indicate a preferred priorty for promoting nodes
|
||||||
|
# a value of zero or less prevents the node being promoted to primary
|
||||||
# (default: 100)
|
# (default: 100)
|
||||||
|
|
||||||
#promote_command='repmgr standby promote -f /path/to/repmgr.conf'
|
#promote_command='repmgr standby promote -f /path/to/repmgr.conf'
|
||||||
#follow_command='repmgr standby follow -f /path/to/repmgr.conf -W'
|
#follow_command='repmgr standby follow -f /path/to/repmgr.conf -W'
|
||||||
|
|
||||||
|
|||||||
91
repmgrd.c
91
repmgrd.c
@@ -62,6 +62,13 @@ t_node_info node_info;
|
|||||||
|
|
||||||
bool failover_done = false;
|
bool failover_done = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* when `failover=manual`, and the upstream server has gone away,
|
||||||
|
* this flag is set to indicate we should connect to whatever the
|
||||||
|
* current master is to update monitoring information
|
||||||
|
*/
|
||||||
|
bool manual_mode_upstream_disconnected = false;
|
||||||
|
|
||||||
char *pid_file = NULL;
|
char *pid_file = NULL;
|
||||||
|
|
||||||
static void help(void);
|
static void help(void);
|
||||||
@@ -449,6 +456,7 @@ main(int argc, char **argv)
|
|||||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||||
update_registration();
|
update_registration();
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Log startup event */
|
/* Log startup event */
|
||||||
if (startup_event_logged == false)
|
if (startup_event_logged == false)
|
||||||
{
|
{
|
||||||
@@ -736,6 +744,8 @@ standby_monitor(void)
|
|||||||
const char *upstream_node_type = NULL;
|
const char *upstream_node_type = NULL;
|
||||||
|
|
||||||
bool receiving_streamed_wal = true;
|
bool receiving_streamed_wal = true;
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Verify that the local node is still available - if not there's
|
* Verify that the local node is still available - if not there's
|
||||||
* no point in doing much else anyway
|
* no point in doing much else anyway
|
||||||
@@ -757,15 +767,32 @@ standby_monitor(void)
|
|||||||
goto continue_monitoring_standby;
|
goto continue_monitoring_standby;
|
||||||
}
|
}
|
||||||
|
|
||||||
upstream_conn = get_upstream_connection(my_local_conn,
|
/*
|
||||||
local_options.cluster_name,
|
* Standby has `failover` set to manual and is disconnected from
|
||||||
local_options.node,
|
* replication following a prior upstream node failure - we'll
|
||||||
&upstream_node_id,
|
* find the master to be able to write monitoring information, if
|
||||||
upstream_conninfo);
|
* required
|
||||||
|
*/
|
||||||
|
if (manual_mode_upstream_disconnected == true)
|
||||||
|
{
|
||||||
|
upstream_conn = get_master_connection(my_local_conn,
|
||||||
|
local_options.cluster_name,
|
||||||
|
&upstream_node_id,
|
||||||
|
upstream_conninfo);
|
||||||
|
upstream_node_type = "master";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
upstream_conn = get_upstream_connection(my_local_conn,
|
||||||
|
local_options.cluster_name,
|
||||||
|
local_options.node,
|
||||||
|
&upstream_node_id,
|
||||||
|
upstream_conninfo);
|
||||||
|
|
||||||
upstream_node_type = (upstream_node_id == master_options.node)
|
upstream_node_type = (upstream_node_id == master_options.node)
|
||||||
? "master"
|
? "master"
|
||||||
: "upstream";
|
: "upstream";
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check that the upstream node is still available
|
* Check that the upstream node is still available
|
||||||
@@ -780,16 +807,27 @@ standby_monitor(void)
|
|||||||
|
|
||||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
|
int previous_master_node_id = master_options.node;
|
||||||
|
|
||||||
PQfinish(upstream_conn);
|
PQfinish(upstream_conn);
|
||||||
upstream_conn = NULL;
|
upstream_conn = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When `failover=manual`, no actual failover will be performed, instead
|
||||||
|
* the following happens:
|
||||||
|
* - find the new master
|
||||||
|
* - create an event notification `standby_disconnect_manual`
|
||||||
|
* - set a flag to indicate we're disconnected from replication,
|
||||||
|
*/
|
||||||
if (local_options.failover == MANUAL_FAILOVER)
|
if (local_options.failover == MANUAL_FAILOVER)
|
||||||
{
|
{
|
||||||
log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), upstream_node_type);
|
log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), upstream_node_type);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Set the location string in shared memory to indicate to other
|
* Set the location string in shared memory to indicate to other
|
||||||
* repmgrd instances that we're *not* a promotion candidate
|
* repmgrd instances that we're *not* a promotion candidate and
|
||||||
|
* that other repmgrd instance should not expect location updates
|
||||||
|
* from us
|
||||||
*/
|
*/
|
||||||
|
|
||||||
update_shared_memory(PASSIVE_NODE);
|
update_shared_memory(PASSIVE_NODE);
|
||||||
@@ -798,13 +836,14 @@ standby_monitor(void)
|
|||||||
{
|
{
|
||||||
master_conn = get_master_connection(my_local_conn,
|
master_conn = get_master_connection(my_local_conn,
|
||||||
local_options.cluster_name, &master_options.node, NULL);
|
local_options.cluster_name, &master_options.node, NULL);
|
||||||
|
|
||||||
if (PQstatus(master_conn) == CONNECTION_OK)
|
if (PQstatus(master_conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Connected, we can continue the process so break the
|
* Connected, we can continue the process so break the
|
||||||
* loop
|
* loop
|
||||||
*/
|
*/
|
||||||
log_err(_("connected to node %d, continuing monitoring.\n"),
|
log_notice(_("connected to node %d, continuing monitoring.\n"),
|
||||||
master_options.node);
|
master_options.node);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -845,7 +884,34 @@ standby_monitor(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
* connected to a master - is it the same as the former upstream?
|
||||||
|
* if not:
|
||||||
|
* - create event standby_disconnect
|
||||||
|
* - set global "disconnected_manual_standby"
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
if (previous_master_node_id != master_options.node)
|
||||||
|
{
|
||||||
|
PQExpBufferData errmsg;
|
||||||
|
initPQExpBuffer(&errmsg);
|
||||||
|
|
||||||
|
appendPQExpBuffer(&errmsg,
|
||||||
|
_("node %i is in manual failover mode and is now disconnected from replication"),
|
||||||
|
local_options.node);
|
||||||
|
|
||||||
|
log_verbose(LOG_DEBUG, "old master: %i; current: %i\n", previous_master_node_id, master_options.node);
|
||||||
|
|
||||||
|
manual_mode_upstream_disconnected = true;
|
||||||
|
|
||||||
|
create_event_record(master_conn,
|
||||||
|
&local_options,
|
||||||
|
local_options.node,
|
||||||
|
"standby_disconnect_manual",
|
||||||
|
/* here "true" indicates the action has occurred as expected */
|
||||||
|
true,
|
||||||
|
errmsg.data);
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (local_options.failover == AUTOMATIC_FAILOVER)
|
else if (local_options.failover == AUTOMATIC_FAILOVER)
|
||||||
{
|
{
|
||||||
@@ -946,8 +1012,8 @@ standby_monitor(void)
|
|||||||
* the stream. If we set the local standby node as failed and it's now running
|
* the stream. If we set the local standby node as failed and it's now running
|
||||||
* and receiving replication data, we should activate it again.
|
* and receiving replication data, we should activate it again.
|
||||||
*/
|
*/
|
||||||
set_local_node_status();
|
set_local_node_status();
|
||||||
log_info(_("standby connection recovered!\n"));
|
log_info(_("standby connection recovered!\n"));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Fast path for the case where no history is requested */
|
/* Fast path for the case where no history is requested */
|
||||||
@@ -959,6 +1025,7 @@ standby_monitor(void)
|
|||||||
* from the upstream node to write monitoring information
|
* from the upstream node to write monitoring information
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* XXX not used? */
|
||||||
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
||||||
|
|
||||||
sprintf(sqlquery,
|
sprintf(sqlquery,
|
||||||
|
|||||||
Reference in New Issue
Block a user