mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Skip largely pointless master reconnection attempt.
Experimental - see notes in code.
This commit is contained in:
51
repmgrd.c
51
repmgrd.c
@@ -799,14 +799,13 @@ standby_monitor(void)
|
||||
/*
|
||||
* Check that the upstream node is still available
|
||||
* If not, initiate failover process
|
||||
*/
|
||||
|
||||
check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
|
||||
/*
|
||||
*
|
||||
* This takes up to local_options.reconnect_attempts *
|
||||
* local_options.reconnect_interval seconds
|
||||
*/
|
||||
|
||||
check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
|
||||
|
||||
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
||||
{
|
||||
int previous_master_node_id = master_options.node;
|
||||
@@ -1330,21 +1329,43 @@ do_master_failover(void)
|
||||
nodes[i].is_visible = false;
|
||||
nodes[i].is_ready = false;
|
||||
|
||||
/* Copy details of the failed master node */
|
||||
/* XXX only node_id is actually used later */
|
||||
if (nodes[i].type == MASTER)
|
||||
{
|
||||
failed_master.node_id = nodes[i].node_id;
|
||||
failed_master.xlog_location = nodes[i].xlog_location;
|
||||
failed_master.is_ready = nodes[i].is_ready;
|
||||
}
|
||||
|
||||
log_debug(_("node=%i conninfo=\"%s\" type=%s\n"),
|
||||
nodes[i].node_id,
|
||||
nodes[i].conninfo_str,
|
||||
node_type);
|
||||
|
||||
/* XXX do we need to try and connect to the master here? */
|
||||
/* Copy details of the failed master node */
|
||||
if (nodes[i].type == MASTER)
|
||||
{
|
||||
/* XXX only node_id is currently used */
|
||||
failed_master.node_id = nodes[i].node_id;
|
||||
|
||||
/*
|
||||
* XXX experimental
|
||||
*
|
||||
* Currently an attempt is made to connect to the master,
|
||||
* which is very likely to be a waste of time at this point, as we'll
|
||||
* have spent the last however many seconds trying to do just that
|
||||
* in check_connection() before deciding it's gone away.
|
||||
*
|
||||
* If the master did come back at this point, the voting algorithm should decide
|
||||
* it's the "best candidate" anyway and no standby will promote itself or
|
||||
* attempt to follow* another server.
|
||||
*
|
||||
* If we don't try and connect to the master here (and the code generally
|
||||
* assumes it's failed anyway) but it does come back any time from here
|
||||
* onwards, promotion will fail and the promotion candidate will
|
||||
* notice the reappearance.
|
||||
*
|
||||
* TLDR version: by skipping the master connection attempt (and the chances
|
||||
* the master would reappear between the last attempt in check_connection()
|
||||
* and now are minimal) we can remove useless cycles during the failover process;
|
||||
* if the master does reappear it will be caught before later anyway.
|
||||
*/
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
node_conn = establish_db_connection(nodes[i].conninfo_str, false);
|
||||
|
||||
/* if we can't see the node just skip it */
|
||||
@@ -1706,6 +1727,8 @@ do_master_failover(void)
|
||||
{
|
||||
log_notice(_("Original master reappeared before this standby was promoted - no action taken\n"));
|
||||
|
||||
/* XXX log an event here? */
|
||||
|
||||
PQfinish(master_conn);
|
||||
master_conn = NULL;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user