mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
1938 lines
52 KiB
C
1938 lines
52 KiB
C
/*
|
|
* repmgrd-physical.c - physical replication functionality for repmgrd
|
|
*
|
|
* Copyright (c) 2ndQuadrant, 2010-2017
|
|
*/
|
|
|
|
#include <signal.h>
|
|
|
|
#include "repmgr.h"
|
|
#include "repmgrd.h"
|
|
#include "repmgrd-physical.h"
|
|
|
|
|
|
typedef enum {
|
|
FAILOVER_STATE_UNKNOWN = -1,
|
|
FAILOVER_STATE_NONE,
|
|
FAILOVER_STATE_PROMOTED,
|
|
FAILOVER_STATE_PROMOTION_FAILED,
|
|
FAILOVER_STATE_PRIMARY_REAPPEARED,
|
|
FAILOVER_STATE_LOCAL_NODE_FAILURE,
|
|
FAILOVER_STATE_WAITING_NEW_PRIMARY,
|
|
FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER,
|
|
FAILOVER_STATE_FOLLOWED_NEW_PRIMARY,
|
|
FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY,
|
|
FAILOVER_STATE_NO_NEW_PRIMARY,
|
|
FAILOVER_STATE_FOLLOW_FAIL,
|
|
FAILOVER_STATE_NODE_NOTIFICATION_ERROR
|
|
} FailoverState;
|
|
|
|
|
|
typedef enum {
|
|
ELECTION_NOT_CANDIDATE = -1,
|
|
ELECTION_WON,
|
|
ELECTION_LOST,
|
|
ELECTION_CANCELLED
|
|
} ElectionResult;
|
|
|
|
|
|
static PGconn *upstream_conn = NULL;
|
|
static PGconn *primary_conn = NULL;
|
|
|
|
#ifndef BDR_ONLY
|
|
static FailoverState failover_state = FAILOVER_STATE_UNKNOWN;
|
|
|
|
static t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
|
|
static NodeInfoList standby_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
|
|
|
|
|
static ElectionResult do_election(void);
|
|
static const char *_print_voting_status(NodeVotingStatus voting_status);
|
|
static const char *_print_election_result(ElectionResult result);
|
|
|
|
static FailoverState promote_self(void);
|
|
static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);
|
|
|
|
static t_node_info *poll_best_candidate(NodeInfoList *standby_nodes);
|
|
|
|
static void check_connection(t_node_info *node_info, PGconn *conn);
|
|
|
|
static bool wait_primary_notification(int *new_primary_id);
|
|
static FailoverState follow_new_primary(int new_primary_id);
|
|
|
|
static void reset_node_voting_status(void);
|
|
void close_connections_physical();
|
|
|
|
static bool do_primary_failover(void);
|
|
static bool do_upstream_standby_failover(void);
|
|
|
|
#endif
|
|
|
|
|
|
/* perform some sanity checks on the node's configuration */
|
|
|
|
void
|
|
do_physical_node_check(void)
|
|
{
|
|
#ifndef BDR_ONLY
|
|
/*
|
|
* Check if node record is active - if not, and `failover=automatic`, the node
|
|
* won't be considered as a promotion candidate; this often happens when
|
|
* a failed primary is recloned and the node was not re-registered, giving
|
|
* the impression failover capability is there when it's not. In this case
|
|
* abort with an error and a hint about registering.
|
|
*
|
|
* If `failover=manual`, repmgrd can continue to passively monitor the node, but
|
|
* we should nevertheless issue a warning and the same hint.
|
|
*/
|
|
|
|
if (local_node_info.active == false)
|
|
{
|
|
char *hint = "Check that 'repmgr (primary|standby) register' was executed for this node";
|
|
|
|
switch (config_file_options.failover)
|
|
{
|
|
/* "failover" is an enum, all values should be covered here */
|
|
|
|
case FAILOVER_AUTOMATIC:
|
|
log_error(_("this node is marked as inactive and cannot be used as a failover target"));
|
|
log_hint(_("%s"), hint);
|
|
PQfinish(local_conn);
|
|
terminate(ERR_BAD_CONFIG);
|
|
|
|
case FAILOVER_MANUAL:
|
|
log_warning(_("this node is marked as inactive and will be passively monitored only"));
|
|
log_hint(_("%s"), hint);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (config_file_options.failover == FAILOVER_AUTOMATIC)
|
|
{
|
|
/*
|
|
* check that promote/follow commands are defined, otherwise repmgrd
|
|
* won't be able to perform any useful action
|
|
*/
|
|
|
|
bool required_param_missing = false;
|
|
|
|
if (config_file_options.promote_command[0] == '\0')
|
|
{
|
|
log_error(_("\"promote_command\" must be defined in the configuration file"));
|
|
|
|
if (config_file_options.service_promote_command[0] != '\0')
|
|
{
|
|
/*
|
|
* if repmgrd executes "service_promote_command" directly, repmgr metadata
|
|
* won't get updated
|
|
*/
|
|
log_hint(_("\"service_promote_command\" is set, but can only be executed by \"repmgr standby promote\""));
|
|
}
|
|
|
|
required_param_missing = true;
|
|
}
|
|
if (config_file_options.follow_command[0] == '\0')
|
|
{
|
|
log_error(_("\"follow_command\" must be defined in the configuration file"));
|
|
required_param_missing = true;
|
|
}
|
|
|
|
if (required_param_missing == true)
|
|
{
|
|
log_hint(_("add the missing configuration parameter(s) and start repmgrd again"));
|
|
PQfinish(local_conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
* repmgrd running on the primary server
|
|
*/
|
|
void
|
|
monitor_streaming_primary(void)
|
|
{
|
|
#ifndef BDR_ONLY
|
|
instr_time log_status_interval_start;
|
|
PQExpBufferData event_details;
|
|
|
|
reset_node_voting_status();
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("monitoring cluster primary \"%s\" (node ID: %i)"),
|
|
local_node_info.node_name,
|
|
local_node_info.node_id);
|
|
|
|
|
|
/* Log startup event */
|
|
if (startup_event_logged == false)
|
|
{
|
|
create_event_notification(local_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_start",
|
|
true,
|
|
event_details.data);
|
|
|
|
startup_event_logged = true;
|
|
}
|
|
else
|
|
{
|
|
create_event_notification(local_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_reload",
|
|
true,
|
|
event_details.data);
|
|
}
|
|
|
|
log_notice("%s", event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
|
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
|
local_node_info.node_status = NODE_STATUS_UP;
|
|
|
|
while (true)
|
|
{
|
|
|
|
// cache node list here, refresh at `node_list_refresh_interval`
|
|
// also return reason for inavailability so we can log it
|
|
if (is_server_available(local_node_info.conninfo) == false)
|
|
{
|
|
|
|
/* node is down, we were expecting it to be up */
|
|
if (local_node_info.node_status == NODE_STATUS_UP)
|
|
{
|
|
PQExpBufferData event_details;
|
|
instr_time local_node_unreachable_start;
|
|
|
|
INSTR_TIME_SET_CURRENT(local_node_unreachable_start);
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("unable to connect to local node"));
|
|
|
|
log_warning("%s", event_details.data);
|
|
|
|
local_node_info.node_status = NODE_STATUS_UNKNOWN;
|
|
|
|
PQfinish(local_conn);
|
|
|
|
/*
|
|
* as we're monitoring the primary, no point in trying to write
|
|
* the event to the database
|
|
*
|
|
* XXX possible pre-action event
|
|
*/
|
|
create_event_notification(NULL,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_local_disconnect",
|
|
true,
|
|
event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
|
|
local_conn = try_reconnect(&local_node_info);
|
|
|
|
if (local_node_info.node_status == NODE_STATUS_UP)
|
|
{
|
|
int local_node_unreachable_elapsed = calculate_elapsed(local_node_unreachable_start);
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("reconnected to local node after %i seconds"),
|
|
local_node_unreachable_elapsed);
|
|
log_notice("%s", event_details.data);
|
|
|
|
create_event_notification(local_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_local_reconnect",
|
|
true,
|
|
event_details.data);
|
|
termPQExpBuffer(&event_details);
|
|
|
|
goto loop;
|
|
}
|
|
|
|
monitoring_state = MS_DEGRADED;
|
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
if (monitoring_state == MS_DEGRADED)
|
|
{
|
|
int degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
|
|
|
|
if (config_file_options.degraded_monitoring_timeout > 0
|
|
&& degraded_monitoring_elapsed > config_file_options.degraded_monitoring_timeout)
|
|
{
|
|
initPQExpBuffer(&event_details);
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("degraded monitoring timeout (%i seconds) exceeded, terminating"),
|
|
degraded_monitoring_elapsed);
|
|
|
|
log_notice("%s", event_details.data);
|
|
|
|
create_event_notification(NULL,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_terminate",
|
|
true,
|
|
event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
terminate(ERR_MONITORING_TIMEOUT);
|
|
}
|
|
|
|
log_debug("monitoring node in degraded state for %i seconds", degraded_monitoring_elapsed);
|
|
|
|
if (is_server_available(local_node_info.conninfo) == true)
|
|
{
|
|
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
|
|
|
if (PQstatus(local_conn) == CONNECTION_OK)
|
|
{
|
|
local_node_info.node_status = NODE_STATUS_UP;
|
|
monitoring_state = MS_NORMAL;
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("reconnected to primary node after %i seconds, resuming monitoring"),
|
|
degraded_monitoring_elapsed);
|
|
|
|
create_event_notification(local_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_local_reconnect",
|
|
true,
|
|
event_details.data);
|
|
|
|
log_notice("%s", event_details.data);
|
|
termPQExpBuffer(&event_details);
|
|
|
|
goto loop;
|
|
}
|
|
}
|
|
|
|
|
|
// possibly attempt to find another node from cached list
|
|
// check if there's a new primary - if so add hook for fencing?
|
|
// loop, if starts up check status, switch monitoring mode
|
|
}
|
|
loop:
|
|
/* emit "still alive" log message at regular intervals, if requested */
|
|
if (config_file_options.log_status_interval > 0)
|
|
{
|
|
int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
|
|
|
|
if (log_status_interval_elapsed >= config_file_options.log_status_interval)
|
|
{
|
|
log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"),
|
|
local_node_info.node_name,
|
|
local_node_info.node_id,
|
|
print_monitoring_state(monitoring_state));
|
|
|
|
if (monitoring_state == MS_DEGRADED)
|
|
{
|
|
log_detail(_("waiting for primary to reappear"));
|
|
}
|
|
|
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
|
}
|
|
}
|
|
|
|
log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
|
|
config_file_options.monitor_interval_secs);
|
|
|
|
sleep(config_file_options.monitor_interval_secs);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
|
|
void
|
|
monitor_streaming_standby(void)
|
|
{
|
|
#ifndef BDR_ONLY
|
|
RecordStatus record_status;
|
|
instr_time log_status_interval_start;
|
|
PQExpBufferData event_details;
|
|
|
|
reset_node_voting_status();
|
|
|
|
log_debug("monitor_streaming_standby()");
|
|
|
|
/*
|
|
* If no upstream node id is specified in the metadata, we'll try
|
|
* and determine the current cluster primary in the assumption we
|
|
* should connect to that by default.
|
|
*/
|
|
if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
|
|
{
|
|
local_node_info.upstream_node_id = get_primary_node_id(local_conn);
|
|
|
|
/*
|
|
* Terminate if there doesn't appear to be an active cluster primary.
|
|
* There could be one or more nodes marked as inactive primaries, and one
|
|
* of them could actually be a primary, but we can't sensibly monitor
|
|
* in that state.
|
|
*/
|
|
if (local_node_info.upstream_node_id == NODE_NOT_FOUND)
|
|
{
|
|
// XXX check if there's an inactive record(s) and log detail/hint
|
|
log_error(_("unable to determine an active primary for this cluster, terminating"));
|
|
PQfinish(local_conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
}
|
|
|
|
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &upstream_node_info);
|
|
|
|
/*
|
|
* Terminate if we can't find the record for the node we're supposed
|
|
* to monitor. This is a "fix-the-config" situation, not a lot else we
|
|
* can do.
|
|
*/
|
|
if (record_status != RECORD_FOUND)
|
|
{
|
|
log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
|
|
local_node_info.upstream_node_id);
|
|
PQfinish(local_conn);
|
|
exit(ERR_DB_CONN);
|
|
}
|
|
|
|
log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);
|
|
|
|
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
|
|
|
/*
|
|
* Upstream node must be running.
|
|
*
|
|
* We could possibly have repmgrd skip to degraded monitoring mode until it
|
|
* comes up, but there doesn't seem to be much point in doint that.
|
|
*/
|
|
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
|
{
|
|
log_error(_("unable connect to upstream node (ID: %i), terminating"),
|
|
local_node_info.upstream_node_id);
|
|
log_hint(_("upstream node must be running before repmgrd can start"));
|
|
|
|
PQfinish(local_conn);
|
|
exit(ERR_DB_CONN);
|
|
}
|
|
|
|
/* refresh upstream node record from upstream node, so it's as up-to-date as possible */
|
|
record_status = get_node_record(upstream_conn, upstream_node_info.node_id, &upstream_node_info);
|
|
|
|
if (upstream_node_info.type == STANDBY)
|
|
{
|
|
/*
|
|
* Currently cascaded standbys need to be able to connect to the primary.
|
|
* We could possibly add a limited connection mode for cases where this isn't
|
|
* possible.
|
|
*/
|
|
primary_conn = establish_primary_db_connection(upstream_conn, false);
|
|
|
|
if (PQstatus(primary_conn) != CONNECTION_OK)
|
|
{
|
|
log_error(_("unable to connect to primary node"));
|
|
log_hint(_("ensure the primary node is reachable from this node"));
|
|
exit(ERR_DB_CONN);
|
|
}
|
|
|
|
log_verbose(LOG_DEBUG, "connected to primary");
|
|
}
|
|
else
|
|
{
|
|
primary_conn = upstream_conn;
|
|
}
|
|
|
|
/* Log startup event */
|
|
if (startup_event_logged == false)
|
|
{
|
|
PQExpBufferData event_details;
|
|
initPQExpBuffer(&event_details);
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("monitoring upstream node \"%s\" (node ID: %i)"),
|
|
upstream_node_info.node_name,
|
|
upstream_node_info.node_id);
|
|
|
|
create_event_notification(primary_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_start",
|
|
true,
|
|
event_details.data);
|
|
|
|
startup_event_logged = true;
|
|
|
|
log_notice("%s", event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
}
|
|
|
|
monitoring_state = MS_NORMAL;
|
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
|
upstream_node_info.node_status = NODE_STATUS_UP;
|
|
|
|
while (true)
|
|
{
|
|
if (is_server_available(upstream_node_info.conninfo) == false)
|
|
{
|
|
|
|
/* upstream node is down, we were expecting it to be up */
|
|
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
|
{
|
|
instr_time upstream_node_unreachable_start;
|
|
|
|
INSTR_TIME_SET_CURRENT(upstream_node_unreachable_start);
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
upstream_node_info.node_status = NODE_STATUS_UNKNOWN;
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("unable to connect to upstream node \"%s\" (node ID: %i)"),
|
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
|
|
|
if (upstream_node_info.type == STANDBY)
|
|
{
|
|
/* XXX possible pre-action event */
|
|
create_event_record(primary_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_upstream_disconnect",
|
|
true,
|
|
event_details.data);
|
|
}
|
|
|
|
log_warning("%s", event_details.data);
|
|
termPQExpBuffer(&event_details);
|
|
|
|
PQfinish(upstream_conn);
|
|
upstream_conn = try_reconnect(&upstream_node_info);
|
|
|
|
/* Node has recovered - log and continue */
|
|
if (upstream_node_info.node_status == NODE_STATUS_UP)
|
|
{
|
|
int upstream_node_unreachable_elapsed = calculate_elapsed(upstream_node_unreachable_start);
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("reconnected to upstream node after %i seconds"),
|
|
upstream_node_unreachable_elapsed);
|
|
log_notice("%s", event_details.data);
|
|
|
|
create_event_notification(local_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_upstream_reconnect",
|
|
true,
|
|
event_details.data);
|
|
termPQExpBuffer(&event_details);
|
|
|
|
goto loop;
|
|
}
|
|
|
|
/* still down after reconnect attempt(s) */
|
|
if (upstream_node_info.node_status == NODE_STATUS_DOWN)
|
|
{
|
|
bool failover_done = false;
|
|
|
|
if (upstream_node_info.type == PRIMARY)
|
|
{
|
|
failover_done = do_primary_failover();
|
|
}
|
|
else if (upstream_node_info.type == STANDBY)
|
|
{
|
|
failover_done = do_upstream_standby_failover();
|
|
}
|
|
|
|
// it's possible it will make sense to return in
|
|
// all cases to restart monitoring
|
|
if (failover_done == true)
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (monitoring_state == MS_DEGRADED)
|
|
{
|
|
int degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
|
|
|
|
log_debug("monitoring node %i in degraded state for %i seconds",
|
|
upstream_node_info.node_id,
|
|
degraded_monitoring_elapsed);
|
|
|
|
if (is_server_available(upstream_node_info.conninfo) == true)
|
|
{
|
|
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
|
|
|
if (PQstatus(upstream_conn) == CONNECTION_OK)
|
|
{
|
|
// XXX check here if upstream is still primary
|
|
// -> will be a problem if another node was promoted in the meantime
|
|
// and upstream is now former primary
|
|
// XXX scan other nodes to see if any has become primary
|
|
|
|
upstream_node_info.node_status = NODE_STATUS_UP;
|
|
monitoring_state = MS_NORMAL;
|
|
|
|
if (upstream_node_info.type == PRIMARY)
|
|
{
|
|
primary_conn = upstream_conn;
|
|
}
|
|
else
|
|
{
|
|
if (primary_conn == NULL || PQstatus(primary_conn) != CONNECTION_OK)
|
|
{
|
|
primary_conn = establish_primary_db_connection(upstream_conn, false);
|
|
}
|
|
}
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("reconnected to upstream node %i after %i seconds, resuming monitoring"),
|
|
upstream_node_info.node_id,
|
|
degraded_monitoring_elapsed);
|
|
|
|
create_event_notification(primary_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"repmgrd_upstream_reconnect",
|
|
true,
|
|
event_details.data);
|
|
|
|
log_notice("%s", event_details.data);
|
|
termPQExpBuffer(&event_details);
|
|
|
|
goto loop;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* unable to connect to former primary - check if another node has
|
|
* been promoted
|
|
*/
|
|
|
|
NodeInfoListCell *cell;
|
|
int follow_node_id = UNKNOWN_NODE_ID;
|
|
|
|
/* local node has been promoted */
|
|
if (get_recovery_type(local_conn) == RECTYPE_PRIMARY)
|
|
{
|
|
log_notice(_("local node is primary, checking local node record"));
|
|
|
|
/*
|
|
* There may be a delay between the node being promoted and the local
|
|
* record being updated, so if the node record still shows it as a
|
|
* standby, do nothing, we'll catch the update during the next loop.
|
|
* (e.g. node was manually
|
|
* promoted) we'll do nothing, as the repmgr metadata is now out-of-sync.
|
|
* If it does get fixed, we'll catch it here on a future iteration.
|
|
*/
|
|
|
|
/* refresh own internal node record */
|
|
record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
|
|
|
|
if (local_node_info.type == PRIMARY)
|
|
{
|
|
|
|
int degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
|
|
|
|
log_notice(_("resuming monitoring as primary node after %i seconds"),
|
|
degraded_monitoring_elapsed);
|
|
|
|
/* this will restart monitoring in primary mode */
|
|
monitoring_state = MS_NORMAL;
|
|
return;
|
|
}
|
|
}
|
|
|
|
|
|
if (config_file_options.failover == FAILOVER_AUTOMATIC)
|
|
{
|
|
get_active_sibling_node_records(local_conn,
|
|
local_node_info.node_id,
|
|
local_node_info.upstream_node_id,
|
|
&standby_nodes);
|
|
|
|
if (standby_nodes.node_count > 0)
|
|
{
|
|
log_debug("scanning %i node records to detect new primary...", standby_nodes.node_count);
|
|
for (cell = standby_nodes.head; cell; cell = cell->next)
|
|
{
|
|
/* skip local node check, we did that above */
|
|
if (cell->node_info->node_id == local_node_info.node_id)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
|
|
|
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
|
{
|
|
log_debug("unable to connect to %i ... ", cell->node_info->node_id);
|
|
continue;
|
|
}
|
|
|
|
if (get_recovery_type(cell->node_info->conn) == RECTYPE_PRIMARY)
|
|
{
|
|
follow_node_id = cell->node_info->node_id;
|
|
PQfinish(cell->node_info->conn);
|
|
cell->node_info->conn = NULL;
|
|
break;
|
|
}
|
|
PQfinish(cell->node_info->conn);
|
|
cell->node_info->conn = NULL;
|
|
}
|
|
|
|
if (follow_node_id != UNKNOWN_NODE_ID)
|
|
{
|
|
follow_new_primary(follow_node_id);
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
loop:
|
|
|
|
/* emit "still alive" log message at regular intervals, if requested */
|
|
if (config_file_options.log_status_interval > 0)
|
|
{
|
|
int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
|
|
|
|
if (log_status_interval_elapsed >= config_file_options.log_status_interval)
|
|
{
|
|
PQExpBufferData monitoring_summary;
|
|
initPQExpBuffer(&monitoring_summary);
|
|
|
|
appendPQExpBuffer(
|
|
&monitoring_summary,
|
|
_("node \"%s\" (node ID: %i) monitoring upstream node \"%s\" (node ID: %i) in %s state"),
|
|
local_node_info.node_name,
|
|
local_node_info.node_id,
|
|
upstream_node_info.node_name,
|
|
upstream_node_info.node_id,
|
|
print_monitoring_state(monitoring_state));
|
|
|
|
if (config_file_options.failover == FAILOVER_MANUAL)
|
|
{
|
|
appendPQExpBuffer(
|
|
&monitoring_summary,
|
|
_(" (automatic failover disabled)"));
|
|
}
|
|
|
|
log_info("%s", monitoring_summary.data);
|
|
termPQExpBuffer(&monitoring_summary);
|
|
if (monitoring_state == MS_DEGRADED && config_file_options.failover == FAILOVER_AUTOMATIC)
|
|
{
|
|
log_detail(_("waiting for upstream or another primary to reappear"));
|
|
}
|
|
|
|
INSTR_TIME_SET_CURRENT(log_status_interval_start);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* handle local node failure
|
|
*
|
|
* currently we'll just check the connection, and try to reconnect
|
|
*
|
|
* TODO: add timeout, after which we run in degraded state
|
|
*/
|
|
|
|
check_connection(&local_node_info, local_conn);
|
|
|
|
sleep(config_file_options.monitor_interval_secs);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#ifndef BDR_ONLY
|
|
static bool
|
|
do_primary_failover(void)
|
|
{
|
|
/* attempt to initiate voting process */
|
|
ElectionResult election_result = do_election();
|
|
|
|
/* TODO add pre-event notification here */
|
|
failover_state = FAILOVER_STATE_UNKNOWN;
|
|
|
|
log_debug("election result: %s", _print_election_result(election_result));
|
|
|
|
if (election_result == ELECTION_CANCELLED)
|
|
{
|
|
log_notice(_("election cancelled"));
|
|
return false;
|
|
}
|
|
else if (election_result == ELECTION_WON)
|
|
{
|
|
log_notice("I am the winner, will now promote self and inform other nodes");
|
|
|
|
failover_state = promote_self();
|
|
}
|
|
else if (election_result == ELECTION_LOST)
|
|
{
|
|
t_node_info *best_candidate;
|
|
|
|
log_info(_("I am the candidate but did not get all votes; will now determine the best candidate"));
|
|
|
|
/* standby_nodes is in the state created by do_election() */
|
|
best_candidate = poll_best_candidate(&standby_nodes);
|
|
|
|
/*
|
|
* this can occur in a tie-break situation, where this node establishes
|
|
* it is the best candidate
|
|
*/
|
|
if (best_candidate->node_id == local_node_info.node_id)
|
|
{
|
|
log_notice("I am the best candidate, will now promote self and inform other nodes");
|
|
|
|
failover_state = promote_self();
|
|
}
|
|
else
|
|
{
|
|
PGconn *candidate_conn = NULL;
|
|
|
|
log_info("node %i is the best candidate, waiting for it to confirm so I can follow it",
|
|
best_candidate->node_id);
|
|
|
|
/* notify the best candidate so it */
|
|
|
|
candidate_conn = establish_db_connection(best_candidate->conninfo, false);
|
|
|
|
if (PQstatus(candidate_conn) == CONNECTION_OK)
|
|
{
|
|
notify_follow_primary(candidate_conn, best_candidate->node_id);
|
|
|
|
/* we'll wait for the candidate to get back to us */
|
|
failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
|
|
}
|
|
else
|
|
{
|
|
log_error(_("unable to connect to candidate node (ID: %i)"), best_candidate->node_id);
|
|
failover_state = FAILOVER_STATE_NODE_NOTIFICATION_ERROR;
|
|
}
|
|
PQfinish(candidate_conn);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
|
|
if (standby_nodes.node_count == 0)
|
|
{
|
|
/* Node is not a candidate but no other nodes are available */
|
|
log_notice(_("no other nodes are available as promotion candidate"));
|
|
log_hint(_("use \"repmgr standby promote\" to manually promote this node"));
|
|
|
|
monitoring_state = MS_DEGRADED;
|
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
|
|
|
failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
|
|
}
|
|
else
|
|
{
|
|
log_info(_("follower node awaiting notification from the candidate node"));
|
|
failover_state = FAILOVER_STATE_WAITING_NEW_PRIMARY;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* node has decided it is a follower, so will await notification
|
|
* from the candidate that it has promoted itself and can be followed
|
|
*/
|
|
if (failover_state == FAILOVER_STATE_WAITING_NEW_PRIMARY)
|
|
{
|
|
int new_primary_id;
|
|
|
|
// --> need timeout in case new primary doesn't come up, then rerun election
|
|
|
|
/* either follow or time out; either way resume monitoring */
|
|
if (wait_primary_notification(&new_primary_id) == true)
|
|
{
|
|
/* if primary has reappeared, no action needed */
|
|
if (new_primary_id == upstream_node_info.node_id)
|
|
{
|
|
failover_state = FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY;
|
|
}
|
|
/* if new_primary_id is self, promote */
|
|
else if (new_primary_id == local_node_info.node_id)
|
|
{
|
|
log_notice(_("this node is promotion candidate, promoting"));
|
|
|
|
failover_state = promote_self();
|
|
|
|
get_active_sibling_node_records(local_conn,
|
|
local_node_info.node_id,
|
|
upstream_node_info.node_id,
|
|
&standby_nodes);
|
|
|
|
}
|
|
else if (config_file_options.failover == FAILOVER_MANUAL)
|
|
{
|
|
/* automatic failover disabled */
|
|
|
|
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
|
|
RecordStatus record_status = RECORD_NOT_FOUND;
|
|
PGconn *new_primary_conn;
|
|
|
|
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
|
|
|
|
if (record_status != RECORD_FOUND)
|
|
{
|
|
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
|
|
new_primary_id);
|
|
}
|
|
else
|
|
{
|
|
PQExpBufferData event_details;
|
|
initPQExpBuffer(&event_details);
|
|
appendPQExpBuffer(&event_details,
|
|
_("node %i is in manual failover mode and is now disconnected from streaming replication"),
|
|
local_node_info.node_id);
|
|
|
|
new_primary_conn = establish_db_connection(new_primary.conninfo, false);
|
|
|
|
create_event_notification(
|
|
new_primary_conn,
|
|
&config_file_options,
|
|
local_node_info.node_id,
|
|
"standby_disconnect_manual",
|
|
/* here "true" indicates the action has occurred as expected */
|
|
true,
|
|
event_details.data);
|
|
PQfinish(new_primary_conn);
|
|
termPQExpBuffer(&event_details);
|
|
|
|
}
|
|
failover_state = FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER;
|
|
}
|
|
else
|
|
{
|
|
failover_state = follow_new_primary(new_primary_id);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
failover_state = FAILOVER_STATE_NO_NEW_PRIMARY;
|
|
}
|
|
}
|
|
|
|
switch(failover_state)
|
|
{
|
|
case FAILOVER_STATE_PROMOTED:
|
|
log_debug("failover state is PROMOTED");
|
|
|
|
/* notify former siblings that they should now follow this node */
|
|
notify_followers(&standby_nodes, local_node_info.node_id);
|
|
|
|
/* we no longer care about our former siblings */
|
|
clear_node_info_list(&standby_nodes);
|
|
|
|
/* pass control back down to start_monitoring() */
|
|
log_info(_("switching to primary monitoring mode"));
|
|
|
|
failover_state = FAILOVER_STATE_NONE;
|
|
return true;
|
|
|
|
case FAILOVER_STATE_PRIMARY_REAPPEARED:
|
|
log_debug("failover state is PRIMARY_REAPPEARED");
|
|
|
|
/* notify siblings that they should resume following the original primary */
|
|
notify_followers(&standby_nodes, upstream_node_info.node_id);
|
|
|
|
/* we no longer care about our former siblings */
|
|
clear_node_info_list(&standby_nodes);
|
|
|
|
/* pass control back down to start_monitoring() */
|
|
log_info(_("resuming standby monitoring mode"));
|
|
log_detail(_("original primary \"%s\" (node ID: %i) reappeared"),
|
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
|
|
|
failover_state = FAILOVER_STATE_NONE;
|
|
return true;
|
|
|
|
|
|
case FAILOVER_STATE_FOLLOWED_NEW_PRIMARY:
|
|
log_info(_("resuming standby monitoring mode"));
|
|
log_detail(_("following new primary \"%s\" (node id: %i)"),
|
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
|
failover_state = FAILOVER_STATE_NONE;
|
|
|
|
return true;
|
|
|
|
case FAILOVER_STATE_FOLLOWING_ORIGINAL_PRIMARY:
|
|
log_info(_("resuming standby monitoring mode"));
|
|
log_detail(_("following original primary \"%s\" (node id: %i)"),
|
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
|
failover_state = FAILOVER_STATE_NONE;
|
|
|
|
return true;
|
|
|
|
case FAILOVER_STATE_PROMOTION_FAILED:
|
|
log_debug("failover state is PROMOTION FAILED");
|
|
return false;
|
|
|
|
case FAILOVER_STATE_FOLLOW_FAIL:
|
|
/*
|
|
* for whatever reason we were unable to follow the new primary -
|
|
* continue monitoring in degraded state
|
|
*/
|
|
monitoring_state = MS_DEGRADED;
|
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
|
|
|
return false;
|
|
|
|
case FAILOVER_STATE_REQUIRES_MANUAL_FAILOVER:
|
|
log_info(_("automatic failover disabled for this node, manual intervention required"));
|
|
|
|
monitoring_state = MS_DEGRADED;
|
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
|
return false;
|
|
|
|
case FAILOVER_STATE_NO_NEW_PRIMARY:
|
|
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
|
|
/* pass control back down to start_monitoring() */
|
|
// -> should kick off new election
|
|
return false;
|
|
|
|
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
|
|
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
|
case FAILOVER_STATE_UNKNOWN:
|
|
case FAILOVER_STATE_NONE:
|
|
log_debug("failover state is %i", failover_state);
|
|
return false;
|
|
}
|
|
|
|
|
|
/* should never reach here */
|
|
return false;
|
|
}
|
|
|
|
|
|
/*
|
|
* do_upstream_standby_failover()
|
|
*
|
|
* Attach cascaded standby to primary
|
|
*
|
|
* Currently we will try to attach to the cluster primary, as "repmgr
|
|
* standby follow" doesn't support attaching to another node.
|
|
*
|
|
* If this becomes supported, it might be worth providing a selection
|
|
* of reconnection strategies as different behaviour might be desirable
|
|
* in different situations;
|
|
* or maybe the option not to reconnect might be required?
|
|
*
|
|
* XXX check this handles replication slots gracefully
|
|
*/
|
|
static bool
|
|
do_upstream_standby_failover(void)
|
|
{
|
|
PQExpBufferData event_details;
|
|
t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
|
|
RecordStatus record_status;
|
|
int r;
|
|
|
|
PQfinish(upstream_conn);
|
|
upstream_conn = NULL;
|
|
|
|
if (get_primary_node_record(local_conn, &primary_node_info) == false)
|
|
{
|
|
log_error(_("unable to retrieve primary node record"));
|
|
return false;
|
|
}
|
|
/*
|
|
* Verify that we can still talk to the cluster primary, even though
|
|
* the node's upstream is not available
|
|
*/
|
|
|
|
check_connection(&primary_node_info, primary_conn);
|
|
|
|
|
|
/* grandparent upstream is inactive */
|
|
if (primary_node_info.active == false)
|
|
{
|
|
// XXX
|
|
}
|
|
|
|
/* Close the connection to this server */
|
|
PQfinish(local_conn);
|
|
local_conn = NULL;
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
log_debug(_("standby follow command is:\n \"%s\""),
|
|
config_file_options.follow_command);
|
|
|
|
r = system(config_file_options.follow_command);
|
|
|
|
if (r != 0)
|
|
{
|
|
appendPQExpBuffer(&event_details,
|
|
_("unable to execute follow command:\n %s"),
|
|
config_file_options.follow_command);
|
|
|
|
log_error("%s", event_details.data);
|
|
|
|
/* It may not possible to write to the event notification
|
|
* table but we should be able to generate an external notification
|
|
* if required.
|
|
*/
|
|
create_event_notification(
|
|
primary_conn,
|
|
&config_file_options,
|
|
local_node_info.node_id,
|
|
"repmgrd_failover_follow",
|
|
false,
|
|
event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
}
|
|
|
|
/* reconnect to local node */
|
|
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
|
|
|
if (update_node_record_set_upstream(primary_conn,
|
|
local_node_info.node_id,
|
|
primary_node_info.node_id) == false)
|
|
{
|
|
appendPQExpBuffer(&event_details,
|
|
_("unable to set node %i's new upstream ID to %i"),
|
|
local_node_info.node_id,
|
|
primary_node_info.node_id);
|
|
|
|
log_error("%s", event_details.data);
|
|
|
|
create_event_notification(
|
|
NULL,
|
|
&config_file_options,
|
|
local_node_info.node_id,
|
|
"repmgrd_failover_follow",
|
|
false,
|
|
event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/* refresh own internal node record */
|
|
record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);
|
|
|
|
/*
|
|
* highly improbable this will happen, but in case we're unable to retrieve
|
|
* our node record from the primary, update it ourselves, and hope for the best
|
|
*/
|
|
if (record_status != RECORD_FOUND)
|
|
{
|
|
local_node_info.upstream_node_id = primary_node_info.node_id;
|
|
}
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("node %i is now following primary node %i"),
|
|
local_node_info.node_id,
|
|
primary_node_info.node_id);
|
|
|
|
log_notice("%s", event_details.data);
|
|
|
|
create_event_notification(
|
|
primary_conn,
|
|
&config_file_options,
|
|
local_node_info.node_id,
|
|
"repmgrd_failover_follow",
|
|
true,
|
|
event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
|
|
PQfinish(primary_conn);
|
|
primary_conn = NULL;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
static FailoverState
|
|
promote_self(void)
|
|
{
|
|
PQExpBufferData event_details;
|
|
char *promote_command;
|
|
int r;
|
|
|
|
/* Store details of the failed node here */
|
|
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
|
|
RecordStatus record_status;
|
|
|
|
/*
|
|
* optionally add a delay before promoting the standby; this is mainly
|
|
* useful for testing (e.g. for reappearance of the original primary)
|
|
* and is not documented.
|
|
*/
|
|
if (config_file_options.promote_delay > 0)
|
|
{
|
|
log_debug("sleeping %i seconds before promoting standby",
|
|
config_file_options.promote_delay);
|
|
sleep(config_file_options.promote_delay);
|
|
}
|
|
|
|
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
|
|
|
|
if (record_status != RECORD_FOUND)
|
|
{
|
|
log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
|
|
local_node_info.upstream_node_id);
|
|
return FAILOVER_STATE_PROMOTION_FAILED;
|
|
}
|
|
|
|
/* the presence of either of this command has been established already */
|
|
promote_command = config_file_options.promote_command;
|
|
|
|
log_debug("promote command is:\n \"%s\"",
|
|
promote_command);
|
|
|
|
if (log_type == REPMGR_STDERR && *config_file_options.log_file)
|
|
{
|
|
fflush(stderr);
|
|
}
|
|
|
|
r = system(promote_command);
|
|
|
|
/* connection should stay up, but check just in case */
|
|
if(PQstatus(local_conn) != CONNECTION_OK)
|
|
{
|
|
local_conn = establish_db_connection(local_node_info.conninfo, true);
|
|
|
|
/* assume node failed */
|
|
if(PQstatus(local_conn) != CONNECTION_OK)
|
|
{
|
|
log_error(_("unable to reconnect to local node"));
|
|
// XXX handle this
|
|
return FAILOVER_STATE_LOCAL_NODE_FAILURE;
|
|
}
|
|
}
|
|
|
|
if (r != 0)
|
|
{
|
|
int primary_node_id;
|
|
|
|
upstream_conn = get_primary_connection(local_conn,
|
|
&primary_node_id, NULL);
|
|
|
|
if (PQstatus(upstream_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
|
|
{
|
|
log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"),
|
|
failed_primary.node_id);
|
|
|
|
initPQExpBuffer(&event_details);
|
|
appendPQExpBuffer(&event_details,
|
|
_("original primary \"%s\" (node ID: %i) reappeared"),
|
|
failed_primary.node_name,
|
|
failed_primary.node_id);
|
|
|
|
create_event_notification(upstream_conn,
|
|
&config_file_options,
|
|
local_node_info.node_id,
|
|
"repmgrd_failover_abort",
|
|
true,
|
|
event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
|
|
//primary_conn = NULL;
|
|
|
|
// XXX handle this!
|
|
// -> we'll need to let the other nodes know too....
|
|
/* no failover occurred but we'll want to restart connections */
|
|
//failover_done = true;
|
|
return FAILOVER_STATE_PRIMARY_REAPPEARED;
|
|
}
|
|
|
|
// handle this
|
|
// -> check if somehow primary; otherwise go for new election?
|
|
log_error(_("promote command failed"));
|
|
return FAILOVER_STATE_PROMOTION_FAILED;
|
|
}
|
|
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
/* update own internal node record */
|
|
record_status = get_node_record(local_conn, local_node_info.node_id, &local_node_info);
|
|
|
|
/*
|
|
* XXX here we're assuming the promote command updated metadata
|
|
*/
|
|
appendPQExpBuffer(&event_details,
|
|
_("node %i promoted to primary; old primary %i marked as failed"),
|
|
local_node_info.node_id,
|
|
failed_primary.node_id);
|
|
|
|
/* local_conn is now the primary connection */
|
|
create_event_notification(local_conn,
|
|
&config_file_options,
|
|
local_node_info.node_id,
|
|
"repmgrd_failover_promote",
|
|
true,
|
|
event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
|
|
return FAILOVER_STATE_PROMOTED;
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
* Notify follower nodes about which node to follow. Normally this
|
|
* will be the current node, however if the original primary reappeared
|
|
* before this node could be promoted, we'll inform the followers they
|
|
* should resume monitoring the original primary.
|
|
*/
|
|
static void
|
|
notify_followers(NodeInfoList *standby_nodes, int follow_node_id)
|
|
{
|
|
NodeInfoListCell *cell;
|
|
|
|
log_debug("notify_followers()");
|
|
for (cell = standby_nodes->head; cell; cell = cell->next)
|
|
{
|
|
log_debug("intending to notify node %i... ", cell->node_info->node_id);
|
|
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
|
{
|
|
log_debug("reconnecting to node %i... ", cell->node_info->node_id);
|
|
|
|
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
|
}
|
|
|
|
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
|
{
|
|
log_debug("unable to reconnect to %i ... ", cell->node_info->node_id);
|
|
|
|
continue;
|
|
}
|
|
|
|
log_debug("notifying node %i to follow node %i",
|
|
cell->node_info->node_id, follow_node_id);
|
|
notify_follow_primary(cell->node_info->conn, follow_node_id);
|
|
}
|
|
}
|
|
|
|
|
|
static t_node_info *
|
|
poll_best_candidate(NodeInfoList *standby_nodes)
|
|
{
|
|
NodeInfoListCell *cell;
|
|
t_node_info *best_candidate = &local_node_info;
|
|
|
|
|
|
/*
|
|
* we need to definitively decide the best candidate, as in some corner
|
|
* cases we could end up with two candidate nodes, so they should each
|
|
* come to the same conclusion.
|
|
*
|
|
* XXX check there are no cases where the standby node's LSN is
|
|
* not set
|
|
*/
|
|
for (cell = standby_nodes->head; cell; cell = cell->next)
|
|
{
|
|
if (cell->node_info->last_wal_receive_lsn > best_candidate->last_wal_receive_lsn)
|
|
{
|
|
log_debug("node %i has higher LSN, now best candidate", cell->node_info->node_id);
|
|
best_candidate = cell->node_info;
|
|
}
|
|
else if (cell->node_info->last_wal_receive_lsn == best_candidate->last_wal_receive_lsn)
|
|
{
|
|
if (cell->node_info->priority > best_candidate->priority)
|
|
{
|
|
log_debug("node %i has higher priority, now best candidate", cell->node_info->node_id);
|
|
best_candidate = cell->node_info;
|
|
}
|
|
}
|
|
/* if all else fails, we decide by node_id */
|
|
else if (cell->node_info->node_id < best_candidate->node_id)
|
|
{
|
|
log_debug("node %i has lower node_id, now best candidate", cell->node_info->node_id);
|
|
best_candidate = cell->node_info;
|
|
}
|
|
|
|
if (cell->node_info->conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
|
|
{
|
|
PQfinish(cell->node_info->conn);
|
|
cell->node_info->conn = NULL;
|
|
}
|
|
}
|
|
|
|
log_info(_("best candidate is node %s (node ID: %i)"),
|
|
best_candidate->node_name,
|
|
best_candidate->node_id);
|
|
|
|
return best_candidate;
|
|
}
|
|
|
|
|
|
static bool
|
|
wait_primary_notification(int *new_primary_id)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < config_file_options.primary_notification_timeout; i++)
|
|
{
|
|
if (get_new_primary(local_conn, new_primary_id) == true)
|
|
{
|
|
log_debug("new primary is %i; elapsed: %i",
|
|
*new_primary_id, i);
|
|
return true;
|
|
}
|
|
sleep(1);
|
|
}
|
|
|
|
|
|
log_warning(_("no notification received from new primary after %i seconds"),
|
|
config_file_options.primary_notification_timeout);
|
|
|
|
monitoring_state = MS_DEGRADED;
|
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
static FailoverState
|
|
follow_new_primary(int new_primary_id)
|
|
{
|
|
PQExpBufferData event_details;
|
|
int r;
|
|
|
|
/* Store details of the failed node here */
|
|
t_node_info failed_primary = T_NODE_INFO_INITIALIZER;
|
|
t_node_info new_primary = T_NODE_INFO_INITIALIZER;
|
|
RecordStatus record_status = RECORD_NOT_FOUND;
|
|
bool new_primary_ok = false;
|
|
|
|
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
|
|
|
|
if (record_status != RECORD_FOUND)
|
|
{
|
|
log_error(_("unable to retrieve metadata record for new primary node (ID: %i)"),
|
|
new_primary_id);
|
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
|
}
|
|
|
|
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
|
|
|
|
if (record_status != RECORD_FOUND)
|
|
{
|
|
log_error(_("unable to retrieve metadata record for failed primary (ID: %i)"),
|
|
local_node_info.upstream_node_id);
|
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
|
}
|
|
|
|
// XXX check if new_primary_id == failed_primary.node_id?
|
|
|
|
if (log_type == REPMGR_STDERR && *config_file_options.log_file)
|
|
{
|
|
fflush(stderr);
|
|
}
|
|
|
|
log_debug(_("standby follow command is:\n \"%s\""),
|
|
config_file_options.follow_command);
|
|
|
|
upstream_conn = establish_db_connection(new_primary.conninfo, false);
|
|
|
|
if (PQstatus(upstream_conn) == CONNECTION_OK)
|
|
{
|
|
RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
|
|
if (primary_recovery_type == RECTYPE_PRIMARY)
|
|
{
|
|
new_primary_ok = true;
|
|
}
|
|
else
|
|
{
|
|
log_warning(_("new primary is not in recovery"));
|
|
PQfinish(upstream_conn);
|
|
}
|
|
}
|
|
|
|
if (new_primary_ok == false)
|
|
{
|
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
|
}
|
|
|
|
/*
|
|
* disconnect from local node, as follow operation will result in
|
|
* a server restart
|
|
*/
|
|
|
|
PQfinish(local_conn);
|
|
local_conn = NULL;
|
|
|
|
/* execute the follow command */
|
|
r = system(config_file_options.follow_command);
|
|
|
|
if (r != 0)
|
|
{
|
|
PGconn *old_primary_conn;
|
|
/*
|
|
* The follow action could still fail due to the original primary reappearing
|
|
* before the candidate could promote itself ("repmgr standby follow" will
|
|
* refuse to promote another node if the primary is available). However
|
|
* the new primary will only instruct use to follow it after it's successfully
|
|
* promoted itself, so that very likely won't be the reason for the failure.
|
|
*
|
|
*
|
|
* TODO: check the new primary too - we could have a split-brain
|
|
* situation where the old primary reappeared just after the new
|
|
* one promoted itself.
|
|
*/
|
|
old_primary_conn = establish_db_connection(failed_primary.conninfo, false);
|
|
|
|
if (PQstatus(old_primary_conn) == CONNECTION_OK)
|
|
{
|
|
// XXX add event notifications
|
|
RecoveryType upstream_recovery_type = get_recovery_type(old_primary_conn);
|
|
PQfinish(old_primary_conn);
|
|
|
|
if (upstream_recovery_type == RECTYPE_PRIMARY)
|
|
{
|
|
log_notice(_("original primary reappeared - no action taken"));
|
|
return FAILOVER_STATE_PRIMARY_REAPPEARED;
|
|
}
|
|
}
|
|
|
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
|
}
|
|
|
|
|
|
/*
|
|
* refresh local copy of local and primary node records - we get these
|
|
* directly from the primary to ensure they're the current version
|
|
*/
|
|
|
|
record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
|
|
|
|
if (record_status != RECORD_FOUND)
|
|
{
|
|
log_error(_("unable to retrieve metadata record found for node %i"),
|
|
new_primary_id);
|
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
|
}
|
|
|
|
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
|
|
if (record_status != RECORD_FOUND)
|
|
{
|
|
log_error(_("unable to retrieve metadata record found for node %i"),
|
|
local_node_info.node_id);
|
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
|
}
|
|
|
|
|
|
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
|
initPQExpBuffer(&event_details);
|
|
appendPQExpBuffer(&event_details,
|
|
_("node %i now following new upstream node %i"),
|
|
local_node_info.node_id,
|
|
upstream_node_info.node_id);
|
|
|
|
log_notice("%s\n", event_details.data);
|
|
|
|
create_event_notification(
|
|
upstream_conn,
|
|
&config_file_options,
|
|
local_node_info.node_id,
|
|
"repmgrd_failover_follow",
|
|
true,
|
|
event_details.data);
|
|
|
|
termPQExpBuffer(&event_details);
|
|
|
|
return FAILOVER_STATE_FOLLOWED_NEW_PRIMARY;
|
|
}
|
|
|
|
|
|
static const char *
|
|
_print_voting_status(NodeVotingStatus voting_status)
|
|
{
|
|
switch(voting_status)
|
|
{
|
|
case VS_NO_VOTE:
|
|
return "NO VOTE";
|
|
|
|
case VS_VOTE_REQUEST_RECEIVED:
|
|
return "VOTE REQUEST RECEIVED";
|
|
|
|
case VS_VOTE_INITIATED:
|
|
return "VOTE REQUEST INITIATED";
|
|
|
|
case VS_UNKNOWN:
|
|
return "VOTE REQUEST UNKNOWN";
|
|
}
|
|
|
|
return "UNKNOWN VOTE REQUEST STATE";
|
|
}
|
|
|
|
static const char *
|
|
_print_election_result(ElectionResult result)
|
|
{
|
|
switch(result)
|
|
{
|
|
case ELECTION_NOT_CANDIDATE:
|
|
return "NOT CANDIDATE";
|
|
|
|
case ELECTION_WON:
|
|
return "WON";
|
|
|
|
case ELECTION_LOST:
|
|
return "LOST";
|
|
|
|
case ELECTION_CANCELLED:
|
|
return "CANCELLED";
|
|
}
|
|
|
|
/* should never reach here */
|
|
return "UNKNOWN";
|
|
}
|
|
|
|
|
|
/*
|
|
* NB: this function sets standby_nodes; caller (do_primary_failover)
|
|
* expects to be able to read this list
|
|
*/
|
|
static ElectionResult
|
|
do_election(void)
|
|
{
|
|
int electoral_term = -1;
|
|
|
|
int votes_for_me = 0;
|
|
|
|
/* we're visible */
|
|
int visible_nodes = 1;
|
|
|
|
/*
|
|
* get voting status from shared memory - should be one of "VS_NO_VOTE"
|
|
* or "VS_VOTE_REQUEST_RECEIVED". If VS_NO_VOTE, we declare ourselves as
|
|
* candidate and initiate the voting process.
|
|
*/
|
|
NodeVotingStatus voting_status;
|
|
|
|
NodeInfoListCell *cell;
|
|
|
|
bool other_node_is_candidate = false;
|
|
bool other_node_is_ahead = false;
|
|
|
|
/*
|
|
* Check if at least one server in the primary's location is visible;
|
|
* if not we'll assume a network split between this node and the primary
|
|
* location, and not promote any standby.
|
|
*
|
|
* NOTE: this function is only ever called by standbys attached to the current
|
|
* (unreachable) primary, so "upstream_node_info" will always contain the
|
|
* primary node record.
|
|
*/
|
|
bool primary_location_seen = false;
|
|
|
|
/*
|
|
* sleep for a random period of 100 ~ 350 ms
|
|
*/
|
|
|
|
long unsigned rand_wait = (long) ((rand() % 35) + 10) * 10000;
|
|
|
|
/* get all active nodes attached to primary, excluding self */
|
|
get_active_sibling_node_records(local_conn,
|
|
local_node_info.node_id,
|
|
upstream_node_info.node_id,
|
|
&standby_nodes);
|
|
|
|
if (config_file_options.failover == FAILOVER_MANUAL)
|
|
{
|
|
log_notice(_("this node is not configured for automatic failover so will not be considered as promotion candidate"));
|
|
|
|
return ELECTION_NOT_CANDIDATE;
|
|
}
|
|
|
|
/* node priority is set to zero - don't ever become a candidate */
|
|
if (local_node_info.priority <= 0)
|
|
{
|
|
log_notice(_("this node's priority is %i so will not be considered as an automatic promotion candidate"),
|
|
local_node_info.priority);
|
|
|
|
return ELECTION_NOT_CANDIDATE;
|
|
}
|
|
|
|
|
|
log_debug("do_election(): sleeping %lu", rand_wait);
|
|
log_debug("do_election(): primary location is %s", upstream_node_info.location);
|
|
|
|
pg_usleep(rand_wait);
|
|
|
|
local_node_info.last_wal_receive_lsn = InvalidXLogRecPtr;
|
|
|
|
log_debug("do_election(): executing get_voting_status()");
|
|
voting_status = get_voting_status(local_conn);
|
|
log_debug("do_election(): node voting status is %s", _print_voting_status(voting_status));
|
|
|
|
if (voting_status == VS_VOTE_REQUEST_RECEIVED)
|
|
{
|
|
/* we've already been requested to vote, so can't become a candidate */
|
|
log_debug("vote request already received, not candidate");
|
|
return ELECTION_NOT_CANDIDATE;
|
|
}
|
|
|
|
/*
|
|
* Here we mark ourselves as candidate, so any further vote requests
|
|
* are rejected. However it's possible another node has done the
|
|
* same thing, so when announcing ourselves as candidate to the other
|
|
* nodes, we'll check for that and withdraw our candidature.
|
|
*/
|
|
electoral_term = set_voting_status_initiated(local_conn);
|
|
|
|
/* no other standbys - normally win by default */
|
|
if (standby_nodes.node_count == 0)
|
|
{
|
|
if (strncmp(upstream_node_info.location, local_node_info.location, MAXLEN) == 0)
|
|
{
|
|
log_debug("no other nodes - we win by default");
|
|
return ELECTION_WON;
|
|
}
|
|
else
|
|
{
|
|
log_debug("no other nodes, but primary and standby locations differ");
|
|
|
|
monitoring_state = MS_DEGRADED;
|
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
|
|
|
return ELECTION_NOT_CANDIDATE;
|
|
}
|
|
}
|
|
|
|
for (cell = standby_nodes.head; cell; cell = cell->next)
|
|
{
|
|
/* assume the worst case */
|
|
cell->node_info->node_status = NODE_STATUS_UNKNOWN;
|
|
|
|
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
|
|
|
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* tell the other node we're candidate - if the node has already declared
|
|
* itself, we withdraw
|
|
*
|
|
* XXX check for situations where more than one node could end up as candidate?
|
|
*
|
|
* XXX note it's possible some nodes accepted our candidature before we
|
|
* found out about the other candidate, check what happens in that situation
|
|
* -> other node will have info from all the nodes, even if not the vote,
|
|
* so it should be able to determine the best node anyway
|
|
*/
|
|
|
|
if (announce_candidature(cell->node_info->conn, &local_node_info, cell->node_info, electoral_term) == false)
|
|
{
|
|
log_debug("node %i is candidate", cell->node_info->node_id);
|
|
other_node_is_candidate = true;
|
|
|
|
/* don't notify any further standbys */
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* see if the node is in the primary's location (but skip the check
|
|
* if we've seen
|
|
*/
|
|
if (primary_location_seen == false)
|
|
{
|
|
if (strncmp(cell->node_info->location, upstream_node_info.location, MAXLEN) == 0)
|
|
{
|
|
primary_location_seen = true;
|
|
}
|
|
}
|
|
|
|
cell->node_info->node_status = NODE_STATUS_UP;
|
|
visible_nodes ++;
|
|
}
|
|
|
|
if (other_node_is_candidate == true)
|
|
{
|
|
reset_node_voting_status();
|
|
log_debug("other node is candidate, returning NOT CANDIDATE");
|
|
return ELECTION_NOT_CANDIDATE;
|
|
}
|
|
|
|
if (primary_location_seen == false)
|
|
{
|
|
log_notice(_("no nodes from the primary location \"%s\" visible - assuming network split"),
|
|
upstream_node_info.location);
|
|
log_detail(_("node will enter degraded monitoring state waiting for reconnect"));
|
|
|
|
monitoring_state = MS_DEGRADED;
|
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
|
|
|
reset_node_voting_status();
|
|
|
|
return ELECTION_CANCELLED;
|
|
}
|
|
|
|
|
|
/* get our lsn */
|
|
local_node_info.last_wal_receive_lsn = get_last_wal_receive_location(local_conn);
|
|
|
|
log_debug("last receive lsn = %X/%X", format_lsn(local_node_info.last_wal_receive_lsn));
|
|
|
|
/* request vote from each node */
|
|
|
|
for (cell = standby_nodes.head; cell; cell = cell->next)
|
|
{
|
|
VoteRequestResult vote_result;
|
|
|
|
log_debug("checking node %i...", cell->node_info->node_id);
|
|
/* ignore unreachable nodes */
|
|
if (cell->node_info->node_status != NODE_STATUS_UP)
|
|
continue;
|
|
|
|
vote_result = request_vote(cell->node_info->conn,
|
|
&local_node_info,
|
|
cell->node_info,
|
|
electoral_term);
|
|
|
|
switch (vote_result)
|
|
{
|
|
case VR_VOTE_REFUSED:
|
|
if (cell->node_info->node_id < local_node_info.node_id)
|
|
{
|
|
log_debug(_("node %i refused vote, their ID is lower, yielding"),
|
|
cell->node_info->node_id);
|
|
PQfinish(cell->node_info->conn);
|
|
cell->node_info->conn = NULL;
|
|
|
|
reset_node_voting_status();
|
|
log_debug("other node is candidate, returning NOT CANDIDATE");
|
|
return ELECTION_NOT_CANDIDATE;
|
|
}
|
|
|
|
log_debug(_("no vote received from %i, our ID is lower, not yielding"),
|
|
cell->node_info->node_id);
|
|
break;
|
|
|
|
case VR_POSITIVE_VOTE:
|
|
votes_for_me += 1;
|
|
break;
|
|
case VR_NEGATIVE_VOTE:
|
|
break;
|
|
}
|
|
|
|
if (cell->node_info->last_wal_receive_lsn > local_node_info.last_wal_receive_lsn)
|
|
{
|
|
/* register if another node is ahead of us */
|
|
other_node_is_ahead = true;
|
|
}
|
|
|
|
}
|
|
|
|
/* vote for myself, but only if I believe no-one else is ahead */
|
|
if (other_node_is_ahead == false)
|
|
{
|
|
votes_for_me += 1;
|
|
}
|
|
|
|
log_debug(_("%i of of %i votes"), votes_for_me, visible_nodes);
|
|
|
|
if (votes_for_me == visible_nodes)
|
|
return ELECTION_WON;
|
|
|
|
return ELECTION_LOST;
|
|
}
|
|
|
|
|
|
static void
|
|
reset_node_voting_status(void)
|
|
{
|
|
failover_state = FAILOVER_STATE_NONE;
|
|
|
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
|
{
|
|
log_error(_("reset_node_voting_status(): local_conn not set"));
|
|
return;
|
|
}
|
|
reset_voting_status(local_conn);
|
|
}
|
|
|
|
|
|
static void
|
|
check_connection(t_node_info *node_info, PGconn *conn)
|
|
{
|
|
// consolidate below code
|
|
if (is_server_available(node_info->conninfo) == false)
|
|
{
|
|
log_warning(_("connection to node %i lost"), node_info->node_id);
|
|
|
|
if (conn != NULL)
|
|
{
|
|
PQfinish(conn);
|
|
conn = NULL;
|
|
}
|
|
}
|
|
|
|
if (PQstatus(conn) != CONNECTION_OK)
|
|
{
|
|
log_info(_("attempting to reconnect"));
|
|
conn = establish_db_connection(node_info->conninfo, false);
|
|
|
|
if (PQstatus(conn) != CONNECTION_OK)
|
|
{
|
|
log_warning(_("reconnection failed"));
|
|
}
|
|
else
|
|
{
|
|
log_info(_("reconnected"));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#endif /* #ifndef BDR_ONLY */
|
|
|
|
void
|
|
close_connections_physical()
|
|
{
|
|
if (PQstatus(primary_conn) == CONNECTION_OK)
|
|
{
|
|
/* cancel any pending queries to the primary */
|
|
if (PQisBusy(primary_conn) == 1)
|
|
cancel_query(primary_conn, config_file_options.async_query_timeout);
|
|
PQfinish(primary_conn);
|
|
primary_conn = NULL;
|
|
}
|
|
|
|
if (upstream_conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
|
|
{
|
|
PQfinish(upstream_conn);
|
|
upstream_conn = NULL;
|
|
}
|
|
|
|
}
|