mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 16:46:28 +00:00
Handle various (unlikely) failure states
This commit is contained in:
@@ -952,7 +952,7 @@ _get_primary_connection(PGconn *conn,
|
|||||||
" SELECT node_id, conninfo, "
|
" SELECT node_id, conninfo, "
|
||||||
" CASE WHEN type = 'primary' THEN 1 ELSE 2 END AS type_priority"
|
" CASE WHEN type = 'primary' THEN 1 ELSE 2 END AS type_priority"
|
||||||
" FROM repmgr.nodes "
|
" FROM repmgr.nodes "
|
||||||
" WHERE type != 'witness' "
|
" WHERE active IS TRUE "
|
||||||
"ORDER BY active DESC, type_priority, priority, node_id");
|
"ORDER BY active DESC, type_priority, priority, node_id");
|
||||||
|
|
||||||
log_verbose(LOG_DEBUG, "get_primary_connection():\n%s", query.data);
|
log_verbose(LOG_DEBUG, "get_primary_connection():\n%s", query.data);
|
||||||
@@ -1030,7 +1030,6 @@ get_primary_connection(PGconn *conn,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PGconn *
|
PGconn *
|
||||||
get_primary_connection_quiet(PGconn *conn,
|
get_primary_connection_quiet(PGconn *conn,
|
||||||
int *primary_id, char *primary_conninfo_out)
|
int *primary_id, char *primary_conninfo_out)
|
||||||
@@ -1038,6 +1037,7 @@ get_primary_connection_quiet(PGconn *conn,
|
|||||||
return _get_primary_connection(conn, primary_id, primary_conninfo_out, true);
|
return _get_primary_connection(conn, primary_id, primary_conninfo_out, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return the id of the active primary node, or NODE_NOT_FOUND if no
|
* Return the id of the active primary node, or NODE_NOT_FOUND if no
|
||||||
* record available.
|
* record available.
|
||||||
|
|||||||
90
repmgrd.c
90
repmgrd.c
@@ -885,7 +885,9 @@ monitor_streaming_standby(void)
|
|||||||
{
|
{
|
||||||
int degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
|
int degraded_monitoring_elapsed = calculate_elapsed(degraded_monitoring_start);
|
||||||
|
|
||||||
log_debug("monitoring node in degraded state for %i seconds", degraded_monitoring_elapsed);
|
log_debug("monitoring node %i in degraded state for %i seconds",
|
||||||
|
upstream_node_info.node_id,
|
||||||
|
degraded_monitoring_elapsed);
|
||||||
|
|
||||||
if (is_server_available(upstream_node_info.conninfo) == true)
|
if (is_server_available(upstream_node_info.conninfo) == true)
|
||||||
{
|
{
|
||||||
@@ -908,7 +910,7 @@ monitor_streaming_standby(void)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
||||||
if (primary_conn == NULL ||PQstatus(primary_conn) != CONNECTION_OK)
|
if (primary_conn == NULL || PQstatus(primary_conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
primary_conn = establish_primary_db_connection(upstream_conn, false);
|
primary_conn = establish_primary_db_connection(upstream_conn, false);
|
||||||
}
|
}
|
||||||
@@ -1180,12 +1182,24 @@ do_primary_failover(void)
|
|||||||
case FAILOVER_STATE_PROMOTION_FAILED:
|
case FAILOVER_STATE_PROMOTION_FAILED:
|
||||||
log_debug("failover state is PROMOTION FAILED");
|
log_debug("failover state is PROMOTION FAILED");
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
case FAILOVER_STATE_FOLLOW_FAIL:
|
||||||
|
/*
|
||||||
|
* for whatever reason we were unable to follow the new primary -
|
||||||
|
* continue monitoring in degraded state
|
||||||
|
*/
|
||||||
|
monitoring_state = MS_DEGRADED;
|
||||||
|
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
|
||||||
case FAILOVER_STATE_NO_NEW_PRIMARY:
|
case FAILOVER_STATE_NO_NEW_PRIMARY:
|
||||||
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
|
case FAILOVER_STATE_WAITING_NEW_PRIMARY:
|
||||||
/* pass control back down to start_monitoring() */
|
/* pass control back down to start_monitoring() */
|
||||||
// -> should kick off new election
|
// -> should kick off new election
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
case FAILOVER_STATE_NODE_NOTIFICATION_ERROR:
|
||||||
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
case FAILOVER_STATE_LOCAL_NODE_FAILURE:
|
||||||
case FAILOVER_STATE_UNKNOWN:
|
case FAILOVER_STATE_UNKNOWN:
|
||||||
case FAILOVER_STATE_NONE:
|
case FAILOVER_STATE_NONE:
|
||||||
@@ -1378,9 +1392,15 @@ promote_self(void)
|
|||||||
sleep(config_file_options.promote_delay);
|
sleep(config_file_options.promote_delay);
|
||||||
}
|
}
|
||||||
|
|
||||||
// XXX check success
|
|
||||||
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
|
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
|
||||||
|
|
||||||
|
if (record_status != RECORD_FOUND)
|
||||||
|
{
|
||||||
|
log_error(_("unable to retrieve metadata record for failed upstream (ID: %i)"),
|
||||||
|
local_node_info.upstream_node_id);
|
||||||
|
return FAILOVER_STATE_PROMOTION_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
/* the presence of either of these commands has been established already */
|
/* the presence of either of these commands has been established already */
|
||||||
if (config_file_options.service_promote_command[0] != '\0')
|
if (config_file_options.service_promote_command[0] != '\0')
|
||||||
promote_command = config_file_options.service_promote_command;
|
promote_command = config_file_options.service_promote_command;
|
||||||
@@ -1597,11 +1617,24 @@ follow_new_primary(int new_primary_id)
|
|||||||
RecordStatus record_status;
|
RecordStatus record_status;
|
||||||
bool new_primary_ok = false;
|
bool new_primary_ok = false;
|
||||||
|
|
||||||
// XXX check success
|
|
||||||
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
|
record_status = get_node_record(local_conn, new_primary_id, &new_primary);
|
||||||
|
|
||||||
|
if (record_status != RECORD_FOUND)
|
||||||
|
{
|
||||||
|
log_error(_("unable to retrieve metadata record for upstream node (ID: %i)"),
|
||||||
|
new_primary_id);
|
||||||
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
|
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &failed_primary);
|
||||||
|
|
||||||
|
if (record_status != RECORD_FOUND)
|
||||||
|
{
|
||||||
|
log_error(_("unable to retrieve metadata record for failed primary (ID: %i)"),
|
||||||
|
local_node_info.upstream_node_id);
|
||||||
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
// XXX check if new_primary_id == failed_primary.node_id?
|
// XXX check if new_primary_id == failed_primary.node_id?
|
||||||
|
|
||||||
if (log_type == REPMGR_STDERR && *config_file_options.log_file)
|
if (log_type == REPMGR_STDERR && *config_file_options.log_file)
|
||||||
@@ -1612,13 +1645,6 @@ follow_new_primary(int new_primary_id)
|
|||||||
log_debug(_("standby follow command is:\n \"%s\""),
|
log_debug(_("standby follow command is:\n \"%s\""),
|
||||||
config_file_options.follow_command);
|
config_file_options.follow_command);
|
||||||
|
|
||||||
/*
|
|
||||||
* disconnect from local node, as follow operation will result in
|
|
||||||
* a server restart
|
|
||||||
*/
|
|
||||||
PQfinish(local_conn);
|
|
||||||
local_conn = NULL;
|
|
||||||
|
|
||||||
upstream_conn = establish_db_connection(new_primary.conninfo, false);
|
upstream_conn = establish_db_connection(new_primary.conninfo, false);
|
||||||
|
|
||||||
if (PQstatus(upstream_conn) == CONNECTION_OK)
|
if (PQstatus(upstream_conn) == CONNECTION_OK)
|
||||||
@@ -1635,12 +1661,20 @@ follow_new_primary(int new_primary_id)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (new_primary_ok == false)
|
if (new_primary_ok == false)
|
||||||
{
|
{
|
||||||
return FAILOVER_STATE_FOLLOW_FAIL;
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||||
}
|
}
|
||||||
// XXX check new primary is reachable and is not in recovery here
|
|
||||||
|
/*
|
||||||
|
* disconnect from local node, as follow operation will result in
|
||||||
|
* a server restart
|
||||||
|
*/
|
||||||
|
|
||||||
|
PQfinish(local_conn);
|
||||||
|
local_conn = NULL;
|
||||||
|
|
||||||
|
/* execute the follow command */
|
||||||
r = system(config_file_options.follow_command);
|
r = system(config_file_options.follow_command);
|
||||||
|
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
@@ -1682,10 +1716,23 @@ follow_new_primary(int new_primary_id)
|
|||||||
* directly from the primary to ensure they're the current version
|
* directly from the primary to ensure they're the current version
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// XXX check success
|
|
||||||
|
|
||||||
record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
|
record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
|
||||||
|
|
||||||
|
if (record_status != RECORD_FOUND)
|
||||||
|
{
|
||||||
|
log_error(_("unable to retrieve metadata record found for node %i"),
|
||||||
|
new_primary_id);
|
||||||
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
|
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
|
||||||
|
if (record_status != RECORD_FOUND)
|
||||||
|
{
|
||||||
|
log_error(_("unable to retrieve metadata record found for node %i"),
|
||||||
|
local_node_info.node_id);
|
||||||
|
return FAILOVER_STATE_FOLLOW_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
@@ -1836,8 +1883,6 @@ do_election(void)
|
|||||||
electoral_term = set_voting_status_initiated(local_conn);
|
electoral_term = set_voting_status_initiated(local_conn);
|
||||||
|
|
||||||
/* get all active nodes attached to primary, excluding self */
|
/* get all active nodes attached to primary, excluding self */
|
||||||
// XXX include barman node in results
|
|
||||||
|
|
||||||
get_active_sibling_node_records(local_conn,
|
get_active_sibling_node_records(local_conn,
|
||||||
local_node_info.node_id,
|
local_node_info.node_id,
|
||||||
upstream_node_info.node_id,
|
upstream_node_info.node_id,
|
||||||
@@ -1856,7 +1901,6 @@ do_election(void)
|
|||||||
/* assume the worst case */
|
/* assume the worst case */
|
||||||
cell->node_info->is_visible = false;
|
cell->node_info->is_visible = false;
|
||||||
|
|
||||||
// XXX handle witness-barman
|
|
||||||
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
||||||
|
|
||||||
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
||||||
@@ -2201,8 +2245,12 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
|
|||||||
{
|
{
|
||||||
log_notice(_("node has recovered, reconnecting"));
|
log_notice(_("node has recovered, reconnecting"));
|
||||||
|
|
||||||
// XXX how to handle case where node is reachable
|
/*
|
||||||
// but connection denied due to connection exhaustion
|
* XXX we should also handle the case where node is pingable
|
||||||
|
* but connection denied due to connection exhaustion
|
||||||
|
* - fall back to degraded monitoring?
|
||||||
|
* - make that configurable
|
||||||
|
*/
|
||||||
conn = establish_db_connection(conninfo, false);
|
conn = establish_db_connection(conninfo, false);
|
||||||
if (PQstatus(conn) == CONNECTION_OK)
|
if (PQstatus(conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
@@ -2213,7 +2261,7 @@ try_reconnect(const char *conninfo, NodeStatus *node_status)
|
|||||||
PQfinish(conn);
|
PQfinish(conn);
|
||||||
log_notice(_("unable to reconnect to node"));
|
log_notice(_("unable to reconnect to node"));
|
||||||
}
|
}
|
||||||
log_info(_("sleeping %i seconds until next reconnection_attempt"),
|
log_info(_("sleeping %i seconds until next reconnection attempt"),
|
||||||
config_file_options.reconnect_interval);
|
config_file_options.reconnect_interval);
|
||||||
sleep(config_file_options.reconnect_interval);
|
sleep(config_file_options.reconnect_interval);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user