mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
repmgrd: improve BDR recovery handling
This commit is contained in:
70
dbutils.c
70
dbutils.c
@@ -3185,6 +3185,76 @@ bdr_node_exists(PGconn *conn, const char *node_name)
|
||||
}
|
||||
|
||||
|
||||
ReplSlotStatus
|
||||
get_bdr_node_replication_slot_status(PGconn *conn, const char *node_name)
|
||||
{
|
||||
PQExpBufferData query;
|
||||
PGresult *res;
|
||||
ReplSlotStatus status;
|
||||
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(
|
||||
&query,
|
||||
" SELECT s.active "
|
||||
" FROM pg_catalog.pg_replication_slots s "
|
||||
" WHERE slot_name = "
|
||||
" (SELECT bdr.bdr_format_slot_name(node_sysid, node_timeline, node_dboid, datoid) "
|
||||
" FROM bdr.bdr_nodes "
|
||||
" WHERE node_name = '%s') ",
|
||||
node_name);
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
termPQExpBuffer(&query);
|
||||
|
||||
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
status = SLOT_UNKNOWN;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = (strcmp(PQgetvalue(res, 0, 0), "t") == 0)
|
||||
? SLOT_ACTIVE
|
||||
: SLOT_INACTIVE;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
get_bdr_other_node_name(PGconn *conn, int node_id, char *node_name)
|
||||
{
|
||||
PQExpBufferData query;
|
||||
PGresult *res;
|
||||
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(
|
||||
&query,
|
||||
" SELECT node_name "
|
||||
" FROM repmgr.nodes "
|
||||
" WHERE node_id != %i",
|
||||
node_id);
|
||||
|
||||
log_verbose(LOG_DEBUG, "get_bdr_other_node_name():\n %s", query.data);
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
termPQExpBuffer(&query);
|
||||
|
||||
if(PQresultStatus(res) == PGRES_TUPLES_OK)
|
||||
{
|
||||
strncpy(node_name, PQgetvalue(res, 0, 0), MAXLEN);
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
add_extension_tables_to_bdr_replication_set(PGconn *conn)
|
||||
{
|
||||
|
||||
@@ -60,6 +60,13 @@ typedef enum {
|
||||
VR_NEGATIVE_VOTE
|
||||
} VoteRequestResult;
|
||||
|
||||
|
||||
typedef enum {
|
||||
SLOT_UNKNOWN = -1,
|
||||
SLOT_INACTIVE,
|
||||
SLOT_ACTIVE
|
||||
} ReplSlotStatus;
|
||||
|
||||
/*
|
||||
* Struct to store node information
|
||||
*/
|
||||
@@ -356,6 +363,8 @@ bool add_table_to_bdr_replication_set(PGconn *conn, const char *tablename, con
|
||||
void add_extension_tables_to_bdr_replication_set(PGconn *conn);
|
||||
|
||||
bool bdr_node_exists(PGconn *conn, const char *node_name);
|
||||
ReplSlotStatus get_bdr_node_replication_slot_status(PGconn *conn, const char *node_name);
|
||||
void get_bdr_other_node_name(PGconn *conn, int node_id, char *name_buf);
|
||||
|
||||
bool am_bdr_failover_handler(PGconn *conn, int node_id);
|
||||
void unset_bdr_failover_handler(PGconn *conn);
|
||||
|
||||
@@ -224,7 +224,8 @@ do_node_status(void)
|
||||
{
|
||||
log_warning(_("following issue(s) were detected:"));
|
||||
print_item_list(&warnings);
|
||||
log_hint(_("execute \"repmgr node check\" for more details"));
|
||||
/* add this when functionality implemented */
|
||||
/* log_hint(_("execute \"repmgr node check\" for more details")); */
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1300,6 +1300,9 @@ action_name(const int action)
|
||||
; case BDR_UNREGISTER:
|
||||
return "BDR UNREGISTER";
|
||||
|
||||
case NODE_STATUS:
|
||||
return "NODE STATUS";
|
||||
|
||||
case CLUSTER_SHOW:
|
||||
return "CLUSTER SHOW";
|
||||
case CLUSTER_EVENT:
|
||||
@@ -1358,7 +1361,7 @@ do_help(void)
|
||||
#endif
|
||||
printf(_(" %s [OPTIONS] bdr {register|unregister}\n"), progname());
|
||||
printf(_(" %s [OPTIONS] node status\n"), progname());
|
||||
printf(_(" %s [OPTIONS] cluster {show|matrix|crosscheck|cleanup}\n"), progname());
|
||||
printf(_(" %s [OPTIONS] cluster {show|event|matrix|crosscheck}\n"), progname());
|
||||
|
||||
puts("");
|
||||
|
||||
|
||||
@@ -416,13 +416,16 @@ static void
|
||||
do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||
{
|
||||
PGconn *recovered_node_conn;
|
||||
PGconn *slot_check_conn;
|
||||
|
||||
PQExpBufferData event_details;
|
||||
t_bdr_node_info bdr_record;
|
||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||
int i;
|
||||
bool node_recovered = false;
|
||||
int node_recovery_elapsed;
|
||||
|
||||
char node_name[MAXLEN] = "";
|
||||
|
||||
recovered_node_conn = establish_db_connection(monitored_node->conninfo, false);
|
||||
|
||||
if (PQstatus(recovered_node_conn) != CONNECTION_OK)
|
||||
@@ -431,23 +434,30 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||
return;
|
||||
}
|
||||
|
||||
if (am_bdr_failover_handler(recovered_node_conn, local_node_info.node_id) == false)
|
||||
/* determine which replication slot to look fore */
|
||||
if (monitored_node->node_id == local_node_info.node_id)
|
||||
{
|
||||
PQfinish(recovered_node_conn);
|
||||
log_debug("other node's repmgrd is handling recovery");
|
||||
return;
|
||||
slot_check_conn = recovered_node_conn;
|
||||
get_bdr_other_node_name(recovered_node_conn, local_node_info.node_id, node_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
slot_check_conn = local_conn;
|
||||
strncpy(node_name, monitored_node->node_name, MAXLEN);
|
||||
}
|
||||
|
||||
for (i = 0; i < config_file_options.bdr_recovery_timeout; i++)
|
||||
{
|
||||
RecordStatus record_status = get_bdr_node_record_by_name(
|
||||
recovered_node_conn,
|
||||
monitored_node->node_name,
|
||||
&bdr_record);
|
||||
ReplSlotStatus slot_status;
|
||||
|
||||
if (record_status == RECORD_FOUND && bdr_record.node_status == 'r')
|
||||
log_debug("checking for state of replication slot for node \"%s\"", node_name);
|
||||
|
||||
slot_status = get_bdr_node_replication_slot_status(
|
||||
slot_check_conn,
|
||||
node_name);
|
||||
|
||||
if (slot_status == SLOT_ACTIVE)
|
||||
{
|
||||
// check pg_stat_replication
|
||||
node_recovered = true;
|
||||
break;
|
||||
}
|
||||
@@ -456,32 +466,46 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
if (node_recovered == false)
|
||||
{
|
||||
log_warning(_("node did not come up"));
|
||||
log_warning(_("no active replication slot for node \"%s\" found after %i seconds"),
|
||||
node_name,
|
||||
config_file_options.bdr_recovery_timeout);
|
||||
log_detail(_("this probably means inter-node BDR connections have not been re-established"));
|
||||
PQfinish(recovered_node_conn);
|
||||
return;
|
||||
}
|
||||
|
||||
log_info(_("active replication slot for node \"%s\" found after %i seconds"),
|
||||
node_name,
|
||||
i);
|
||||
|
||||
// XXX check other node is attached to this one so we
|
||||
// don't end up monitoring a parted node; if not attached,
|
||||
// generate a failed bdr_recovery event
|
||||
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
|
||||
monitored_node->monitoring_state = MS_NORMAL;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("node '%s' (ID: %i) has recovered after %i seconds"),
|
||||
_("node \"%s\" (ID: %i) has recovered after %i seconds"),
|
||||
monitored_node->node_name,
|
||||
monitored_node->node_id,
|
||||
node_recovery_elapsed);
|
||||
|
||||
monitored_node->monitoring_state = MS_NORMAL;
|
||||
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
|
||||
/* other node will generate the event */
|
||||
if (monitored_node->node_id == local_node_info.node_id)
|
||||
{
|
||||
termPQExpBuffer(&event_details);
|
||||
PQfinish(recovered_node_conn);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
/* generate the event on the currently active node only */
|
||||
if (monitored_node->node_id != local_node_info.node_id)
|
||||
{
|
||||
@@ -513,8 +537,6 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
unset_bdr_failover_handler(recovered_node_conn);
|
||||
|
||||
PQfinish(recovered_node_conn);
|
||||
|
||||
return;
|
||||
|
||||
Reference in New Issue
Block a user