repmgrd: in BDR mode, have each repmgrd monitor each node

This will cover both the case when an entire node including
repmgrd goes down, and when one PostgreSQL instance goes down
but repmgrd is still up (in which case only one of the repmgrds
will handle the failover).
This commit is contained in:
Ian Barwick
2017-07-14 15:01:18 +09:00
parent e3b3fb65f0
commit 951c7dbd07
9 changed files with 211 additions and 89 deletions

View File

@@ -16,7 +16,7 @@
/* mainly for use by repmgrd */ /* mainly for use by repmgrd */
int server_version_num = 0; int server_version_num = UNKNOWN_SERVER_VERSION_NUM;
static PGconn *_establish_db_connection(const char *conninfo, static PGconn *_establish_db_connection(const char *conninfo,
@@ -1247,6 +1247,7 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row)
node_info->is_ready = false; node_info->is_ready = false;
node_info->is_visible = false; node_info->is_visible = false;
node_info->last_wal_receive_lsn = InvalidXLogRecPtr; node_info->last_wal_receive_lsn = InvalidXLogRecPtr;
node_info->monitoring_state = MS_NORMAL;
} }
@@ -3135,7 +3136,7 @@ void _populate_bdr_node_records(PGresult *res, BdrNodeInfoList *node_list)
{ {
int i; int i;
clear_node_info_list(node_list); clear_node_info_list((NodeInfoList *)node_list);
if (PQresultStatus(res) != PGRES_TUPLES_OK) if (PQresultStatus(res) != PGRES_TUPLES_OK)
{ {
@@ -3180,3 +3181,37 @@ _populate_bdr_node_record(PGresult *res, t_bdr_node_info *node_info, int row)
} }
bool
am_bdr_failover_handler(PGconn *conn, int node_id)
{
PQExpBufferData query;
PGresult *res;
bool am_handler;
initPQExpBuffer(&query);
appendPQExpBuffer(
&query,
"SELECT repmgr.am_bdr_failover_handler(%i)",
node_id);
res = PQexec(conn, query.data);
termPQExpBuffer(&query);
am_handler = (strcmp(PQgetvalue(res, 0, 0), "t") == 0) ? true : false;
PQclear(res);
return am_handler;
}
void
unset_bdr_failover_handler(PGconn *conn)
{
PGresult *res;
res = PQexec(conn, "SELECT repmgr.unset_bdr_failover_handler()");
PQclear(res);
return;
}

View File

@@ -9,6 +9,7 @@
#include "access/xlogdefs.h" #include "access/xlogdefs.h"
#include "pqexpbuffer.h" #include "pqexpbuffer.h"
#include "portability/instr_time.h"
#include "config.h" #include "config.h"
#include "strutil.h" #include "strutil.h"
@@ -40,8 +41,10 @@ typedef enum {
RECORD_NOT_FOUND RECORD_NOT_FOUND
} RecordStatus; } RecordStatus;
typedef enum {
MS_NORMAL = 0,
MS_DEGRADED = 1
} MonitoringState;
/* /*
* Struct to store node information * Struct to store node information
@@ -62,6 +65,7 @@ typedef struct s_node_info
bool is_ready; bool is_ready;
bool is_visible; bool is_visible;
XLogRecPtr last_wal_receive_lsn; XLogRecPtr last_wal_receive_lsn;
MonitoringState monitoring_state;
PGconn *conn; PGconn *conn;
} t_node_info; } t_node_info;
@@ -80,6 +84,7 @@ typedef struct s_node_info
false, \ false, \
false, \ false, \
InvalidXLogRecPtr, \ InvalidXLogRecPtr, \
MS_NORMAL, \
NULL \ NULL \
} }
@@ -323,5 +328,7 @@ void add_extension_tables_to_bdr_replication_set(PGconn *conn);
bool bdr_node_exists(PGconn *conn, const char *node_name); bool bdr_node_exists(PGconn *conn, const char *node_name);
bool am_bdr_failover_handler(PGconn *conn, int node_id);
void unset_bdr_failover_handler(PGconn *conn);
#endif /* dbutils.h */ #endif /* dbutils.h */

View File

@@ -70,3 +70,15 @@ CREATE FUNCTION reset_voting_status()
RETURNS VOID RETURNS VOID
AS '$libdir/repmgr', 'reset_voting_status' AS '$libdir/repmgr', 'reset_voting_status'
LANGUAGE C STRICT; LANGUAGE C STRICT;
CREATE FUNCTION am_bdr_failover_handler(INT)
RETURNS BOOL
AS '$libdir/repmgr', 'am_bdr_failover_handler'
LANGUAGE C STRICT;
CREATE FUNCTION unset_bdr_failover_handler()
RETURNS VOID
AS '$libdir/repmgr', 'unset_bdr_failover_handler'
LANGUAGE C STRICT;

View File

@@ -37,8 +37,6 @@ typedef struct TablespaceDataList
static PGconn *primary_conn = NULL; static PGconn *primary_conn = NULL;
static PGconn *source_conn = NULL; static PGconn *source_conn = NULL;
static int server_version_num = UNKNOWN_SERVER_VERSION_NUM;
static char local_data_directory[MAXPGPATH]; static char local_data_directory[MAXPGPATH];
static bool local_data_directory_provided = false; static bool local_data_directory_provided = false;

View File

@@ -45,11 +45,14 @@ typedef enum {
typedef struct repmgrdSharedState typedef struct repmgrdSharedState
{ {
LWLockId lock; /* protects search/modification */ LWLockId lock; /* protects search/modification */
/* streaming failover */
NodeState node_state; NodeState node_state;
NodeVotingStatus voting_status; NodeVotingStatus voting_status;
int current_electoral_term; int current_electoral_term;
int candidate_node_id; int candidate_node_id;
bool follow_new_primary; bool follow_new_primary;
/* BDR failover */
int bdr_failover_handler;
} repmgrdSharedState; } repmgrdSharedState;
static repmgrdSharedState *shared_state = NULL; static repmgrdSharedState *shared_state = NULL;
@@ -83,6 +86,13 @@ PG_FUNCTION_INFO_V1(get_new_primary);
Datum reset_voting_status(PG_FUNCTION_ARGS); Datum reset_voting_status(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(reset_voting_status); PG_FUNCTION_INFO_V1(reset_voting_status);
Datum am_bdr_failover_handler(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(am_bdr_failover_handler);
Datum unset_bdr_failover_handler(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(unset_bdr_failover_handler);
/* /*
* Module load callback * Module load callback
*/ */
@@ -157,6 +167,7 @@ repmgr_shmem_startup(void)
shared_state->voting_status = VS_NO_VOTE; shared_state->voting_status = VS_NO_VOTE;
shared_state->candidate_node_id = UNKNOWN_NODE_ID; shared_state->candidate_node_id = UNKNOWN_NODE_ID;
shared_state->follow_new_primary = false; shared_state->follow_new_primary = false;
shared_state->bdr_failover_handler = UNKNOWN_NODE_ID;
} }
LWLockRelease(AddinShmemInitLock); LWLockRelease(AddinShmemInitLock);
@@ -330,3 +341,38 @@ reset_voting_status(PG_FUNCTION_ARGS)
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
Datum
am_bdr_failover_handler(PG_FUNCTION_ARGS)
{
int node_id = PG_GETARG_INT32(0);
bool am_handler = false;
LWLockAcquire(shared_state->lock, LW_SHARED);
if (shared_state->bdr_failover_handler == UNKNOWN_NODE_ID)
{
shared_state->bdr_failover_handler = node_id;
am_handler = true;
}
else if (shared_state->bdr_failover_handler == node_id)
{
am_handler = true;
}
LWLockRelease(shared_state->lock);
PG_RETURN_BOOL(am_handler);
}
Datum
unset_bdr_failover_handler(PG_FUNCTION_ARGS)
{
LWLockAcquire(shared_state->lock, LW_SHARED);
shared_state->bdr_failover_handler = UNKNOWN_NODE_ID;
LWLockRelease(shared_state->lock);
PG_RETURN_VOID();
}

View File

@@ -14,7 +14,7 @@
static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t got_SIGHUP = false;
static void do_bdr_failover(NodeInfoList *nodes); static void do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node);
void void
@@ -31,6 +31,8 @@ monitor_bdr(void)
t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER; t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER;
RecordStatus record_status; RecordStatus record_status;
// t_node_info other_node_info = T_NODE_INFO_INITIALIZER;
/* sanity check local database */ /* sanity check local database */
log_info(_("connecting to local database '%s'"), log_info(_("connecting to local database '%s'"),
config_file_options.conninfo); config_file_options.conninfo);
@@ -61,7 +63,6 @@ monitor_bdr(void)
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr") == false) if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr") == false)
{ {
log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"), log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"),
@@ -125,7 +126,7 @@ monitor_bdr(void)
NULL); NULL);
/* /*
* retrieve list of nodes - we'll need these if the DB connection goes away, * retrieve list of all nodes - we'll need these if the DB connection goes away,
*/ */
get_all_node_records(local_conn, &nodes); get_all_node_records(local_conn, &nodes);
@@ -135,44 +136,56 @@ monitor_bdr(void)
while (true) while (true)
{ {
NodeInfoListCell *cell;
/* monitoring loop */ /* monitoring loop */
log_verbose(LOG_DEBUG, "bdr check loop..."); log_verbose(LOG_DEBUG, "bdr check loop...");
switch (monitoring_state) for (cell = nodes.head; cell; cell = cell->next)
{ {
case MS_NORMAL: if (cell->node_info->node_id == local_node_info.node_id)
{ {
if (is_server_available(local_node_info.conninfo) == false) log_debug("checking local node %i in %s state",
{ local_node_info.node_id,
// XXX improve print_monitoring_state(cell->node_info->monitoring_state));
log_warning("connection problem!");
do_bdr_failover(&nodes);
}
else
{
log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
config_file_options.monitor_interval_secs);
sleep(config_file_options.monitor_interval_secs);
}
} }
case MS_DEGRADED: else
{ {
/* degraded monitoring */ log_debug("checking other node %i in %s state",
if (is_server_available(local_node_info.conninfo) == true) cell->node_info->node_id,
print_monitoring_state(cell->node_info->monitoring_state));
}
switch (cell->node_info->monitoring_state)
{
case MS_NORMAL:
{ {
log_notice(_("monitored node %i has recovered"), local_node_info.node_id); if (is_server_available(cell->node_info->conninfo) == false)
// do_bdr_recovery() {
// XXX improve
log_warning("connection problem! to node %i", cell->node_info->node_id);
do_bdr_failover(&nodes, cell->node_info);
}
} }
else break;
case MS_DEGRADED:
{ {
log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")", /* degraded monitoring */
config_file_options.monitor_interval_secs); if (is_server_available(cell->node_info->conninfo) == true)
sleep(config_file_options.monitor_interval_secs); {
log_notice(_("monitored node %i has recovered"), cell->node_info->node_id);
// do_bdr_recovery()
}
} }
break;
} }
} }
log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")",
config_file_options.monitor_interval_secs);
sleep(config_file_options.monitor_interval_secs);
if (got_SIGHUP) if (got_SIGHUP)
{ {
/* /*
@@ -186,9 +199,6 @@ monitor_bdr(void)
update_registration(local_conn); update_registration(local_conn);
} }
/* reload node list */
get_all_node_records(local_conn, &nodes);
got_SIGHUP = false; got_SIGHUP = false;
} }
@@ -199,14 +209,13 @@ monitor_bdr(void)
/* /*
* do_bdr_failover() * do_bdr_failover()
* *0
* Here we attempt to perform a BDR "failover". * Here we attempt to perform a BDR "failover".
* *
* As there's no equivalent of a physical replication failover, * As there's no equivalent of a physical replication failover,
* we'll do the following: * we'll do the following:
* *
* - attempt to find another node, to set our node record as inactive * - connect to active node
* (there should be only one other node)
* - generate an event log record on that node * - generate an event log record on that node
* - optionally execute `bdr_failover_command`, passing the conninfo string * - optionally execute `bdr_failover_command`, passing the conninfo string
* of that node to the command; this can be used for e.g. reconfiguring * of that node to the command; this can be used for e.g. reconfiguring
@@ -215,29 +224,33 @@ monitor_bdr(void)
*/ */
void void
do_bdr_failover(NodeInfoList *nodes) do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
{ {
PGconn *next_node_conn = NULL; PGconn *next_node_conn = NULL;
NodeInfoListCell *cell; NodeInfoListCell *cell;
bool failover_success = false; // bool failover_success = false;
PQExpBufferData event_details; PQExpBufferData event_details;
RecordStatus record_status; RecordStatus record_status;
t_event_info event_info = T_EVENT_INFO_INITIALIZER; t_event_info event_info = T_EVENT_INFO_INITIALIZER;
t_node_info target_node = T_NODE_INFO_INITIALIZER; t_node_info target_node = T_NODE_INFO_INITIALIZER;
initPQExpBuffer(&event_details); initPQExpBuffer(&event_details);
/* get next active node */ monitored_node->monitoring_state = MS_DEGRADED;
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
/* get other node */
for (cell = nodes->head; cell; cell = cell->next) for (cell = nodes->head; cell; cell = cell->next)
{ {
log_debug("do_bdr_failover() %s", cell->node_info->node_name); log_debug("do_bdr_failover() %s", cell->node_info->node_name);
/* don't attempt to connect to the current monitored node, as that's the one which has failed */ /* don't attempt to connect to the current monitored node, as that's the one which has failed */
if (cell->node_info->node_id == local_node_info.node_id) if (cell->node_info->node_id == monitored_node->node_id)
continue; continue;
/* XXX skip inactive node? */ /* XXX skip inactive node? */
// reuse local conn if local node is up
next_node_conn = establish_db_connection(cell->node_info->conninfo, false); next_node_conn = establish_db_connection(cell->node_info->conninfo, false);
if (PQstatus(next_node_conn) == CONNECTION_OK) if (PQstatus(next_node_conn) == CONNECTION_OK)
@@ -251,6 +264,7 @@ do_bdr_failover(NodeInfoList *nodes)
next_node_conn = NULL; next_node_conn = NULL;
} }
/* shouldn't happen, and if it does, it means everything is down */
if (next_node_conn == NULL) if (next_node_conn == NULL)
{ {
appendPQExpBuffer(&event_details, appendPQExpBuffer(&event_details,
@@ -258,33 +272,42 @@ do_bdr_failover(NodeInfoList *nodes)
log_error("%s", event_details.data); log_error("%s", event_details.data);
// no other nodes found /* no other nodes found - continue degraded monitoring */
// continue degraded monitoring until node is restored? return;
} }
else
// call: repmgr.am_bdr_failover_handler(node_id)
if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false)
{ {
log_info(_("connecting to target node %s"), target_node.node_name); log_debug("XXX am not failover handler");
PQfinish(next_node_conn);
failover_success = true; log_debug("other node's repmgrd is handling failover");
return;
event_info.conninfo_str = target_node.conninfo;
event_info.node_name = target_node.node_name;
/* update our own record on the other node */
update_node_record_set_active(next_node_conn, local_node_info.node_id, false);
appendPQExpBuffer(&event_details,
_("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
local_node_info.node_name,
local_node_info.node_id,
target_node.node_name,
target_node.node_id);
} }
monitoring_state = MS_DEGRADED; log_debug("YYYam the failover handler");
INSTR_TIME_SET_CURRENT(degraded_monitoring_start);
// check here that the node hasn't come back up... // check here that the node hasn't come back up...
log_info(_("connecting to target node %s"), target_node.node_name);
// failover_success = true;
event_info.conninfo_str = target_node.conninfo;
event_info.node_name = target_node.node_name;
/* update our own record on the other node */
update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
appendPQExpBuffer(&event_details,
_("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"),
monitored_node->node_name,
monitored_node->node_id,
target_node.node_name,
target_node.node_id);
/* /*
* Create an event record * Create an event record
@@ -301,12 +324,13 @@ do_bdr_failover(NodeInfoList *nodes)
&config_file_options, &config_file_options,
config_file_options.node_id, config_file_options.node_id,
"bdr_failover", "bdr_failover",
failover_success, true,
event_details.data, event_details.data,
&event_info); &event_info);
termPQExpBuffer(&event_details); termPQExpBuffer(&event_details);
unset_bdr_failover_handler(next_node_conn);
/* local monitoring mode - there's no new node to monitor */ /* local monitoring mode - there's no new node to monitor */
return; return;

View File

@@ -47,7 +47,6 @@ static PGconn *primary_conn = NULL;
static ElectionResult do_election(void); static ElectionResult do_election(void);
static const char *_print_voting_status(NodeVotingStatus voting_status); static const char *_print_voting_status(NodeVotingStatus voting_status);
static const char *_print_election_result(ElectionResult result); static const char *_print_election_result(ElectionResult result);
static const char *_print_monitoring_state(MonitoringState monitoring_state);
static FailoverState promote_self(void); static FailoverState promote_self(void);
static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id); static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);
@@ -315,7 +314,7 @@ monitor_streaming_primary(void)
log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"), log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"),
local_node_info.node_name, local_node_info.node_name,
local_node_info.node_id, local_node_info.node_id,
_print_monitoring_state(monitoring_state)); print_monitoring_state(monitoring_state));
if (monitoring_state == MS_DEGRADED) if (monitoring_state == MS_DEGRADED)
{ {
@@ -617,7 +616,7 @@ monitor_streaming_standby(void)
local_node_info.node_id, local_node_info.node_id,
upstream_node_info.node_name, upstream_node_info.node_name,
upstream_node_info.node_id, upstream_node_info.node_id,
_print_monitoring_state(monitoring_state)); print_monitoring_state(monitoring_state));
if (monitoring_state == MS_DEGRADED) if (monitoring_state == MS_DEGRADED)
{ {
@@ -1459,22 +1458,6 @@ _print_election_result(ElectionResult result)
return "UNKNOWN"; return "UNKNOWN";
} }
static const char *
_print_monitoring_state(MonitoringState monitoring_state)
{
switch(monitoring_state)
{
case MS_NORMAL:
return "normal";
case MS_DEGRADED:
return "degraded";
}
/* should never reach here */
return "UNKNOWN";
}
static ElectionResult static ElectionResult

View File

@@ -665,6 +665,23 @@ calculate_elapsed(instr_time start_time)
} }
const char *
print_monitoring_state(MonitoringState monitoring_state)
{
switch(monitoring_state)
{
case MS_NORMAL:
return "normal";
case MS_DEGRADED:
return "degraded";
}
/* should never reach here */
return "UNKNOWN";
}
static void static void
close_connections() close_connections()
{ {
@@ -694,3 +711,5 @@ terminate(int retval)
exit(retval); exit(retval);
} }

View File

@@ -15,10 +15,6 @@ typedef enum {
NODE_STATUS_DOWN NODE_STATUS_DOWN
} NodeStatus; } NodeStatus;
typedef enum {
MS_NORMAL = 0,
MS_DEGRADED = 1
} MonitoringState;
extern MonitoringState monitoring_state; extern MonitoringState monitoring_state;
extern instr_time degraded_monitoring_start; extern instr_time degraded_monitoring_start;
@@ -31,6 +27,8 @@ extern bool startup_event_logged;
PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status); PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);
int calculate_elapsed(instr_time start_time); int calculate_elapsed(instr_time start_time);
const char *print_monitoring_state(MonitoringState monitoring_state);
void update_registration(PGconn *conn); void update_registration(PGconn *conn);
void terminate(int retval); void terminate(int retval);
#endif /* _REPMGRD_H_ */ #endif /* _REPMGRD_H_ */