diff --git a/dbutils.c b/dbutils.c index 11ff7ac5..85947d8b 100644 --- a/dbutils.c +++ b/dbutils.c @@ -16,7 +16,7 @@ /* mainly for use by repmgrd */ -int server_version_num = 0; +int server_version_num = UNKNOWN_SERVER_VERSION_NUM; static PGconn *_establish_db_connection(const char *conninfo, @@ -1247,6 +1247,7 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row) node_info->is_ready = false; node_info->is_visible = false; node_info->last_wal_receive_lsn = InvalidXLogRecPtr; + node_info->monitoring_state = MS_NORMAL; } @@ -3135,7 +3136,7 @@ void _populate_bdr_node_records(PGresult *res, BdrNodeInfoList *node_list) { int i; - clear_node_info_list(node_list); + clear_node_info_list((NodeInfoList *)node_list); if (PQresultStatus(res) != PGRES_TUPLES_OK) { @@ -3180,3 +3181,37 @@ _populate_bdr_node_record(PGresult *res, t_bdr_node_info *node_info, int row) } +bool +am_bdr_failover_handler(PGconn *conn, int node_id) +{ + PQExpBufferData query; + PGresult *res; + bool am_handler; + + initPQExpBuffer(&query); + + appendPQExpBuffer( + &query, + "SELECT repmgr.am_bdr_failover_handler(%i)", + node_id); + + + res = PQexec(conn, query.data); + termPQExpBuffer(&query); + + am_handler = (strcmp(PQgetvalue(res, 0, 0), "t") == 0) ? true : false; + + PQclear(res); + + return am_handler; +} + +void +unset_bdr_failover_handler(PGconn *conn) +{ + PGresult *res; + res = PQexec(conn, "SELECT repmgr.unset_bdr_failover_handler()"); + + PQclear(res); + return; +} diff --git a/dbutils.h b/dbutils.h index 1b9ede4d..adae6848 100644 --- a/dbutils.h +++ b/dbutils.h @@ -9,6 +9,7 @@ #include "access/xlogdefs.h" #include "pqexpbuffer.h" +#include "portability/instr_time.h" #include "config.h" #include "strutil.h" @@ -40,8 +41,10 @@ typedef enum { RECORD_NOT_FOUND } RecordStatus; - - +typedef enum { + MS_NORMAL = 0, + MS_DEGRADED = 1 +} MonitoringState; /* * Struct to store node information @@ -62,6 +65,7 @@ typedef struct s_node_info bool is_ready; bool is_visible; XLogRecPtr last_wal_receive_lsn; + MonitoringState monitoring_state; PGconn *conn; } t_node_info; @@ -80,6 +84,7 @@ typedef struct s_node_info false, \ false, \ InvalidXLogRecPtr, \ + MS_NORMAL, \ NULL \ } @@ -323,5 +328,7 @@ void add_extension_tables_to_bdr_replication_set(PGconn *conn); bool bdr_node_exists(PGconn *conn, const char *node_name); +bool am_bdr_failover_handler(PGconn *conn, int node_id); +void unset_bdr_failover_handler(PGconn *conn); #endif /* dbutils.h */ diff --git a/repmgr--4.0.sql b/repmgr--4.0.sql index eabf04d1..37c1ac26 100644 --- a/repmgr--4.0.sql +++ b/repmgr--4.0.sql @@ -70,3 +70,15 @@ CREATE FUNCTION reset_voting_status() RETURNS VOID AS '$libdir/repmgr', 'reset_voting_status' LANGUAGE C STRICT; + + +CREATE FUNCTION am_bdr_failover_handler(INT) + RETURNS BOOL + AS '$libdir/repmgr', 'am_bdr_failover_handler' + LANGUAGE C STRICT; + + +CREATE FUNCTION unset_bdr_failover_handler() + RETURNS VOID + AS '$libdir/repmgr', 'unset_bdr_failover_handler' + LANGUAGE C STRICT; diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index 66840464..f55df9ca 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -37,8 +37,6 @@ typedef struct TablespaceDataList static PGconn *primary_conn = NULL; static PGconn *source_conn = NULL; -static int server_version_num = UNKNOWN_SERVER_VERSION_NUM; - static char local_data_directory[MAXPGPATH]; static bool local_data_directory_provided = false; diff --git a/repmgr.c b/repmgr.c index 328c66db..93499563 100644 --- a/repmgr.c +++ b/repmgr.c @@ -45,11 +45,14 @@ typedef enum { typedef struct repmgrdSharedState { LWLockId lock; /* protects search/modification */ + /* streaming failover */ NodeState node_state; NodeVotingStatus voting_status; int current_electoral_term; int candidate_node_id; bool follow_new_primary; + /* BDR failover */ + int bdr_failover_handler; } repmgrdSharedState; static repmgrdSharedState *shared_state = NULL; @@ -83,6 +86,13 @@ PG_FUNCTION_INFO_V1(get_new_primary); Datum reset_voting_status(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(reset_voting_status); +Datum am_bdr_failover_handler(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(am_bdr_failover_handler); + +Datum unset_bdr_failover_handler(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(unset_bdr_failover_handler); + + /* * Module load callback */ @@ -157,6 +167,7 @@ repmgr_shmem_startup(void) shared_state->voting_status = VS_NO_VOTE; shared_state->candidate_node_id = UNKNOWN_NODE_ID; shared_state->follow_new_primary = false; + shared_state->bdr_failover_handler = UNKNOWN_NODE_ID; } LWLockRelease(AddinShmemInitLock); @@ -330,3 +341,38 @@ reset_voting_status(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + + +Datum +am_bdr_failover_handler(PG_FUNCTION_ARGS) +{ + int node_id = PG_GETARG_INT32(0); + bool am_handler = false; + + LWLockAcquire(shared_state->lock, LW_SHARED); + + if (shared_state->bdr_failover_handler == UNKNOWN_NODE_ID) + { + shared_state->bdr_failover_handler = node_id; + am_handler = true; + } + else if (shared_state->bdr_failover_handler == node_id) + { + am_handler = true; + } + + LWLockRelease(shared_state->lock); + + PG_RETURN_BOOL(am_handler); +} + + +Datum +unset_bdr_failover_handler(PG_FUNCTION_ARGS) +{ + LWLockAcquire(shared_state->lock, LW_SHARED); + shared_state->bdr_failover_handler = UNKNOWN_NODE_ID; + LWLockRelease(shared_state->lock); + + PG_RETURN_VOID(); +} diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c index 9a6f0656..267ea142 100644 --- a/repmgrd-bdr.c +++ b/repmgrd-bdr.c @@ -14,7 +14,7 @@ static volatile sig_atomic_t got_SIGHUP = false; -static void do_bdr_failover(NodeInfoList *nodes); +static void do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node); void @@ -31,6 +31,8 @@ monitor_bdr(void) t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER; RecordStatus record_status; +// t_node_info other_node_info = T_NODE_INFO_INITIALIZER; + /* sanity check local database */ log_info(_("connecting to local database '%s'"), config_file_options.conninfo); @@ -61,7 +63,6 @@ monitor_bdr(void) exit(ERR_BAD_CONFIG); } - if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr") == false) { log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"), @@ -125,7 +126,7 @@ monitor_bdr(void) NULL); /* - * retrieve list of nodes - we'll need these if the DB connection goes away, + * retrieve list of all nodes - we'll need these if the DB connection goes away, */ get_all_node_records(local_conn, &nodes); @@ -135,44 +136,56 @@ monitor_bdr(void) while (true) { - + NodeInfoListCell *cell; /* monitoring loop */ log_verbose(LOG_DEBUG, "bdr check loop..."); - switch (monitoring_state) + for (cell = nodes.head; cell; cell = cell->next) { - case MS_NORMAL: + if (cell->node_info->node_id == local_node_info.node_id) { - if (is_server_available(local_node_info.conninfo) == false) - { - // XXX improve - log_warning("connection problem!"); - do_bdr_failover(&nodes); - } - else - { - log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")", - config_file_options.monitor_interval_secs); - sleep(config_file_options.monitor_interval_secs); - } + log_debug("checking local node %i in %s state", + local_node_info.node_id, + print_monitoring_state(cell->node_info->monitoring_state)); } - case MS_DEGRADED: + else { - /* degraded monitoring */ - if (is_server_available(local_node_info.conninfo) == true) + log_debug("checking other node %i in %s state", + cell->node_info->node_id, + print_monitoring_state(cell->node_info->monitoring_state)); + } + + + switch (cell->node_info->monitoring_state) + { + case MS_NORMAL: { - log_notice(_("monitored node %i has recovered"), local_node_info.node_id); - // do_bdr_recovery() + if (is_server_available(cell->node_info->conninfo) == false) + { + // XXX improve + log_warning("connection problem! to node %i", cell->node_info->node_id); + do_bdr_failover(&nodes, cell->node_info); + } } - else + break; + case MS_DEGRADED: { - log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")", - config_file_options.monitor_interval_secs); - sleep(config_file_options.monitor_interval_secs); + /* degraded monitoring */ + if (is_server_available(cell->node_info->conninfo) == true) + { + log_notice(_("monitored node %i has recovered"), cell->node_info->node_id); + // do_bdr_recovery() + } + } + break; } } + log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")", + config_file_options.monitor_interval_secs); + sleep(config_file_options.monitor_interval_secs); + if (got_SIGHUP) { /* @@ -186,9 +199,6 @@ monitor_bdr(void) update_registration(local_conn); } - /* reload node list */ - get_all_node_records(local_conn, &nodes); - got_SIGHUP = false; } @@ -199,14 +209,13 @@ monitor_bdr(void) /* * do_bdr_failover() - * + *0 * Here we attempt to perform a BDR "failover". * * As there's no equivalent of a physical replication failover, * we'll do the following: * - * - attempt to find another node, to set our node record as inactive - * (there should be only one other node) + * - connect to active node * - generate an event log record on that node * - optionally execute `bdr_failover_command`, passing the conninfo string * of that node to the command; this can be used for e.g. reconfiguring @@ -215,29 +224,33 @@ monitor_bdr(void) */ void -do_bdr_failover(NodeInfoList *nodes) +do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) { PGconn *next_node_conn = NULL; NodeInfoListCell *cell; - bool failover_success = false; +// bool failover_success = false; PQExpBufferData event_details; RecordStatus record_status; t_event_info event_info = T_EVENT_INFO_INITIALIZER; - t_node_info target_node = T_NODE_INFO_INITIALIZER; + t_node_info target_node = T_NODE_INFO_INITIALIZER; initPQExpBuffer(&event_details); - /* get next active node */ + monitored_node->monitoring_state = MS_DEGRADED; + INSTR_TIME_SET_CURRENT(degraded_monitoring_start); + + /* get other node */ for (cell = nodes->head; cell; cell = cell->next) { log_debug("do_bdr_failover() %s", cell->node_info->node_name); /* don't attempt to connect to the current monitored node, as that's the one which has failed */ - if (cell->node_info->node_id == local_node_info.node_id) + if (cell->node_info->node_id == monitored_node->node_id) continue; /* XXX skip inactive node? */ + // reuse local conn if local node is up next_node_conn = establish_db_connection(cell->node_info->conninfo, false); if (PQstatus(next_node_conn) == CONNECTION_OK) @@ -251,6 +264,7 @@ do_bdr_failover(NodeInfoList *nodes) next_node_conn = NULL; } + /* shouldn't happen, and if it does, it means everything is down */ if (next_node_conn == NULL) { appendPQExpBuffer(&event_details, @@ -258,33 +272,42 @@ do_bdr_failover(NodeInfoList *nodes) log_error("%s", event_details.data); - // no other nodes found - // continue degraded monitoring until node is restored? + /* no other nodes found - continue degraded monitoring */ + return; } - else + + + // call: repmgr.am_bdr_failover_handler(node_id) + if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false) { - log_info(_("connecting to target node %s"), target_node.node_name); - - failover_success = true; - - event_info.conninfo_str = target_node.conninfo; - event_info.node_name = target_node.node_name; - - /* update our own record on the other node */ - update_node_record_set_active(next_node_conn, local_node_info.node_id, false); - - appendPQExpBuffer(&event_details, - _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"), - local_node_info.node_name, - local_node_info.node_id, - target_node.node_name, - target_node.node_id); + log_debug("XXX am not failover handler"); + PQfinish(next_node_conn); + log_debug("other node's repmgrd is handling failover"); + return; } - monitoring_state = MS_DEGRADED; - INSTR_TIME_SET_CURRENT(degraded_monitoring_start); + log_debug("YYYam the failover handler"); // check here that the node hasn't come back up... + log_info(_("connecting to target node %s"), target_node.node_name); + +// failover_success = true; + + event_info.conninfo_str = target_node.conninfo; + event_info.node_name = target_node.node_name; + + /* update our own record on the other node */ + update_node_record_set_active(next_node_conn, monitored_node->node_id, false); + + appendPQExpBuffer(&event_details, + _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"), + monitored_node->node_name, + monitored_node->node_id, + target_node.node_name, + target_node.node_id); + + + /* * Create an event record @@ -301,12 +324,13 @@ do_bdr_failover(NodeInfoList *nodes) &config_file_options, config_file_options.node_id, "bdr_failover", - failover_success, + true, event_details.data, &event_info); termPQExpBuffer(&event_details); + unset_bdr_failover_handler(next_node_conn); /* local monitoring mode - there's no new node to monitor */ return; diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 4efa4d4b..b1370b15 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -47,7 +47,6 @@ static PGconn *primary_conn = NULL; static ElectionResult do_election(void); static const char *_print_voting_status(NodeVotingStatus voting_status); static const char *_print_election_result(ElectionResult result); -static const char *_print_monitoring_state(MonitoringState monitoring_state); static FailoverState promote_self(void); static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id); @@ -315,7 +314,7 @@ monitor_streaming_primary(void) log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"), local_node_info.node_name, local_node_info.node_id, - _print_monitoring_state(monitoring_state)); + print_monitoring_state(monitoring_state)); if (monitoring_state == MS_DEGRADED) { @@ -617,7 +616,7 @@ monitor_streaming_standby(void) local_node_info.node_id, upstream_node_info.node_name, upstream_node_info.node_id, - _print_monitoring_state(monitoring_state)); + print_monitoring_state(monitoring_state)); if (monitoring_state == MS_DEGRADED) { @@ -1459,22 +1458,6 @@ _print_election_result(ElectionResult result) return "UNKNOWN"; } -static const char * -_print_monitoring_state(MonitoringState monitoring_state) -{ - switch(monitoring_state) - { - case MS_NORMAL: - return "normal"; - - case MS_DEGRADED: - return "degraded"; - } - - /* should never reach here */ - return "UNKNOWN"; -} - static ElectionResult diff --git a/repmgrd.c b/repmgrd.c index 00335f75..59d21e6f 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -665,6 +665,23 @@ calculate_elapsed(instr_time start_time) } +const char * +print_monitoring_state(MonitoringState monitoring_state) +{ + switch(monitoring_state) + { + case MS_NORMAL: + return "normal"; + + case MS_DEGRADED: + return "degraded"; + } + + /* should never reach here */ + return "UNKNOWN"; +} + + static void close_connections() { @@ -694,3 +711,5 @@ terminate(int retval) exit(retval); } + + diff --git a/repmgrd.h b/repmgrd.h index 56b6b3b8..2a6caea9 100644 --- a/repmgrd.h +++ b/repmgrd.h @@ -15,10 +15,6 @@ typedef enum { NODE_STATUS_DOWN } NodeStatus; -typedef enum { - MS_NORMAL = 0, - MS_DEGRADED = 1 -} MonitoringState; extern MonitoringState monitoring_state; extern instr_time degraded_monitoring_start; @@ -31,6 +27,8 @@ extern bool startup_event_logged; PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status); int calculate_elapsed(instr_time start_time); +const char *print_monitoring_state(MonitoringState monitoring_state); + void update_registration(PGconn *conn); void terminate(int retval); #endif /* _REPMGRD_H_ */