diff --git a/config.c b/config.c index 403d177f..7b8b25ec 100644 --- a/config.c +++ b/config.c @@ -435,23 +435,6 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList * else if (strcmp(name, "event_notifications") == 0) parse_event_notifications_list(options, value); - /* bdr settings */ - else if (strcmp(name, "bdr_monitoring_mode") == 0) - { - if (strncmp(value, "local", MAXLEN) == 0) - { - options->bdr_monitoring_mode = BDR_MONITORING_LOCAL; - } - else if (strcmp(value, "highest_priority") == 0) - { - options->bdr_monitoring_mode = BDR_MONITORING_PRIORITY; - } - else - { - item_list_append(error_list, _("value for 'bdr_monitoring_mode' must be 'local' or 'highest_priority'\n")); - } - } - /* barman settings */ else if (strcmp(name, "barman_host") == 0) strncpy(options->barman_host, value, MAXLEN); diff --git a/config.h b/config.h index 80c6685d..77e34edf 100644 --- a/config.h +++ b/config.h @@ -97,9 +97,6 @@ typedef struct char event_notification_command[MAXLEN]; EventNotificationList event_notifications; - /* bdr settings */ - int bdr_monitoring_mode; - /* barman settings */ char barman_host[MAXLEN]; char barman_server[MAXLEN]; @@ -132,8 +129,6 @@ typedef struct "", "", "", "", "", "", \ /* event notification settings */ \ "", { NULL, NULL }, \ - /* bdr settings */ \ - BDR_MONITORING_LOCAL, \ /* barman settings */ \ "", "", "", \ /* undocumented test settings */ \ diff --git a/dbutils.c b/dbutils.c index 8f488e5a..11ff7ac5 100644 --- a/dbutils.c +++ b/dbutils.c @@ -14,6 +14,11 @@ #include "catalog/pg_control.h" + +/* mainly for use by repmgrd */ +int server_version_num = 0; + + static PGconn *_establish_db_connection(const char *conninfo, const bool exit_on_error, const bool log_notice, @@ -1501,8 +1506,6 @@ get_active_sibling_node_records(PGconn *conn, int node_id, int upstream_node_id, PQExpBufferData query; PGresult *res; - clear_node_info_list(node_list); - initPQExpBuffer(&query); appendPQExpBuffer(&query, @@ -1535,8 +1538,6 @@ get_node_records_by_priority(PGconn *conn, NodeInfoList *node_list) PQExpBufferData query; PGresult *res; - clear_node_info_list(node_list); - initPQExpBuffer(&query); appendPQExpBuffer( @@ -1696,7 +1697,7 @@ update_node_record_set_active(PGconn *conn, int this_node_id, bool active) appendPQExpBuffer( &query, "UPDATE repmgr.nodes SET active = %s " - " WHERE id = %i", + " WHERE node_id = %i", active == true ? "TRUE" : "FALSE", this_node_id); @@ -1878,7 +1879,7 @@ update_node_record_conn_priority(PGconn *conn, t_configuration_options *options) "UPDATE repmgr.nodes " " SET conninfo = '%s', " " priority = %d " - " WHERE id = %d ", + " WHERE node_id = %d ", options->conninfo, options->priority, options->node_id); @@ -1929,14 +1930,13 @@ delete_node_record(PGconn *conn, int node) } - void clear_node_info_list(NodeInfoList *nodes) { NodeInfoListCell *cell; NodeInfoListCell *next_cell; - log_debug("clear_node_info_list() - closing open connections"); + log_verbose(LOG_DEBUG, "clear_node_info_list() - closing open connections"); /* close any open connections */ for (cell = nodes->head; cell; cell = cell->next) @@ -1948,7 +1948,7 @@ clear_node_info_list(NodeInfoList *nodes) } } - log_debug("clear_node_info_list() - unlinking"); + log_verbose(LOG_DEBUG, "clear_node_info_list() - unlinking"); cell = nodes->head; @@ -1959,6 +1959,7 @@ clear_node_info_list(NodeInfoList *nodes) pfree(cell); cell = next_cell; } + nodes->head = NULL; nodes->tail = NULL; nodes->node_count = 0; @@ -3134,9 +3135,7 @@ void _populate_bdr_node_records(PGresult *res, BdrNodeInfoList *node_list) { int i; - node_list->head = NULL; - node_list->tail = NULL; - node_list->node_count = 0; + clear_node_info_list(node_list); if (PQresultStatus(res) != PGRES_TUPLES_OK) { diff --git a/dbutils.h b/dbutils.h index 2df11eb5..1b9ede4d 100644 --- a/dbutils.h +++ b/dbutils.h @@ -186,6 +186,9 @@ typedef struct BdrNodeInfoList 0 \ } + +extern int server_version_num; + /* utility functions */ XLogRecPtr parse_lsn(const char *str); diff --git a/repmgr-client.c b/repmgr-client.c index 234f6362..ac4b2948 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -1467,12 +1467,12 @@ create_repmgr_extension(PGconn *conn) int check_server_version(PGconn *conn, char *server_type, bool exit_on_error, char *server_version_string) { - int server_version_num = 0; + int conn_server_version_num = 0; - server_version_num = get_server_version(conn, server_version_string); - if (server_version_num < MIN_SUPPORTED_VERSION_NUM) + conn_server_version_num = get_server_version(conn, server_version_string); + if (conn_server_version_num < MIN_SUPPORTED_VERSION_NUM) { - if (server_version_num > 0) + if (conn_server_version_num > 0) log_error(_("%s requires %s to be PostgreSQL %s or later"), progname(), server_type, @@ -1488,7 +1488,7 @@ check_server_version(PGconn *conn, char *server_type, bool exit_on_error, char * return -1; } - return server_version_num; + return conn_server_version_num; } diff --git a/repmgr.conf.sample b/repmgr.conf.sample index a8c890dc..217895fc 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -13,7 +13,7 @@ # repmgr and repmgrd require the following items to be configured. -#node_id=1 # A unique integer greater than zero +#node_id= # A unique integer greater than zero #node_name='' # An arbitrary (but unique) string; we recommend # using the server's hostname or another identifier # unambiguously associated with the server to avoid @@ -28,6 +28,7 @@ # # For details on conninfo strings, see: # https://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-CONNSTRING + # # If repmgrd is in use, consider explicitly setting # "connect_timeout" in the conninfo string to determine # the length of time which elapses before a network @@ -44,8 +45,9 @@ # Replication settings #------------------------------------------------------------------------------ +#replication_type=physical # Must be one of 'physical' or 'bdr' -#upstream_node_id=1 # When using cascading replication, a standby +#upstream_node_id= # When using cascading replication, a standby # can connect to another upstream standby node # which is specified by setting 'upstream_node_id'. # In that case, the upstream node must exist diff --git a/repmgr.h b/repmgr.h index 23a090fd..a627e413 100644 --- a/repmgr.h +++ b/repmgr.h @@ -30,8 +30,6 @@ #define REPLICATION_TYPE_PHYSICAL 1 #define REPLICATION_TYPE_BDR 2 -#define BDR_MONITORING_LOCAL 1 -#define BDR_MONITORING_PRIORITY 2 #define DEFAULT_LOCATION "default" #define DEFAULT_PRIORITY 100 @@ -52,4 +50,5 @@ #define ERRBUFF_SIZE 512 + #endif /* _REPMGR_H_ */ diff --git a/repmgrd-bdr.c b/repmgrd-bdr.c index a0df1964..9a6f0656 100644 --- a/repmgrd-bdr.c +++ b/repmgrd-bdr.c @@ -14,6 +14,9 @@ static volatile sig_atomic_t got_SIGHUP = false; +static void do_bdr_failover(NodeInfoList *nodes); + + void do_bdr_node_check(void) { @@ -25,12 +28,8 @@ void monitor_bdr(void) { NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER; - PGconn *monitoring_conn = NULL; - t_node_info *monitored_node = NULL; t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER; - RecordStatus record_status; - bool failover_done = false; /* sanity check local database */ log_info(_("connecting to local database '%s'"), @@ -118,7 +117,6 @@ monitor_bdr(void) } /* Log startup event */ - create_event_record(local_conn, &config_file_options, config_file_options.node_id, @@ -128,158 +126,72 @@ monitor_bdr(void) /* * retrieve list of nodes - we'll need these if the DB connection goes away, - * or if we're monitoring a non-local node */ - get_node_records_by_priority(local_conn, &nodes); + get_all_node_records(local_conn, &nodes); - /* decided which node to monitor */ + log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id); - if (config_file_options.bdr_monitoring_mode == BDR_MONITORING_LOCAL) - { - // if local, reuse local_conn and node info - //record_status = get_node_record(local_conn, config_file_options.node_id, &monitored_node); - monitored_node = &local_node_info; - - monitoring_conn = establish_db_connection(monitored_node->conninfo, false); - log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id); - } - else - { - NodeInfoListCell *cell; - - for (cell = nodes.head; cell; cell = cell->next) - { - log_debug("main_loop_bdr() checking node %s %i", cell->node_info->node_name, cell->node_info->priority); - - monitoring_conn = establish_db_connection(cell->node_info->conninfo, false); - if (PQstatus(monitoring_conn) == CONNECTION_OK) - { - log_debug("main_loop_bdr() monitoring node '%s' (ID %i, priority %i)", - cell->node_info->node_name, cell->node_info->node_id, cell->node_info->priority); - /* fetch the record again, as the node list is transient */ - monitored_node = get_node_record_pointer(monitoring_conn, cell->node_info->node_id); - - break; - } - } - } - - // check monitored_node not null! + log_info(_("starting continuous bdr node monitoring")); while (true) { - /* normal state - connection active */ - if (PQstatus(monitoring_conn) == CONNECTION_OK) + + /* monitoring loop */ + log_verbose(LOG_DEBUG, "bdr check loop..."); + + switch (monitoring_state) { - // XXX detail - log_info(_("starting continuous bdr node monitoring")); - - /* monitoring loop */ - do + case MS_NORMAL: { - log_verbose(LOG_DEBUG, "bdr check loop..."); - + if (is_server_available(local_node_info.conninfo) == false) { - NodeInfoListCell *cell; - - for (cell = nodes.head; cell; cell = cell->next) - { - log_debug("bdr_monitor() %s", cell->node_info->node_name); - } - } - - if (is_server_available(monitored_node->conninfo) == false) - { - t_node_info *new_monitored_node; - // XXX improve log_warning("connection problem!"); - new_monitored_node = do_bdr_failover(&nodes, monitored_node); - - if (new_monitored_node != NULL) - { - pfree(monitored_node); - monitored_node = new_monitored_node; - } - log_notice(_("monitored_node->node_name is now '%s' \n"), monitored_node->node_name); + do_bdr_failover(&nodes); } else { + log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")", + config_file_options.monitor_interval_secs); sleep(config_file_options.monitor_interval_secs); } - - if (got_SIGHUP) - { - /* - * if we can reload, then could need to change - * local_conn - */ - if (reload_config(&config_file_options)) - { - PQfinish(local_conn); - local_conn = establish_db_connection(config_file_options.conninfo, true); - update_registration(local_conn); - } - - /* reload node list */ - get_node_records_by_priority(local_conn, &nodes); - - got_SIGHUP = false; - } - - } while (!failover_done); - } - /* local connection inactive - periodically try and connect */ - /* TODO: make this an option */ - else - { - - monitoring_conn = establish_db_connection(monitored_node->conninfo, false); - - if (PQstatus(monitoring_conn) == CONNECTION_OK) + } + case MS_DEGRADED: { - // XXX event bdr_node_recovered -> if monitored == local node - - if (monitored_node->node_id == config_file_options.node_id) + /* degraded monitoring */ + if (is_server_available(local_node_info.conninfo) == true) { - log_notice(_("local connection has returned, resuming monitoring")); + log_notice(_("monitored node %i has recovered"), local_node_info.node_id); + // do_bdr_recovery() } else { - log_notice(_("connection to '%s' has returned, resuming monitoring"), monitored_node->node_name); + log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")", + config_file_options.monitor_interval_secs); + sleep(config_file_options.monitor_interval_secs); } } - else - { - sleep(config_file_options.monitor_interval_secs); - } - - - if (got_SIGHUP) - { - /* - * if we can reload, then could need to change - * local_conn - */ - if (reload_config(&config_file_options)) - { - if (PQstatus(local_conn) == CONNECTION_OK) - { - PQfinish(local_conn); - local_conn = establish_db_connection(config_file_options.conninfo, true); - update_registration(local_conn); - } - } - - /* reload node list */ - if (PQstatus(local_conn) == CONNECTION_OK) - get_node_records_by_priority(local_conn, &nodes); - - got_SIGHUP = false; - } } - failover_done = false; + if (got_SIGHUP) + { + /* + * if we can reload, then could need to change + * local_conn + */ + if (reload_config(&config_file_options)) + { + PQfinish(local_conn); + local_conn = establish_db_connection(config_file_options.conninfo, true); + update_registration(local_conn); + } + + /* reload node list */ + get_all_node_records(local_conn, &nodes); + + got_SIGHUP = false; + } + } return; @@ -294,43 +206,44 @@ monitor_bdr(void) * we'll do the following: * * - attempt to find another node, to set our node record as inactive + * (there should be only one other node) * - generate an event log record on that node * - optionally execute `bdr_failover_command`, passing the conninfo string * of that node to the command; this can be used for e.g. reconfiguring * pgbouncer. - * - if mode is 'BDR_MONITORING_PRIORITY', redirect monitoring to that node. * */ -t_node_info * -do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) + +void +do_bdr_failover(NodeInfoList *nodes) { PGconn *next_node_conn = NULL; NodeInfoListCell *cell; bool failover_success = false; PQExpBufferData event_details; + RecordStatus record_status; t_event_info event_info = T_EVENT_INFO_INITIALIZER; - t_node_info *new_monitored_node = NULL; + t_node_info target_node = T_NODE_INFO_INITIALIZER; initPQExpBuffer(&event_details); - /* get next active priority node */ + /* get next active node */ for (cell = nodes->head; cell; cell = cell->next) { log_debug("do_bdr_failover() %s", cell->node_info->node_name); /* don't attempt to connect to the current monitored node, as that's the one which has failed */ - if (cell->node_info->node_id == monitored_node->node_id) + if (cell->node_info->node_id == local_node_info.node_id) continue; /* XXX skip inactive node? */ - next_node_conn = establish_db_connection(cell->node_info->conninfo, false); if (PQstatus(next_node_conn) == CONNECTION_OK) { // XXX check if record returned - new_monitored_node = get_node_record_pointer(next_node_conn, cell->node_info->node_id); + record_status = get_node_record(next_node_conn, cell->node_info->node_id, &target_node); break; } @@ -345,40 +258,34 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) log_error("%s", event_details.data); - // no other nodes found // continue degraded monitoring until node is restored? } else { - log_info(_("connecting to target node %s"), cell->node_info->node_name); + log_info(_("connecting to target node %s"), target_node.node_name); failover_success = true; - event_info.conninfo_str = cell->node_info->conninfo; - event_info.node_name = cell->node_info->node_name; + event_info.conninfo_str = target_node.conninfo; + event_info.node_name = target_node.node_name; /* update our own record on the other node */ - if (monitored_node->node_id == config_file_options.node_id) - { - update_node_record_set_active(next_node_conn, monitored_node->node_id, false); - } - - if (config_file_options.bdr_monitoring_mode == BDR_MONITORING_PRIORITY) - { - log_notice(_("monitoring next available node by prioriy: %s (ID %i)"), - new_monitored_node->node_name, - new_monitored_node->node_id); - } + update_node_record_set_active(next_node_conn, local_node_info.node_id, false); appendPQExpBuffer(&event_details, _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"), - monitored_node->node_name, - monitored_node->node_id, - cell->node_info->node_name, - cell->node_info->node_id); + local_node_info.node_name, + local_node_info.node_id, + target_node.node_name, + target_node.node_id); } + monitoring_state = MS_DEGRADED; + INSTR_TIME_SET_CURRENT(degraded_monitoring_start); + + // check here that the node hasn't come back up... + /* * Create an event record * @@ -400,11 +307,7 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) termPQExpBuffer(&event_details); - //failover_done = true; - - if (config_file_options.bdr_monitoring_mode == BDR_MONITORING_PRIORITY) - return new_monitored_node; /* local monitoring mode - there's no new node to monitor */ - return NULL; + return; } diff --git a/repmgrd-bdr.h b/repmgrd-bdr.h index 6bbe6257..ecc0986e 100644 --- a/repmgrd-bdr.h +++ b/repmgrd-bdr.h @@ -8,6 +8,5 @@ extern void do_bdr_node_check(void); extern void monitor_bdr(void); -extern t_node_info *do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node); #endif /* _REPMGRD_BDR_H_ */ diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 1384a876..4efa4d4b 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -325,7 +325,11 @@ monitor_streaming_primary(void) INSTR_TIME_SET_CURRENT(log_status_interval_start); } } - sleep(1); + + log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")", + config_file_options.monitor_interval_secs); + + sleep(config_file_options.monitor_interval_secs); } } diff --git a/repmgrd.c b/repmgrd.c index 18faad3f..00335f75 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -37,7 +37,6 @@ PGconn *local_conn = NULL; /* Collate command line errors here for friendlier reporting */ static ItemList cli_errors = { NULL, NULL }; -int server_version_num = 0; bool startup_event_logged = false; MonitoringState monitoring_state = MS_NORMAL; @@ -57,7 +56,6 @@ static void show_usage(void); static void daemonize_process(void); static void check_and_create_pid_file(const char *pid_file); - static void start_monitoring(void); diff --git a/repmgrd.h b/repmgrd.h index eb651743..56b6b3b8 100644 --- a/repmgrd.h +++ b/repmgrd.h @@ -27,7 +27,6 @@ extern t_configuration_options config_file_options; extern t_node_info local_node_info; extern PGconn *local_conn; extern bool startup_event_logged; -extern int server_version_num; PGconn *try_reconnect(const char *conninfo, NodeStatus *node_status);