/* * repmgrd-bdr.c - BDR functionality for repmgrd * * Copyright (c) 2ndQuadrant, 2010-2017 */ #include #include "repmgr.h" #include "repmgrd.h" #include "repmgrd-bdr.h" #include "configfile.h" static volatile sig_atomic_t got_SIGHUP = false; static void do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node); static void do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node); void do_bdr_node_check(void) { /* nothing to do at the moment */ } void monitor_bdr(void) { NodeInfoList nodes = T_NODE_INFO_LIST_INITIALIZER; t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER; RecordStatus record_status; NodeInfoListCell *cell; PQExpBufferData event_details; /* sanity check local database */ log_info(_("connecting to local database '%s'"), config_file_options.conninfo); local_conn = establish_db_connection(config_file_options.conninfo, true); /* * Local node must be running */ if (PQstatus(local_conn) != CONNECTION_OK) { log_error(_("unable connect to local node (ID: %i), terminating"), local_node_info.node_id); log_hint(_("local node must be running before repmgrd can start")); PQfinish(local_conn); exit(ERR_DB_CONN); } /* * Verify that database is a BDR one * TODO: check if supported BDR version? */ log_info(_("connected to database, checking for BDR")); if (!is_bdr_db(local_conn)) { log_error(_("database is not BDR-enabled")); exit(ERR_BAD_CONFIG); } if (is_table_in_bdr_replication_set(local_conn, "nodes", "repmgr") == false) { log_error(_("repmgr metadata table 'repmgr.%s' is not in the 'repmgr' replication set"), "nodes"); /* TODO: add `repmgr bdr sync` or similar for this situation, and hint here */ exit(ERR_BAD_CONFIG); } record_status = get_bdr_node_record_by_name(local_conn, local_node_info.node_name, &bdr_node_info); if (record_status != RECORD_FOUND) { log_error(_("unable to retrieve BDR record for node %s, terminating"), local_node_info.node_name); PQfinish(local_conn); exit(ERR_BAD_CONFIG); } /* Retrieve record for this node from the local database */ record_status = get_node_record(local_conn, config_file_options.node_id, &local_node_info); /* * Terminate if we can't find the local node record. This is a "fix-the-config" * situation, not a lot else we can do. */ if (record_status != RECORD_FOUND) { log_error(_("unable to retrieve record for local node (ID: %i), terminating"), local_node_info.node_id); log_hint(_("check that \"repmgr bdr register\" was executed for this node")); PQfinish(local_conn); exit(ERR_BAD_CONFIG); } if (local_node_info.active == false) { log_error(_("local node (ID: %i) is marked as inactive in repmgr"), local_node_info.node_id); log_hint(_("if the node has been reactivated, run \"repmgr bdr register --force\" and restart repmgrd")); PQfinish(local_conn); exit(ERR_BAD_CONFIG); } if (is_active_bdr_node(local_conn, local_node_info.node_name)) { log_error(_("BDR node %s is not active, terminating"), local_node_info.node_name); PQfinish(local_conn); exit(ERR_BAD_CONFIG); } /* Log startup event */ create_event_record(local_conn, &config_file_options, config_file_options.node_id, "repmgrd_start", true, NULL); /* * retrieve list of all nodes - we'll need these if the DB connection goes away */ get_all_node_records(local_conn, &nodes); /* we're expecting all (both) nodes to be up */ for (cell = nodes.head; cell; cell = cell->next) { cell->node_info->node_status = NODE_STATUS_UP; } log_debug("main_loop_bdr() monitoring local node %i", config_file_options.node_id); log_info(_("starting continuous bdr node monitoring")); while (true) { /* monitoring loop */ log_verbose(LOG_DEBUG, "bdr check loop..."); for (cell = nodes.head; cell; cell = cell->next) { if (config_file_options.bdr_local_monitoring_only == true && cell->node_info->node_id != local_node_info.node_id) { continue; } if (cell->node_info->node_id == local_node_info.node_id) { log_debug("checking local node %i in %s state", local_node_info.node_id, print_monitoring_state(cell->node_info->monitoring_state)); } else { log_debug("checking other node %i in %s state", cell->node_info->node_id, print_monitoring_state(cell->node_info->monitoring_state)); } switch (cell->node_info->monitoring_state) { case MS_NORMAL: { if (is_server_available(cell->node_info->conninfo) == false) { /* node is down, we were expecting it to be up */ if (cell->node_info->node_status == NODE_STATUS_UP) { instr_time node_unreachable_start; INSTR_TIME_SET_CURRENT(node_unreachable_start); cell->node_info->node_status = NODE_STATUS_DOWN; if (cell->node_info->conn != NULL) { PQfinish(cell->node_info->conn); cell->node_info->conn = NULL; } cell->node_info->conn = try_reconnect(cell->node_info); /* node has recovered - log and continue */ if (cell->node_info->node_status == NODE_STATUS_UP) { int node_unreachable_elapsed = calculate_elapsed(node_unreachable_start); initPQExpBuffer(&event_details); appendPQExpBuffer(&event_details, _("reconnected to node %i after %i seconds"), cell->node_info->node_id, node_unreachable_elapsed); log_notice("%s", event_details.data); create_event_notification(cell->node_info->conn, &config_file_options, config_file_options.node_id, "repmgrd_upstream_reconnect", true, event_details.data); termPQExpBuffer(&event_details); goto loop; } /* still down after reconnect attempt(s) */ if (cell->node_info->node_status == NODE_STATUS_DOWN) { do_bdr_failover(&nodes, cell->node_info); goto loop; } } } } break; case MS_DEGRADED: { /* degraded monitoring */ if (is_server_available(cell->node_info->conninfo) == true) { do_bdr_recovery(&nodes, cell->node_info); } } break; } } loop: if (got_SIGHUP) { /* * if we can reload, then could need to change * local_conn */ if (reload_config(&config_file_options)) { PQfinish(local_conn); local_conn = establish_db_connection(config_file_options.conninfo, true); update_registration(local_conn); } got_SIGHUP = false; } log_verbose(LOG_DEBUG, "sleeping %i seconds (\"monitor_interval_secs\")", config_file_options.monitor_interval_secs); sleep(config_file_options.monitor_interval_secs); } return; } /* * do_bdr_failover() * * Here we attempt to perform a BDR "failover". * * As there's no equivalent of a physical replication failover, * we'll do the following: * * - connect to active node * - generate an event log record on that node * - optionally execute `bdr_failover_command`, passing the conninfo string * of that node to the command; this can be used for e.g. reconfiguring * pgbouncer. * */ void do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node) { PGconn *next_node_conn = NULL; NodeInfoListCell *cell; PQExpBufferData event_details; t_event_info event_info = T_EVENT_INFO_INITIALIZER; t_node_info target_node = T_NODE_INFO_INITIALIZER; t_node_info failed_node = T_NODE_INFO_INITIALIZER; RecordStatus record_status; /* if one of the two nodes is down, cluster will be in a degraded state */ monitored_node->monitoring_state = MS_DEGRADED; INSTR_TIME_SET_CURRENT(degraded_monitoring_start); /* get other node */ for (cell = nodes->head; cell; cell = cell->next) { log_debug("do_bdr_failover() %s", cell->node_info->node_name); /* don't attempt to connect to the current monitored node, as that's the one which has failed */ if (cell->node_info->node_id == monitored_node->node_id) continue; /* TODO: reuse local conn if local node is up */ next_node_conn = establish_db_connection(cell->node_info->conninfo, false); if (PQstatus(next_node_conn) == CONNECTION_OK) { record_status = get_node_record(next_node_conn, cell->node_info->node_id, &target_node); if (record_status == RECORD_FOUND) { break; } } next_node_conn = NULL; } /* shouldn't happen, and if it does, it means everything is down */ if (next_node_conn == NULL) { log_error(_("no other available node found")); /* no other nodes found - continue degraded monitoring */ return; } /* * check if the node record for the failed node is still marked as active, * if not it means the other node has done the "failover" already */ record_status = get_node_record(next_node_conn, monitored_node->node_id, &failed_node); if (record_status == RECORD_FOUND && failed_node.active == false) { PQfinish(next_node_conn); log_notice(_("record for node %i has already been set inactive"), cell->node_info->node_id); return; } if (am_bdr_failover_handler(next_node_conn, local_node_info.node_id) == false) { PQfinish(next_node_conn); log_notice(_("other node's repmgrd is handling failover")); return; } /* check here that the node hasn't come back up */ if (is_server_available(monitored_node->conninfo) == true) { log_notice(_("node %i has reappeared, aborting failover"), monitored_node->node_id); monitored_node->monitoring_state = MS_NORMAL; PQfinish(next_node_conn); } log_debug("this node is the failover handler"); initPQExpBuffer(&event_details); event_info.conninfo_str = target_node.conninfo; event_info.node_name = target_node.node_name; /* update node record on the active node */ update_node_record_set_active(next_node_conn, monitored_node->node_id, false); appendPQExpBuffer(&event_details, _("node '%s' (ID: %i) detected as failed; next available node is '%s' (ID: %i)"), monitored_node->node_name, monitored_node->node_id, target_node.node_name, target_node.node_id); /* * Create an event record * * If we were able to connect to another node, we'll update the * event log there. * * In any case the event notification command will be triggered * with the event "bdr_failover" */ create_event_notification_extended( next_node_conn, &config_file_options, monitored_node->node_id, "bdr_failover", true, event_details.data, &event_info); termPQExpBuffer(&event_details); unset_bdr_failover_handler(next_node_conn); PQfinish(next_node_conn); return; } static void do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node) { PGconn *recovered_node_conn; PQExpBufferData event_details; t_bdr_node_info bdr_record; t_event_info event_info = T_EVENT_INFO_INITIALIZER; int i; bool node_recovered = false; int node_recovery_elapsed; recovered_node_conn = establish_db_connection(monitored_node->conninfo, false); if (PQstatus(recovered_node_conn) != CONNECTION_OK) { PQfinish(recovered_node_conn); return; } if (am_bdr_failover_handler(recovered_node_conn, local_node_info.node_id) == false) { PQfinish(recovered_node_conn); log_debug("other node's repmgrd is handling recovery"); return; } for (i = 0; i < config_file_options.bdr_recovery_timeout; i++) { RecordStatus record_status = get_bdr_node_record_by_name( recovered_node_conn, monitored_node->node_name, &bdr_record); if (record_status == RECORD_FOUND && bdr_record.node_status == 'r') { // check pg_stat_replication node_recovered = true; break; } sleep(1); continue; } if (node_recovered == false) { log_warning(_("node did not come up")); PQfinish(recovered_node_conn); return; } // XXX check other node is attached to this one so we // don't end up monitoring a parted node; if not attached, // generate a failed bdr_recovery event initPQExpBuffer(&event_details); node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start); appendPQExpBuffer(&event_details, _("node '%s' (ID: %i) has recovered after %i seconds"), monitored_node->node_name, monitored_node->node_id, node_recovery_elapsed); monitored_node->monitoring_state = MS_NORMAL; log_notice("%s", event_details.data); /* generate the event on the currently active node only */ if (monitored_node->node_id != local_node_info.node_id) { if (config_file_options.bdr_active_node_recovery == true) { event_info.conninfo_str = monitored_node->conninfo; event_info.node_name = monitored_node->node_name; create_event_notification_extended( local_conn, &config_file_options, config_file_options.node_id, "bdr_recovery", true, event_details.data, &event_info); } else { create_event_record( local_conn, &config_file_options, config_file_options.node_id, "bdr_recovery", true, event_details.data); } } termPQExpBuffer(&event_details); unset_bdr_failover_handler(recovered_node_conn); PQfinish(recovered_node_conn); return; }