/* * repmgr-action-node.c * * Implements actions available for any kind of node * * Copyright (c) EnterpriseDB Corporation, 2010-2021 * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include "repmgr.h" #include "controldata.h" #include "dirutil.h" #include "dbutils.h" #include "compat.h" #include "repmgr-client-global.h" #include "repmgr-action-node.h" #include "repmgr-action-standby.h" static bool copy_file(const char *src_file, const char *dest_file); static void format_archive_dir(PQExpBufferData *archive_dir); static t_server_action parse_server_action(const char *action); static const char *output_repmgrd_status(CheckStatus status); static void exit_optformat_error(const char *error, int errcode); static void _do_node_service_list_actions(t_server_action action); static void _do_node_status_is_shutdown_cleanly(void); static void _do_node_archive_config(void); static void _do_node_restore_config(void); static void do_node_check_replication_connection(void); static CheckStatus do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output); static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_repmgrd(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_db_connection(PGconn *conn, OutputMode mode); /* * NODE STATUS * * Can only be run on the local node, as it needs to be able to * read the data directory. * * Parameters: * --is-shutdown-cleanly (for internal use only) * --csv */ void do_node_status(void) { PGconn *conn = NULL; t_node_info node_info = T_NODE_INFO_INITIALIZER; char cluster_size[MAXLEN]; PQExpBufferData output; KeyValueList node_status = {NULL, NULL}; KeyValueListCell *cell = NULL; NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER; ItemList warnings = {NULL, NULL}; RecoveryType recovery_type = RECTYPE_UNKNOWN; ReplInfo replication_info; t_recovery_conf recovery_conf = T_RECOVERY_CONF_INITIALIZER; char data_dir[MAXPGPATH] = ""; char server_version_str[MAXVERSIONSTR] = ""; /* * A database connection is *not* required for this check */ if (runtime_options.is_shutdown_cleanly == true) { return _do_node_status_is_shutdown_cleanly(); } init_replication_info(&replication_info); /* config file required, so we should have "conninfo" and "data_directory" */ conn = establish_db_connection(config_file_options.conninfo, true); strncpy(data_dir, config_file_options.data_directory, MAXPGPATH); (void)get_server_version(conn, server_version_str); /* check node exists */ if (get_node_record_with_upstream(conn, config_file_options.node_id, &node_info) != RECORD_FOUND) { log_error(_("no record found for node %i"), config_file_options.node_id); PQfinish(conn); exit(ERR_BAD_CONFIG); } if (get_cluster_size(conn, cluster_size) == false) strncpy(cluster_size, _("unknown"), MAXLEN); recovery_type = get_recovery_type(conn); get_node_replication_stats(conn, &node_info); key_value_list_set(&node_status, "PostgreSQL version", server_version_str); key_value_list_set(&node_status, "Total data size", cluster_size); key_value_list_set(&node_status, "Conninfo", node_info.conninfo); if (runtime_options.verbose == true) { uint64 local_system_identifier = get_system_identifier(config_file_options.data_directory); if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER) { key_value_list_set(&node_status, "System identifier", "unknown"); item_list_append_format(&warnings, _("unable to retrieve system identifier from pg_control")); } else { key_value_list_set_format(&node_status, "System identifier", "%lu", local_system_identifier); } } key_value_list_set(&node_status, "Role", get_node_type_string(node_info.type)); switch (node_info.type) { case PRIMARY: if (recovery_type == RECTYPE_STANDBY) { item_list_append(&warnings, _("- node is registered as primary but running as standby")); } break; case STANDBY: if (recovery_type == RECTYPE_PRIMARY) { item_list_append(&warnings, _("- node is registered as standby but running as primary")); } break; default: break; } if (guc_set(conn, "archive_mode", "=", "off")) { key_value_list_set(&node_status, "WAL archiving", "off"); key_value_list_set(&node_status, "Archive command", "(none)"); } else { /* "archive_mode" is not "off", i.e. one of "on", "always" */ bool enabled = true; PQExpBufferData archiving_status; char archive_command[MAXLEN] = ""; initPQExpBuffer(&archiving_status); /* * if the node is a standby, and "archive_mode" is "on", archiving will * actually be disabled. */ if (recovery_type == RECTYPE_STANDBY) { if (guc_set(conn, "archive_mode", "=", "on")) enabled = false; } if (enabled == true) { appendPQExpBufferStr(&archiving_status, "enabled"); } else { appendPQExpBufferStr(&archiving_status, "disabled"); } if (enabled == false && recovery_type == RECTYPE_STANDBY) { if (PQserverVersion(conn) >= 90500) { appendPQExpBufferStr(&archiving_status, " (on standbys \"archive_mode\" must be set to \"always\" to be effective)"); } else { appendPQExpBufferStr(&archiving_status, " (\"archive_mode\" has no effect on standbys)"); } } key_value_list_set(&node_status, "WAL archiving", archiving_status.data); termPQExpBuffer(&archiving_status); get_pg_setting(conn, "archive_command", archive_command); key_value_list_set(&node_status, "Archive command", archive_command); } { int ready_files; ready_files = get_ready_archive_files(conn, data_dir); if (ready_files == ARCHIVE_STATUS_DIR_ERROR) { item_list_append_format(&warnings, "- unable to check archive_status directory\n"); } else { if (runtime_options.output_mode == OM_CSV) { key_value_list_set_format(&node_status, "WALs pending archiving", "%i", ready_files); } else { key_value_list_set_format(&node_status, "WALs pending archiving", "%i pending files", ready_files); } } if (guc_set(conn, "archive_mode", "=", "off")) { key_value_list_set_output_mode(&node_status, "WALs pending archiving", OM_CSV); } } if (node_info.max_wal_senders >= 0) { /* In CSV mode, raw values supplied as well */ key_value_list_set_format(&node_status, "Replication connections", "%i (of maximal %i)", node_info.attached_wal_receivers, node_info.max_wal_senders); } else if (node_info.max_wal_senders == 0) { key_value_list_set_format(&node_status, "Replication connections", "disabled"); } /* check for attached nodes */ { NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER; NodeInfoListCell *node_cell = NULL; ItemList missing_nodes = {NULL, NULL}; int missing_nodes_count = 0; int expected_nodes_count = 0; get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes); /* if a witness node is present, we'll need to remove this from the total */ expected_nodes_count = downstream_nodes.node_count; for (node_cell = downstream_nodes.head; node_cell; node_cell = node_cell->next) { /* skip witness server */ if (node_cell->node_info->type == WITNESS) { expected_nodes_count --; continue; } if (is_downstream_node_attached(conn, node_cell->node_info->node_name, NULL) != NODE_ATTACHED) { missing_nodes_count++; item_list_append_format(&missing_nodes, "%s (ID: %i)", node_cell->node_info->node_name, node_cell->node_info->node_id); } } if (missing_nodes_count) { ItemListCell *missing_cell = NULL; item_list_append_format(&warnings, _("- %i of %i downstream nodes not attached:"), missing_nodes_count, expected_nodes_count); for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next) { item_list_append_format(&warnings, " - %s\n", missing_cell->string); } } } if (node_info.max_replication_slots == 0) { key_value_list_set(&node_status, "Replication slots", "disabled"); } else { PQExpBufferData slotinfo; /* * check for missing replication slots - we do this regardless of * what "max_replication_slots" is set to, in case the downstream * node was configured with "use_replication_slots=true" and is * expecting a replication slot to be available */ get_downstream_nodes_with_missing_slot(conn, config_file_options.node_id, &missing_slots); if (missing_slots.node_count > 0) { NodeInfoListCell *missing_slot_cell = NULL; item_list_append_format(&warnings, _("- replication slots missing for following %i node(s):"), missing_slots.node_count); for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next) { item_list_append_format(&warnings, _(" - %s (ID: %i, slot name: \"%s\")"), missing_slot_cell->node_info->node_name, missing_slot_cell->node_info->node_id, missing_slot_cell->node_info->slot_name); } } initPQExpBuffer(&slotinfo); appendPQExpBuffer(&slotinfo, "%i physical (of maximal %i; %i missing)", node_info.active_replication_slots + node_info.inactive_replication_slots, node_info.max_replication_slots, missing_slots.node_count); if (node_info.inactive_replication_slots > 0) { KeyValueList inactive_replication_slots = {NULL, NULL}; (void) get_inactive_replication_slots(conn, &inactive_replication_slots); appendPQExpBuffer(&slotinfo, "; %i inactive", node_info.inactive_replication_slots); item_list_append_format(&warnings, _("- node has %i inactive physical replication slots"), node_info.inactive_replication_slots); for (cell = inactive_replication_slots.head; cell; cell = cell->next) { item_list_append_format(&warnings, " - %s", cell->key); } key_value_list_free(&inactive_replication_slots); } key_value_list_set(&node_status, "Replication slots", slotinfo.data); termPQExpBuffer(&slotinfo); } if (node_info.type == STANDBY) { key_value_list_set_format(&node_status, "Upstream node", "%s (ID: %i)", node_info.upstream_node_name, node_info.upstream_node_id); get_replication_info(conn, node_info.type, &replication_info); key_value_list_set_format(&node_status, "Replication lag", "%i seconds", replication_info.replication_lag_time); key_value_list_set_format(&node_status, "Last received LSN", "%X/%X", format_lsn(replication_info.last_wal_receive_lsn)); key_value_list_set_format(&node_status, "Last replayed LSN", "%X/%X", format_lsn(replication_info.last_wal_replay_lsn)); } else { key_value_list_set(&node_status, "Upstream node", "(none)"); key_value_list_set_output_mode(&node_status, "Upstream node", OM_CSV); key_value_list_set(&node_status, "Replication lag", "n/a"); key_value_list_set(&node_status, "Last received LSN", "(none)"); key_value_list_set_output_mode(&node_status, "Last received LSN", OM_CSV); key_value_list_set(&node_status, "Last replayed LSN", "(none)"); key_value_list_set_output_mode(&node_status, "Last replayed LSN", OM_CSV); } parse_recovery_conf(data_dir, &recovery_conf); /* format output */ initPQExpBuffer(&output); if (runtime_options.output_mode == OM_CSV) { appendPQExpBuffer(&output, "\"Node name\",\"%s\"\n", node_info.node_name); appendPQExpBuffer(&output, "\"Node ID\",\"%i\"\n", node_info.node_id); for (cell = node_status.head; cell; cell = cell->next) { appendPQExpBuffer(&output, "\"%s\",\"%s\"\n", cell->key, cell->value); } /* we'll add the raw data as well */ appendPQExpBuffer(&output, "\"max_wal_senders\",%i\n", node_info.max_wal_senders); appendPQExpBuffer(&output, "\"occupied_wal_senders\",%i\n", node_info.attached_wal_receivers); appendPQExpBuffer(&output, "\"max_replication_slots\",%i\n", node_info.max_replication_slots); appendPQExpBuffer(&output, "\"active_replication_slots\",%i\n", node_info.active_replication_slots); /* output inactive slot information */ appendPQExpBuffer(&output, "\"inactive_replication_slots\",%i", node_info.inactive_replication_slots); if (node_info.inactive_replication_slots) { KeyValueList inactive_replication_slots = {NULL, NULL}; (void) get_inactive_replication_slots(conn, &inactive_replication_slots); for (cell = inactive_replication_slots.head; cell; cell = cell->next) { appendPQExpBuffer(&output, ",\"%s\"", cell->key); } key_value_list_free(&inactive_replication_slots); } /* output missing slot information */ appendPQExpBufferChar(&output, '\n'); appendPQExpBuffer(&output, "\"missing_replication_slots\",%i", missing_slots.node_count); if (missing_slots.node_count > 0) { NodeInfoListCell *missing_slot_cell = NULL; for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next) { appendPQExpBuffer(&output, ",\"%s\"", missing_slot_cell->node_info->slot_name); } } } else { appendPQExpBuffer(&output, "Node \"%s\":\n", node_info.node_name); for (cell = node_status.head; cell; cell = cell->next) { if (cell->output_mode == OM_NOT_SET) appendPQExpBuffer(&output, "\t%s: %s\n", cell->key, cell->value); } } puts(output.data); termPQExpBuffer(&output); if (warnings.head != NULL && runtime_options.terse == false && runtime_options.output_mode == OM_TEXT) { log_warning(_("following issue(s) were detected:")); print_item_list(&warnings); log_hint(_("execute \"repmgr node check\" for more details")); } clear_node_info_list(&missing_slots); key_value_list_free(&node_status); item_list_free(&warnings); PQfinish(conn); /* * If warnings were noted, even if they're not displayed (e.g. in --csv node), * that means something's not right so we need to emit a non-zero exit code. */ if (warnings.head != NULL) { exit(ERR_NODE_STATUS); } return; } /* * Returns information about the running state of the node. * For internal use during "standby switchover". * * Returns "longopt" output: * * --status=(RUNNING|SHUTDOWN|UNCLEAN_SHUTDOWN|UNKNOWN) * --last-checkpoint=... */ static void _do_node_status_is_shutdown_cleanly(void) { PGPing ping_status; PQExpBufferData output; DBState db_state; XLogRecPtr checkPoint = InvalidXLogRecPtr; NodeStatus node_status = NODE_STATUS_UNKNOWN; initPQExpBuffer(&output); appendPQExpBufferStr(&output, "--state="); /* sanity-check we're dealing with a PostgreSQL directory */ if (is_pg_dir(config_file_options.data_directory) == false) { appendPQExpBufferStr(&output, "UNKNOWN"); printf("%s\n", output.data); termPQExpBuffer(&output); return; } ping_status = PQping(config_file_options.conninfo); switch (ping_status) { case PQPING_OK: node_status = NODE_STATUS_UP; break; case PQPING_REJECT: node_status = NODE_STATUS_UP; break; case PQPING_NO_ATTEMPT: case PQPING_NO_RESPONSE: /* status not yet clear */ break; } /* check what pg_control says */ if (get_db_state(config_file_options.data_directory, &db_state) == false) { /* * Unable to retrieve the database state from pg_control */ node_status = NODE_STATUS_UNKNOWN; log_verbose(LOG_DEBUG, "unable to determine db state"); goto return_state; } log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state)); if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY) { if (node_status != NODE_STATUS_UP) { node_status = NODE_STATUS_UNCLEAN_SHUTDOWN; } /* server is still responding but shutting down */ else if (db_state == DB_SHUTDOWNING) { node_status = NODE_STATUS_SHUTTING_DOWN; } } checkPoint = get_latest_checkpoint_location(config_file_options.data_directory); if (checkPoint == InvalidXLogRecPtr) { /* unable to read pg_control, don't know what's happening */ node_status = NODE_STATUS_UNKNOWN; } else if (node_status == NODE_STATUS_UNKNOWN) { /* * if still "UNKNOWN" at this point, then the node must be cleanly shut * down */ node_status = NODE_STATUS_DOWN; } return_state: log_verbose(LOG_DEBUG, "node status determined as: %s", print_node_status(node_status)); appendPQExpBuffer(&output, "%s", print_node_status(node_status)); if (node_status == NODE_STATUS_DOWN) { appendPQExpBuffer(&output, " --last-checkpoint-lsn=%X/%X", format_lsn(checkPoint)); } printf("%s\n", output.data); termPQExpBuffer(&output); return; } static void exit_optformat_error(const char *error, int errcode) { PQExpBufferData output; Assert(runtime_options.output_mode == OM_OPTFORMAT); initPQExpBuffer(&output); appendPQExpBuffer(&output, "--error=%s", error); printf("%s\n", output.data); termPQExpBuffer(&output); exit(errcode); } /* * Configuration file required */ void do_node_check(void) { PGconn *conn = NULL; PQExpBufferData output; t_node_info node_info = T_NODE_INFO_INITIALIZER; CheckStatus return_code; CheckStatusList status_list = {NULL, NULL}; CheckStatusListCell *cell = NULL; bool issue_detected = false; bool exit_on_connection_error = true; /* for internal use */ if (runtime_options.has_passfile == true) { return_code = has_passfile() ? 0 : 1; exit(return_code); } /* for use by "standby switchover" */ if (runtime_options.replication_connection == true) { do_node_check_replication_connection(); exit(SUCCESS); } if (runtime_options.db_connection == true) { exit_on_connection_error = false; } /* * If --optformat was provided, we'll assume this is a remote invocation * and instead of exiting with an error, we'll return an error string to * so the remote invoker will know what's happened. */ if (runtime_options.output_mode == OM_OPTFORMAT) { exit_on_connection_error = false; } if (config_file_options.conninfo[0] != '\0') { t_conninfo_param_list node_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER; char *errmsg = NULL; bool parse_success = false; initialize_conninfo_params(&node_conninfo, false); parse_success = parse_conninfo_string(config_file_options.conninfo, &node_conninfo, &errmsg, false); if (parse_success == false) { if (runtime_options.output_mode == OM_OPTFORMAT) { exit_optformat_error("CONNINFO_PARSE", ERR_BAD_CONFIG); } log_error(_("unable to parse conninfo string \"%s\" for local node"), config_file_options.conninfo); log_detail("%s", errmsg); exit(ERR_BAD_CONFIG); } /* * If --superuser option provided, attempt to connect as the specified user */ if (runtime_options.superuser[0] != '\0') { conn = establish_db_connection_with_replacement_param( config_file_options.conninfo, "user", runtime_options.superuser, exit_on_connection_error); } else { conn = establish_db_connection_by_params(&node_conninfo, exit_on_connection_error); } } else { conn = establish_db_connection_by_params(&source_conninfo, exit_on_connection_error); } /* * --db-connection option provided */ if (runtime_options.db_connection == true) { return_code = do_node_check_db_connection(conn, runtime_options.output_mode); PQfinish(conn); exit(return_code); } /* * If we've reached here, and the connection is invalid, then --optformat was provided */ if (PQstatus(conn) != CONNECTION_OK) { exit_optformat_error("DB_CONNECTION", ERR_DB_CONN); } if (get_node_record(conn, config_file_options.node_id, &node_info) != RECORD_FOUND) { log_error(_("no record found for node %i"), config_file_options.node_id); PQfinish(conn); exit(ERR_BAD_CONFIG); } /* add replication statistics to node record */ get_node_replication_stats(conn, &node_info); /* * handle specific checks ====================== */ if (runtime_options.archive_ready == true) { return_code = do_node_check_archive_ready(conn, runtime_options.output_mode, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.upstream == true) { return_code = do_node_check_upstream(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.downstream == true) { return_code = do_node_check_downstream(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.replication_lag == true) { return_code = do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.role == true) { return_code = do_node_check_role(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.slots == true) { return_code = do_node_check_slots(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.missing_slots == true) { return_code = do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.data_directory_config == true) { return_code = do_node_check_data_directory(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.repmgrd == true) { return_code = do_node_check_repmgrd(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.replication_config_owner == true) { return_code = do_node_check_replication_config_owner(conn, runtime_options.output_mode, &node_info, NULL); PQfinish(conn); exit(return_code); } if (runtime_options.output_mode == OM_NAGIOS) { log_error(_("--nagios can only be used with a specific check")); log_hint(_("execute \"repmgr node --help\" for details")); PQfinish(conn); exit(ERR_BAD_CONFIG); } /* output general overview */ initPQExpBuffer(&output); /* order functions are called is also output order */ if (do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) issue_detected = true; if (do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) issue_detected = true; if (do_node_check_archive_ready(conn, runtime_options.output_mode, &status_list) != CHECK_STATUS_OK) issue_detected = true; if (do_node_check_upstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) issue_detected = true; if (do_node_check_downstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) issue_detected = true; if (do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) issue_detected = true; if (do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) issue_detected = true; if (do_node_check_data_directory(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK) issue_detected = true; if (runtime_options.output_mode == OM_CSV) { appendPQExpBuffer(&output, "\"Node name\",\"%s\"\n", node_info.node_name); appendPQExpBuffer(&output, "\"Node ID\",\"%i\"\n", node_info.node_id); for (cell = status_list.head; cell; cell = cell->next) { appendPQExpBuffer(&output, "\"%s\",\"%s\"", cell->item, output_check_status(cell->status)); if (strlen(cell->details)) { appendPQExpBuffer(&output, ",\"%s\"", cell->details); } appendPQExpBufferChar(&output, '\n'); } } else { appendPQExpBuffer(&output, "Node \"%s\":\n", node_info.node_name); for (cell = status_list.head; cell; cell = cell->next) { appendPQExpBuffer(&output, "\t%s: %s", cell->item, output_check_status(cell->status)); if (strlen(cell->details)) { appendPQExpBuffer(&output, " (%s)", cell->details); } appendPQExpBufferChar(&output, '\n'); } } printf("%s", output.data); termPQExpBuffer(&output); check_status_list_free(&status_list); PQfinish(conn); if (issue_detected == true) { exit(ERR_NODE_STATUS); } } static void do_node_check_replication_connection(void) { PGconn *local_conn = NULL; PGconn *repl_conn = NULL; t_node_info node_record = T_NODE_INFO_INITIALIZER; RecordStatus record_status = RECORD_NOT_FOUND; PQExpBufferData output; initPQExpBuffer(&output); appendPQExpBufferStr(&output, "--connection="); if (runtime_options.remote_node_id == UNKNOWN_NODE_ID) { appendPQExpBufferStr(&output, "UNKNOWN"); printf("%s\n", output.data); termPQExpBuffer(&output); return; } /* retrieve remote node record from local database */ local_conn = establish_db_connection(config_file_options.conninfo, false); if (PQstatus(local_conn) != CONNECTION_OK) { appendPQExpBufferStr(&output, "CONNECTION_ERROR"); printf("%s\n", output.data); termPQExpBuffer(&output); return; } record_status = get_node_record(local_conn, runtime_options.remote_node_id, &node_record); PQfinish(local_conn); if (record_status != RECORD_FOUND) { appendPQExpBufferStr(&output, "UNKNOWN"); printf("%s\n", output.data); termPQExpBuffer(&output); return; } repl_conn = establish_replication_connection_from_conninfo(node_record.conninfo, node_record.repluser); if (PQstatus(repl_conn) != CONNECTION_OK) { appendPQExpBufferStr(&output, "BAD"); printf("%s\n", output.data); termPQExpBuffer(&output); return; } PQfinish(repl_conn); appendPQExpBufferStr(&output, "OK"); printf("%s\n", output.data); termPQExpBuffer(&output); return; } static CheckStatus do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output) { int ready_archive_files = 0; CheckStatus status = CHECK_STATUS_UNKNOWN; PQExpBufferData details; if (mode == OM_CSV && list_output == NULL) { log_error(_("--csv output not provided with --archive-ready option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&details); ready_archive_files = get_ready_archive_files(conn, config_file_options.data_directory); if (ready_archive_files > config_file_options.archive_ready_critical) { status = CHECK_STATUS_CRITICAL; switch (mode) { case OM_OPTFORMAT: appendPQExpBuffer(&details, "--files=%i --threshold=%i", ready_archive_files, config_file_options.archive_ready_critical); break; case OM_NAGIOS: appendPQExpBuffer(&details, "%i pending archive ready files | files=%i;%i;%i", ready_archive_files, ready_archive_files, config_file_options.archive_ready_warning, config_file_options.archive_ready_critical); break; case OM_TEXT: appendPQExpBuffer(&details, "%i pending archive ready files, critical threshold: %i", ready_archive_files, config_file_options.archive_ready_critical); break; default: break; } } else if (ready_archive_files > config_file_options.archive_ready_warning) { status = CHECK_STATUS_WARNING; switch (mode) { case OM_OPTFORMAT: appendPQExpBuffer(&details, "--files=%i --threshold=%i", ready_archive_files, config_file_options.archive_ready_warning); break; case OM_NAGIOS: appendPQExpBuffer(&details, "%i pending archive ready files | files=%i;%i;%i", ready_archive_files, ready_archive_files, config_file_options.archive_ready_warning, config_file_options.archive_ready_critical); break; case OM_TEXT: appendPQExpBuffer(&details, "%i pending archive ready files (threshold: %i)", ready_archive_files, config_file_options.archive_ready_warning); break; default: break; } } else if (ready_archive_files < 0) { status = CHECK_STATUS_UNKNOWN; switch (mode) { case OM_OPTFORMAT: break; case OM_NAGIOS: case OM_TEXT: appendPQExpBufferStr(&details, "unable to check archive_status directory"); break; default: break; } } else { status = CHECK_STATUS_OK; switch (mode) { case OM_OPTFORMAT: appendPQExpBuffer(&details, "--files=%i", ready_archive_files); break; case OM_NAGIOS: appendPQExpBuffer(&details, "%i pending archive ready files | files=%i;%i;%i", ready_archive_files, ready_archive_files, config_file_options.archive_ready_warning, config_file_options.archive_ready_critical); break; case OM_TEXT: appendPQExpBuffer(&details, "%i pending archive ready files", ready_archive_files); break; default: break; } } switch (mode) { case OM_OPTFORMAT: { printf("--status=%s %s\n", output_check_status(status), details.data); } break; case OM_NAGIOS: printf("REPMGR_ARCHIVE_READY %s: %s\n", output_check_status(status), details.data); break; case OM_CSV: case OM_TEXT: if (list_output != NULL) { check_status_list_set(list_output, "WAL archiving", status, details.data); } else { printf("%s (%s)\n", output_check_status(status), details.data); } default: break; } termPQExpBuffer(&details); return status; } static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER; NodeInfoListCell *cell = NULL; int missing_nodes_count = 0; int expected_nodes_count = 0; CheckStatus status = CHECK_STATUS_OK; ItemList missing_nodes = {NULL, NULL}; ItemList attached_nodes = {NULL, NULL}; PQExpBufferData details; if (mode == OM_CSV && list_output == NULL) { log_error(_("--csv output not provided with --downstream option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&details); get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes); /* if a witness node is present, we'll need to remove this from the total */ expected_nodes_count = downstream_nodes.node_count; for (cell = downstream_nodes.head; cell; cell = cell->next) { /* skip witness server */ if (cell->node_info->type == WITNESS) { expected_nodes_count --; continue; } if (is_downstream_node_attached_quiet(conn, cell->node_info->node_name, NULL) != NODE_ATTACHED) { missing_nodes_count++; item_list_append_format(&missing_nodes, "%s (ID: %i)", cell->node_info->node_name, cell->node_info->node_id); } else { item_list_append_format(&attached_nodes, "%s (ID: %i)", cell->node_info->node_name, cell->node_info->node_id); } } if (node_info->type == WITNESS) { /* witness is not connecting to any upstream */ appendPQExpBufferStr(&details, _("N/A - node is a witness")); } else if (missing_nodes_count == 0) { if (expected_nodes_count == 0) appendPQExpBufferStr(&details, "this node has no downstream nodes"); else appendPQExpBuffer(&details, "%i of %i downstream nodes attached", expected_nodes_count - missing_nodes_count, expected_nodes_count); } else { ItemListCell *missing_cell = NULL; bool first = true; status = CHECK_STATUS_CRITICAL; appendPQExpBuffer(&details, "%i of %i downstream nodes not attached", missing_nodes_count, expected_nodes_count); if (mode != OM_NAGIOS) { appendPQExpBufferStr(&details, "; missing: "); for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next) { if (first == false) appendPQExpBufferStr(&details, ", "); else first = false; if (first == false) appendPQExpBufferStr(&details, missing_cell->string); } } } switch (mode) { case OM_NAGIOS: { if (missing_nodes_count) { ItemListCell *missing_cell = NULL; bool first = true; appendPQExpBufferStr(&details, " (missing: "); for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next) { if (first == false) appendPQExpBufferStr(&details, ", "); else first = false; if (first == false) appendPQExpBufferStr(&details, missing_cell->string); } appendPQExpBufferChar(&details, ')'); } printf("REPMGR_DOWNSTREAM_SERVERS %s: %s | attached=%i, missing=%i\n", output_check_status(status), details.data, expected_nodes_count - missing_nodes_count, missing_nodes_count); } break; case OM_CSV: case OM_TEXT: if (list_output != NULL) { check_status_list_set(list_output, "Downstream servers", status, details.data); } else { printf("%s (%s)\n", output_check_status(status), details.data); } default: break; } termPQExpBuffer(&details); clear_node_info_list(&downstream_nodes); return status; } static CheckStatus do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { PGconn *upstream_conn = NULL; t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER; PQExpBufferData details; CheckStatus status = CHECK_STATUS_OK; if (mode == OM_CSV && list_output == NULL) { log_error(_("--csv output not provided with --upstream option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&details); if (node_info->type == WITNESS) { /* witness is not connecting to any upstream */ appendPQExpBufferStr(&details, _("N/A - node is a witness")); } else if (get_node_record(conn, node_info->upstream_node_id, &upstream_node_info) != RECORD_FOUND) { if (get_recovery_type(conn) == RECTYPE_STANDBY) { appendPQExpBuffer(&details, _("node \"%s\" (ID: %i) is a standby but no upstream record found"), node_info->node_name, node_info->node_id); status = CHECK_STATUS_CRITICAL; } else { appendPQExpBufferStr(&details, _("N/A - node is primary")); } } else { upstream_conn = establish_db_connection(upstream_node_info.conninfo, true); /* check our node is connected */ if (is_downstream_node_attached(upstream_conn, config_file_options.node_name, NULL) != NODE_ATTACHED) { appendPQExpBuffer(&details, _("node \"%s\" (ID: %i) is not attached to expected upstream node \"%s\" (ID: %i)"), node_info->node_name, node_info->node_id, upstream_node_info.node_name, upstream_node_info.node_id); status = CHECK_STATUS_CRITICAL; } else { appendPQExpBuffer(&details, _("node \"%s\" (ID: %i) is attached to expected upstream node \"%s\" (ID: %i)"), node_info->node_name, node_info->node_id, upstream_node_info.node_name, upstream_node_info.node_id); } } switch (mode) { case OM_NAGIOS: { printf("REPMGR_UPSTREAM_SERVER %s: %s\n", output_check_status(status), details.data); } break; case OM_TEXT: if (list_output != NULL) { check_status_list_set(list_output, "Upstream connection", status, details.data); } else { printf("%s (%s)\n", output_check_status(status), details.data); } default: break; } termPQExpBuffer(&details); return status; } static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { CheckStatus status = CHECK_STATUS_OK; int lag_seconds = 0; PQExpBufferData details; if (mode == OM_CSV && list_output == NULL) { log_error(_("--csv output not provided with --replication-lag option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&details); if (node_info->recovery_type == RECTYPE_PRIMARY) { switch (mode) { case OM_OPTFORMAT: appendPQExpBufferStr(&details, "--lag=0"); break; case OM_NAGIOS: appendPQExpBuffer(&details, "0 seconds | lag=0;%i;%i", config_file_options.replication_lag_warning, config_file_options.replication_lag_critical); break; case OM_TEXT: if (node_info->type == WITNESS) { appendPQExpBufferStr(&details, "N/A - node is witness"); } else { appendPQExpBufferStr(&details, "N/A - node is primary"); } break; default: break; } } else { lag_seconds = get_replication_lag_seconds(conn); log_debug("lag seconds: %i", lag_seconds); if (lag_seconds >= config_file_options.replication_lag_critical) { status = CHECK_STATUS_CRITICAL; switch (mode) { case OM_OPTFORMAT: appendPQExpBuffer(&details, "--lag=%i --threshold=%i", lag_seconds, config_file_options.replication_lag_critical); break; case OM_NAGIOS: appendPQExpBuffer(&details, "%i seconds | lag=%i;%i;%i", lag_seconds, lag_seconds, config_file_options.replication_lag_warning, config_file_options.replication_lag_critical); break; case OM_TEXT: appendPQExpBuffer(&details, "%i seconds, critical threshold: %i)", lag_seconds, config_file_options.replication_lag_critical); break; default: break; } } else if (lag_seconds > config_file_options.replication_lag_warning) { status = CHECK_STATUS_WARNING; switch (mode) { case OM_OPTFORMAT: appendPQExpBuffer(&details, "--lag=%i --threshold=%i", lag_seconds, config_file_options.replication_lag_warning); break; case OM_NAGIOS: appendPQExpBuffer(&details, "%i seconds | lag=%i;%i;%i", lag_seconds, lag_seconds, config_file_options.replication_lag_warning, config_file_options.replication_lag_critical); break; case OM_TEXT: appendPQExpBuffer(&details, "%i seconds, warning threshold: %i)", lag_seconds, config_file_options.replication_lag_warning); break; default: break; } } else if (lag_seconds == UNKNOWN_REPLICATION_LAG) { status = CHECK_STATUS_UNKNOWN; switch (mode) { case OM_OPTFORMAT: break; case OM_NAGIOS: case OM_TEXT: appendPQExpBufferStr(&details, "unable to query replication lag"); break; default: break; } } else { status = CHECK_STATUS_OK; switch (mode) { case OM_OPTFORMAT: appendPQExpBuffer(&details, "--lag=%i", lag_seconds); break; case OM_NAGIOS: appendPQExpBuffer(&details, "%i seconds | lag=%i;%i;%i", lag_seconds, lag_seconds, config_file_options.replication_lag_warning, config_file_options.replication_lag_critical); break; case OM_TEXT: appendPQExpBuffer(&details, "%i seconds", lag_seconds); break; default: break; } } } switch (mode) { case OM_OPTFORMAT: printf("--status=%s %s\n", output_check_status(status), details.data); break; case OM_NAGIOS: printf("REPMGR_REPLICATION_LAG %s: %s\n", output_check_status(status), details.data); break; case OM_CSV: case OM_TEXT: if (list_output != NULL) { check_status_list_set(list_output, "Replication lag", status, details.data); } else { printf("%s (%s)\n", output_check_status(status), details.data); } default: break; } termPQExpBuffer(&details); return status; } static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { CheckStatus status = CHECK_STATUS_OK; PQExpBufferData details; RecoveryType recovery_type = get_recovery_type(conn); if (mode == OM_CSV && list_output == NULL) { log_error(_("--csv output not provided with --role option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&details); switch (node_info->type) { case PRIMARY: if (recovery_type == RECTYPE_STANDBY) { status = CHECK_STATUS_CRITICAL; appendPQExpBufferStr(&details, _("node is registered as primary but running as standby")); } else { appendPQExpBufferStr(&details, _("node is primary")); } break; case STANDBY: if (recovery_type == RECTYPE_PRIMARY) { status = CHECK_STATUS_CRITICAL; appendPQExpBufferStr(&details, _("node is registered as standby but running as primary")); } else { appendPQExpBufferStr(&details, _("node is standby")); } break; case WITNESS: if (recovery_type == RECTYPE_STANDBY) { status = CHECK_STATUS_CRITICAL; appendPQExpBufferStr(&details, _("node is registered as witness but running as standby")); } else { appendPQExpBufferStr(&details, _("node is witness")); } break; default: break; } switch (mode) { case OM_NAGIOS: printf("REPMGR_SERVER_ROLE %s: %s\n", output_check_status(status), details.data); break; case OM_CSV: case OM_TEXT: if (list_output != NULL) { check_status_list_set(list_output, "Server role", status, details.data); } else { printf("%s (%s)\n", output_check_status(status), details.data); } default: break; } termPQExpBuffer(&details); return status; } static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { CheckStatus status = CHECK_STATUS_OK; PQExpBufferData details; if (mode == OM_CSV && list_output == NULL) { log_error(_("--csv output not provided with --slots option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&details); if (node_info->total_replication_slots == 0) { appendPQExpBufferStr(&details, _("node has no physical replication slots")); } else if (node_info->inactive_replication_slots == 0) { appendPQExpBuffer(&details, _("%i of %i physical replication slots are active"), node_info->total_replication_slots, node_info->total_replication_slots); } else if (node_info->inactive_replication_slots > 0) { status = CHECK_STATUS_CRITICAL; appendPQExpBuffer(&details, _("%i of %i physical replication slots are inactive"), node_info->inactive_replication_slots, node_info->total_replication_slots); } switch (mode) { case OM_NAGIOS: printf("REPMGR_INACTIVE_SLOTS %s: %s | slots=%i;%i\n", output_check_status(status), details.data, node_info->total_replication_slots, node_info->inactive_replication_slots); break; case OM_CSV: case OM_TEXT: if (list_output != NULL) { check_status_list_set(list_output, "Replication slots", status, details.data); } else { printf("%s (%s)\n", output_check_status(status), details.data); } default: break; } termPQExpBuffer(&details); return status; } static CheckStatus do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { CheckStatus status = CHECK_STATUS_OK; PQExpBufferData details; NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER; if (mode == OM_CSV && list_output == NULL) { log_error(_("--csv output not provided with --missing-slots option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&details); get_downstream_nodes_with_missing_slot(conn, config_file_options.node_id, &missing_slots); if (missing_slots.node_count == 0) { appendPQExpBufferStr(&details, _("node has no missing physical replication slots")); } else { NodeInfoListCell *missing_slot_cell = NULL; bool first_element = true; status = CHECK_STATUS_CRITICAL; appendPQExpBuffer(&details, _("%i physical replication slots are missing"), missing_slots.node_count); if (missing_slots.node_count) { appendPQExpBufferStr(&details, ": "); for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next) { if (first_element == true) { first_element = false; } else { appendPQExpBufferStr(&details, ", "); } appendPQExpBufferStr(&details, missing_slot_cell->node_info->slot_name); } } } switch (mode) { case OM_NAGIOS: { printf("REPMGR_MISSING_SLOTS %s: %s | missing_slots=%i", output_check_status(status), details.data, missing_slots.node_count); if (missing_slots.node_count) { NodeInfoListCell *missing_slot_cell = NULL; bool first_element = true; printf(";"); for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next) { if (first_element == true) { first_element = false; } else { printf(","); } printf("%s", missing_slot_cell->node_info->slot_name); } } printf("\n"); break; } case OM_CSV: case OM_TEXT: if (list_output != NULL) { check_status_list_set(list_output, "Missing physical replication slots", status, details.data); } else { printf("%s (%s)\n", output_check_status(status), details.data); } default: break; } clear_node_info_list(&missing_slots); termPQExpBuffer(&details); return status; } CheckStatus do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { CheckStatus status = CHECK_STATUS_OK; char actual_data_directory[MAXPGPATH] = ""; PQExpBufferData details; if (mode == OM_CSV && list_output == NULL) { log_error(_("--csv output not provided with --data-directory-config option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&details); /* * Check actual data directory matches that in repmgr.conf; note this requires * a superuser connection */ if (connection_has_pg_monitor_role(conn, "pg_read_all_settings") == true) { /* we expect to have a database connection */ if (get_pg_setting(conn, "data_directory", actual_data_directory) == false) { appendPQExpBuffer(&details, _("unable to determine current \"data_directory\"")); status = CHECK_STATUS_UNKNOWN; } if (strncmp(actual_data_directory, config_file_options.data_directory, MAXPGPATH) != 0) { if (mode != OM_NAGIOS) { appendPQExpBuffer(&details, _("configured \"data_directory\" is \"%s\"; "), config_file_options.data_directory); } appendPQExpBuffer(&details, "actual data directory is \"%s\"", actual_data_directory); status = CHECK_STATUS_CRITICAL; } else { appendPQExpBuffer(&details, _("configured \"data_directory\" is \"%s\""), config_file_options.data_directory); } } /* * If no superuser connection available, sanity-check that the configuration directory looks * like a PostgreSQL directory and hope it's the right one. */ else { if (mode == OM_TEXT) { log_info(_("connection is not a superuser connection, falling back to simple check")); if (PQserverVersion(conn) >= 100000) { log_hint(_("provide a superuser with -S/--superuser, or add the \"%s\" user to role \"pg_read_all_settings\" or \"pg_monitor\""), PQuser(conn)); } } if (is_pg_dir(config_file_options.data_directory) == false) { if (mode == OM_NAGIOS) { appendPQExpBufferStr(&details, _("configured \"data_directory\" is not a PostgreSQL data directory")); } else { appendPQExpBuffer(&details, _("configured \"data_directory\" \"%s\" is not a PostgreSQL data directory"), actual_data_directory); } status = CHECK_STATUS_CRITICAL; } else { appendPQExpBuffer(&details, _("configured \"data_directory\" is \"%s\""), config_file_options.data_directory); } } switch (mode) { case OM_OPTFORMAT: printf("--configured-data-directory=%s\n", output_check_status(status)); break; case OM_NAGIOS: printf("REPMGR_DATA_DIRECTORY %s: %s", output_check_status(status), config_file_options.data_directory); if (status == CHECK_STATUS_CRITICAL) { printf(" | %s", details.data); } puts(""); break; case OM_CSV: case OM_TEXT: if (list_output != NULL) { check_status_list_set(list_output, "Configured data directory", status, details.data); } else { printf("%s (%s)\n", output_check_status(status), details.data); } default: break; } termPQExpBuffer(&details); return status; } CheckStatus do_node_check_repmgrd(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { CheckStatus status = CHECK_STATUS_OK; if (mode == OM_CSV && list_output == NULL) { log_error(_("--csv output not provided with --repmgrd option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } status = get_repmgrd_status(conn); switch (mode) { case OM_OPTFORMAT: printf("--repmgrd=%s\n", output_check_status(status)); break; case OM_NAGIOS: printf("REPMGRD %s: %s\n", output_check_status(status), output_repmgrd_status(status)); break; case OM_CSV: case OM_TEXT: if (list_output != NULL) { check_status_list_set(list_output, "repmgrd", status, output_repmgrd_status(status)); } else { printf("%s (%s)\n", output_check_status(status), output_repmgrd_status(status)); } default: break; } return status; } /* * This is not included in the general list output */ static CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) { CheckStatus status = CHECK_STATUS_OK; PQExpBufferData errmsg; PQExpBufferData details; if (mode != OM_OPTFORMAT) { log_error(_("--replication-config-owner option can only be used with --optformat")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&errmsg); initPQExpBuffer(&details); if (check_replication_config_owner(PQserverVersion(conn), config_file_options.data_directory, &errmsg, &details) == false) { status = CHECK_STATUS_CRITICAL; } printf("--replication-config-owner=%s\n", output_check_status(status)); return status; } /* * This is not included in the general list output */ static CheckStatus do_node_check_db_connection(PGconn *conn, OutputMode mode) { CheckStatus status = CHECK_STATUS_OK; PQExpBufferData details; if (mode == OM_CSV) { log_error(_("--csv output not provided with --db-connection option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } /* This check is for configuration diagnostics only */ if (mode == OM_NAGIOS) { log_error(_("--nagios output not provided with --db-connection option")); PQfinish(conn); exit(ERR_BAD_CONFIG); } initPQExpBuffer(&details); if (PQstatus(conn) != CONNECTION_OK) { t_conninfo_param_list conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER; int c; status = CHECK_STATUS_CRITICAL; initialize_conninfo_params(&conninfo, false); conn_to_param_list(conn, &conninfo); appendPQExpBufferStr(&details, "connection parameters used:"); for (c = 0; c < conninfo.size && conninfo.keywords[c] != NULL; c++) { if (conninfo.values[c] != NULL && conninfo.values[c][0] != '\0') { appendPQExpBuffer(&details, " %s=%s", conninfo.keywords[c], conninfo.values[c]); } } } if (mode == OM_OPTFORMAT) { printf("--db-connection=%s\n", output_check_status(status)); } else if (mode == OM_TEXT) { printf("%s (%s)\n", output_check_status(status), details.data); } termPQExpBuffer(&details); return status; } void do_node_service(void) { t_server_action action = ACTION_UNKNOWN; char data_dir[MAXPGPATH] = ""; char command[MAXLEN] = ""; PQExpBufferData output; action = parse_server_action(runtime_options.action); if (action == ACTION_UNKNOWN) { log_error(_("unknown value \"%s\" provided for parameter --action"), runtime_options.action); log_hint(_("valid values are \"start\", \"stop\", \"restart\", \"reload\" and \"promote\"")); exit(ERR_BAD_CONFIG); } if (runtime_options.list_actions == true) { return _do_node_service_list_actions(action); } if (data_dir_required_for_action(action)) { get_node_config_directory(data_dir); if (data_dir[0] == '\0') { log_error(_("unable to determine data directory for action")); exit(ERR_BAD_CONFIG); } } if ((action == ACTION_STOP || action == ACTION_RESTART) && runtime_options.checkpoint == true) { PGconn *conn = NULL; if (config_file_options.conninfo[0] != '\0') { /* * If --superuser option provided, attempt to connect as the specified user */ if (runtime_options.superuser[0] != '\0') { conn = establish_db_connection_with_replacement_param( config_file_options.conninfo, "user", runtime_options.superuser, true); } else { conn = establish_db_connection(config_file_options.conninfo, true); } } else { conn = establish_db_connection_by_params(&source_conninfo, true); } if (can_execute_checkpoint(conn) == false) { if (runtime_options.dry_run == true) { log_warning(_("a CHECKPOINT would be issued here but no authorized connection is available")); } else { log_warning(_("an authorized connection is required to issue a CHECKPOINT")); } if (PQserverVersion(conn) >= 150000) { log_hint(_("provide a superuser with -S/--superuser or grant pg_checkpoint role to repmgr user")); } else { log_hint(_("provide a superuser with -S/--superuser")); } } else { if (runtime_options.dry_run == true) { log_info(_("a CHECKPOINT would be issued here")); } else { log_notice(_("issuing CHECKPOINT on node \"%s\" (ID: %i) "), config_file_options.node_name, config_file_options.node_id); checkpoint(conn); } } PQfinish(conn); } get_server_action(action, command, data_dir); if (runtime_options.dry_run == true) { log_info(_("would execute server command \"%s\""), command); return; } /* * log level is "DETAIL" here as this command is intended to be executed * by another repmgr process (e.g. during standby switchover); that repmgr * should emit a "NOTICE" about the intent of the command. */ log_detail(_("executing server command \"%s\""), command); initPQExpBuffer(&output); if (local_command(command, &output) == false) { termPQExpBuffer(&output); exit(ERR_LOCAL_COMMAND); } termPQExpBuffer(&output); } static void _do_node_service_list_actions(t_server_action action) { char command[MAXLEN] = ""; char data_dir[MAXPGPATH] = ""; bool data_dir_required = false; /* do we need to provide a data directory for any of the actions? */ if (data_dir_required_for_action(ACTION_START)) data_dir_required = true; if (data_dir_required_for_action(ACTION_STOP)) data_dir_required = true; if (data_dir_required_for_action(ACTION_RESTART)) data_dir_required = true; if (data_dir_required_for_action(ACTION_RELOAD)) data_dir_required = true; if (data_dir_required_for_action(ACTION_PROMOTE)) data_dir_required = true; if (data_dir_required == true) { get_node_config_directory(data_dir); } /* show command for specific action only */ if (action != ACTION_NONE) { get_server_action(action, command, data_dir); printf("%s\n", command); return; } puts(_("Following commands would be executed for each action:")); puts(""); get_server_action(ACTION_START, command, data_dir); printf(" start: \"%s\"\n", command); get_server_action(ACTION_STOP, command, data_dir); printf(" stop: \"%s\"\n", command); get_server_action(ACTION_RESTART, command, data_dir); printf(" restart: \"%s\"\n", command); get_server_action(ACTION_RELOAD, command, data_dir); printf(" reload: \"%s\"\n", command); get_server_action(ACTION_PROMOTE, command, data_dir); printf(" promote: \"%s\"\n", command); puts(""); } static t_server_action parse_server_action(const char *action_name) { if (action_name[0] == '\0') return ACTION_NONE; if (strcasecmp(action_name, "start") == 0) return ACTION_START; if (strcasecmp(action_name, "stop") == 0) return ACTION_STOP; if (strcasecmp(action_name, "restart") == 0) return ACTION_RESTART; if (strcasecmp(action_name, "reload") == 0) return ACTION_RELOAD; if (strcasecmp(action_name, "promote") == 0) return ACTION_PROMOTE; return ACTION_UNKNOWN; } /* * Rejoin a dormant (shut down) node to the replication cluster; this * is typically a former primary which needs to be demoted to a standby. * * Note that "repmgr node rejoin" is also executed by * "repmgr standby switchover" after promoting the new primary. * * Parameters: * --dry-run * --force-rewind[=VALUE] * --config-files * --config-archive-dir * -W/--no-wait */ void do_node_rejoin(void) { PGconn *upstream_conn = NULL; RecoveryType primary_recovery_type = RECTYPE_UNKNOWN; PGconn *primary_conn = NULL; DBState db_state; PGPing status; bool is_shutdown = true; int server_version_num = UNKNOWN_SERVER_VERSION_NUM; bool hide_standby_signal = false; KeyValueListCell *cell = NULL; PQExpBufferData command; PQExpBufferData command_output; PQExpBufferData follow_output; struct stat statbuf; t_node_info primary_node_record = T_NODE_INFO_INITIALIZER; t_node_info local_node_record = T_NODE_INFO_INITIALIZER; bool success = true; int follow_error_code = SUCCESS; /* check node is not actually running */ status = PQping(config_file_options.conninfo); switch (status) { case PQPING_NO_ATTEMPT: log_error(_("unable to determine status of server")); exit(ERR_BAD_CONFIG); case PQPING_OK: is_shutdown = false; break; case PQPING_REJECT: is_shutdown = false; break; case PQPING_NO_RESPONSE: /* status not yet clear */ break; } if (get_db_state(config_file_options.data_directory, &db_state) == false) { log_error(_("unable to determine database state from pg_control")); exit(ERR_BAD_CONFIG); } if (is_shutdown == false) { log_error(_("database is still running in state \"%s\""), describe_db_state(db_state)); log_hint(_("\"repmgr node rejoin\" cannot be executed on a running node")); exit(ERR_REJOIN_FAIL); } /* * Server version number required to determine whether pg_rewind will run * crash recovery (Pg 13 and later). */ server_version_num = get_pg_version(config_file_options.data_directory, NULL); if (server_version_num == UNKNOWN_SERVER_VERSION_NUM) { /* This is very unlikely to happen */ log_error(_("unable to determine database version")); exit(ERR_BAD_CONFIG); } log_verbose(LOG_DEBUG, "server version number is: %i", server_version_num); /* check if cleanly shut down */ if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY) { if (db_state == DB_SHUTDOWNING) { log_error(_("database is still shutting down")); } else if (server_version_num >= 130000 && runtime_options.force_rewind_used == true) { log_warning(_("database is not shut down cleanly")); log_detail(_("--force-rewind provided, pg_rewind will automatically perform recovery")); /* * If pg_rewind is executed, the first change it will make * is to start the server in single user mode, which will fail * in the presence of "standby.signal", so we'll "hide" it * (actually delete and recreate). */ hide_standby_signal = true; } else { /* * If the database was not shut down cleanly, it *might* rejoin correctly * after starting up and recovering, but better to ensure the database * can recover before trying anything else. */ log_error(_("database is not shut down cleanly")); if (server_version_num >= 130000) { log_hint(_("provide --force-rewind to run recovery")); } else { if (runtime_options.force_rewind_used == true) { log_detail(_("pg_rewind will not be able to run")); } log_hint(_("database should be restarted then shut down cleanly after crash recovery completes")); } exit(ERR_REJOIN_FAIL); } } /* check provided upstream connection */ upstream_conn = establish_db_connection_by_params(&source_conninfo, true); if (get_primary_node_record(upstream_conn, &primary_node_record) == false) { log_error(_("unable to retrieve primary node record")); log_hint(_("check the provided database connection string is for a \"repmgr\" database")); PQfinish(upstream_conn); exit(ERR_BAD_CONFIG); } /* * Emit a notice about the identity of the rejoin target */ log_notice(_("rejoin target is node \"%s\" (ID: %i)"), primary_node_record.node_name, primary_node_record.node_id); /* connect to registered primary and check it's not in recovery */ primary_conn = establish_db_connection(primary_node_record.conninfo, false); if (PQstatus(primary_conn) != CONNECTION_OK) { RecoveryType upstream_recovery_type = get_recovery_type(upstream_conn); log_error(_("unable to connect to current registered primary \"%s\" (ID: %i)"), primary_node_record.node_name, primary_node_record.node_id); log_detail(_("registered primary node conninfo is: \"%s\""), primary_node_record.conninfo); /* * Catch case where provided upstream is not in recovery, but is also * not registered as primary */ if (upstream_recovery_type == RECTYPE_PRIMARY) { log_warning(_("provided upstream connection string is for a server which is not in recovery, but not registered as primary")); log_hint(_("fix repmgr metadata configuration before continuing")); } PQfinish(upstream_conn); exit(ERR_BAD_CONFIG); } PQfinish(upstream_conn); primary_recovery_type = get_recovery_type(primary_conn); if (primary_recovery_type != RECTYPE_PRIMARY) { log_error(_("primary server is registered as node \"%s\" (ID: %i), but server is not a primary"), primary_node_record.node_name, primary_node_record.node_id); /* TODO: hint about checking cluster */ PQfinish(primary_conn); exit(ERR_BAD_CONFIG); } /* * Fetch the local node record - we'll need this later, and it acts as an * additional sanity-check that the node is known to the primary. */ if (get_node_record(primary_conn, config_file_options.node_id, &local_node_record) != RECORD_FOUND) { log_error(_("unable to retrieve node record for the local node")); log_hint(_("check the local node is registered with the current primary \"%s\" (ID: %i)"), primary_node_record.node_name, primary_node_record.node_id); PQfinish(primary_conn); exit(ERR_BAD_CONFIG); } /* * Sanity-check replication slot availability */ if (config_file_options.use_replication_slots) { bool slots_available = check_replication_slots_available(primary_node_record.node_id, primary_conn); if (slots_available == false) { PQfinish(primary_conn); exit(ERR_BAD_CONFIG); } } /* * sanity-check that it will actually be possible to stream from the new upstream */ { bool can_rejoin; TimeLineID tli = get_min_recovery_end_timeline(config_file_options.data_directory); XLogRecPtr min_recovery_location = get_min_recovery_location(config_file_options.data_directory); /* * It's possible this was a former primary, so the minRecoveryPoint* * fields may be empty. */ if (min_recovery_location == InvalidXLogRecPtr) min_recovery_location = get_latest_checkpoint_location(config_file_options.data_directory); if (tli == 0) tli = get_timeline(config_file_options.data_directory); can_rejoin = check_node_can_attach(tli, min_recovery_location, primary_conn, &primary_node_record, true); if (can_rejoin == false) { PQfinish(primary_conn); exit(ERR_REJOIN_FAIL); } } /* * --force-rewind specified - check prerequisites, and attempt to execute * (if --dry-run provided, just output the command which would be executed) */ if (runtime_options.force_rewind_used == true) { PQExpBufferData msg; PQExpBufferData filebuf; int ret; /* * Check that pg_rewind can be used */ initPQExpBuffer(&msg); if (can_use_pg_rewind(primary_conn, config_file_options.data_directory, &msg) == false) { log_error(_("--force-rewind specified but pg_rewind cannot be used")); log_detail("%s", msg.data); termPQExpBuffer(&msg); PQfinish(primary_conn); exit(ERR_BAD_CONFIG); } appendPQExpBufferStr(&msg, _("prerequisites for using pg_rewind are met")); if (runtime_options.dry_run == true) { log_info("%s", msg.data); } else { log_verbose(LOG_INFO, "%s", msg.data); } termPQExpBuffer(&msg); /* * Archive requested configuration files. * * In --dry-run mode this acts as a check that the files can be archived, though * errors will only be logged; any copied files will be deleted and --dry-run * execution will continue. */ _do_node_archive_config(); /* execute pg_rewind */ initPQExpBuffer(&command); if (runtime_options.force_rewind_path[0] != '\0') { appendPQExpBuffer(&command, "%s -D ", runtime_options.force_rewind_path); } else { make_pg_path(&command, "pg_rewind"); appendPQExpBufferStr(&command, " -D "); } appendShellString(&command, config_file_options.data_directory); appendPQExpBuffer(&command, " --source-server='%s'", primary_node_record.conninfo); if (runtime_options.dry_run == true) { log_info(_("pg_rewind would now be executed")); log_detail(_("pg_rewind command is:\n %s"), command.data); } else { log_notice(_("executing pg_rewind")); log_detail(_("pg_rewind command is \"%s\""), command.data); /* * In Pg13 and later, pg_rewind will attempt to start up a server which * was not cleanly shut down in single user mode. This will fail if * "standby.signal" is present. We'll remove it and restore it after * pg_rewind runs. */ if (hide_standby_signal == true) { char standby_signal_file_path[MAXPGPATH] = ""; log_notice(_("temporarily removing \"standby.signal\"")); log_detail(_("this is required so pg_rewind can fix the unclean shutdown")); make_standby_signal_path(config_file_options.data_directory, standby_signal_file_path); if (unlink(standby_signal_file_path) < 0 && errno != ENOENT) { log_error(_("unable to remove \"standby.signal\" file in data directory \"%s\""), standby_signal_file_path); log_detail("%s", strerror(errno)); exit(ERR_REJOIN_FAIL); } } initPQExpBuffer(&command_output); ret = local_command(command.data, &command_output); termPQExpBuffer(&command); if (hide_standby_signal == true) { /* * Restore standby.signal if we previously removed it, regardless * of whether the pg_rewind operation failed. */ log_notice(_("recreating \"standby.signal\"")); write_standby_signal(config_file_options.data_directory); } if (ret == false) { log_error(_("pg_rewind execution failed")); log_detail("%s", command_output.data); termPQExpBuffer(&command_output); exit(ERR_REJOIN_FAIL); } termPQExpBuffer(&command_output); /* Restore any previously archived config files */ _do_node_restore_config(); initPQExpBuffer(&filebuf); /* remove any recovery.done file copied in by pg_rewind */ appendPQExpBuffer(&filebuf, "%s/recovery.done", config_file_options.data_directory); if (stat(filebuf.data, &statbuf) == 0) { log_verbose(LOG_INFO, _("deleting \"recovery.done\"")); if (unlink(filebuf.data) == -1) { log_warning(_("unable to delete \"%s\""), filebuf.data); log_detail("%s", strerror(errno)); } } termPQExpBuffer(&filebuf); /* * Delete any replication slots copied in by pg_rewind. * * TODO: * - from PostgreSQL 11, this will be handled by pg_rewind, so * we can skip this step from that version; see commit * 266b6acb312fc440c1c1a2036aa9da94916beac6 * - possibly delete contents of various other directories * as per the above commit for pre-PostgreSQL 11 */ { PQExpBufferData slotdir_path; DIR *slotdir; struct dirent *slotdir_ent; initPQExpBuffer(&slotdir_path); appendPQExpBuffer(&slotdir_path, "%s/pg_replslot", config_file_options.data_directory); slotdir = opendir(slotdir_path.data); if (slotdir == NULL) { log_warning(_("unable to open replication slot directory \"%s\""), slotdir_path.data); log_detail("%s", strerror(errno)); } else { while ((slotdir_ent = readdir(slotdir)) != NULL) { struct stat local_statbuf; PQExpBufferData slotdir_ent_path; if (strcmp(slotdir_ent->d_name, ".") == 0 || strcmp(slotdir_ent->d_name, "..") == 0) continue; initPQExpBuffer(&slotdir_ent_path); appendPQExpBuffer(&slotdir_ent_path, "%s/%s", slotdir_path.data, slotdir_ent->d_name); if (stat(slotdir_ent_path.data, &local_statbuf) == 0 && !S_ISDIR(local_statbuf.st_mode)) { termPQExpBuffer(&slotdir_ent_path); continue; } log_debug("deleting slot directory \"%s\"", slotdir_ent_path.data); if (rmdir_recursive(slotdir_ent_path.data) != 0 && errno != EEXIST) { log_warning(_("unable to delete replication slot directory \"%s\""), slotdir_ent_path.data); log_detail("%s", strerror(errno)); log_hint(_("directory may need to be manually removed")); } termPQExpBuffer(&slotdir_ent_path); } closedir(slotdir); } termPQExpBuffer(&slotdir_path); } } } if (runtime_options.dry_run == true) { log_info(_("prerequisites for executing NODE REJOIN are met")); exit(SUCCESS); } initPQExpBuffer(&follow_output); /* * do_standby_follow_internal() can handle situations where the follow * target is not the primary, so requires database handles to both * (even if they point to the same node). For the time being, * "node rejoin" will only attach a standby to the primary. */ success = do_standby_follow_internal(primary_conn, primary_conn, &primary_node_record, &follow_output, ERR_REJOIN_FAIL, &follow_error_code); if (success == false) { log_error(_("NODE REJOIN failed")); if (strlen(follow_output.data)) log_detail("%s", follow_output.data); create_event_notification(primary_conn, &config_file_options, config_file_options.node_id, "node_rejoin", success, follow_output.data); PQfinish(primary_conn); termPQExpBuffer(&follow_output); exit(follow_error_code); } /* * Actively check that node actually started and connected to primary, * if not exit with ERR_REJOIN_FAIL. * * This check can be overridden with -W/--no-wait, in which case a one-time * check will be carried out. */ if (runtime_options.no_wait == false) { standy_join_status join_success = check_standby_join(primary_conn, &primary_node_record, &local_node_record); create_event_notification(primary_conn, &config_file_options, config_file_options.node_id, "node_rejoin", join_success == JOIN_SUCCESS ? true : false, follow_output.data); if (join_success != JOIN_SUCCESS) { termPQExpBuffer(&follow_output); log_error(_("NODE REJOIN failed")); if (join_success == JOIN_FAIL_NO_PING) { log_detail(_("local node \"%s\" did not become available start after %i seconds"), config_file_options.node_name, config_file_options.node_rejoin_timeout); } else { log_detail(_("no active record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"), config_file_options.node_name, primary_node_record.node_name); } log_hint(_("check the PostgreSQL log on the local node")); exit(ERR_REJOIN_FAIL); } } else { /* -W/--no-wait provided - check once */ NodeAttached node_attached = is_downstream_node_attached(primary_conn, config_file_options.node_name, NULL); if (node_attached == NODE_ATTACHED) success = true; } /* * Handle replication slots: * - if a slot for the new upstream exists, delete that * - warn about any other inactive replication slots */ if (runtime_options.force_rewind_used == false && config_file_options.use_replication_slots) { PGconn *local_conn = NULL; local_conn = establish_db_connection(config_file_options.conninfo, false); if (PQstatus(local_conn) != CONNECTION_OK) { log_warning(_("unable to connect to local node to check replication slot status")); log_hint(_("execute \"repmgr node check\" to check inactive slots and drop manually if necessary")); } else { KeyValueList inactive_replication_slots = {NULL, NULL}; int inactive_count = 0; PQExpBufferData slotinfo; drop_replication_slot_if_exists(local_conn, config_file_options.node_id, primary_node_record.slot_name); (void) get_inactive_replication_slots(local_conn, &inactive_replication_slots); initPQExpBuffer(&slotinfo); for (cell = inactive_replication_slots.head; cell; cell = cell->next) { appendPQExpBuffer(&slotinfo, " - %s (%s)", cell->key, cell->value); inactive_count++; } if (inactive_count > 0) { log_warning(_("%i inactive replication slots detected"), inactive_count); log_detail(_("inactive replication slots:\n%s"), slotinfo.data); log_hint(_("these replication slots may need to be removed manually")); } termPQExpBuffer(&slotinfo); PQfinish(local_conn); } } if (success == true) { log_notice(_("NODE REJOIN successful")); log_detail("%s", follow_output.data); } else { /* * if we reach here, no record found in upstream node's pg_stat_replication */ log_notice(_("NODE REJOIN has completed but node is not yet reattached to upstream")); log_hint(_("you will need to manually check the node's replication status")); } termPQExpBuffer(&follow_output); return; } /* * Currently for testing purposes only, not documented; * use at own risk! */ void do_node_control(void) { PGconn *conn = NULL; pid_t wal_receiver_pid = UNKNOWN_PID; conn = establish_db_connection(config_file_options.conninfo, true); if (runtime_options.disable_wal_receiver == true) { wal_receiver_pid = disable_wal_receiver(conn); PQfinish(conn); if (wal_receiver_pid == UNKNOWN_PID) exit(ERR_BAD_CONFIG); exit(SUCCESS); } if (runtime_options.enable_wal_receiver == true) { wal_receiver_pid = enable_wal_receiver(conn, true); PQfinish(conn); if (wal_receiver_pid == UNKNOWN_PID) exit(ERR_BAD_CONFIG); exit(SUCCESS); } log_error(_("no option provided")); PQfinish(conn); } /* * For "internal" use by `node rejoin` on the local node when * called by "standby switchover" from the remote node. * * This archives any configuration files in the data directory, which may be * overwritten by pg_rewind. * * Requires configuration file, optionally --config-archive-dir */ static void _do_node_archive_config(void) { PQExpBufferData archive_dir; struct stat statbuf; struct dirent *arcdir_ent; DIR *arcdir; KeyValueList config_files = {NULL, NULL}; KeyValueListCell *cell = NULL; int copied_count = 0; initPQExpBuffer(&archive_dir); format_archive_dir(&archive_dir); /* sanity-check directory path */ if (stat(archive_dir.data, &statbuf) == -1) { if (errno != ENOENT) { log_error(_("error encountered when checking archive directory \"%s\""), archive_dir.data); log_detail("%s", strerror(errno)); termPQExpBuffer(&archive_dir); exit(ERR_BAD_CONFIG); } /* attempt to create and open the directory */ if (mkdir(archive_dir.data, S_IRWXU) != 0 && errno != EEXIST) { log_error(_("unable to create temporary archive directory \"%s\""), archive_dir.data); log_detail("%s", strerror(errno)); termPQExpBuffer(&archive_dir); exit(ERR_BAD_CONFIG); } if (runtime_options.dry_run == true) { log_verbose(LOG_INFO, "temporary archive directory \"%s\" created", archive_dir.data); } } else if (!S_ISDIR(statbuf.st_mode)) { log_error(_("\"%s\" exists but is not a directory"), archive_dir.data); termPQExpBuffer(&archive_dir); exit(ERR_BAD_CONFIG); } arcdir = opendir(archive_dir.data); /* always attempt to open the directory */ if (arcdir == NULL) { log_error(_("unable to open archive directory \"%s\""), archive_dir.data); log_detail("%s", strerror(errno)); termPQExpBuffer(&archive_dir); exit(ERR_BAD_CONFIG); } if (runtime_options.dry_run == false) { /* * attempt to remove any existing files in the directory * TODO: collate problem files into list */ while ((arcdir_ent = readdir(arcdir)) != NULL) { PQExpBufferData arcdir_ent_path; initPQExpBuffer(&arcdir_ent_path); appendPQExpBuffer(&arcdir_ent_path, "%s/%s", archive_dir.data, arcdir_ent->d_name); if (stat(arcdir_ent_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) { termPQExpBuffer(&arcdir_ent_path); continue; } if (unlink(arcdir_ent_path.data) == -1) { log_error(_("unable to delete file in temporary archive directory")); log_detail(_("file is: \"%s\""), arcdir_ent_path.data); log_detail("%s", strerror(errno)); closedir(arcdir); termPQExpBuffer(&arcdir_ent_path); exit(ERR_BAD_CONFIG); } termPQExpBuffer(&arcdir_ent_path); } } closedir(arcdir); /* * extract list of config files from --config-files */ { int i = 0; int j = 0; int config_file_len = strlen(runtime_options.config_files); char filenamebuf[MAXPGPATH] = ""; PQExpBufferData pathbuf; for (j = 0; j < config_file_len; j++) { if (runtime_options.config_files[j] == ',') { int filename_len = j - i; if (filename_len >= MAXPGPATH) filename_len = MAXPGPATH - 1; strncpy(filenamebuf, runtime_options.config_files + i, filename_len); filenamebuf[filename_len] = '\0'; initPQExpBuffer(&pathbuf); appendPQExpBuffer(&pathbuf, "%s/%s", config_file_options.data_directory, filenamebuf); key_value_list_set(&config_files, filenamebuf, pathbuf.data); termPQExpBuffer(&pathbuf); i = j + 1; } } if (i < config_file_len) { int filename_len = config_file_len - i; strncpy(filenamebuf, runtime_options.config_files + i, filename_len); filenamebuf[filename_len] = '\0'; initPQExpBuffer(&pathbuf); appendPQExpBuffer(&pathbuf, "%s/%s", config_file_options.data_directory, filenamebuf); key_value_list_set(&config_files, filenamebuf, pathbuf.data); termPQExpBuffer(&pathbuf); } } for (cell = config_files.head; cell; cell = cell->next) { PQExpBufferData dest_file; initPQExpBuffer(&dest_file); appendPQExpBuffer(&dest_file, "%s/%s", archive_dir.data, cell->key); if (stat(cell->value, &statbuf) == -1) { log_warning(_("specified file \"%s\" not found, skipping"), cell->value); } else { if (runtime_options.dry_run == true) { log_info("file \"%s\" would be copied to \"%s\"", cell->key, dest_file.data); copied_count++; } else { log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"", cell->key, dest_file.data); copy_file(cell->value, dest_file.data); copied_count++; } } termPQExpBuffer(&dest_file); } if (runtime_options.dry_run == true) { log_verbose(LOG_INFO, _("%i files would have been copied to \"%s\""), copied_count, archive_dir.data); } else { log_verbose(LOG_INFO, _("%i files copied to \"%s\""), copied_count, archive_dir.data); } if (runtime_options.dry_run == true) { /* * Delete directory in --dry-run mode - it should be empty unless it's been * interfered with for some reason, in which case manual intervention is * required */ if (rmdir(archive_dir.data) != 0 && errno != EEXIST) { log_warning(_("unable to delete directory \"%s\""), archive_dir.data); log_detail("%s", strerror(errno)); log_hint(_("directory may need to be manually removed")); } else { log_verbose(LOG_INFO, "temporary archive directory \"%s\" deleted", archive_dir.data); } } termPQExpBuffer(&archive_dir); } /* * Intended mainly for "internal" use by `standby switchover`, which * calls this on the target server to restore any configuration files * to the data directory, which may have been overwritten by an operation * like pg_rewind * * Not designed to be called if the instance is running, but does * not currently check. * * Requires -D/--pgdata, optionally --config-archive-dir * * Removes --config-archive-dir after successful copy */ static void _do_node_restore_config(void) { PQExpBufferData archive_dir; DIR *arcdir; struct dirent *arcdir_ent; int copied_count = 0; bool copy_ok = true; initPQExpBuffer(&archive_dir); format_archive_dir(&archive_dir); arcdir = opendir(archive_dir.data); if (arcdir == NULL) { log_error(_("unable to open archive directory \"%s\""), archive_dir.data); log_detail("%s", strerror(errno)); termPQExpBuffer(&archive_dir); exit(ERR_BAD_CONFIG); } while ((arcdir_ent = readdir(arcdir)) != NULL) { struct stat statbuf; PQExpBufferData src_file_path; PQExpBufferData dest_file_path; initPQExpBuffer(&src_file_path); appendPQExpBuffer(&src_file_path, "%s/%s", archive_dir.data, arcdir_ent->d_name); /* skip non-files */ if (stat(src_file_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) { termPQExpBuffer(&src_file_path); continue; } initPQExpBuffer(&dest_file_path); appendPQExpBuffer(&dest_file_path, "%s/%s", config_file_options.data_directory, arcdir_ent->d_name); log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"", src_file_path.data, dest_file_path.data); if (copy_file(src_file_path.data, dest_file_path.data) == false) { copy_ok = false; log_warning(_("unable to copy \"%s\" to \"%s\""), arcdir_ent->d_name, runtime_options.data_dir); } else { unlink(src_file_path.data); copied_count++; } termPQExpBuffer(&dest_file_path); termPQExpBuffer(&src_file_path); } closedir(arcdir); log_notice(_("%i files copied to %s"), copied_count, config_file_options.data_directory); if (copy_ok == false) { log_warning(_("unable to copy all files from \"%s\""), archive_dir.data); } else { /* * Finally, delete directory - it should be empty unless it's been * interfered with for some reason, in which case manual intervention is * required */ if (rmdir(archive_dir.data) != 0 && errno != EEXIST) { log_warning(_("unable to delete directory \"%s\""), archive_dir.data); log_detail("%s", strerror(errno)); log_hint(_("directory may need to be manually removed")); } else { log_verbose(LOG_INFO, "directory \"%s\" deleted", archive_dir.data); } } termPQExpBuffer(&archive_dir); return; } static void format_archive_dir(PQExpBufferData *archive_dir) { appendPQExpBuffer(archive_dir, "%s/repmgr-config-archive-%s", runtime_options.config_archive_dir, config_file_options.node_name); log_verbose(LOG_DEBUG, "using archive directory \"%s\"", archive_dir->data); } static bool copy_file(const char *src_file, const char *dest_file) { FILE *ptr_old, *ptr_new; int a = 0; ptr_old = fopen(src_file, "r"); if (ptr_old == NULL) return false; ptr_new = fopen(dest_file, "w"); if (ptr_new == NULL) { fclose(ptr_old); return false; } chmod(dest_file, S_IRUSR | S_IWUSR); while (1) { a = fgetc(ptr_old); if (!feof(ptr_old)) { fputc(a, ptr_new); } else { break; } } fclose(ptr_new); fclose(ptr_old); return true; } static const char * output_repmgrd_status(CheckStatus status) { switch (status) { case CHECK_STATUS_OK: return "repmgrd running"; case CHECK_STATUS_WARNING: return "repmgrd running but paused"; case CHECK_STATUS_CRITICAL: return "repmgrd not running"; case CHECK_STATUS_UNKNOWN: return "repmgrd status unknown"; } return "UNKNOWN"; } void do_node_help(void) { print_help_header(); printf(_("Usage:\n")); printf(_(" %s [OPTIONS] node status\n"), progname()); printf(_(" %s [OPTIONS] node check\n"), progname()); printf(_(" %s [OPTIONS] node rejoin\n"), progname()); printf(_(" %s [OPTIONS] node service\n"), progname()); puts(""); printf(_("NODE STATUS\n")); puts(""); printf(_(" \"node status\" displays an overview of a node's basic information and replication status.\n")); puts(""); printf(_(" Configuration file required, runs on local node only.\n")); puts(""); printf(_(" --csv emit output as CSV\n")); puts(""); printf(_("NODE CHECK\n")); puts(""); printf(_(" \"node check\" performs some health checks on a node from a replication perspective.\n")); puts(""); printf(_(" Configuration file required, runs on local node only.\n")); puts(""); printf(_(" Connection options:\n")); printf(_(" -S, --superuser=USERNAME superuser to use, if repmgr user is not superuser\n")); puts(""); printf(_(" Output options:\n")); printf(_(" --csv emit output as CSV (not available for individual check output)\n")); printf(_(" --nagios emit output in Nagios format (individual check output only)\n")); puts(""); printf(_(" Following options check an individual status:\n")); printf(_(" --archive-ready number of WAL files ready for archiving\n")); printf(_(" --downstream whether all downstream nodes are connected\n")); printf(_(" --upstream whether the node is connected to its upstream\n")); printf(_(" --replication-lag replication lag in seconds (standbys only)\n")); printf(_(" --role check node has expected role\n")); printf(_(" --slots check for inactive replication slots\n")); printf(_(" --missing-slots check for missing replication slots\n")); printf(_(" --repmgrd check if repmgrd is running\n")); printf(_(" --data-directory-config check repmgr's data directory configuration\n")); puts(""); printf(_("NODE REJOIN\n")); puts(""); printf(_(" \"node rejoin\" enables a dormant (stopped) node to be rejoined to the replication cluster.\n")); puts(""); printf(_(" Configuration file required, runs on local node only.\n")); puts(""); printf(_(" --dry-run check that the prerequisites are met for rejoining the node\n" \ " (including usability of \"pg_rewind\" if requested)\n")); printf(_(" --force-rewind[=VALUE] execute \"pg_rewind\" if necessary\n")); printf(_(" (PostgreSQL 9.4 - provide full \"pg_rewind\" path)\n")); printf(_(" --config-files comma-separated list of configuration files to retain\n" \ " after executing \"pg_rewind\"\n")); printf(_(" --config-archive-dir directory to temporarily store retained configuration files\n" \ " (default: /tmp)\n")); printf(_(" -W, --no-wait don't wait for the node to rejoin cluster\n")); puts(""); printf(_("NODE SERVICE\n")); puts(""); printf(_(" \"node service\" executes a system service command to stop/start/restart/reload a node\n" \ " or optionally display which command would be executed\n")); puts(""); printf(_(" Configuration file required, runs on local node only.\n")); puts(""); printf(_(" --dry-run show what action would be performed, but don't execute it\n")); printf(_(" --action action to perform (one of \"start\", \"stop\", \"restart\" or \"reload\")\n")); printf(_(" --list-actions show what command would be performed for each action\n")); printf(_(" --checkpoint issue a CHECKPOINT before stopping or restarting the node\n")); printf(_(" -S, --superuser=USERNAME superuser to use, if repmgr user is not superuser\n")); puts(""); printf(_("%s home page: <%s>\n"), "repmgr", REPMGR_URL); }