/* * repmgr.c - Command interpreter for the repmgr * Copyright (C) 2ndQuadrant, 2010-2015 * * This module is a command-line utility to easily setup a cluster of * hot standby servers for an HA environment * * Commands implemented are. * MASTER REGISTER * STANDBY REGISTER, STANDBY CLONE, STANDBY FOLLOW, STANDBY PROMOTE * CLUSTER SHOW, CLUSTER CLEANUP * WITNESS CREATE * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * */ #include "repmgr.h" #include #include #include #include #include #include "log.h" #include "config.h" #include "check_dir.h" #include "strutil.h" #include "version.h" #define RECOVERY_FILE "recovery.conf" #define NO_ACTION 0 /* Not a real action, just to initialize */ #define MASTER_REGISTER 1 #define STANDBY_REGISTER 2 #define STANDBY_CLONE 3 #define STANDBY_PROMOTE 4 #define STANDBY_FOLLOW 5 #define WITNESS_CREATE 6 #define CLUSTER_SHOW 7 #define CLUSTER_CLEANUP 8 static bool create_recovery_file(const char *data_dir); static int test_ssh_connection(char *host, char *remote_user); static int copy_remote_files(char *host, char *remote_user, char *remote_path, char *local_path); static int run_basebackup(void); static bool check_parameters_for_action(const int action); static bool create_schema(PGconn *conn); static bool copy_configuration(PGconn *masterconn, PGconn *witnessconn); static void write_primary_conninfo(char *line); static bool write_recovery_file_line(FILE *recovery_file, char *recovery_file_path, char *line); static int check_server_version(PGconn *conn, char *server_type, bool exit_on_error, char *server_version_string); static bool check_master_config(PGconn *conn, bool exit_on_error); static bool create_node_record(PGconn *conn, char *action, int node, char *cluster_name, char *node_name, char *conninfo, int priority, bool witness); static void do_master_register(void); static void do_standby_register(void); static void do_standby_clone(void); static void do_standby_promote(void); static void do_standby_follow(void); static void do_witness_create(void); static void do_cluster_show(void); static void do_cluster_cleanup(void); static void do_check_master_config(void); static void usage(void); static void help(const char *progname); /* Global variables */ static const char *progname; static const char *keywords[6]; static const char *values[6]; bool need_a_node = true; /* XXX This should be mapped into a command line option */ bool require_password = false; /* Initialization of runtime options */ t_runtime_options runtime_options = T_RUNTIME_OPTIONS_INITIALIZER; t_configuration_options options = T_CONFIGURATION_OPTIONS_INITIALIZER; static char *server_mode = NULL; static char *server_cmd = NULL; int main(int argc, char **argv) { static struct option long_options[] = { {"dbname", required_argument, NULL, 'd'}, {"host", required_argument, NULL, 'h'}, {"port", required_argument, NULL, 'p'}, {"username", required_argument, NULL, 'U'}, {"superuser", required_argument, NULL, 'S'}, {"dest-dir", required_argument, NULL, 'D'}, {"local-port", required_argument, NULL, 'l'}, {"config-file", required_argument, NULL, 'f'}, {"remote-user", required_argument, NULL, 'R'}, {"wal-keep-segments", required_argument, NULL, 'w'}, {"keep-history", required_argument, NULL, 'k'}, {"force", no_argument, NULL, 'F'}, {"wait", no_argument, NULL, 'W'}, {"min-recovery-apply-delay", required_argument, NULL, 'r'}, {"verbose", no_argument, NULL, 'v'}, {"initdb-no-pwprompt", no_argument, NULL, 1}, {"check-master-config", no_argument, NULL, 2}, {NULL, 0, NULL, 0} }; int optindex; int c, targ; int action = NO_ACTION; bool check_master_config = false; char *ptr = NULL; progname = get_progname(argv[0]); if (argc > 1) { if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) { help(progname); exit(SUCCESS); } if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) { printf("%s %s (PostgreSQL %s)\n", progname, REPMGR_VERSION, PG_VERSION); exit(SUCCESS); } } while ((c = getopt_long(argc, argv, "d:h:p:U:S:D:l:f:R:w:k:FWIvr:", long_options, &optindex)) != -1) { switch (c) { case 'd': strncpy(runtime_options.dbname, optarg, MAXLEN); break; case 'h': strncpy(runtime_options.host, optarg, MAXLEN); break; case 'p': if (atoi(optarg) > 0) strncpy(runtime_options.masterport, optarg, MAXLEN); break; case 'U': strncpy(runtime_options.username, optarg, MAXLEN); break; case 'S': strncpy(runtime_options.superuser, optarg, MAXLEN); break; case 'D': strncpy(runtime_options.dest_dir, optarg, MAXFILENAME); break; case 'l': if (atoi(optarg) > 0) strncpy(runtime_options.localport, optarg, MAXLEN); break; case 'f': strncpy(runtime_options.config_file, optarg, MAXLEN); break; case 'R': strncpy(runtime_options.remote_user, optarg, MAXLEN); break; case 'w': if (atoi(optarg) > 0) strncpy(runtime_options.wal_keep_segments, optarg, MAXLEN); break; case 'k': if (atoi(optarg) > 0) runtime_options.keep_history = atoi(optarg); else runtime_options.keep_history = 0; break; case 'F': runtime_options.force = true; break; case 'W': runtime_options.wait_for_master = true; break; case 'I': runtime_options.ignore_rsync_warn = true; break; case 'r': targ = strtol(optarg, &ptr, 10); if(targ < 0) { usage(); exit(ERR_BAD_CONFIG); } if(ptr && *ptr) { if(strcmp(ptr, "ms") != 0 && strcmp(ptr, "s") != 0 && strcmp(ptr, "min") != 0 && strcmp(ptr, "h") != 0 && strcmp(ptr, "d") != 0) { usage(); exit(ERR_BAD_CONFIG); } } strncpy(runtime_options.min_recovery_apply_delay, optarg, MAXLEN); break; case 'v': runtime_options.verbose = true; break; case 1: runtime_options.initdb_no_pwprompt = true; break; case 2: check_master_config = true; break; default: usage(); exit(ERR_BAD_CONFIG); } } if(check_master_config == true) { do_check_master_config(); exit(SUCCESS); } /* * Now we need to obtain the action, this comes in one of these forms: * MASTER REGISTER | STANDBY {REGISTER | CLONE [node] | PROMOTE | FOLLOW * [node]} | WITNESS CREATE CLUSTER {SHOW | CLEANUP} * * the node part is optional, if we receive it then we shouldn't have * received a -h option */ if (optind < argc) { server_mode = argv[optind++]; if (strcasecmp(server_mode, "STANDBY") != 0 && strcasecmp(server_mode, "MASTER") != 0 && strcasecmp(server_mode, "WITNESS") != 0 && strcasecmp(server_mode, "CLUSTER") != 0) { usage(); exit(ERR_BAD_CONFIG); } } if (optind < argc) { server_cmd = argv[optind++]; /* check posibilities for all server modes */ if (strcasecmp(server_mode, "MASTER") == 0) { if (strcasecmp(server_cmd, "REGISTER") == 0) action = MASTER_REGISTER; } else if (strcasecmp(server_mode, "STANDBY") == 0) { if (strcasecmp(server_cmd, "REGISTER") == 0) action = STANDBY_REGISTER; else if (strcasecmp(server_cmd, "CLONE") == 0) action = STANDBY_CLONE; else if (strcasecmp(server_cmd, "PROMOTE") == 0) action = STANDBY_PROMOTE; else if (strcasecmp(server_cmd, "FOLLOW") == 0) action = STANDBY_FOLLOW; } else if (strcasecmp(server_mode, "CLUSTER") == 0) { if (strcasecmp(server_cmd, "SHOW") == 0) action = CLUSTER_SHOW; else if (strcasecmp(server_cmd, "CLEANUP") == 0) action = CLUSTER_CLEANUP; } else if (strcasecmp(server_mode, "WITNESS") == 0) if (strcasecmp(server_cmd, "CREATE") == 0) action = WITNESS_CREATE; } if (action == NO_ACTION) { usage(); exit(ERR_BAD_CONFIG); } /* For some actions we still can receive a last argument */ if (action == STANDBY_CLONE) { if (optind < argc) { if (runtime_options.host[0]) { log_err(_("Conflicting parameters: you can't use -h while providing a node separately.\n")); usage(); exit(ERR_BAD_CONFIG); } strncpy(runtime_options.host, argv[optind++], MAXLEN); } } if (optind < argc) { log_err(_("%s: too many command-line arguments (first extra is \"%s\")\n"), progname, argv[optind]); usage(); exit(ERR_BAD_CONFIG); } if (!check_parameters_for_action(action)) exit(ERR_BAD_CONFIG); if (!runtime_options.dbname[0]) { if (getenv("PGDATABASE")) strncpy(runtime_options.dbname, getenv("PGDATABASE"), MAXLEN); else if (getenv("PGUSER")) strncpy(runtime_options.dbname, getenv("PGUSER"), MAXLEN); else strncpy(runtime_options.dbname, DEFAULT_DBNAME, MAXLEN); } /* We check that port number is not null */ if (!runtime_options.dbname[0]) { strncpy(runtime_options.masterport, DEFAULT_MASTER_PORT, MAXLEN); } /* Read the configuration file, normally repmgr.conf */ if (!runtime_options.config_file[0]) strncpy(runtime_options.config_file, DEFAULT_CONFIG_FILE, MAXLEN); if (runtime_options.verbose) printf(_("Opening configuration file: %s\n"), runtime_options.config_file); /* * XXX Do not read config files for action where it is not required (clone * for example). */ parse_config(runtime_options.config_file, &options); keywords[2] = "user"; values[2] = (runtime_options.username[0]) ? runtime_options.username : NULL; keywords[3] = "dbname"; values[3] = runtime_options.dbname; keywords[4] = "application_name"; values[4] = (char *) progname; keywords[5] = NULL; values[5] = NULL; /* * Initialize the logger. If verbose command line parameter was input, * make sure that the log level is at least INFO. This is mainly useful * for STANDBY CLONE. That doesn't require a configuration file where a * logging level might be specified at, but it often requires detailed * logging to troubleshoot problems. */ logger_init(&options, progname, options.loglevel, options.logfacility); if (runtime_options.verbose) logger_min_verbose(LOG_INFO); /* * Node configuration information is not needed for all actions, with * STANDBY CLONE being the main exception. */ if (need_a_node) { if (options.node == -1) { log_err(_("Node information is missing. " "Check the configuration file.\n")); exit(ERR_BAD_CONFIG); } } /* Initialise the repmgr schema name */ maxlen_snprintf(repmgr_schema, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX, options.cluster_name); switch (action) { case MASTER_REGISTER: do_master_register(); break; case STANDBY_REGISTER: do_standby_register(); break; case STANDBY_CLONE: do_standby_clone(); break; case STANDBY_PROMOTE: do_standby_promote(); break; case STANDBY_FOLLOW: do_standby_follow(); break; case WITNESS_CREATE: do_witness_create(); break; case CLUSTER_SHOW: do_cluster_show(); break; case CLUSTER_CLEANUP: do_cluster_cleanup(); break; default: usage(); exit(ERR_BAD_CONFIG); } logger_shutdown(); return 0; } static void do_cluster_show(void) { PGconn *conn; PGresult *res; char sqlquery[QUERY_STR_LEN]; char node_role[MAXLEN]; int i; /* We need to connect to check configuration */ log_info(_("%s connecting to database\n"), progname); conn = establish_db_connection(options.conninfo, true); sqlquery_snprintf(sqlquery, "SELECT conninfo, witness " " FROM %s.repl_nodes ", get_repmgr_schema_quoted(conn)); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("Can't get nodes information, have you registered them?\n%s\n"), PQerrorMessage(conn)); PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } PQfinish(conn); printf("Role | Connection String \n"); for (i = 0; i < PQntuples(res); i++) { conn = establish_db_connection(PQgetvalue(res, i, 0), false); if (PQstatus(conn) != CONNECTION_OK) strcpy(node_role, " FAILED"); else if (strcmp(PQgetvalue(res, i, 1), "t") == 0) strcpy(node_role, " witness"); else if (is_standby(conn)) strcpy(node_role, " standby"); else strcpy(node_role, "* master"); printf("%-10s", node_role); printf("| %s\n", PQgetvalue(res, i, 0)); PQfinish(conn); } PQclear(res); } static void do_cluster_cleanup(void) { int master_id; PGconn *conn = NULL; PGconn *master_conn = NULL; PGresult *res; char sqlquery[QUERY_STR_LEN]; /* We need to connect to check configuration */ log_info(_("%s connecting to database\n"), progname); conn = establish_db_connection(options.conninfo, true); /* check if there is a master in this cluster */ log_info(_("%s connecting to master database\n"), progname); master_conn = get_master_connection(conn, options.cluster_name, &master_id, NULL); if (!master_conn) { log_err(_("cluster cleanup: cannot connect to master\n")); PQfinish(conn); exit(ERR_DB_CON); } PQfinish(conn); if (runtime_options.keep_history > 0) { sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_monitor " " WHERE age(now(), last_monitor_time) >= '%d days'::interval ", get_repmgr_schema_quoted(master_conn), runtime_options.keep_history); } else { sqlquery_snprintf(sqlquery, "TRUNCATE TABLE %s.repl_monitor", get_repmgr_schema_quoted(master_conn)); } res = PQexec(master_conn, sqlquery); if (PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("cluster cleanup: Couldn't clean history\n%s\n"), PQerrorMessage(master_conn)); PQclear(res); PQfinish(master_conn); exit(ERR_BAD_CONFIG); } PQclear(res); /* * Let's VACUUM the table to avoid autovacuum to be launched in an * unexpected hour */ sqlquery_snprintf(sqlquery, "VACUUM %s.repl_monitor", get_repmgr_schema_quoted(master_conn)); res = PQexec(master_conn, sqlquery); /* XXX There is any need to check this VACUUM happens without problems? */ PQclear(res); PQfinish(master_conn); } static void do_master_register(void) { PGconn *conn; PGresult *res; char sqlquery[QUERY_STR_LEN]; bool schema_exists = false; int ret; bool node_record_created; conn = establish_db_connection(options.conninfo, true); /* Verify that master is a supported server version */ log_info(_("%s connecting to master database\n"), progname); check_server_version(conn, "master", true, NULL); /* Check we are a master */ log_info(_("%s connected to master, checking its state\n"), progname); ret = is_standby(conn); if (ret) { log_err(_(ret == 1 ? "Trying to register a standby node as a master\n" : "Connection to node lost!\n")); PQfinish(conn); exit(ERR_BAD_CONFIG); } /* Check if there is a schema for this cluster */ sqlquery_snprintf(sqlquery, "SELECT 1 FROM pg_namespace " "WHERE nspname = '%s' ", get_repmgr_schema()); log_debug(_("master register: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("Can't get info about schemas: %s\n"), PQerrorMessage(conn)); PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } if (PQntuples(res) > 0) /* schema exists */ { if (!runtime_options.force) /* and we are not forcing so error */ { log_notice(_("Schema %s already exists.\n"), get_repmgr_schema()); PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } schema_exists = true; } PQclear(res); if (!schema_exists) { log_info(_("master register: creating database objects inside the %s schema\n"), get_repmgr_schema()); /* ok, create the schema */ if (!create_schema(conn)) return; } else { PGconn *master_conn; int id; if (runtime_options.force) { sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_nodes " " WHERE id = %d ", get_repmgr_schema_quoted(conn), options.node); log_debug(_("master register: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_warning(_("Cannot delete node details, %s\n"), PQerrorMessage(conn)); PQfinish(conn); exit(ERR_BAD_CONFIG); } PQclear(res); } /* Ensure there isn't any other master already registered */ master_conn = get_master_connection(conn, options.cluster_name, &id, NULL); if (master_conn != NULL) { PQfinish(master_conn); log_warning(_("There is a master already in cluster %s\n"), options.cluster_name); exit(ERR_BAD_CONFIG); } } /* Now register the master */ node_record_created = create_node_record(conn, "master register", options.node, options.cluster_name, options.node_name, options.conninfo, options.priority, false); PQfinish(conn); if(node_record_created == false) exit(ERR_DB_QUERY); log_notice(_("Master node correctly registered for cluster %s with id %d (conninfo: %s)\n"), options.cluster_name, options.node, options.conninfo); return; } static void do_standby_register(void) { PGconn *conn; PGconn *master_conn; int master_id, ret; PGresult *res; char sqlquery[QUERY_STR_LEN]; char master_version[MAXVERSIONSTR]; int master_version_num = 0; char standby_version[MAXVERSIONSTR]; int standby_version_num = 0; bool node_record_created; /* XXX: A lot of copied code from do_master_register! Refactor */ log_info(_("%s connecting to standby database\n"), progname); conn = establish_db_connection(options.conninfo, true); /* Verify that standby is a supported server version */ standby_version_num = check_server_version(conn, "standby", true, standby_version); /* Check we are a standby */ ret = is_standby(conn); if (ret == 0 || ret == -1) { log_err(_(ret == 0 ? "repmgr: This node should be a standby (%s)\n" : "repmgr: connection to node (%s) lost\n"), options.conninfo); PQfinish(conn); exit(ERR_BAD_CONFIG); } /* Check if there is a schema for this cluster */ sqlquery_snprintf(sqlquery, "SELECT 1 FROM pg_namespace WHERE nspname = '%s'", get_repmgr_schema()); log_debug(_("standby register: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { /* XXX error message does not match query */ log_err(_("Can't get info about tablespaces: %s\n"), PQerrorMessage(conn)); PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } if (PQntuples(res) == 0) { /* schema doesn't exist */ log_err(_("Schema %s doesn't exist.\n"), get_repmgr_schema()); PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } PQclear(res); /* check if there is a master in this cluster */ log_info(_("%s connecting to master database\n"), progname); master_conn = get_master_connection(conn, options.cluster_name, &master_id, NULL); if (!master_conn) { log_err(_("A master must be defined before configuring a slave\n")); exit(ERR_BAD_CONFIG); } /* Verify that master is a supported server version */ log_info(_("%s connected to master, checking its state\n"), progname); master_version_num = check_server_version(conn, "master", false, master_version); if(master_version_num < 0) { PQfinish(conn); PQfinish(master_conn); exit(ERR_BAD_CONFIG); } /* master and standby version should match */ if ((master_version_num / 100) != (standby_version_num / 100)) { PQfinish(conn); PQfinish(master_conn); log_err(_("%s needs versions of both master (%s) and standby (%s) to match.\n"), progname, master_version, standby_version); exit(ERR_BAD_CONFIG); } /* Now register the standby */ log_info(_("%s registering the standby\n"), progname); if (runtime_options.force) { sqlquery_snprintf(sqlquery, "DELETE FROM %s.repl_nodes " " WHERE id = %d", get_repmgr_schema_quoted(master_conn), options.node); log_debug(_("standby register: %s\n"), sqlquery); res = PQexec(master_conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("Cannot delete node details, %s\n"), PQerrorMessage(master_conn)); PQfinish(master_conn); PQfinish(conn); exit(ERR_BAD_CONFIG); } PQclear(res); } node_record_created = create_node_record(master_conn, "standby register", options.node, options.cluster_name, options.node_name, options.conninfo, options.priority, false); PQfinish(master_conn); PQfinish(conn); if(node_record_created == false) exit(ERR_BAD_CONFIG); log_info(_("%s registering the standby complete\n"), progname); log_notice(_("Standby node correctly registered for cluster %s with id %d (conninfo: %s)\n"), options.cluster_name, options.node, options.conninfo); return; } static void do_standby_clone(void) { PGconn *conn; PGresult *res; char sqlquery[QUERY_STR_LEN]; char cluster_size[MAXLEN]; int r = 0, retval = SUCCESS; int i; bool test_mode = false; bool config_file_copy_required = false; char tblspc_dir[MAXFILENAME]; char master_data_directory[MAXFILENAME]; char local_data_directory[MAXFILENAME]; char master_config_file[MAXFILENAME] = ""; char local_config_file[MAXFILENAME] = ""; char master_hba_file[MAXFILENAME] = ""; char local_hba_file[MAXFILENAME] = ""; char master_ident_file[MAXFILENAME] = ""; char local_ident_file[MAXFILENAME] = ""; /* * if dest_dir has been provided, we copy everything in the same path if * dest_dir is set and the master have tablespace, repmgr will stop * because it is more complex to remap the path for the tablespaces and it * does not look useful at the moment * * XXX test_mode a bit of a misnomer */ if (runtime_options.dest_dir[0]) { test_mode = true; log_notice(_("%s Destination directory %s provided, try to clone everything in it.\n"), progname, runtime_options.dest_dir); } /* Connection parameters for master only */ keywords[0] = "host"; values[0] = runtime_options.host; keywords[1] = "port"; values[1] = runtime_options.masterport; /* We need to connect to check configuration and start a backup */ log_info(_("%s connecting to master database\n"), progname); conn = establish_db_connection_by_params(keywords, values, true); /* Verify that master is a supported server version */ log_info(_("%s connected to master, checking its state\n"), progname); check_server_version(conn, "master", true, NULL); check_master_config(conn, true); sqlquery_snprintf(sqlquery, "SELECT pg_tablespace_location(oid) spclocation " " FROM pg_tablespace " "WHERE spcname NOT IN ('pg_default', 'pg_global') "); log_debug("standby clone: %s\n", sqlquery); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("Can't get info about tablespaces: %s\n"), PQerrorMessage(conn)); PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } for (i = 0; i < PQntuples(res); i++) { if (test_mode) { log_err("Can't clone in test mode when master have tablespace\n"); PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } strncpy(tblspc_dir, PQgetvalue(res, i, 0), MAXFILENAME); /* * Check this directory could be used for tablespace this will create * the directory a bit too early XXX build an array of tablespace to * create later in the backup */ if (!create_pg_dir(tblspc_dir, runtime_options.force)) { PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } } PQclear(res); if(get_cluster_size(conn, cluster_size) == false) exit(ERR_DB_QUERY); log_info(_("Successfully connected to master. Current installation size is %s\n"), cluster_size); /* * Obtain data directory and configuration file locations * We'll check to see whether the configuration files are in the data * directory - if not we'll have to copy them via SSH * * XXX: if configuration files are symlinks to targets outside the data * directory, they won't be copied by pg_basebackup, but we can't tell * this from the below query; we'll probably need to add a check for their * presence and if missing force copy by SSH */ sqlquery_snprintf(sqlquery, " WITH dd AS ( " " SELECT setting " " FROM pg_settings " " WHERE name = 'data_directory' " " ) " " SELECT ps.name, ps.setting, " " ps.setting ~ ('^' || dd.setting) AS in_data_dir " " FROM dd, pg_settings ps " " WHERE ps.name IN ('data_directory', 'config_file', 'hba_file', 'ident_file') " " ORDER BY 1 " ); log_debug(_("standby clone: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { log_err(_("Can't get info about data directory and configuration files: %s\n"), PQerrorMessage(conn)); PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } /* We need all 4 parameters, and they can be retrieved only by superusers */ if (PQntuples(res) != 4) { log_err("%s: STANDBY CLONE should be run by a SUPERUSER\n", progname); PQclear(res); PQfinish(conn); exit(ERR_BAD_CONFIG); } log_debug(_("standby clone: %i tuples\n"), PQntuples(res)); for (i = 0; i < PQntuples(res); i++) { if (strcmp(PQgetvalue(res, i, 0), "data_directory") == 0) { strncpy(master_data_directory, PQgetvalue(res, i, 1), MAXFILENAME); } else if (strcmp(PQgetvalue(res, i, 0), "config_file") == 0) { if(strcmp(PQgetvalue(res, i, 2), "f") == 0) { config_file_copy_required = true; strncpy(master_config_file, PQgetvalue(res, i, 1), MAXFILENAME); } } else if (strcmp(PQgetvalue(res, i, 0), "hba_file") == 0) { if(strcmp(PQgetvalue(res, i, 2), "f") == 0) { config_file_copy_required = true; strncpy(master_hba_file, PQgetvalue(res, i, 1), MAXFILENAME); } } else if (strcmp(PQgetvalue(res, i, 0), "ident_file") == 0) { if(strcmp(PQgetvalue(res, i, 2), "f") == 0) { config_file_copy_required = true; strncpy(master_ident_file, PQgetvalue(res, i, 1), MAXFILENAME); } } else log_warning(_("unknown parameter: %s\n"), PQgetvalue(res, i, 0)); } PQclear(res); if (test_mode) { strncpy(local_data_directory, runtime_options.dest_dir, MAXFILENAME); strncpy(local_config_file, runtime_options.dest_dir, MAXFILENAME); strncpy(local_hba_file, runtime_options.dest_dir, MAXFILENAME); strncpy(local_ident_file, runtime_options.dest_dir, MAXFILENAME); } else { strncpy(local_data_directory, master_data_directory, MAXFILENAME); strncpy(local_config_file, master_config_file, MAXFILENAME); strncpy(local_hba_file, master_hba_file, MAXFILENAME); strncpy(local_ident_file, master_ident_file, MAXFILENAME); } log_notice(_("Starting backup...\n")); /* Check the directory could be used as a PGDATA dir */ /* ZZZ maybe check tablespace, xlog dirs too */ if (!create_pg_dir(local_data_directory, runtime_options.force)) { log_err(_("%s: couldn't use directory %s ...\nUse --force option to force\n"), progname, local_data_directory); r = ERR_BAD_CONFIG; retval = ERR_BAD_CONFIG; goto stop_backup; } r = run_basebackup(); if (r != 0) { log_warning(_("standby clone: base backup failed\n")); retval = ERR_BAD_BASEBACKUP; goto stop_backup; } /* * If configuration files were not in the data directory, we need to copy * them via SSH * * TODO: add option to place these files in the same location on the * standby server? */ if(config_file_copy_required == true) { log_notice(_("Copying configuration files from master\n")); r = test_ssh_connection(runtime_options.host, runtime_options.remote_user); if (r != 0) { log_err(_("%s: Aborting, remote host %s is not reachable.\n"), progname, runtime_options.host); PQfinish(conn); retval = ERR_BAD_SSH; goto stop_backup; } if(strlen(master_config_file)) { log_info(_("standby clone: master config file '%s'\n"), master_config_file); r = copy_remote_files(runtime_options.host, runtime_options.remote_user, master_config_file, local_config_file); if (r != 0) { log_warning(_("standby clone: failed copying master config file '%s'\n"), master_config_file); retval = ERR_BAD_SSH; goto stop_backup; } } if(strlen(master_hba_file)) { log_info(_("standby clone: master hba file '%s'\n"), master_hba_file); r = copy_remote_files(runtime_options.host, runtime_options.remote_user, master_hba_file, local_hba_file); if (r != 0) { log_warning(_("standby clone: failed copying master hba file '%s'\n"), master_hba_file); retval = ERR_BAD_SSH; goto stop_backup; } } if(strlen(master_ident_file)) { log_info(_("standby clone: master ident file '%s'\n"), master_ident_file); r = copy_remote_files(runtime_options.host, runtime_options.remote_user, master_ident_file, local_ident_file); if (r != 0) { log_warning(_("standby clone: failed copying master ident file '%s'\n"), master_ident_file); retval = ERR_BAD_SSH; goto stop_backup; } } } stop_backup: /* If the backup failed then exit */ if (r != 0) { log_err(_("Unable to take a base backup of the master server\n")); log_warning(_("The destination directory (%s) will need to be cleaned up manually\n"), local_data_directory); exit(retval); } /* Finally, write the recovery.conf file */ create_recovery_file(local_data_directory); log_notice(_("%s base backup of standby complete\n"), progname); log_notice("HINT: You can now start your postgresql server\n"); if (test_mode) { log_notice(_("for example : pg_ctl -D %s start\n"), local_data_directory); } else { log_notice("for example : /etc/init.d/postgresql start\n"); } exit(retval); } static void do_standby_promote(void) { PGconn *conn; char script[MAXLEN]; PGconn *old_master_conn; int old_master_id; int r, retval; char data_dir[MAXLEN]; int i, promote_check_timeout = 60, promote_check_interval = 2; bool promote_sucess = false; bool success; /* We need to connect to check configuration */ log_info(_("%s connecting to standby database\n"), progname); conn = establish_db_connection(options.conninfo, true); /* Verify that standby is a supported server version */ log_info(_("%s connected to standby, checking its state\n"), progname); check_server_version(conn, "standby", true, NULL); /* Check we are in a standby node */ retval = is_standby(conn); if (retval == 0 || retval == -1) { log_err(_(retval == 0 ? "%s: The command should be executed on a standby node\n" : "%s: connection to node lost!\n"), progname); PQfinish(conn); exit(ERR_BAD_CONFIG); } /* we also need to check if there isn't any master already */ old_master_conn = get_master_connection(conn, options.cluster_name, &old_master_id, NULL); if (old_master_conn != NULL) { log_err(_("This cluster already has an active master server\n")); PQfinish(old_master_conn); PQfinish(conn); exit(ERR_BAD_CONFIG); } log_notice(_("%s: Promoting standby\n"), progname); /* Get the data directory */ success = get_pg_setting(conn, "data_directory", data_dir); PQfinish(conn); if (success == false) { log_err(_("Unable to determine data directory\n")); exit(ERR_BAD_CONFIG); } /* * Promote standby to master. * * `pg_ctl promote` returns immediately and has no -w option, so we * can't be sure when or if the promotion completes. * For now we'll poll the server until the default timeout (60 seconds) */ maxlen_snprintf(script, "%s/pg_ctl -D %s promote", options.pg_bindir, data_dir); log_notice(_("%s: promoting server using '%s'\n"), progname, script); r = system(script); if (r != 0) { log_err(_("Unable to promote server from standby to master\n")); exit(ERR_NO_RESTART); } /* reconnect to check we got promoted */ log_info(_("%s reconnecting to promoted server\n"), progname); conn = establish_db_connection(options.conninfo, true); for(i = 0; i < promote_check_timeout; i += promote_check_interval) { retval = is_standby(conn); if(!retval) { promote_sucess = true; break; } sleep(promote_check_interval); } if (promote_sucess == false) { /* XXX exit with error? */ log_err(_(retval == 1 ? "%s: STANDBY PROMOTE failed, this is still a standby node.\n" : "%s: connection to node lost!\n"), progname); } else { log_notice(_("%s: STANDBY PROMOTE successful. You should REINDEX any hash indexes you have.\n"), progname); } PQfinish(conn); return; } static void do_standby_follow(void) { PGconn *conn; char script[MAXLEN]; char master_conninfo[MAXLEN]; PGconn *master_conn; int master_id; int r, retval; char data_dir[MAXLEN]; char master_version[MAXVERSIONSTR]; int master_version_num = 0; char standby_version[MAXVERSIONSTR]; int standby_version_num = 0; bool success; /* We need to connect to check configuration */ log_info(_("%s connecting to standby database\n"), progname); conn = establish_db_connection(options.conninfo, true); log_info(_("%s connected to standby, checking its state\n"), progname); /* Verify that standby is a supported server version */ standby_version_num = check_server_version(conn, "standby", true, standby_version); /* Check we are in a standby node */ retval = is_standby(conn); if (retval == 0 || retval == -1) { log_err(_(retval == 0 ? "%s: The command should be executed in a standby node\n" : "%s: connection to node lost!\n"), progname); PQfinish(conn); exit(ERR_BAD_CONFIG); } /* * we also need to check if there is any master in the cluster or wait for * one to appear if we have set the wait option */ log_info(_("%s discovering new master...\n"), progname); do { if (!is_pgup(conn, options.master_response_timeout)) { conn = establish_db_connection(options.conninfo, true); } master_conn = get_master_connection(conn, options.cluster_name, &master_id, (char *) &master_conninfo); } while (master_conn == NULL && runtime_options.wait_for_master); if (master_conn == NULL) { log_err(_("There isn't a master to follow in this cluster\n")); PQfinish(conn); exit(ERR_BAD_CONFIG); } /* Check we are going to point to a master */ retval = is_standby(master_conn); if (retval) { log_err(_(retval == 1 ? "%s: The node to follow should be a master\n" : "%s: connection to node lost!\n"), progname); PQfinish(conn); PQfinish(master_conn); exit(ERR_BAD_CONFIG); } /* Verify that master is a supported server version */ log_info(_("%s connected to master, checking its state\n"), progname); master_version_num = check_server_version(conn, "master", false, master_version); if(master_version_num < 0) { PQfinish(conn); PQfinish(master_conn); exit(ERR_BAD_CONFIG); } /* master and standby version should match */ if ((master_version_num / 100) != (standby_version_num / 100)) { PQfinish(conn); PQfinish(master_conn); log_err(_("%s needs versions of both master (%s) and standby (%s) to match.\n"), progname, master_version, standby_version); exit(ERR_BAD_CONFIG); } /* * set the host and masterport variables with the master ones before * closing the connection because we will need them to recreate the * recovery.conf file */ strncpy(runtime_options.host, PQhost(master_conn), MAXLEN); strncpy(runtime_options.masterport, PQport(master_conn), MAXLEN); strncpy(runtime_options.username, PQuser(master_conn), MAXLEN); PQfinish(master_conn); log_info(_("%s Changing standby's master\n"), progname); /* Get the data directory full path */ success = get_pg_setting(conn, "data_directory", data_dir); PQfinish(conn); if (success == false) { log_err(_("Unable to determine data directory\n")); exit(ERR_BAD_CONFIG); } /* write the recovery.conf file */ if (!create_recovery_file(data_dir)) exit(ERR_BAD_CONFIG); /* Finally, restart the service */ maxlen_snprintf(script, "%s/pg_ctl %s -w -D %s -m fast restart", options.pg_bindir, options.pgctl_options, data_dir); log_notice(_("%s: restarting server using '%s'\n"), progname, script); r = system(script); if (r != 0) { log_err(_("Can't restart server\n")); exit(ERR_NO_RESTART); } return; } static void do_witness_create(void) { PGconn *masterconn; PGconn *witnessconn; PGresult *res; char sqlquery[QUERY_STR_LEN]; char script[MAXLEN]; char buf[MAXLEN]; FILE *pg_conf = NULL; int r = 0, retval; char master_hba_file[MAXLEN]; bool success; bool node_record_created; /* Connection parameters for master only */ keywords[0] = "host"; values[0] = runtime_options.host; keywords[1] = "port"; values[1] = runtime_options.masterport; /* We need to connect to check configuration and copy it */ masterconn = establish_db_connection_by_params(keywords, values, true); if (!masterconn) { log_err(_("%s: could not connect to master\n"), progname); exit(ERR_DB_CON); } /* Verify that master is a supported server version */ check_server_version(masterconn, "master", true, NULL); /* Check we are connecting to a primary node */ retval = is_standby(masterconn); if (retval) { log_err(_(retval == 1 ? "The command should not run on a standby node\n" : "Connection to node lost!\n")); PQfinish(masterconn); exit(ERR_BAD_CONFIG); } log_info(_("Successfully connected to master.\n")); r = test_ssh_connection(runtime_options.host, runtime_options.remote_user); if (r != 0) { log_err(_("%s: Aborting, remote host %s is not reachable.\n"), progname, runtime_options.host); PQfinish(masterconn); exit(ERR_BAD_SSH); } /* Check this directory could be used as a PGDATA dir */ if (!create_pg_dir(runtime_options.dest_dir, runtime_options.force)) { log_err(_("witness create: couldn't create data directory (\"%s\") for witness"), runtime_options.dest_dir); exit(ERR_BAD_CONFIG); } /* * To create a witness server we need to: 1) initialize the cluster 2) * register the witness in repl_nodes 3) copy configuration from master */ /* Create the cluster for witness */ if (!runtime_options.superuser[0]) strncpy(runtime_options.superuser, "postgres", MAXLEN); sprintf(script, "%s/pg_ctl %s -D %s init -o \"%s-U %s\"", options.pg_bindir, options.pgctl_options, runtime_options.dest_dir, runtime_options.initdb_no_pwprompt ? "" : "-W ", runtime_options.superuser); log_info("Initialize cluster for witness: %s.\n", script); r = system(script); if (r != 0) { log_err("Can't initialize cluster for witness server\n"); PQfinish(masterconn); exit(ERR_BAD_CONFIG); } /* * default port for the witness is 5499, but user can provide a different * one */ xsnprintf(buf, sizeof(buf), "%s/postgresql.conf", runtime_options.dest_dir); pg_conf = fopen(buf, "a"); if (pg_conf == NULL) { log_err(_("%s: could not open \"%s\" for adding extra config: %s\n"), progname, buf, strerror(errno)); PQfinish(masterconn); exit(ERR_BAD_CONFIG); } xsnprintf(buf, sizeof(buf), "\n#Configuration added by %s\n", progname); fputs(buf, pg_conf); if (!runtime_options.localport[0]) strncpy(runtime_options.localport, "5499", MAXLEN); xsnprintf(buf, sizeof(buf), "port = %s\n", runtime_options.localport); fputs(buf, pg_conf); xsnprintf(buf, sizeof(buf), "shared_preload_libraries = 'repmgr_funcs'\n"); fputs(buf, pg_conf); xsnprintf(buf, sizeof(buf), "listen_addresses = '*'\n"); fputs(buf, pg_conf); fclose(pg_conf); /* start new instance */ sprintf(script, "%s/pg_ctl %s -w -D %s start", options.pg_bindir, options.pgctl_options, runtime_options.dest_dir); log_info(_("Start cluster for witness: %s"), script); r = system(script); if (r != 0) { log_err(_("Can't start cluster for witness server\n")); PQfinish(masterconn); exit(ERR_BAD_CONFIG); } /* check if we need to create a user */ if (runtime_options.username[0] && runtime_options.localport[0] && strcmp(runtime_options.username,"postgres")!=0 ) { /* create required user needs to be superuser to create untrusted language function in c */ sprintf(script, "%s/createuser -p %s --superuser --login -U %s %s", options.pg_bindir, runtime_options.localport, runtime_options.superuser, runtime_options.username); log_info("Create user for witness db: %s.\n", script); r = system(script); if (r != 0) { log_err("Can't create user for witness server\n"); PQfinish(masterconn); exit(ERR_BAD_CONFIG); } } /* check if we need to create a database */ if(runtime_options.dbname[0] && strcmp(runtime_options.dbname,"postgres")!=0 && runtime_options.localport[0]) { /* create required db */ sprintf(script, "%s/createdb -p %s -U %s --owner=%s %s", options.pg_bindir, runtime_options.localport, runtime_options.superuser, runtime_options.username, runtime_options.dbname); log_info("Create database for witness db: %s.\n", script); r = system(script); if (r != 0) { log_err("Can't create database for witness server\n"); PQfinish(masterconn); exit(ERR_BAD_CONFIG); } } /* Get the pg_hba.conf full path */ success = get_pg_setting(masterconn, "hba_file", master_hba_file); if (success == false) { log_err(_("Can't get info about pg_hba.conf\n")); exit(ERR_DB_QUERY); } r = copy_remote_files(runtime_options.host, runtime_options.remote_user, master_hba_file, runtime_options.dest_dir); if (r != 0) { log_err(_("Can't rsync the pg_hba.conf file from master\n")); PQfinish(masterconn); exit(ERR_BAD_CONFIG); } /* reload to adapt for changed pg_hba.conf */ sprintf(script, "%s/pg_ctl %s -w -D %s reload", options.pg_bindir, options.pgctl_options, runtime_options.dest_dir); log_info(_("Reload cluster config for witness: %s"), script); r = system(script); if (r != 0) { log_err(_("Can't reload cluster for witness server\n")); PQfinish(masterconn); exit(ERR_BAD_CONFIG); } /* register ourselves in the master */ node_record_created = create_node_record(masterconn, "witness create", options.node, options.cluster_name, options.node_name, options.conninfo, options.priority, true); if(node_record_created == false) { PQfinish(masterconn); exit(ERR_DB_QUERY); } /* establish a connection to the witness, and create the schema */ witnessconn = establish_db_connection(options.conninfo, true); log_info(_("Starting copy of configuration from master...\n")); if (!create_schema(witnessconn)) { PQfinish(masterconn); PQfinish(witnessconn); exit(ERR_BAD_CONFIG); } /* copy configuration from master, only repl_nodes is needed */ if (!copy_configuration(masterconn, witnessconn)) { PQfinish(masterconn); PQfinish(witnessconn); exit(ERR_BAD_CONFIG); } /* drop superuser powers if needed */ if (runtime_options.username[0] && runtime_options.localport[0] && strcmp(runtime_options.username,"postgres")!=0 ) { sqlquery_snprintf(sqlquery, "ALTER ROLE %s NOSUPERUSER", runtime_options.username); log_info("Drop superuser powers on user for witness db: %s.\n", sqlquery); log_debug(_("witness create: %s\n"), sqlquery); res = PQexec(witnessconn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("Cannot alter user privileges, %s\n"), PQerrorMessage(witnessconn)); PQfinish(masterconn); PQfinish(witnessconn); exit(ERR_DB_QUERY); } } PQfinish(masterconn); PQfinish(witnessconn); log_notice(_("Configuration has been successfully copied to the witness\n")); } static void usage(void) { fprintf(stderr, _("\n\n%s: Replicator manager \n"), progname); fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); } static void help(const char *progname) { printf(_("\n%s: Replicator manager \n"), progname); printf(_("Usage:\n")); printf(_(" %s [OPTIONS] master {register}\n"), progname); printf(_(" %s [OPTIONS] standby {register|clone|promote|follow}\n"), progname); printf(_(" %s [OPTIONS] cluster {show|cleanup}\n"), progname); printf(_("\nGeneral options:\n")); printf(_(" --help show this help, then exit\n")); printf(_(" --version output version information, then exit\n")); printf(_(" --verbose output verbose activity information\n")); printf(_("\nConnection options:\n")); printf(_(" -d, --dbname=DBNAME database to connect to\n")); printf(_(" -h, --host=HOSTNAME database server host or socket directory\n")); printf(_(" -p, --port=PORT database server port\n")); printf(_(" -U, --username=USERNAME database user name to connect as\n")); printf(_("\nConfiguration options:\n")); printf(_(" -D, --data-dir=DIR local directory where the files will be\n" \ " copied to\n")); printf(_(" -l, --local-port=PORT standby or witness server local port\n")); printf(_(" -f, --config-file=PATH path to the configuration file\n")); printf(_(" -R, --remote-user=USERNAME database server username for rsync\n")); printf(_(" -S, --superuser=USERNAME superuser username for witness database\n" \ " (default: postgres)\n")); printf(_(" -w, --wal-keep-segments=VALUE minimum value for the GUC\n" \ " wal_keep_segments (default: 5000)\n")); printf(_(" -k, --keep-history=VALUE keeps indicated number of days of\n" \ " history\n")); printf(_(" -F, --force force potentially dangerous operations\n" \ " to happen\n")); printf(_(" -W, --wait wait for a master to appear\n")); printf(_(" -r, --min-recovery-apply-delay=VALUE enable recovery time delay, value has to be a valid time atom (e.g. 5min)\n")); printf(_(" --initdb-no-pwprompt don't require superuser password when running initdb\n")); printf(_(" --check-master-config report problems with master configuration\n")); printf(_("\n%s performs some tasks like clone a node, promote it or making follow\n"), progname); printf(_("another node and then exits.\n\n")); printf(_("COMMANDS:\n")); printf(_(" master register - registers the master in a cluster\n")); printf(_(" standby register - registers a standby in a cluster\n")); printf(_(" standby clone [node] - allows creation of a new standby\n")); printf(_(" standby promote - allows manual promotion of a specific standby into\n" \ " a new master in the event of a failover\n")); printf(_(" standby follow - allows the standby to re-point itself to a new\n" \ " master\n")); printf(_(" cluster show - print node information\n")); printf(_(" cluster cleanup - cleans monitor's history\n")); } /* * Creates a recovery file for a standby. */ static bool create_recovery_file(const char *data_dir) { FILE *recovery_file; char recovery_file_path[MAXLEN]; char line[MAXLEN]; maxlen_snprintf(recovery_file_path, "%s/%s", data_dir, RECOVERY_FILE); recovery_file = fopen(recovery_file_path, "w"); if (recovery_file == NULL) { log_err(_("Unable to create recovery.conf file at '%s'\n"), recovery_file_path); return false; } /* standby_mode = 'on' */ maxlen_snprintf(line, "standby_mode = 'on'\n"); if(write_recovery_file_line(recovery_file, recovery_file_path, line) == false) return false; /* primary_conninfo = '...' */ write_primary_conninfo(line); if(write_recovery_file_line(recovery_file, recovery_file_path, line) == false) return false; /* recovery_target_timeline = 'latest' */ maxlen_snprintf(line, "recovery_target_timeline = 'latest'\n"); if(write_recovery_file_line(recovery_file, recovery_file_path, line) == false) return false; /* min_recovery_apply_delay = ... (optional) */ if(*runtime_options.min_recovery_apply_delay) { maxlen_snprintf(line, "\nmin_recovery_apply_delay = %s\n", runtime_options.min_recovery_apply_delay); if(write_recovery_file_line(recovery_file, recovery_file_path, line) == false) return false; } fclose(recovery_file); return true; } static bool write_recovery_file_line(FILE *recovery_file, char *recovery_file_path, char *line) { if (fputs(line, recovery_file) == EOF) { log_err(_("Unable to write to recovery file at '%s'\n"), recovery_file_path); fclose(recovery_file); return false; } return true; } static int test_ssh_connection(char *host, char *remote_user) { char script[MAXLEN]; int r = 1, i; /* On some OS, true is located in a different place than in Linux * we have to try them all until all alternatives are gone or we * found `true' because the target OS may differ from the source * OS */ const char *truebin_paths[] = { "/bin/true", "/usr/bin/true", NULL }; /* Check if we have ssh connectivity to host before trying to rsync */ for(i = 0; truebin_paths[i] && r != 0; ++i) { if (!remote_user[0]) maxlen_snprintf(script, "ssh -o Batchmode=yes %s %s %s", options.ssh_options, host, truebin_paths[i]); else maxlen_snprintf(script, "ssh -o Batchmode=yes %s %s -l %s %s", options.ssh_options, host, remote_user, truebin_paths[i]); log_debug(_("command is: %s\n"), script); r = system(script); } if (r != 0) log_info(_("Can not connect to the remote host (%s)\n"), host); return r; } static int copy_remote_files(char *host, char *remote_user, char *remote_path, char *local_path) { char script[MAXLEN]; char rsync_flags[MAXLEN]; char host_string[MAXLEN]; int r; if (*options.rsync_options == '\0') maxlen_snprintf( rsync_flags, "%s", "--archive --checksum --compress --progress --rsh=ssh"); else maxlen_snprintf(rsync_flags, "%s", options.rsync_options); if (runtime_options.force) strcat(rsync_flags, " --delete"); if (!remote_user[0]) { maxlen_snprintf(host_string, "%s", host); } else { maxlen_snprintf(host_string, "%s@%s", remote_user, host); } maxlen_snprintf(script, "rsync %s %s:%s %s", rsync_flags, host_string, remote_path, local_path); log_info(_("rsync command line: '%s'\n"), script); r = system(script); if (r != 0) log_err(_("Can't rsync from remote file (%s:%s)\n"), host_string, remote_path); return r; } static int run_basebackup() { char script[MAXLEN]; int r = 0; maxlen_snprintf(script, "%s/pg_basebackup -h %s -p %s -U %s -D %s -l \"repmgr base backup\"", options.pg_bindir, runtime_options.host, runtime_options.masterport, runtime_options.username, runtime_options.dest_dir ); log_info(_("Executing: '%s'\n"), script); r = system(script); /* * As of 9.4, pg_basebackup et al only ever return 0 or 1 */ log_debug(_("r = %i, %i\n"), r, WEXITSTATUS(r)); return r; } /* * Tries to avoid useless or conflicting parameters */ static bool check_parameters_for_action(const int action) { bool ok = true; switch (action) { case MASTER_REGISTER: /* * To register a master we only need the repmgr.conf all other * parameters are at least useless and could be confusing so * reject them */ if (runtime_options.host[0] || runtime_options.masterport[0] || runtime_options.username[0] || runtime_options.dbname[0]) { log_err(_("You can't use connection parameters to the master when issuing a MASTER REGISTER command.\n")); usage(); ok = false; } if (runtime_options.dest_dir[0]) { log_err(_("You don't need a destination directory for MASTER REGISTER command\n")); usage(); ok = false; } break; case STANDBY_REGISTER: /* * To register a standby we only need the repmgr.conf we don't * need connection parameters to the master because we can detect * the master in repl_nodes */ if (runtime_options.host[0] || runtime_options.masterport[0] || runtime_options.username[0] || runtime_options.dbname[0]) { log_err(_("You can't use connection parameters to the master when issuing a STANDBY REGISTER command.\n")); usage(); ok = false; } if (runtime_options.dest_dir[0]) { log_err(_("You don't need a destination directory for STANDBY REGISTER command\n")); usage(); ok = false; } break; case STANDBY_PROMOTE: /* * To promote a standby we only need the repmgr.conf we don't want * connection parameters to the master because we will try to * detect the master in repl_nodes if we can't find it then the * promote action will be cancelled */ if (runtime_options.host[0] || runtime_options.masterport[0] || runtime_options.username[0] || runtime_options.dbname[0]) { log_err(_("You can't use connection parameters to the master when issuing a STANDBY PROMOTE command.\n")); usage(); ok = false; } if (runtime_options.dest_dir[0]) { log_err(_("You don't need a destination directory for STANDBY PROMOTE command\n")); usage(); ok = false; } break; case STANDBY_FOLLOW: /* * To make a standby follow a master we only need the repmgr.conf * we don't want connection parameters to the new master because * we will try to detect the master in repl_nodes if we can't find * it then the follow action will be cancelled */ if (runtime_options.host[0] || runtime_options.masterport[0] || runtime_options.username[0] || runtime_options.dbname[0]) { log_err(_("You can't use connection parameters to the master when issuing a STANDBY FOLLOW command.\n")); usage(); ok = false; } if (runtime_options.dest_dir[0]) { log_err(_("You don't need a destination directory for STANDBY FOLLOW command\n")); usage(); ok = false; } break; case STANDBY_CLONE: /* * Issue a friendly notice that the configuration file is not * necessary nor read at all in when performing a STANDBY CLONE * action. */ if (runtime_options.config_file[0]) { log_notice(_("Only command line parameters for the connection " "to the master are used when issuing a STANDBY CLONE command. " "The passed configuration file is neither required nor used for " "its node configuration portions\n\n")); } /* * To clone a master into a standby we need connection parameters * repmgr.conf is useless because we don't have a server running * in the standby; warn the user, but keep going. */ if (runtime_options.host == NULL) { log_notice(_("You need to use connection parameters to " "the master when issuing a STANDBY CLONE command.")); ok = false; } need_a_node = false; break; case WITNESS_CREATE: /* allow all parameters to be supplied */ break; case CLUSTER_SHOW: /* allow all parameters to be supplied */ break; case CLUSTER_CLEANUP: /* allow all parameters to be supplied */ break; } return ok; } static bool create_schema(PGconn *conn) { char sqlquery[QUERY_STR_LEN]; PGresult *res; sqlquery_snprintf(sqlquery, "CREATE SCHEMA %s", get_repmgr_schema_quoted(conn)); log_debug(_("master register: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("Cannot create the schema %s: %s\n"), get_repmgr_schema(), PQerrorMessage(conn)); PQfinish(conn); exit(ERR_BAD_CONFIG); } PQclear(res); /* * to avoid confusion of the time_lag field and provide a consistent UI we * use these functions for providing the latest update timestamp */ sqlquery_snprintf(sqlquery, "CREATE FUNCTION %s.repmgr_update_last_updated() " " RETURNS TIMESTAMP WITH TIME ZONE " " AS '$libdir/repmgr_funcs', 'repmgr_update_last_updated' " " LANGUAGE C STRICT ", get_repmgr_schema_quoted(conn)); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, "Cannot create the function repmgr_update_last_updated: %s\n", PQerrorMessage(conn)); return false; } PQclear(res); sqlquery_snprintf(sqlquery, "CREATE FUNCTION %s.repmgr_get_last_updated() " " RETURNS TIMESTAMP WITH TIME ZONE " " AS '$libdir/repmgr_funcs', 'repmgr_get_last_updated' " " LANGUAGE C STRICT ", get_repmgr_schema_quoted(conn)); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, "Cannot create the function repmgr_get_last_updated: %s\n", PQerrorMessage(conn)); return false; } PQclear(res); /* ... the tables */ sqlquery_snprintf(sqlquery, "CREATE TABLE %s.repl_nodes ( " " id INTEGER PRIMARY KEY, " " cluster TEXT NOT NULL, " " name TEXT NOT NULL, " " conninfo TEXT NOT NULL, " " priority INTEGER NOT NULL, " " witness BOOLEAN NOT NULL DEFAULT FALSE) ", get_repmgr_schema_quoted(conn)); log_debug(_("master register: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("Cannot create the table %s.repl_nodes: %s\n"), get_repmgr_schema_quoted(conn), PQerrorMessage(conn)); PQfinish(conn); exit(ERR_BAD_CONFIG); } PQclear(res); sqlquery_snprintf(sqlquery, "CREATE TABLE %s.repl_monitor ( " " primary_node INTEGER NOT NULL, " " standby_node INTEGER NOT NULL, " " last_monitor_time TIMESTAMP WITH TIME ZONE NOT NULL, " " last_apply_time TIMESTAMP WITH TIME ZONE, " " last_wal_primary_location TEXT NOT NULL, " " last_wal_standby_location TEXT, " " replication_lag BIGINT NOT NULL, " " apply_lag BIGINT NOT NULL) ", get_repmgr_schema_quoted(conn)); log_debug(_("master register: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("Cannot create the table %s.repl_monitor: %s\n"), get_repmgr_schema_quoted(conn), PQerrorMessage(conn)); PQfinish(conn); exit(ERR_BAD_CONFIG); } PQclear(res); /* a view */ sqlquery_snprintf(sqlquery, "CREATE VIEW %s.repl_status AS " " SELECT primary_node, standby_node, name AS standby_name, last_monitor_time, " " last_wal_primary_location, last_wal_standby_location, " " pg_size_pretty(replication_lag) replication_lag, " " age(now(), last_apply_time) AS replication_time_lag, " " pg_size_pretty(apply_lag) apply_lag, " " age(now(), CASE WHEN pg_is_in_recovery() THEN %s.repmgr_get_last_updated() ELSE last_monitor_time END) AS communication_time_lag " " FROM %s.repl_monitor " " JOIN %s.repl_nodes ON standby_node = id " " WHERE (standby_node, last_monitor_time) IN ( " " SELECT standby_node, MAX(last_monitor_time) " " FROM %s.repl_monitor GROUP BY 1 " " )", get_repmgr_schema_quoted(conn), get_repmgr_schema_quoted(conn), get_repmgr_schema_quoted(conn), get_repmgr_schema_quoted(conn), get_repmgr_schema_quoted(conn)); log_debug(_("master register: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("Cannot create the view %s.repl_status: %s\n"), get_repmgr_schema_quoted(conn), PQerrorMessage(conn)); PQfinish(conn); exit(ERR_BAD_CONFIG); } PQclear(res); /* an index to improve performance of the view */ sqlquery_snprintf(sqlquery, "CREATE INDEX idx_repl_status_sort " " ON %s.repl_monitor (last_monitor_time, standby_node) ", get_repmgr_schema_quoted(conn)); log_debug(_("master register: %s\n"), sqlquery); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_err(_("Can't index table %s.repl_monitor: %s\n"), get_repmgr_schema_quoted(conn), PQerrorMessage(conn)); PQfinish(conn); exit(ERR_BAD_CONFIG); } PQclear(res); /* * XXX Here we MUST try to load the repmgr_function.sql not hardcode it * here */ sqlquery_snprintf(sqlquery, "CREATE OR REPLACE FUNCTION %s.repmgr_update_standby_location(text) " " RETURNS boolean " " AS '$libdir/repmgr_funcs', 'repmgr_update_standby_location' " " LANGUAGE C STRICT ", get_repmgr_schema_quoted(conn)); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, "Cannot create the function repmgr_update_standby_location: %s\n", PQerrorMessage(conn)); return false; } PQclear(res); sqlquery_snprintf(sqlquery, "CREATE OR REPLACE FUNCTION %s.repmgr_get_last_standby_location() " " RETURNS text " " AS '$libdir/repmgr_funcs', 'repmgr_get_last_standby_location' " " LANGUAGE C STRICT ", get_repmgr_schema_quoted(conn)); res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, "Cannot create the function repmgr_get_last_standby_location: %s\n", PQerrorMessage(conn)); return false; } PQclear(res); return true; } /* * copy_configuration() * * Copy records in master's `repl_nodes` table to witness database */ static bool copy_configuration(PGconn *masterconn, PGconn *witnessconn) { char sqlquery[MAXLEN]; PGresult *res; int i; sqlquery_snprintf(sqlquery, "TRUNCATE TABLE %s.repl_nodes", get_repmgr_schema_quoted(witnessconn)); log_debug("copy_configuration: %s\n", sqlquery); res = PQexec(witnessconn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, "Cannot clean node details in the witness, %s\n", PQerrorMessage(witnessconn)); return false; } sqlquery_snprintf(sqlquery, "SELECT id, name, conninfo, priority, witness FROM %s.repl_nodes", get_repmgr_schema_quoted(masterconn)); res = PQexec(masterconn, sqlquery); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, "Can't get configuration from master: %s\n", PQerrorMessage(masterconn)); PQclear(res); return false; } for (i = 0; i < PQntuples(res); i++) { bool node_record_created; char *witness = PQgetvalue(res, i, 4); log_debug(_("copy_configuration(): %s\n"), witness); node_record_created = create_node_record(witnessconn, "copy_configuration", atoi(PQgetvalue(res, i, 0)), options.cluster_name, PQgetvalue(res, i, 1), PQgetvalue(res, i, 2), atoi(PQgetvalue(res, i, 3)), strcmp(witness, "t") == 0 ? true : false); if (node_record_created == false) { fprintf(stderr, "Unable to create node record for witness: %s\n", PQerrorMessage(witnessconn)); return false; } } PQclear(res); return true; } /* This function uses global variables to determine connection settings. Special * usage of the PGPASSWORD variable is handled, but strongly discouraged */ static void write_primary_conninfo(char *line) { char host_buf[MAXLEN] = ""; char conn_buf[MAXLEN] = ""; char user_buf[MAXLEN] = ""; char appname_buf[MAXLEN] = ""; char password_buf[MAXLEN] = ""; /* Environment variable for password (UGLY, please use .pgpass!) */ const char *password = getenv("PGPASSWORD"); if (password != NULL) { maxlen_snprintf(password_buf, " password=%s", password); } else if (require_password) { log_err(_("%s: PGPASSWORD not set, but having one is required\n"), progname); exit(ERR_BAD_PASSWORD); } if (runtime_options.host[0]) { maxlen_snprintf(host_buf, " host=%s", runtime_options.host); } if (runtime_options.username[0]) { maxlen_snprintf(user_buf, " user=%s", runtime_options.username); } if (options.node_name[0]) { maxlen_snprintf(appname_buf, " application_name=%s", options.node_name); } maxlen_snprintf(conn_buf, "port=%s%s%s%s%s", (runtime_options.masterport[0]) ? runtime_options.masterport : "5432", host_buf, user_buf, password_buf, appname_buf); maxlen_snprintf(line, "primary_conninfo = '%s'\n", conn_buf); } /** * check_server_version() * * Verify that the server is MIN_SUPPORTED_VERSION_NUM or later * * PGconn *conn: * the connection to check * * char *server_type: * either "master" or "standby"; used to format error message * * bool exit_on_error: * exit if reported server version is too low; optional to enable some callers * to perform additional cleanup * * char *server_version_string * passed to get_server_version(), which will place the human-readble * server version string there (e.g. "9.4.0") */ static int check_server_version(PGconn *conn, char *server_type, bool exit_on_error, char *server_version_string) { int server_version_num = 0; server_version_num = get_server_version(conn, server_version_string); if(server_version_num < MIN_SUPPORTED_VERSION_NUM) { if (server_version_num > 0) log_err(_("%s needs %s to be PostgreSQL %s or better\n"), progname, server_type, MIN_SUPPORTED_VERSION ); if(exit_on_error == true) { PQfinish(conn); exit(ERR_BAD_CONFIG); } return -1; } return server_version_num; } /* * check_master_config() * * Perform sanity check on master configuration * * TODO: check replication connection is possble */ static bool check_master_config(PGconn *conn, bool exit_on_error) { int i, is_standby_retval; bool config_ok = true; /* Check we are cloning a primary node */ is_standby_retval = is_standby(conn); if (is_standby_retval) { log_err(_(is_standby_retval == 1 ? "The specified node is in standby state and cannot be used as a master\n" : "Connection to node lost!\n")); if(exit_on_error == true) { PQfinish(conn); exit(ERR_BAD_CONFIG); } config_ok = false; } /* XXX check user is qualified to perform base backup */ /* And check if it is well configured */ i = guc_set(conn, "wal_level", "=", "hot_standby"); if (i == 0 || i == -1) { if (i == 0) log_err(_("%s needs parameter 'wal_level' to be set to 'hot_standby'\n"), progname); if(exit_on_error == true) { PQfinish(conn); exit(ERR_BAD_CONFIG); } config_ok = false; } /* XXX we may need to modify this */ i = guc_set_typed(conn, "wal_keep_segments", ">=", runtime_options.wal_keep_segments, "integer"); if (i == 0 || i == -1) { if (i == 0) log_err(_("%s needs parameter 'wal_keep_segments' to be set to %s or greater (see the '-w' option or edit the postgresql.conf of the PostgreSQL master.)\n"), progname, runtime_options.wal_keep_segments); if(exit_on_error == true) { PQfinish(conn); exit(ERR_BAD_CONFIG); } config_ok = false; } i = guc_set(conn, "archive_mode", "=", "on"); if (i == 0 || i == -1) { if (i == 0) log_err(_("%s needs parameter 'archive_mode' to be set to 'on'\n"), progname); if(exit_on_error == true) { PQfinish(conn); exit(ERR_BAD_CONFIG); } config_ok = false; } i = guc_set(conn, "hot_standby", "=", "on"); if (i == 0 || i == -1) { if (i == 0) log_err(_("%s needs parameter 'hot_standby' to be set to 'on'\n"), progname); if(exit_on_error == true) { PQfinish(conn); exit(ERR_BAD_CONFIG); } config_ok = false; } i = guc_set_typed(conn, "max_wal_senders", ">", "0", "integer"); if (i == 0 || i == -1) { if (i == 0) log_err(_("%s needs parameter 'max_wal_senders' to be set to be at least 1\n"), progname); if(exit_on_error == true) { PQfinish(conn); exit(ERR_BAD_CONFIG); } config_ok = false; } return config_ok; } static void do_check_master_config(void) { PGconn *conn; bool config_ok; /* Connection parameters for master only */ keywords[0] = "host"; values[0] = runtime_options.host; keywords[1] = "port"; values[1] = runtime_options.masterport; keywords[2] = "dbname"; values[2] = runtime_options.dbname; /* We need to connect to check configuration and start a backup */ log_info(_("%s connecting to master database\n"), progname); conn = establish_db_connection_by_params(keywords, values, true); /* Verify that master is a supported server version */ log_info(_("%s connected to master, checking its state\n"), progname); check_server_version(conn, "master", false, NULL); config_ok = check_master_config(conn, false); if(config_ok == true) { puts(_("No master configuration problems found")); } PQfinish(conn); } static bool create_node_record(PGconn *conn, char *action, int node, char *cluster_name, char *node_name, char *conninfo, int priority, bool witness) { char sqlquery[QUERY_STR_LEN]; PGresult *res; sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_nodes " " (id, cluster, name, conninfo, priority, witness) " "VALUES (%d, '%s', '%s', '%s', %d, %s) ", get_repmgr_schema_quoted(conn), node, cluster_name, node_name, conninfo, priority, witness == true ? "TRUE" : "FALSE"); if(action != NULL) { log_debug(_("%s: %s\n"), action, sqlquery); } res = PQexec(conn, sqlquery); if (!res || PQresultStatus(res) != PGRES_COMMAND_OK) { log_warning(_("Cannot insert node details, %s\n"), PQerrorMessage(conn)); return false; } PQclear(res); return true; }