From 8f6058c676350969cfb988006ab1e696169dc6f6 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 4 Mar 2020 11:35:52 +0900 Subject: [PATCH] standby switchover: check replication configuration file ownership Within a PostgreSQL data directory, all files should have the same ownership as the data directory itself. PostgreSQL itself expects this, and ownership of files by another user is likely to cause problems. In PostgreSQL 11 or earlier, if "recovery.conf" cannot be moved by PostgreSQL (because e.g. it is owned by root), it will not be possible to promote the standby to primary. In PostgreSQL 12 and later, if "postgresql.auto.conf" on the demotion candidate (current primary) has incorrect ownership (e.g. owned by root), repmgr will very likely not be able to modify this file and write the replication configuration required for the node to rejoin the cluster as a standby. Checks added to catch both cases before a switchover is executed. --- repmgr-action-node.c | 48 ++++++++++++- repmgr-action-standby.c | 147 ++++++++++++++++++++++++++++++++++++++++ repmgr-client-global.h | 4 +- repmgr-client.c | 89 ++++++++++++++++++++++++ repmgr-client.h | 2 + 5 files changed, 287 insertions(+), 3 deletions(-) diff --git a/repmgr-action-node.c b/repmgr-action-node.c index c640c999..5e399f4a 100644 --- a/repmgr-action-node.c +++ b/repmgr-action-node.c @@ -49,7 +49,7 @@ static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); static CheckStatus do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); - +static CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output); /* * NODE STATUS * @@ -803,6 +803,16 @@ do_node_check(void) exit(return_code); } + if (runtime_options.replication_config_owner == true) + { + return_code = do_node_check_replication_config_owner(conn, + runtime_options.output_mode, + &node_info, + NULL); + PQfinish(conn); + exit(return_code); + } + if (runtime_options.output_mode == OM_NAGIOS) { @@ -1799,11 +1809,11 @@ do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_in } initPQExpBuffer(&details); + /* * Check actual data directory matches that in repmgr.conf; note this requires * a superuser connection */ - if (connection_has_pg_settings(conn) == true) { /* we expect to have a database connection */ @@ -1913,6 +1923,40 @@ do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_in return status; } +/* + * This is not included in the general list output + */ +static +CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output) +{ + CheckStatus status = CHECK_STATUS_OK; + + PQExpBufferData errmsg; + PQExpBufferData details; + + if (mode != OM_OPTFORMAT) + { + log_error(_("--replication-config-owner option can only be used with --optformat")); + PQfinish(conn); + exit(ERR_BAD_CONFIG); + } + + initPQExpBuffer(&errmsg); + initPQExpBuffer(&details); + + if (check_replication_config_owner(PQserverVersion(conn), + config_file_options.data_directory, + &errmsg, &details) == false) + { + status = CHECK_STATUS_CRITICAL; + } + + printf("--replication-config-owner=%s\n", + output_check_status(status)); + + return status; +} + void do_node_service(void) diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index ae954c14..de772a31 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -133,6 +133,8 @@ static NodeStatus parse_node_status_is_shutdown_cleanly(const char *node_status_ static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold); static ConnectionStatus parse_remote_node_replication_connection(const char *node_check_output); static bool parse_data_directory_config(const char *node_check_output); +static bool parse_replication_config_owner(const char *node_check_output); + /* * STANDBY CLONE @@ -3434,6 +3436,8 @@ do_standby_switchover(void) PQExpBufferData remote_command_str; PQExpBufferData command_output; PQExpBufferData node_rejoin_options; + PQExpBufferData errmsg; + PQExpBufferData detailmsg; int r, i; @@ -3526,6 +3530,39 @@ do_standby_switchover(void) exit(ERR_SWITCHOVER_FAIL); } + /* + * Check that the local replication configuration file is owned by the data + * directory owner. + * + * For PostgreSQL 11 and earlier, if PostgreSQL is not able rename "recovery.conf", + * promotion will fail. + * + * For PostgreSQL 12 and later, promotion will not fail even if "postgresql.auto.conf" + * is owned by another user, but we'll check just in case, as it is indicative of a + * poorly configured setup. In any case we will need to check "postgresql.auto.conf" on + * the demotion candidate as the rejoin will fail if we are unable to to write to that. + */ + + initPQExpBuffer(&errmsg); + initPQExpBuffer(&detailmsg); + + if (check_replication_config_owner(PQserverVersion(local_conn), + config_file_options.data_directory, + &errmsg, &detailmsg) == false) + { + log_error("%s", errmsg.data); + log_detail("%s", detailmsg.data); + + termPQExpBuffer(&errmsg); + termPQExpBuffer(&detailmsg); + + PQfinish(local_conn); + exit(ERR_BAD_CONFIG); + } + + termPQExpBuffer(&errmsg); + termPQExpBuffer(&detailmsg); + /* check remote server connection and retrieve its record */ remote_conn = get_primary_connection(local_conn, &remote_node_id, remote_conninfo); @@ -3897,6 +3934,63 @@ do_standby_switchover(void) remote_host); } + /* + * For PostgreSQL 12 and later, check "postgresql.auto.conf" is owned by the + * correct user, otherwise the node will probably not be able to attach to + * the promotion candidate (and is a sign of bad configuration anyway) so we + * will complain vocally. + */ + + if (PQserverVersion(local_conn) >= 120000) + { + initPQExpBuffer(&remote_command_str); + make_remote_repmgr_path(&remote_command_str, &remote_node_record); + + appendPQExpBufferStr(&remote_command_str, "node check --replication-config-owner --optformat -LINFO 2>/dev/null"); + + initPQExpBuffer(&command_output); + command_success = remote_command(remote_host, + runtime_options.remote_user, + remote_command_str.data, + config_file_options.ssh_options, + &command_output); + + termPQExpBuffer(&remote_command_str); + + if (command_success == false) + { + log_error(_("unable to execute \"%s node check --replication-config-owner\" on \"%s\":"), + progname(), remote_host); + log_detail("%s", command_output.data); + + PQfinish(remote_conn); + PQfinish(local_conn); + + termPQExpBuffer(&command_output); + + exit(ERR_BAD_CONFIG); + } + + if (parse_replication_config_owner(command_output.data) == false) + { + log_error(_("\"%s\" file on \"%s\" has incorrect ownership"), + PG_AUTOCONF_FILENAME, + remote_node_record.node_name); + + log_hint(_("check the file has the same owner/group as the data directory")); + + PQfinish(remote_conn); + PQfinish(local_conn); + + termPQExpBuffer(&command_output); + + exit(ERR_BAD_CONFIG); + } + + termPQExpBuffer(&command_output); + } + + /* * populate local node record with current state of various replication-related * values, so we can check for sufficient walsenders and replication slots @@ -8145,6 +8239,59 @@ parse_data_directory_config(const char *node_check_output) } +static bool +parse_replication_config_owner(const char *node_check_output) +{ + bool config_ok = true; + + int c = 0, + argc_item = 0; + char **argv_array = NULL; + int optindex = 0; + + /* We're only interested in this option */ + struct option node_check_options[] = + { + {"replication-config-owner", required_argument, NULL, 'C'}, + {NULL, 0, NULL, 0} + }; + + /* Don't attempt to tokenise an empty string */ + if (!strlen(node_check_output)) + { + return false; + } + + argc_item = parse_output_to_argv(node_check_output, &argv_array); + + /* Reset getopt's optind variable */ + optind = 0; + + /* Prevent getopt from emitting errors */ + opterr = 0; + + while ((c = getopt_long(argc_item, argv_array, "C:", node_check_options, + &optindex)) != -1) + { + switch (c) + { + /* --configured-data-directory */ + case 'C': + { + /* we only care whether it's "OK" or not */ + if (strncmp(optarg, "OK", 2) != 0) + config_ok = false; + } + break; + } + } + + free_parsed_argv(&argv_array); + + return config_ok; +} + + void do_standby_help(void) { diff --git a/repmgr-client-global.h b/repmgr-client-global.h index b6f2fc88..a10e7da2 100644 --- a/repmgr-client-global.h +++ b/repmgr-client-global.h @@ -118,6 +118,7 @@ typedef struct bool has_passfile; bool replication_connection; bool data_directory_config; + bool replication_config_owner; /* "node rejoin" options */ char config_files[MAXLEN]; @@ -170,7 +171,7 @@ typedef struct /* "node status" options */ \ false, \ /* "node check" options */ \ - false, false, false, false, false, false, false, false, false, \ + false, false, false, false, false, false, false, false, false, false, \ /* "node rejoin" options */ \ "", \ /* "node service" options */ \ @@ -281,6 +282,7 @@ extern bool drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slo extern standy_join_status check_standby_join(PGconn *primary_conn, t_node_info *primary_node_record, t_node_info *standby_node_record); extern bool check_replication_slots_available(int node_id, PGconn* conn); extern bool check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin); +extern bool check_replication_config_owner(int pg_version, const char *data_directory, PQExpBufferData *error_msg, PQExpBufferData *detail_msg); extern void check_shared_library(PGconn *conn); extern bool is_repmgrd_running(PGconn *conn); diff --git a/repmgr-client.c b/repmgr-client.c index 17ea8c4a..470ba165 100644 --- a/repmgr-client.c +++ b/repmgr-client.c @@ -533,6 +533,10 @@ main(int argc, char **argv) runtime_options.data_directory_config = true; break; + case OPT_REPLICATION_CONFIG_OWNER: + runtime_options.replication_config_owner = true; + break; + /*-------------------- * "node rejoin" options *-------------------- @@ -4205,6 +4209,91 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo } +/* + * Check that the replication configuration file is owned by the user who + * owns the data directory. + */ +extern bool +check_replication_config_owner(int pg_version, const char *data_directory, PQExpBufferData *error_msg, PQExpBufferData *detail_msg) +{ + PQExpBufferData replication_config_file; + struct stat dirstat; + struct stat confstat; + + if (stat(data_directory, &dirstat)) + { + if (error_msg != NULL) + { + appendPQExpBuffer(error_msg, + "unable to check ownership of data directory \"%s\"", + data_directory); + appendPQExpBufferStr(detail_msg, + strerror(errno)); + } + return false; + } + + initPQExpBuffer(&replication_config_file); + + appendPQExpBuffer(&replication_config_file, + "%s/%s", + config_file_options.data_directory, + pg_version >= 120000 ? PG_AUTOCONF_FILENAME : RECOVERY_COMMAND_FILE); + + stat(replication_config_file.data, &confstat); + + if (confstat.st_uid == dirstat.st_uid) + { + termPQExpBuffer(&replication_config_file); + return true; + } + + if (error_msg != NULL) + { + char conf_owner[MAXLEN]; + char dir_owner[MAXLEN]; + struct passwd *pw; + + pw = getpwuid(confstat.st_uid); + if (!pw) + { + maxlen_snprintf(conf_owner, + "(unknown user %i)", + confstat.st_uid); + } + else + { + strncpy(conf_owner, pw->pw_name, MAXLEN); + } + + pw = getpwuid(dirstat.st_uid); + + if (!pw) + { + maxlen_snprintf(conf_owner, + "(unknown user %i)", + dirstat.st_uid); + } + else + { + strncpy(dir_owner, pw->pw_name, MAXLEN); + } + + appendPQExpBuffer(error_msg, + "ownership error for file \"%s\"", + replication_config_file.data); + appendPQExpBuffer(detail_msg, + "file owner is \"%s\", data directory owner is \"%s\"", + conf_owner, + dir_owner); + } + + termPQExpBuffer(&replication_config_file); + + return false; +} + + /* * Simple check to see if "shared_preload_libraries" includes "repmgr". * Parsing "shared_preload_libraries" is non-trivial, as it's potentially diff --git a/repmgr-client.h b/repmgr-client.h index 95abb55a..8131e0c1 100644 --- a/repmgr-client.h +++ b/repmgr-client.h @@ -98,6 +98,7 @@ #define OPT_ENABLE_WAL_RECEIVER 1045 #define OPT_DETAIL 1046 #define OPT_REPMGRD_FORCE_UNPAUSE 1047 +#define OPT_REPLICATION_CONFIG_OWNER 1048 /* deprecated since 4.0 */ #define OPT_CHECK_UPSTREAM_CONFIG 999 @@ -185,6 +186,7 @@ static struct option long_options[] = {"has-passfile", no_argument, NULL, OPT_HAS_PASSFILE}, {"replication-connection", no_argument, NULL, OPT_REPL_CONN}, {"data-directory-config", no_argument, NULL, OPT_DATA_DIRECTORY_CONFIG}, + {"replication-config-owner", no_argument, NULL, OPT_REPLICATION_CONFIG_OWNER}, /* "node rejoin" options */ {"config-files", required_argument, NULL, OPT_CONFIG_FILES},