From 51da33a260afa60195fa5cd5c4a045b627b0da69 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Wed, 10 May 2017 09:24:51 +0900 Subject: [PATCH] Initial implementation of "standby promote" --- dbutils.c | 71 ++++++++++++++++- dbutils.h | 2 + errcode.h | 4 +- repmgr-action-standby.c | 170 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 239 insertions(+), 8 deletions(-) diff --git a/dbutils.c b/dbutils.c index 0cdcc6df..428e6709 100644 --- a/dbutils.c +++ b/dbutils.c @@ -81,7 +81,7 @@ _establish_db_connection(const char *conninfo, const bool exit_on_error, const b if (exit_on_error) { PQfinish(conn); - exit(ERR_DB_CON); + exit(ERR_DB_CONN); } } @@ -96,7 +96,7 @@ _establish_db_connection(const char *conninfo, const bool exit_on_error, const b if (exit_on_error) { PQfinish(conn); - exit(ERR_DB_CON); + exit(ERR_DB_CONN); } } @@ -172,7 +172,7 @@ establish_db_connection_by_params(const char *keywords[], const char *values[], if (exit_on_error) { PQfinish(conn); - exit(ERR_DB_CON); + exit(ERR_DB_CONN); } } else @@ -193,7 +193,7 @@ establish_db_connection_by_params(const char *keywords[], const char *values[], if (exit_on_error) { PQfinish(conn); - exit(ERR_DB_CON); + exit(ERR_DB_CONN); } } } @@ -1360,6 +1360,69 @@ _create_update_node_record(PGconn *conn, char *action, t_node_info *node_info) return true; } +bool +update_node_record_set_master(PGconn *conn, int this_node_id) +{ + PQExpBufferData query; + PGresult *res; + + log_debug(_("setting node %i as master and marking existing master as failed"), + this_node_id); + + begin_transaction(conn); + + initPQExpBuffer(&query); + + appendPQExpBuffer(&query, + " UPDATE repmgr.repl_nodes " + " SET active = FALSE " + " WHERE type = 'master' " + " AND active IS TRUE "); + + res = PQexec(conn, query.data); + + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + log_error(_("unable to set old master node as inactive:\n %s"), + PQerrorMessage(conn)); + PQclear(res); + + rollback_transaction(conn); + return false; + } + + PQclear(res); + termPQExpBuffer(&query); + + initPQExpBuffer(&query); + + appendPQExpBuffer(&query, + " UPDATE repmgr.nodes" + " SET type = 'master', " + " upstream_node_id = NULL " + " WHERE node_id = %i ", + this_node_id); + + res = PQexec(conn, query.data); + termPQExpBuffer(&query); + + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + log_error(_("unable to set current node %i as active master:\n %s"), + this_node_id, + PQerrorMessage(conn)); + PQclear(res); + + rollback_transaction(conn); + return false; + } + + PQclear(res); + + return commit_transaction(conn); +} + + bool delete_node_record(PGconn *conn, int node) diff --git a/dbutils.h b/dbutils.h index 3420b64b..396b92c6 100644 --- a/dbutils.h +++ b/dbutils.h @@ -185,6 +185,8 @@ bool create_node_record(PGconn *conn, char *repmgr_action, t_node_info *node_in bool update_node_record(PGconn *conn, char *repmgr_action, t_node_info *node_info); bool delete_node_record(PGconn *conn, int node); +bool update_node_record_set_master(PGconn *conn, int this_node_id); + /* event record functions */ bool create_event_record(PGconn *conn, t_configuration_options *options, int node_id, char *event, bool successful, char *details); bool create_event_record_extended(PGconn *conn, t_configuration_options *options, int node_id, char *event, bool successful, char *details, t_event_info *event_info); diff --git a/errcode.h b/errcode.h index 90ee6985..6f9239c2 100644 --- a/errcode.h +++ b/errcode.h @@ -12,9 +12,9 @@ #define ERR_BAD_CONFIG 1 #define ERR_BAD_RSYNC 2 #define ERR_NO_RESTART 4 -#define ERR_DB_CON 6 +#define ERR_DB_CONN 6 #define ERR_DB_QUERY 7 -#define ERR_PROMOTED 8 +#define ERR_PROMOTION_FAIL 8 #define ERR_STR_OVERFLOW 10 #define ERR_FAILOVER_FAIL 11 #define ERR_BAD_SSH 12 diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c index d3391070..a33833c6 100644 --- a/repmgr-action-standby.c +++ b/repmgr-action-standby.c @@ -258,7 +258,7 @@ do_standby_clone(void) parse_success = parse_conninfo_string(recovery_conninfo_str, &recovery_conninfo, errmsg, true); if (parse_success == false) { - log_error(_("unable to parse conninfo string \"%s\" for upstream node:\n%s"), + log_error(_("unable to parse conninfo string \"%s\" for upstream node:\n %s"), recovery_conninfo_str, errmsg); PQfinish(source_conn); @@ -1016,6 +1016,172 @@ do_standby_unregister(void) void do_standby_promote(void) { + PGconn *conn; + PGconn *current_master_conn; + + char script[MAXLEN]; + + + int r, + retval; + char data_dir[MAXLEN]; + + int i, + promote_check_timeout = 60, + promote_check_interval = 2; + bool promote_success = false; + bool success; + PQExpBufferData details; + + log_info(_("connecting to standby database")); + conn = establish_db_connection(config_file_options.conninfo, true); + + log_verbose(LOG_INFO, _("connected to standby, checking its state")); + + /* Verify that standby is a supported server version */ + check_server_version(conn, "standby", true, NULL); + + /* Check we are in a standby node */ + retval = is_standby(conn); + + switch (retval) + { + case 0: + log_error(_("STANDBY PROMOTE can only be executed on a standby node")); + PQfinish(conn); + exit(ERR_BAD_CONFIG); + case -1: + log_error(_("connection to node lost")); + PQfinish(conn); + exit(ERR_DB_CONN); + } + + + /* we also need to check if there isn't any master already */ + current_master_conn = get_master_connection(conn, NULL, NULL); + + if (PQstatus(current_master_conn) == CONNECTION_OK) + { + log_error(_("this cluster already has an active master server")); + // say which one as detail + PQfinish(current_master_conn); + PQfinish(conn); + exit(ERR_PROMOTION_FAIL); + } + + + /* Get the data directory */ + // XXX do we need a superuser check? + success = get_pg_setting(conn, "data_directory", data_dir); + PQfinish(conn); + + if (success == false) + { + log_error(_("unable to determine data directory")); + exit(ERR_PROMOTION_FAIL); + } + + log_notice(_("promoting standby")); + + /* + * Promote standby to master. + * + * `pg_ctl promote` returns immediately and (prior to 10.0) has no -w option + * so we can't be sure when or if the promotion completes. + * For now we'll poll the server until the default timeout (60 seconds) + */ + + if (*config_file_options.service_promote_command) + { + maxlen_snprintf(script, "%s", config_file_options.service_promote_command); + } + else + { + maxlen_snprintf(script, "%s -D %s promote", + make_pg_path("pg_ctl"), data_dir); + } + + log_notice(_("promoting server using '%s'"), + script); + + r = system(script); + if (r != 0) + { + log_error(_("unable to promote server from standby to master")); + exit(ERR_PROMOTION_FAIL); + } + + /* reconnect to check we got promoted */ + + log_info(_("reconnecting to promoted server")); + conn = establish_db_connection(config_file_options.conninfo, true); + + for (i = 0; i < promote_check_timeout; i += promote_check_interval) + { + retval = is_standby(conn); + if (!retval) + { + promote_success = true; + break; + } + sleep(promote_check_interval); + } + + if (promote_success == false) + { + switch (retval) + { + case 1: + log_error(_("STANDBY PROMOTE failed, node is still a standby")); + PQfinish(conn); + exit(ERR_PROMOTION_FAIL); + default: + log_error(_("connection to node lost")); + PQfinish(conn); + exit(ERR_DB_CONN); + } + } + + + /* update node information to reflect new status */ + if (update_node_record_set_master(conn, config_file_options.node_id) == false) + { + initPQExpBuffer(&details); + appendPQExpBuffer(&details, + _("unable to update node record for node %i"), + config_file_options.node_id); + + log_error("%s", details.data); + + create_event_record(NULL, + &config_file_options, + config_file_options.node_id, + "standby_promote", + false, + details.data); + + exit(ERR_DB_QUERY); + } + + + initPQExpBuffer(&details); + appendPQExpBuffer(&details, + _("node %i was successfully promoted to master"), + config_file_options.node_id); + + log_notice(_("STANDBY PROMOTE successful")); + log_detail("%s", details.data); + + /* Log the event */ + create_event_record(conn, + &config_file_options, + config_file_options.node_id, + "standby_promote", + true, + details.data); + + PQfinish(conn); + return; } @@ -1074,7 +1240,7 @@ check_source_server() if (mode == barman) return; else - exit(ERR_DB_CON); + exit(ERR_DB_CONN); } /*