After standby promotion, ensure metadata is updated by repmgr

Previously this was handled by repmgrd but if a standby is promoted
directly this will leave the metadata in an incorrect state.
This commit is contained in:
Ian Barwick
2015-04-14 13:39:48 +09:00
parent 34eaf94b2b
commit a01fefa7d0
2 changed files with 109 additions and 21 deletions

110
repmgr.c
View File

@@ -77,6 +77,7 @@ static bool write_recovery_file_line(FILE *recovery_file, char *recovery_file_pa
static void check_master_standby_version_match(PGconn *conn, PGconn *master_conn);
static int check_server_version(PGconn *conn, char *server_type, bool exit_on_error, char *server_version_string);
static bool check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error);
static bool update_node_record_set_master(PGconn *conn, int this_node_id);
static char *make_pg_path(char *file);
@@ -1626,6 +1627,28 @@ do_standby_promote(void)
exit(ERR_FAILOVER_FAIL);
}
/* update node information to reflect new status */
if(update_node_record_set_master(conn, options.node) == false)
{
initPQExpBuffer(&details);
appendPQExpBuffer(&details,
_("unable to update node record for node %i"),
options.node);
log_err("%s\n", details.data);
create_event_record(NULL,
&options,
options.node,
"repmgrd_failover_promote",
false,
details.data);
exit(ERR_DB_QUERY);
}
initPQExpBuffer(&details);
appendPQExpBuffer(&details,
"Node %i was successfully promoted to master",
@@ -3230,6 +3253,92 @@ check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error)
}
static bool
update_node_record_set_master(PGconn *conn, int this_node_id)
{
PGresult *res;
char sqlquery[QUERY_STR_LEN];
log_debug(_("Setting %i as master and marking existing master as failed\n"), this_node_id);
res = PQexec(conn, "BEGIN");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to begin transaction: %s\n"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
PQclear(res);
sqlquery_snprintf(sqlquery,
" UPDATE %s.repl_nodes "
" SET active = FALSE "
" WHERE cluster = '%s' "
" AND type = 'master' "
" AND active IS TRUE ",
get_repmgr_schema_quoted(conn),
options.cluster_name);
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to set old master node as inactive: %s\n"),
PQerrorMessage(conn));
PQclear(res);
PQexec(conn, "ROLLBACK");
return false;
}
PQclear(res);
sqlquery_snprintf(sqlquery,
" UPDATE %s.repl_nodes "
" SET type = 'master', "
" upstream_node_id = NULL "
" WHERE cluster = '%s' "
" AND id = %i ",
get_repmgr_schema_quoted(conn),
options.cluster_name,
this_node_id);
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to set current node %i as active master: %s\n"),
this_node_id,
PQerrorMessage(conn));
PQclear(res);
PQexec(conn, "ROLLBACK");
return false;
}
PQclear(res);
res = PQexec(conn, "COMMIT");
if (PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("Unable to set commit transaction: %s\n"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
PQclear(res);
return true;
}
static void
do_check_upstream_config(void)
{
@@ -3266,7 +3375,6 @@ do_check_upstream_config(void)
}
static char *
make_pg_path(char *file)
{

View File

@@ -91,7 +91,6 @@ static void witness_monitor(void);
static bool check_connection(PGconn *conn, const char *type);
static bool set_local_node_failed(void);
static bool update_node_record_set_master(PGconn *conn, int this_node_id, int old_master_node_id);
static bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id);
static void update_shared_memory(char *last_wal_standby_applied);
@@ -1442,25 +1441,6 @@ do_master_failover(void)
/* and reconnect to the local database */
my_local_conn = establish_db_connection(local_options.conninfo, true);
/* update node information to reflect new status */
if(update_node_record_set_master(my_local_conn, node_info.node_id, failed_master.node_id) == false)
{
appendPQExpBuffer(&event_details,
_("unable to update node record for node %i (promoted to master following failure of node %i)"),
node_info.node_id,
failed_master.node_id);
log_err("%s\n", event_details.data);
create_event_record(NULL,
&local_options,
node_info.node_id,
"repmgrd_failover_promote",
false,
event_details.data);
terminate(ERR_DB_QUERY);
}
/* update internal record for this node */
node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);