mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 00:46:29 +00:00
repmgrd: initial code for cascaded standby failover
This commit is contained in:
36
dbutils.c
36
dbutils.c
@@ -1632,6 +1632,42 @@ update_node_record_set_primary(PGconn *conn, int this_node_id)
|
|||||||
return commit_transaction(conn);
|
return commit_transaction(conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id)
|
||||||
|
{
|
||||||
|
PQExpBufferData query;
|
||||||
|
PGresult *res;
|
||||||
|
|
||||||
|
log_debug(_("update_node_record_set_upstream(): Updating node %i's upstream node to %i"),
|
||||||
|
this_node_id, new_upstream_node_id);
|
||||||
|
|
||||||
|
initPQExpBuffer(&query);
|
||||||
|
|
||||||
|
appendPQExpBuffer(&query,
|
||||||
|
" UPDATE repmgr.nodes "
|
||||||
|
" SET upstream_node_id = %i "
|
||||||
|
" WHERE id = %i ",
|
||||||
|
new_upstream_node_id,
|
||||||
|
this_node_id);
|
||||||
|
|
||||||
|
log_verbose(LOG_DEBUG, "update_node_record_set_upstream():\n%s\n", query.data);
|
||||||
|
|
||||||
|
res = PQexec(conn, query.data);
|
||||||
|
|
||||||
|
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||||
|
{
|
||||||
|
log_error(_("unable to set new upstream node id:\n %s"),
|
||||||
|
PQerrorMessage(conn));
|
||||||
|
PQclear(res);
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(res);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Update node record following change of status
|
* Update node record following change of status
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ PGconn *establish_db_connection_by_params(const char *keywords[],
|
|||||||
const char *values[],
|
const char *values[],
|
||||||
const bool exit_on_error);
|
const bool exit_on_error);
|
||||||
PGconn *establish_primary_db_connection(PGconn *conn,
|
PGconn *establish_primary_db_connection(PGconn *conn,
|
||||||
const bool exit_on_error);
|
const bool exit_on_error);
|
||||||
|
|
||||||
PGconn *get_primary_connection(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
|
PGconn *get_primary_connection(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
|
||||||
PGconn *get_primary_connection_quiet(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
|
PGconn *get_primary_connection_quiet(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
|
||||||
@@ -221,6 +221,7 @@ bool update_node_record(PGconn *conn, char *repmgr_action, t_node_info *node_in
|
|||||||
bool delete_node_record(PGconn *conn, int node);
|
bool delete_node_record(PGconn *conn, int node);
|
||||||
|
|
||||||
bool update_node_record_set_primary(PGconn *conn, int this_node_id);
|
bool update_node_record_set_primary(PGconn *conn, int this_node_id);
|
||||||
|
bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id);
|
||||||
bool update_node_record_status(PGconn *conn, int this_node_id, char *type, int upstream_node_id, bool active);
|
bool update_node_record_status(PGconn *conn, int this_node_id, char *type, int upstream_node_id, bool active);
|
||||||
|
|
||||||
void clear_node_info_list(NodeInfoList *nodes);
|
void clear_node_info_list(NodeInfoList *nodes);
|
||||||
|
|||||||
181
repmgrd.c
181
repmgrd.c
@@ -651,8 +651,15 @@ monitor_streaming_standby(void)
|
|||||||
// handle failure - do we want to loop here?
|
// handle failure - do we want to loop here?
|
||||||
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
||||||
|
|
||||||
// fix for cascaded standbys
|
if (upstream_node_info.type == STANDBY)
|
||||||
primary_conn = upstream_conn;
|
{
|
||||||
|
// XXX check result
|
||||||
|
primary_conn = establish_primary_db_connection(local_conn, false);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
primary_conn = upstream_conn;
|
||||||
|
}
|
||||||
|
|
||||||
/* Log startup event */
|
/* Log startup event */
|
||||||
if (startup_event_logged == false)
|
if (startup_event_logged == false)
|
||||||
@@ -980,12 +987,145 @@ do_primary_failover(void)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* do_upstream_standby_failover()
|
||||||
|
*
|
||||||
|
* Attach cascaded standby to primary
|
||||||
|
*
|
||||||
|
* Currently we will try to attach to the cluster primary, as "repmgr
|
||||||
|
* standby follow" doesn't support attaching to another node.
|
||||||
|
*
|
||||||
|
* If this becomes supported, it might be worth providing a selection
|
||||||
|
* of reconnection strategies as different behaviour might be desirable
|
||||||
|
* in different situations;
|
||||||
|
* or maybe the option not to reconnect might be required?
|
||||||
|
*
|
||||||
|
* XXX check this handles replication slots gracefully
|
||||||
|
*/
|
||||||
static bool
|
static bool
|
||||||
do_upstream_standby_failover(void)
|
do_upstream_standby_failover(void)
|
||||||
{
|
{
|
||||||
// not implemented yet
|
PQExpBufferData event_details;
|
||||||
return false;
|
t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
|
||||||
|
RecordStatus record_status;
|
||||||
|
int r;
|
||||||
|
|
||||||
|
// check status
|
||||||
|
record_status = get_primary_node_record(local_conn, &primary_node_info);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Verify that we can still talk to the cluster primary, even though
|
||||||
|
* the node's upstream is not available
|
||||||
|
*/
|
||||||
|
|
||||||
|
// consolidate below code
|
||||||
|
if (is_server_available(primary_node_info.conninfo) == false)
|
||||||
|
{
|
||||||
|
log_warning(_("connection to primary %i lost"), primary_node_info.node_id);
|
||||||
|
|
||||||
|
if (primary_conn != NULL)
|
||||||
|
{
|
||||||
|
PQfinish(primary_conn);
|
||||||
|
primary_conn = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PQstatus(primary_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
log_info(_("attempting to reconnect"));
|
||||||
|
primary_conn = establish_db_connection(primary_node_info.conninfo, false);
|
||||||
|
|
||||||
|
if (PQstatus(primary_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
log_warning(_("reconnection failed"));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_info(_("reconnected"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* grandparent upstream is inactive */
|
||||||
|
if (primary_node_info.active == false)
|
||||||
|
{
|
||||||
|
// XXX
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Close the connection to this server */
|
||||||
|
PQfinish(local_conn);
|
||||||
|
local_conn = NULL;
|
||||||
|
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
log_debug(_("standby follow command is:\n \"%s\""),
|
||||||
|
config_file_options.follow_command);
|
||||||
|
|
||||||
|
r = system(config_file_options.follow_command);
|
||||||
|
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("unable to execute follow command:\n %s"),
|
||||||
|
config_file_options.follow_command);
|
||||||
|
|
||||||
|
log_error("%s", event_details.data);
|
||||||
|
|
||||||
|
/* It may not possible to write to the event notification
|
||||||
|
* table but we should be able to generate an external notification
|
||||||
|
* if required.
|
||||||
|
*/
|
||||||
|
create_event_record(primary_conn,
|
||||||
|
&config_file_options,
|
||||||
|
local_node_info.node_id,
|
||||||
|
"repmgrd_failover_follow",
|
||||||
|
false,
|
||||||
|
event_details.data);
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (update_node_record_set_upstream(primary_conn,
|
||||||
|
local_node_info.node_id,
|
||||||
|
primary_node_info.node_id) == false)
|
||||||
|
{
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("unable to set node %i's new upstream ID to %i"),
|
||||||
|
local_node_info.node_id,
|
||||||
|
primary_node_info.node_id);
|
||||||
|
|
||||||
|
log_error("%s", event_details.data);
|
||||||
|
|
||||||
|
create_event_record(NULL,
|
||||||
|
&config_file_options,
|
||||||
|
local_node_info.node_id,
|
||||||
|
"repmgrd_failover_follow",
|
||||||
|
false,
|
||||||
|
event_details.data);
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
terminate(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("node %i is now following primary node %i"),
|
||||||
|
local_node_info.node_id,
|
||||||
|
primary_node_info.node_id);
|
||||||
|
|
||||||
|
log_notice("%s", event_details.data);
|
||||||
|
|
||||||
|
create_event_record(primary_conn,
|
||||||
|
&config_file_options,
|
||||||
|
local_node_info.node_id,
|
||||||
|
"repmgrd_failover_follow",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1049,10 +1189,10 @@ promote_self(void)
|
|||||||
{
|
{
|
||||||
int primary_node_id;
|
int primary_node_id;
|
||||||
|
|
||||||
primary_conn = get_primary_connection(local_conn,
|
upstream_conn = get_primary_connection(local_conn,
|
||||||
&primary_node_id, NULL);
|
&primary_node_id, NULL);
|
||||||
|
|
||||||
if (PQstatus(primary_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
|
if (PQstatus(upstream_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
|
||||||
{
|
{
|
||||||
log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"),
|
log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"),
|
||||||
failed_primary.node_id);
|
failed_primary.node_id);
|
||||||
@@ -1063,7 +1203,7 @@ promote_self(void)
|
|||||||
failed_primary.node_name,
|
failed_primary.node_name,
|
||||||
failed_primary.node_id);
|
failed_primary.node_id);
|
||||||
|
|
||||||
create_event_record(primary_conn,
|
create_event_record(upstream_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
local_node_info.node_id,
|
local_node_info.node_id,
|
||||||
"repmgrd_failover_abort",
|
"repmgrd_failover_abort",
|
||||||
@@ -1072,7 +1212,6 @@ promote_self(void)
|
|||||||
|
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
//PQfinish(primary_conn);
|
|
||||||
//primary_conn = NULL;
|
//primary_conn = NULL;
|
||||||
|
|
||||||
// XXX handle this!
|
// XXX handle this!
|
||||||
@@ -1254,11 +1393,11 @@ follow_new_primary(int new_primary_id)
|
|||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
local_conn = NULL;
|
local_conn = NULL;
|
||||||
|
|
||||||
primary_conn = establish_db_connection(new_primary.conninfo, false);
|
upstream_conn = establish_db_connection(new_primary.conninfo, false);
|
||||||
|
|
||||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
if (PQstatus(upstream_conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
RecoveryType primary_recovery_type = get_recovery_type(primary_conn);
|
RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
|
||||||
if (primary_recovery_type == RECTYPE_PRIMARY)
|
if (primary_recovery_type == RECTYPE_PRIMARY)
|
||||||
{
|
{
|
||||||
new_primary_ok = true;
|
new_primary_ok = true;
|
||||||
@@ -1266,7 +1405,7 @@ follow_new_primary(int new_primary_id)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
log_warning(_("new primary is not in recovery"));
|
log_warning(_("new primary is not in recovery"));
|
||||||
PQfinish(primary_conn);
|
PQfinish(upstream_conn);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1319,8 +1458,8 @@ follow_new_primary(int new_primary_id)
|
|||||||
|
|
||||||
// XXX check success
|
// XXX check success
|
||||||
|
|
||||||
record_status = get_node_record(primary_conn, new_primary_id, &upstream_node_info);
|
record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
|
||||||
record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);
|
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
|
||||||
|
|
||||||
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
@@ -1331,7 +1470,7 @@ follow_new_primary(int new_primary_id)
|
|||||||
|
|
||||||
log_notice("%s\n", event_details.data);
|
log_notice("%s\n", event_details.data);
|
||||||
|
|
||||||
create_event_record(primary_conn,
|
create_event_record(upstream_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
local_node_info.node_id,
|
local_node_info.node_id,
|
||||||
"repmgrd_failover_follow",
|
"repmgrd_failover_follow",
|
||||||
@@ -1808,10 +1947,20 @@ close_connections()
|
|||||||
if (PQisBusy(primary_conn) == 1)
|
if (PQisBusy(primary_conn) == 1)
|
||||||
cancel_query(primary_conn, config_file_options.primary_response_timeout);
|
cancel_query(primary_conn, config_file_options.primary_response_timeout);
|
||||||
PQfinish(primary_conn);
|
PQfinish(primary_conn);
|
||||||
|
primary_conn = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (upstream_conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
|
||||||
|
{
|
||||||
|
PQfinish(upstream_conn);
|
||||||
|
upstream_conn = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (PQstatus(local_conn) == CONNECTION_OK)
|
if (PQstatus(local_conn) == CONNECTION_OK)
|
||||||
|
{
|
||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
|
local_conn = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user