mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
repmgrd: initial code for cascaded standby failover
This commit is contained in:
36
dbutils.c
36
dbutils.c
@@ -1632,6 +1632,42 @@ update_node_record_set_primary(PGconn *conn, int this_node_id)
|
||||
return commit_transaction(conn);
|
||||
}
|
||||
|
||||
bool
|
||||
update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id)
|
||||
{
|
||||
PQExpBufferData query;
|
||||
PGresult *res;
|
||||
|
||||
log_debug(_("update_node_record_set_upstream(): Updating node %i's upstream node to %i"),
|
||||
this_node_id, new_upstream_node_id);
|
||||
|
||||
initPQExpBuffer(&query);
|
||||
|
||||
appendPQExpBuffer(&query,
|
||||
" UPDATE repmgr.nodes "
|
||||
" SET upstream_node_id = %i "
|
||||
" WHERE id = %i ",
|
||||
new_upstream_node_id,
|
||||
this_node_id);
|
||||
|
||||
log_verbose(LOG_DEBUG, "update_node_record_set_upstream():\n%s\n", query.data);
|
||||
|
||||
res = PQexec(conn, query.data);
|
||||
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
||||
{
|
||||
log_error(_("unable to set new upstream node id:\n %s"),
|
||||
PQerrorMessage(conn));
|
||||
PQclear(res);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
PQclear(res);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Update node record following change of status
|
||||
|
||||
@@ -160,7 +160,7 @@ PGconn *establish_db_connection_by_params(const char *keywords[],
|
||||
const char *values[],
|
||||
const bool exit_on_error);
|
||||
PGconn *establish_primary_db_connection(PGconn *conn,
|
||||
const bool exit_on_error);
|
||||
const bool exit_on_error);
|
||||
|
||||
PGconn *get_primary_connection(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
|
||||
PGconn *get_primary_connection_quiet(PGconn *standby_conn, int *primary_id, char *primary_conninfo_out);
|
||||
@@ -221,6 +221,7 @@ bool update_node_record(PGconn *conn, char *repmgr_action, t_node_info *node_in
|
||||
bool delete_node_record(PGconn *conn, int node);
|
||||
|
||||
bool update_node_record_set_primary(PGconn *conn, int this_node_id);
|
||||
bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id);
|
||||
bool update_node_record_status(PGconn *conn, int this_node_id, char *type, int upstream_node_id, bool active);
|
||||
|
||||
void clear_node_info_list(NodeInfoList *nodes);
|
||||
|
||||
181
repmgrd.c
181
repmgrd.c
@@ -651,8 +651,15 @@ monitor_streaming_standby(void)
|
||||
// handle failure - do we want to loop here?
|
||||
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
||||
|
||||
// fix for cascaded standbys
|
||||
primary_conn = upstream_conn;
|
||||
if (upstream_node_info.type == STANDBY)
|
||||
{
|
||||
// XXX check result
|
||||
primary_conn = establish_primary_db_connection(local_conn, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
primary_conn = upstream_conn;
|
||||
}
|
||||
|
||||
/* Log startup event */
|
||||
if (startup_event_logged == false)
|
||||
@@ -980,12 +987,145 @@ do_primary_failover(void)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* do_upstream_standby_failover()
|
||||
*
|
||||
* Attach cascaded standby to primary
|
||||
*
|
||||
* Currently we will try to attach to the cluster primary, as "repmgr
|
||||
* standby follow" doesn't support attaching to another node.
|
||||
*
|
||||
* If this becomes supported, it might be worth providing a selection
|
||||
* of reconnection strategies as different behaviour might be desirable
|
||||
* in different situations;
|
||||
* or maybe the option not to reconnect might be required?
|
||||
*
|
||||
* XXX check this handles replication slots gracefully
|
||||
*/
|
||||
static bool
|
||||
do_upstream_standby_failover(void)
|
||||
{
|
||||
// not implemented yet
|
||||
return false;
|
||||
PQExpBufferData event_details;
|
||||
t_node_info primary_node_info = T_NODE_INFO_INITIALIZER;
|
||||
RecordStatus record_status;
|
||||
int r;
|
||||
|
||||
// check status
|
||||
record_status = get_primary_node_record(local_conn, &primary_node_info);
|
||||
|
||||
/*
|
||||
* Verify that we can still talk to the cluster primary, even though
|
||||
* the node's upstream is not available
|
||||
*/
|
||||
|
||||
// consolidate below code
|
||||
if (is_server_available(primary_node_info.conninfo) == false)
|
||||
{
|
||||
log_warning(_("connection to primary %i lost"), primary_node_info.node_id);
|
||||
|
||||
if (primary_conn != NULL)
|
||||
{
|
||||
PQfinish(primary_conn);
|
||||
primary_conn = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (PQstatus(primary_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_info(_("attempting to reconnect"));
|
||||
primary_conn = establish_db_connection(primary_node_info.conninfo, false);
|
||||
|
||||
if (PQstatus(primary_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_warning(_("reconnection failed"));
|
||||
}
|
||||
else
|
||||
{
|
||||
log_info(_("reconnected"));
|
||||
}
|
||||
}
|
||||
|
||||
/* grandparent upstream is inactive */
|
||||
if (primary_node_info.active == false)
|
||||
{
|
||||
// XXX
|
||||
}
|
||||
|
||||
/* Close the connection to this server */
|
||||
PQfinish(local_conn);
|
||||
local_conn = NULL;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
|
||||
log_debug(_("standby follow command is:\n \"%s\""),
|
||||
config_file_options.follow_command);
|
||||
|
||||
r = system(config_file_options.follow_command);
|
||||
|
||||
if (r != 0)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to execute follow command:\n %s"),
|
||||
config_file_options.follow_command);
|
||||
|
||||
log_error("%s", event_details.data);
|
||||
|
||||
/* It may not possible to write to the event notification
|
||||
* table but we should be able to generate an external notification
|
||||
* if required.
|
||||
*/
|
||||
create_event_record(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
}
|
||||
|
||||
if (update_node_record_set_upstream(primary_conn,
|
||||
local_node_info.node_id,
|
||||
primary_node_info.node_id) == false)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("unable to set node %i's new upstream ID to %i"),
|
||||
local_node_info.node_id,
|
||||
primary_node_info.node_id);
|
||||
|
||||
log_error("%s", event_details.data);
|
||||
|
||||
create_event_record(NULL,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("node %i is now following primary node %i"),
|
||||
local_node_info.node_id,
|
||||
primary_node_info.node_id);
|
||||
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
create_event_record(primary_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
local_conn = establish_db_connection(config_file_options.conninfo, true);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -1049,10 +1189,10 @@ promote_self(void)
|
||||
{
|
||||
int primary_node_id;
|
||||
|
||||
primary_conn = get_primary_connection(local_conn,
|
||||
upstream_conn = get_primary_connection(local_conn,
|
||||
&primary_node_id, NULL);
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
|
||||
if (PQstatus(upstream_conn) == CONNECTION_OK && primary_node_id == failed_primary.node_id)
|
||||
{
|
||||
log_notice(_("original primary (id: %i) reappeared before this standby was promoted - no action taken"),
|
||||
failed_primary.node_id);
|
||||
@@ -1063,7 +1203,7 @@ promote_self(void)
|
||||
failed_primary.node_name,
|
||||
failed_primary.node_id);
|
||||
|
||||
create_event_record(primary_conn,
|
||||
create_event_record(upstream_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_abort",
|
||||
@@ -1072,7 +1212,6 @@ promote_self(void)
|
||||
|
||||
termPQExpBuffer(&event_details);
|
||||
|
||||
//PQfinish(primary_conn);
|
||||
//primary_conn = NULL;
|
||||
|
||||
// XXX handle this!
|
||||
@@ -1254,11 +1393,11 @@ follow_new_primary(int new_primary_id)
|
||||
PQfinish(local_conn);
|
||||
local_conn = NULL;
|
||||
|
||||
primary_conn = establish_db_connection(new_primary.conninfo, false);
|
||||
upstream_conn = establish_db_connection(new_primary.conninfo, false);
|
||||
|
||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||
if (PQstatus(upstream_conn) == CONNECTION_OK)
|
||||
{
|
||||
RecoveryType primary_recovery_type = get_recovery_type(primary_conn);
|
||||
RecoveryType primary_recovery_type = get_recovery_type(upstream_conn);
|
||||
if (primary_recovery_type == RECTYPE_PRIMARY)
|
||||
{
|
||||
new_primary_ok = true;
|
||||
@@ -1266,7 +1405,7 @@ follow_new_primary(int new_primary_id)
|
||||
else
|
||||
{
|
||||
log_warning(_("new primary is not in recovery"));
|
||||
PQfinish(primary_conn);
|
||||
PQfinish(upstream_conn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1319,8 +1458,8 @@ follow_new_primary(int new_primary_id)
|
||||
|
||||
// XXX check success
|
||||
|
||||
record_status = get_node_record(primary_conn, new_primary_id, &upstream_node_info);
|
||||
record_status = get_node_record(primary_conn, local_node_info.node_id, &local_node_info);
|
||||
record_status = get_node_record(upstream_conn, new_primary_id, &upstream_node_info);
|
||||
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
|
||||
|
||||
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
||||
initPQExpBuffer(&event_details);
|
||||
@@ -1331,7 +1470,7 @@ follow_new_primary(int new_primary_id)
|
||||
|
||||
log_notice("%s\n", event_details.data);
|
||||
|
||||
create_event_record(primary_conn,
|
||||
create_event_record(upstream_conn,
|
||||
&config_file_options,
|
||||
local_node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
@@ -1808,10 +1947,20 @@ close_connections()
|
||||
if (PQisBusy(primary_conn) == 1)
|
||||
cancel_query(primary_conn, config_file_options.primary_response_timeout);
|
||||
PQfinish(primary_conn);
|
||||
primary_conn = NULL;
|
||||
}
|
||||
|
||||
if (upstream_conn != NULL && PQstatus(upstream_conn) == CONNECTION_OK)
|
||||
{
|
||||
PQfinish(upstream_conn);
|
||||
upstream_conn = NULL;
|
||||
}
|
||||
|
||||
if (PQstatus(local_conn) == CONNECTION_OK)
|
||||
{
|
||||
PQfinish(local_conn);
|
||||
local_conn = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user