mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
Make code to check standby join status available globally
This makes it possible to check the standby join status from another node, e.g. the promotion candidate during a switchover operation.
This commit is contained in:
@@ -2118,6 +2118,7 @@ do_node_rejoin(void)
|
||||
PQExpBufferData follow_output;
|
||||
struct stat statbuf;
|
||||
t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;
|
||||
t_node_info local_node_record = T_NODE_INFO_INITIALIZER;
|
||||
|
||||
bool success = true;
|
||||
int follow_error_code = SUCCESS;
|
||||
@@ -2228,6 +2229,21 @@ do_node_rejoin(void)
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch the local node record - we'll need this later, and it acts as an
|
||||
* additional sanity-check that the node is known to the primary.
|
||||
*/
|
||||
if (get_node_record(primary_conn, config_file_options.node_id, &local_node_record) != RECORD_FOUND)
|
||||
{
|
||||
log_error(_("unable to retrieve node record for the local node"));
|
||||
log_hint(_("check the local node is registered with the current primary \"%s\" (ID: %i)"),
|
||||
primary_node_record.node_name,
|
||||
primary_node_record.node_id);
|
||||
PQfinish(upstream_conn);
|
||||
PQfinish(primary_conn);
|
||||
exit(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/*
|
||||
* Sanity-check replication slot availability
|
||||
*/
|
||||
@@ -2520,79 +2536,34 @@ do_node_rejoin(void)
|
||||
*/
|
||||
if (runtime_options.no_wait == false)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
||||
{
|
||||
if (is_server_available(config_file_options.conninfo))
|
||||
{
|
||||
log_verbose(LOG_INFO, _("demoted primary is pingable"));
|
||||
break;
|
||||
}
|
||||
|
||||
if (i % 5 == 0)
|
||||
{
|
||||
log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"),
|
||||
config_file_options.node_id,
|
||||
i + 1, config_file_options.node_rejoin_timeout);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts",
|
||||
config_file_options.node_id,
|
||||
i + 1, config_file_options.node_rejoin_timeout);
|
||||
}
|
||||
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
for (; i < config_file_options.node_rejoin_timeout; i++)
|
||||
{
|
||||
NodeAttached node_attached = is_downstream_node_attached(primary_conn,
|
||||
config_file_options.node_name);
|
||||
|
||||
if (node_attached == NODE_ATTACHED)
|
||||
{
|
||||
log_verbose(LOG_INFO, _("node %i has attached to its upstream node"),
|
||||
config_file_options.node_id);
|
||||
break;
|
||||
}
|
||||
|
||||
if (i % 5 == 0)
|
||||
{
|
||||
log_info(_("waiting for node \"%s\" (ID: %i) to connect to new primary; %i of max %i attempts"),
|
||||
config_file_options.node_name,
|
||||
config_file_options.node_id,
|
||||
i + 1, config_file_options.node_rejoin_timeout);
|
||||
log_detail(_("checking for record in node \"%s\"'s \"pg_stat_replication\" table where \"application_name\" is \"%s\""),
|
||||
primary_node_record.node_name,
|
||||
config_file_options.node_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts",
|
||||
config_file_options.node_id,
|
||||
i + 1, config_file_options.node_rejoin_timeout);
|
||||
}
|
||||
|
||||
sleep(1);
|
||||
}
|
||||
standy_join_status join_success = check_standby_join(primary_conn,
|
||||
&primary_node_record,
|
||||
&local_node_record);
|
||||
|
||||
create_event_notification(primary_conn,
|
||||
&config_file_options,
|
||||
config_file_options.node_id,
|
||||
"node_rejoin",
|
||||
success,
|
||||
join_success == JOIN_SUCCESS ? true : false,
|
||||
follow_output.data);
|
||||
|
||||
if (success == false)
|
||||
if (join_success != JOIN_SUCCESS)
|
||||
{
|
||||
termPQExpBuffer(&follow_output);
|
||||
log_error(_("NODE REJOIN failed"));
|
||||
log_detail(_("no record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
|
||||
config_file_options.node_name,
|
||||
primary_node_record.node_name);
|
||||
|
||||
if (join_success == JOIN_FAIL_NO_PING) {
|
||||
log_detail(_("local node \"%s\" did not become available start after %i seconds"),
|
||||
config_file_options.node_name,
|
||||
config_file_options.node_rejoin_timeout);
|
||||
}
|
||||
else {
|
||||
log_detail(_("no record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
|
||||
config_file_options.node_name,
|
||||
primary_node_record.node_name);
|
||||
}
|
||||
log_hint(_("check the PostgreSQL log on the local node"));
|
||||
|
||||
exit(ERR_REJOIN_FAIL);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
/* default value for "cluster event --limit"*/
|
||||
#define CLUSTER_EVENT_LIMIT 20
|
||||
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/* configuration metadata */
|
||||
@@ -210,6 +212,13 @@ typedef enum
|
||||
SUPERUSER
|
||||
} t_user_type;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
JOIN_SUCCESS,
|
||||
JOIN_FAIL_NO_PING,
|
||||
JOIN_FAIL_NO_REPLICATION
|
||||
} standy_join_status;
|
||||
|
||||
|
||||
typedef struct ColHeader
|
||||
{
|
||||
@@ -269,8 +278,10 @@ extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBuf
|
||||
extern bool create_replication_slot(PGconn *conn, char *slot_name, t_node_info *upstream_node_record, PQExpBufferData *error_msg);
|
||||
extern bool drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);
|
||||
|
||||
extern standy_join_status check_standby_join(PGconn *primary_conn, t_node_info *primary_node_record, t_node_info *standby_node_record);
|
||||
extern bool check_replication_slots_available(int node_id, PGconn* conn);
|
||||
extern bool check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin);
|
||||
|
||||
extern void check_shared_library(PGconn *conn);
|
||||
extern bool is_repmgrd_running(PGconn *conn);
|
||||
extern int parse_repmgr_version(const char *version_string);
|
||||
|
||||
@@ -3866,6 +3866,98 @@ check_replication_slots_available(int node_id, PGconn* conn)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Check whether the specified standby has joined to its upstream.
|
||||
*
|
||||
* This is used by "standby switchover" and "node rejoin" to check
|
||||
* the success of a node rejoin operation.
|
||||
*
|
||||
* IMPORTANT: the timeout settings will be taken from the node where the check
|
||||
* is performed, which might not be the standby itself.
|
||||
*/
|
||||
standy_join_status
|
||||
check_standby_join(PGconn *upstream_conn, t_node_info *upstream_node_record, t_node_info *standby_node_record)
|
||||
{
|
||||
int i;
|
||||
bool available = false;
|
||||
|
||||
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
||||
{
|
||||
if (is_server_available(config_file_options.conninfo))
|
||||
{
|
||||
log_verbose(LOG_INFO, _("node \"%s\" (ID: %i) is pingable"),
|
||||
standby_node_record->node_name,
|
||||
standby_node_record->node_id);
|
||||
available = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (i % 5 == 0)
|
||||
{
|
||||
log_verbose(LOG_INFO, _("waiting for node \"%s\" (ID: %i) to respond to pings; %i of max %i attempts (parameter \"node_rejoin_timeout\")"),
|
||||
standby_node_record->node_name,
|
||||
standby_node_record->node_id,
|
||||
i + 1,
|
||||
config_file_options.node_rejoin_timeout);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_debug("sleeping 1 second waiting for node \"%s\" (ID: %i) to respond to pings; %i of max %i attempts",
|
||||
standby_node_record->node_name,
|
||||
standby_node_record->node_id,
|
||||
i + 1,
|
||||
config_file_options.node_rejoin_timeout);
|
||||
}
|
||||
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
/* node did not become available */
|
||||
if (available == false)
|
||||
{
|
||||
return JOIN_FAIL_NO_PING;
|
||||
}
|
||||
|
||||
for (; i < config_file_options.node_rejoin_timeout; i++)
|
||||
{
|
||||
NodeAttached node_attached = is_downstream_node_attached(upstream_conn,
|
||||
standby_node_record->node_name);
|
||||
if (node_attached == NODE_ATTACHED)
|
||||
{
|
||||
log_verbose(LOG_INFO, _("node \"%s\" (ID: %i) has attached to its upstream node"),
|
||||
standby_node_record->node_name,
|
||||
standby_node_record->node_id);
|
||||
return JOIN_SUCCESS;
|
||||
}
|
||||
|
||||
if (i % 5 == 0)
|
||||
{
|
||||
log_info(_("waiting for node \"%s\" (ID: %i) to connect to new primary; %i of max %i attempts (parameter \"node_rejoin_timeout\")"),
|
||||
standby_node_record->node_name,
|
||||
standby_node_record->node_id,
|
||||
i + 1,
|
||||
config_file_options.node_rejoin_timeout);
|
||||
|
||||
log_detail(_("checking for record in node \"%s\"'s \"pg_stat_replication\" table where \"application_name\" is \"%s\""),
|
||||
upstream_node_record->node_name,
|
||||
standby_node_record->node_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
log_debug("sleeping 1 second waiting for node \"%s\" (ID: %i) to connect to new primary; %i of max %i attempts",
|
||||
standby_node_record->node_name,
|
||||
standby_node_record->node_id,
|
||||
i + 1,
|
||||
config_file_options.node_rejoin_timeout);
|
||||
}
|
||||
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
return JOIN_FAIL_NO_REPLICATION;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Here we'll perform some timeline sanity checks to ensure the follow target
|
||||
* can actually be followed.
|
||||
|
||||
Reference in New Issue
Block a user