mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-26 08:36:30 +00:00
Make code to check standby join status available globally
This makes it possible to check the standby join status from another node, e.g. the promotion candidate during a switchover operation.
This commit is contained in:
@@ -2118,6 +2118,7 @@ do_node_rejoin(void)
|
|||||||
PQExpBufferData follow_output;
|
PQExpBufferData follow_output;
|
||||||
struct stat statbuf;
|
struct stat statbuf;
|
||||||
t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;
|
t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;
|
||||||
|
t_node_info local_node_record = T_NODE_INFO_INITIALIZER;
|
||||||
|
|
||||||
bool success = true;
|
bool success = true;
|
||||||
int follow_error_code = SUCCESS;
|
int follow_error_code = SUCCESS;
|
||||||
@@ -2228,6 +2229,21 @@ do_node_rejoin(void)
|
|||||||
exit(ERR_BAD_CONFIG);
|
exit(ERR_BAD_CONFIG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fetch the local node record - we'll need this later, and it acts as an
|
||||||
|
* additional sanity-check that the node is known to the primary.
|
||||||
|
*/
|
||||||
|
if (get_node_record(primary_conn, config_file_options.node_id, &local_node_record) != RECORD_FOUND)
|
||||||
|
{
|
||||||
|
log_error(_("unable to retrieve node record for the local node"));
|
||||||
|
log_hint(_("check the local node is registered with the current primary \"%s\" (ID: %i)"),
|
||||||
|
primary_node_record.node_name,
|
||||||
|
primary_node_record.node_id);
|
||||||
|
PQfinish(upstream_conn);
|
||||||
|
PQfinish(primary_conn);
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Sanity-check replication slot availability
|
* Sanity-check replication slot availability
|
||||||
*/
|
*/
|
||||||
@@ -2520,79 +2536,34 @@ do_node_rejoin(void)
|
|||||||
*/
|
*/
|
||||||
if (runtime_options.no_wait == false)
|
if (runtime_options.no_wait == false)
|
||||||
{
|
{
|
||||||
int i;
|
standy_join_status join_success = check_standby_join(primary_conn,
|
||||||
|
&primary_node_record,
|
||||||
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
&local_node_record);
|
||||||
{
|
|
||||||
if (is_server_available(config_file_options.conninfo))
|
|
||||||
{
|
|
||||||
log_verbose(LOG_INFO, _("demoted primary is pingable"));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i % 5 == 0)
|
|
||||||
{
|
|
||||||
log_verbose(LOG_INFO, _("waiting for node %i to respond to pings; %i of max %i attempts"),
|
|
||||||
config_file_options.node_id,
|
|
||||||
i + 1, config_file_options.node_rejoin_timeout);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
log_debug("sleeping 1 second waiting for node %i to respond to pings; %i of max %i attempts",
|
|
||||||
config_file_options.node_id,
|
|
||||||
i + 1, config_file_options.node_rejoin_timeout);
|
|
||||||
}
|
|
||||||
|
|
||||||
sleep(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (; i < config_file_options.node_rejoin_timeout; i++)
|
|
||||||
{
|
|
||||||
NodeAttached node_attached = is_downstream_node_attached(primary_conn,
|
|
||||||
config_file_options.node_name);
|
|
||||||
|
|
||||||
if (node_attached == NODE_ATTACHED)
|
|
||||||
{
|
|
||||||
log_verbose(LOG_INFO, _("node %i has attached to its upstream node"),
|
|
||||||
config_file_options.node_id);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i % 5 == 0)
|
|
||||||
{
|
|
||||||
log_info(_("waiting for node \"%s\" (ID: %i) to connect to new primary; %i of max %i attempts"),
|
|
||||||
config_file_options.node_name,
|
|
||||||
config_file_options.node_id,
|
|
||||||
i + 1, config_file_options.node_rejoin_timeout);
|
|
||||||
log_detail(_("checking for record in node \"%s\"'s \"pg_stat_replication\" table where \"application_name\" is \"%s\""),
|
|
||||||
primary_node_record.node_name,
|
|
||||||
config_file_options.node_name);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
log_debug("sleeping 1 second waiting for node %i to connect to new primary; %i of max %i attempts",
|
|
||||||
config_file_options.node_id,
|
|
||||||
i + 1, config_file_options.node_rejoin_timeout);
|
|
||||||
}
|
|
||||||
|
|
||||||
sleep(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
create_event_notification(primary_conn,
|
create_event_notification(primary_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
"node_rejoin",
|
"node_rejoin",
|
||||||
success,
|
join_success == JOIN_SUCCESS ? true : false,
|
||||||
follow_output.data);
|
follow_output.data);
|
||||||
|
|
||||||
if (success == false)
|
if (join_success != JOIN_SUCCESS)
|
||||||
{
|
{
|
||||||
termPQExpBuffer(&follow_output);
|
termPQExpBuffer(&follow_output);
|
||||||
log_error(_("NODE REJOIN failed"));
|
log_error(_("NODE REJOIN failed"));
|
||||||
log_detail(_("no record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
|
|
||||||
config_file_options.node_name,
|
if (join_success == JOIN_FAIL_NO_PING) {
|
||||||
primary_node_record.node_name);
|
log_detail(_("local node \"%s\" did not become available start after %i seconds"),
|
||||||
|
config_file_options.node_name,
|
||||||
|
config_file_options.node_rejoin_timeout);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
log_detail(_("no record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
|
||||||
|
config_file_options.node_name,
|
||||||
|
primary_node_record.node_name);
|
||||||
|
}
|
||||||
log_hint(_("check the PostgreSQL log on the local node"));
|
log_hint(_("check the PostgreSQL log on the local node"));
|
||||||
|
|
||||||
exit(ERR_REJOIN_FAIL);
|
exit(ERR_REJOIN_FAIL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,6 +28,8 @@
|
|||||||
/* default value for "cluster event --limit"*/
|
/* default value for "cluster event --limit"*/
|
||||||
#define CLUSTER_EVENT_LIMIT 20
|
#define CLUSTER_EVENT_LIMIT 20
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
/* configuration metadata */
|
/* configuration metadata */
|
||||||
@@ -210,6 +212,13 @@ typedef enum
|
|||||||
SUPERUSER
|
SUPERUSER
|
||||||
} t_user_type;
|
} t_user_type;
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
JOIN_SUCCESS,
|
||||||
|
JOIN_FAIL_NO_PING,
|
||||||
|
JOIN_FAIL_NO_REPLICATION
|
||||||
|
} standy_join_status;
|
||||||
|
|
||||||
|
|
||||||
typedef struct ColHeader
|
typedef struct ColHeader
|
||||||
{
|
{
|
||||||
@@ -269,8 +278,10 @@ extern bool can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBuf
|
|||||||
extern bool create_replication_slot(PGconn *conn, char *slot_name, t_node_info *upstream_node_record, PQExpBufferData *error_msg);
|
extern bool create_replication_slot(PGconn *conn, char *slot_name, t_node_info *upstream_node_record, PQExpBufferData *error_msg);
|
||||||
extern bool drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);
|
extern bool drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name);
|
||||||
|
|
||||||
|
extern standy_join_status check_standby_join(PGconn *primary_conn, t_node_info *primary_node_record, t_node_info *standby_node_record);
|
||||||
extern bool check_replication_slots_available(int node_id, PGconn* conn);
|
extern bool check_replication_slots_available(int node_id, PGconn* conn);
|
||||||
extern bool check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin);
|
extern bool check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin);
|
||||||
|
|
||||||
extern void check_shared_library(PGconn *conn);
|
extern void check_shared_library(PGconn *conn);
|
||||||
extern bool is_repmgrd_running(PGconn *conn);
|
extern bool is_repmgrd_running(PGconn *conn);
|
||||||
extern int parse_repmgr_version(const char *version_string);
|
extern int parse_repmgr_version(const char *version_string);
|
||||||
|
|||||||
@@ -3866,6 +3866,98 @@ check_replication_slots_available(int node_id, PGconn* conn)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check whether the specified standby has joined to its upstream.
|
||||||
|
*
|
||||||
|
* This is used by "standby switchover" and "node rejoin" to check
|
||||||
|
* the success of a node rejoin operation.
|
||||||
|
*
|
||||||
|
* IMPORTANT: the timeout settings will be taken from the node where the check
|
||||||
|
* is performed, which might not be the standby itself.
|
||||||
|
*/
|
||||||
|
standy_join_status
|
||||||
|
check_standby_join(PGconn *upstream_conn, t_node_info *upstream_node_record, t_node_info *standby_node_record)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
bool available = false;
|
||||||
|
|
||||||
|
for (i = 0; i < config_file_options.standby_reconnect_timeout; i++)
|
||||||
|
{
|
||||||
|
if (is_server_available(config_file_options.conninfo))
|
||||||
|
{
|
||||||
|
log_verbose(LOG_INFO, _("node \"%s\" (ID: %i) is pingable"),
|
||||||
|
standby_node_record->node_name,
|
||||||
|
standby_node_record->node_id);
|
||||||
|
available = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i % 5 == 0)
|
||||||
|
{
|
||||||
|
log_verbose(LOG_INFO, _("waiting for node \"%s\" (ID: %i) to respond to pings; %i of max %i attempts (parameter \"node_rejoin_timeout\")"),
|
||||||
|
standby_node_record->node_name,
|
||||||
|
standby_node_record->node_id,
|
||||||
|
i + 1,
|
||||||
|
config_file_options.node_rejoin_timeout);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_debug("sleeping 1 second waiting for node \"%s\" (ID: %i) to respond to pings; %i of max %i attempts",
|
||||||
|
standby_node_record->node_name,
|
||||||
|
standby_node_record->node_id,
|
||||||
|
i + 1,
|
||||||
|
config_file_options.node_rejoin_timeout);
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* node did not become available */
|
||||||
|
if (available == false)
|
||||||
|
{
|
||||||
|
return JOIN_FAIL_NO_PING;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i < config_file_options.node_rejoin_timeout; i++)
|
||||||
|
{
|
||||||
|
NodeAttached node_attached = is_downstream_node_attached(upstream_conn,
|
||||||
|
standby_node_record->node_name);
|
||||||
|
if (node_attached == NODE_ATTACHED)
|
||||||
|
{
|
||||||
|
log_verbose(LOG_INFO, _("node \"%s\" (ID: %i) has attached to its upstream node"),
|
||||||
|
standby_node_record->node_name,
|
||||||
|
standby_node_record->node_id);
|
||||||
|
return JOIN_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i % 5 == 0)
|
||||||
|
{
|
||||||
|
log_info(_("waiting for node \"%s\" (ID: %i) to connect to new primary; %i of max %i attempts (parameter \"node_rejoin_timeout\")"),
|
||||||
|
standby_node_record->node_name,
|
||||||
|
standby_node_record->node_id,
|
||||||
|
i + 1,
|
||||||
|
config_file_options.node_rejoin_timeout);
|
||||||
|
|
||||||
|
log_detail(_("checking for record in node \"%s\"'s \"pg_stat_replication\" table where \"application_name\" is \"%s\""),
|
||||||
|
upstream_node_record->node_name,
|
||||||
|
standby_node_record->node_name);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_debug("sleeping 1 second waiting for node \"%s\" (ID: %i) to connect to new primary; %i of max %i attempts",
|
||||||
|
standby_node_record->node_name,
|
||||||
|
standby_node_record->node_id,
|
||||||
|
i + 1,
|
||||||
|
config_file_options.node_rejoin_timeout);
|
||||||
|
}
|
||||||
|
|
||||||
|
sleep(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return JOIN_FAIL_NO_REPLICATION;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Here we'll perform some timeline sanity checks to ensure the follow target
|
* Here we'll perform some timeline sanity checks to ensure the follow target
|
||||||
* can actually be followed.
|
* can actually be followed.
|
||||||
|
|||||||
Reference in New Issue
Block a user