mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-25 16:16:29 +00:00
Refactor code for executing --siblings-follow
This will enable provision of "--siblings-follow" to "repmgr standby promote"
This commit is contained in:
@@ -47,6 +47,23 @@ typedef struct TablespaceDataList
|
|||||||
} TablespaceDataList;
|
} TablespaceDataList;
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
int reachable_sibling_node_count;
|
||||||
|
int reachable_sibling_nodes_with_slot_count;
|
||||||
|
int unreachable_sibling_node_count;
|
||||||
|
int min_required_wal_senders;
|
||||||
|
int min_required_free_slots;
|
||||||
|
} SiblingNodeStats;
|
||||||
|
|
||||||
|
#define T_SIBLING_NODES_STATS_INITIALIZER { \
|
||||||
|
0, \
|
||||||
|
0, \
|
||||||
|
0, \
|
||||||
|
0, \
|
||||||
|
0 \
|
||||||
|
}
|
||||||
|
|
||||||
static PGconn *primary_conn = NULL;
|
static PGconn *primary_conn = NULL;
|
||||||
static PGconn *source_conn = NULL;
|
static PGconn *source_conn = NULL;
|
||||||
|
|
||||||
@@ -97,6 +114,8 @@ static char *make_barman_ssh_command(char *buf);
|
|||||||
|
|
||||||
static bool create_recovery_file(t_node_info *node_record, t_conninfo_param_list *recovery_conninfo, char *dest, bool as_file);
|
static bool create_recovery_file(t_node_info *node_record, t_conninfo_param_list *recovery_conninfo, char *dest, bool as_file);
|
||||||
static void write_primary_conninfo(PQExpBufferData *dest, t_conninfo_param_list *param_list);
|
static void write_primary_conninfo(PQExpBufferData *dest, t_conninfo_param_list *param_list);
|
||||||
|
static bool check_sibling_nodes(NodeInfoList *sibling_nodes, SiblingNodeStats *sibling_nodes_stats);
|
||||||
|
static void sibling_nodes_follow(t_node_info *local_node_record, NodeInfoList *sibling_nodes, SiblingNodeStats *sibling_nodes_stats);
|
||||||
|
|
||||||
static NodeStatus parse_node_status_is_shutdown_cleanly(const char *node_status_output, XLogRecPtr *checkPoint);
|
static NodeStatus parse_node_status_is_shutdown_cleanly(const char *node_status_output, XLogRecPtr *checkPoint);
|
||||||
static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold);
|
static CheckStatus parse_node_check_archiver(const char *node_check_output, int *files, int *threshold);
|
||||||
@@ -3167,19 +3186,11 @@ do_standby_switchover(void)
|
|||||||
|
|
||||||
/* store list of sibling nodes if --siblings-follow specified */
|
/* store list of sibling nodes if --siblings-follow specified */
|
||||||
NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
NodeInfoList sibling_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
||||||
int reachable_sibling_node_count = 0;
|
SiblingNodeStats sibling_nodes_stats = T_SIBLING_NODES_STATS_INITIALIZER;
|
||||||
int reachable_sibling_nodes_with_slot_count = 0;
|
|
||||||
int unreachable_sibling_node_count = 0;
|
|
||||||
|
|
||||||
/* number of free walsenders required on promotion candidate */
|
|
||||||
int min_required_wal_senders = 1;
|
|
||||||
|
|
||||||
/* this will be calculated as max_wal_senders - COUNT(*) FROM pg_stat_replication */
|
/* this will be calculated as max_wal_senders - COUNT(*) FROM pg_stat_replication */
|
||||||
int available_wal_senders = 0;
|
int available_wal_senders = 0;
|
||||||
|
|
||||||
/* number of free replication slots required on promotion candidate */
|
|
||||||
int min_required_free_slots = 0;
|
|
||||||
|
|
||||||
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
t_event_info event_info = T_EVENT_INFO_INITIALIZER;
|
||||||
|
|
||||||
/* used for handling repmgrd pause/unpause */
|
/* used for handling repmgrd pause/unpause */
|
||||||
@@ -3187,6 +3198,11 @@ do_standby_switchover(void)
|
|||||||
RepmgrdInfo **repmgrd_info = NULL;
|
RepmgrdInfo **repmgrd_info = NULL;
|
||||||
int repmgrd_running_count = 0;
|
int repmgrd_running_count = 0;
|
||||||
|
|
||||||
|
/* number of free walsenders required on promotion candidate
|
||||||
|
* (at least one will be required for the demotion candidate)
|
||||||
|
*/
|
||||||
|
sibling_nodes_stats.min_required_wal_senders = 1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* SANITY CHECKS
|
* SANITY CHECKS
|
||||||
*
|
*
|
||||||
@@ -3375,7 +3391,7 @@ do_standby_switchover(void)
|
|||||||
/* keep a running total of how many nodes will require a replication slot */
|
/* keep a running total of how many nodes will require a replication slot */
|
||||||
if (remote_node_record.slot_name[0] != '\0')
|
if (remote_node_record.slot_name[0] != '\0')
|
||||||
{
|
{
|
||||||
min_required_free_slots++;
|
sibling_nodes_stats.min_required_free_slots++;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -3596,130 +3612,11 @@ do_standby_switchover(void)
|
|||||||
local_node_record.upstream_node_id,
|
local_node_record.upstream_node_id,
|
||||||
&sibling_nodes);
|
&sibling_nodes);
|
||||||
|
|
||||||
if (runtime_options.siblings_follow == false)
|
if (check_sibling_nodes(&sibling_nodes, &sibling_nodes_stats) == false)
|
||||||
{
|
{
|
||||||
if (sibling_nodes.node_count > 0)
|
PQfinish(local_conn);
|
||||||
{
|
exit(ERR_BAD_CONFIG);
|
||||||
PQExpBufferData nodes;
|
|
||||||
NodeInfoListCell *cell;
|
|
||||||
|
|
||||||
initPQExpBuffer(&nodes);
|
|
||||||
|
|
||||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
|
||||||
{
|
|
||||||
appendPQExpBuffer(&nodes,
|
|
||||||
" %s (ID: %i)",
|
|
||||||
cell->node_info->node_name,
|
|
||||||
cell->node_info->node_id);
|
|
||||||
if (cell->next)
|
|
||||||
appendPQExpBufferStr(&nodes, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"),
|
|
||||||
sibling_nodes.node_count);
|
|
||||||
log_detail(_("these nodes will remain attached to the current primary:\n%s"), nodes.data);
|
|
||||||
|
|
||||||
termPQExpBuffer(&nodes);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
char host[MAXLEN] = "";
|
|
||||||
NodeInfoListCell *cell;
|
|
||||||
|
|
||||||
log_verbose(LOG_INFO, _("%i active sibling nodes found"),
|
|
||||||
sibling_nodes.node_count);
|
|
||||||
|
|
||||||
if (sibling_nodes.node_count == 0)
|
|
||||||
{
|
|
||||||
log_warning(_("option \"--sibling-nodes\" specified, but no sibling nodes exist"));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* include walsender for promotion candidate in total */
|
|
||||||
|
|
||||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
|
||||||
{
|
|
||||||
/* get host from node record */
|
|
||||||
get_conninfo_value(cell->node_info->conninfo, "host", host);
|
|
||||||
r = test_ssh_connection(host, runtime_options.remote_user);
|
|
||||||
|
|
||||||
if (r != 0)
|
|
||||||
{
|
|
||||||
cell->node_info->reachable = false;
|
|
||||||
unreachable_sibling_node_count++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
cell->node_info->reachable = true;
|
|
||||||
reachable_sibling_node_count++;
|
|
||||||
min_required_wal_senders++;
|
|
||||||
|
|
||||||
if (cell->node_info->slot_name[0] != '\0')
|
|
||||||
{
|
|
||||||
reachable_sibling_nodes_with_slot_count++;
|
|
||||||
min_required_free_slots++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (unreachable_sibling_node_count > 0)
|
|
||||||
{
|
|
||||||
if (runtime_options.force == false)
|
|
||||||
{
|
|
||||||
log_error(_("%i of %i sibling nodes unreachable via SSH:"),
|
|
||||||
unreachable_sibling_node_count,
|
|
||||||
sibling_nodes.node_count);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
log_warning(_("%i of %i sibling nodes unreachable via SSH:"),
|
|
||||||
unreachable_sibling_node_count,
|
|
||||||
sibling_nodes.node_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* display list of unreachable sibling nodes */
|
|
||||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
|
||||||
{
|
|
||||||
if (cell->node_info->reachable == true)
|
|
||||||
continue;
|
|
||||||
log_detail(" \"%s\" (ID: %i)",
|
|
||||||
cell->node_info->node_name,
|
|
||||||
cell->node_info->node_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (runtime_options.force == false)
|
|
||||||
{
|
|
||||||
log_hint(_("use -F/--force to proceed in any case"));
|
|
||||||
PQfinish(local_conn);
|
|
||||||
exit(ERR_BAD_CONFIG);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (runtime_options.dry_run == true)
|
|
||||||
{
|
|
||||||
log_detail(_("F/--force specified, would proceed anyway"));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
log_detail(_("F/--force specified, proceeding anyway"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
char *msg = _("all sibling nodes are reachable via SSH");
|
|
||||||
|
|
||||||
if (runtime_options.dry_run == true)
|
|
||||||
{
|
|
||||||
log_info("%s", msg);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
log_verbose(LOG_INFO, "%s", msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* check there are sufficient free walsenders - obviously there's potential
|
* check there are sufficient free walsenders - obviously there's potential
|
||||||
@@ -3729,13 +3626,13 @@ do_standby_switchover(void)
|
|||||||
* performing a switchover in such an unstable environment, they only have
|
* performing a switchover in such an unstable environment, they only have
|
||||||
* themselves to blame).
|
* themselves to blame).
|
||||||
*/
|
*/
|
||||||
if (available_wal_senders < min_required_wal_senders)
|
if (available_wal_senders < sibling_nodes_stats.min_required_wal_senders)
|
||||||
{
|
{
|
||||||
if (runtime_options.force == false || runtime_options.dry_run == true)
|
if (runtime_options.force == false || runtime_options.dry_run == true)
|
||||||
{
|
{
|
||||||
log_error(_("insufficient free walsenders on promotion candidate"));
|
log_error(_("insufficient free walsenders on promotion candidate"));
|
||||||
log_detail(_("at least %i walsenders required but only %i free walsenders on promotion candidate"),
|
log_detail(_("at least %i walsenders required but only %i free walsenders on promotion candidate"),
|
||||||
min_required_wal_senders,
|
sibling_nodes_stats.min_required_wal_senders,
|
||||||
available_wal_senders);
|
available_wal_senders);
|
||||||
log_hint(_("increase parameter \"max_wal_senders\" or use -F/--force to proceed in any case"));
|
log_hint(_("increase parameter \"max_wal_senders\" or use -F/--force to proceed in any case"));
|
||||||
|
|
||||||
@@ -3749,7 +3646,7 @@ do_standby_switchover(void)
|
|||||||
{
|
{
|
||||||
log_warning(_("insufficient free walsenders on promotion candidate"));
|
log_warning(_("insufficient free walsenders on promotion candidate"));
|
||||||
log_detail(_("at least %i walsenders required but only %i free walsender(s) on promotion candidate"),
|
log_detail(_("at least %i walsenders required but only %i free walsender(s) on promotion candidate"),
|
||||||
min_required_wal_senders,
|
sibling_nodes_stats.min_required_wal_senders,
|
||||||
available_wal_senders);
|
available_wal_senders);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3758,7 +3655,7 @@ do_standby_switchover(void)
|
|||||||
if (runtime_options.dry_run == true)
|
if (runtime_options.dry_run == true)
|
||||||
{
|
{
|
||||||
log_info(_("%i walsenders required, %i available"),
|
log_info(_("%i walsenders required, %i available"),
|
||||||
min_required_wal_senders,
|
sibling_nodes_stats.min_required_wal_senders,
|
||||||
available_wal_senders);
|
available_wal_senders);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -4014,23 +3911,23 @@ do_standby_switchover(void)
|
|||||||
* check the promotion candidate has sufficient free slots
|
* check the promotion candidate has sufficient free slots
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if (min_required_free_slots > 0 )
|
if (sibling_nodes_stats.min_required_free_slots > 0 )
|
||||||
{
|
{
|
||||||
int available_slots = local_node_record.max_replication_slots -
|
int available_slots = local_node_record.max_replication_slots -
|
||||||
local_node_record.active_replication_slots;
|
local_node_record.active_replication_slots;
|
||||||
|
|
||||||
log_debug("minimum of %i free slots (%i for siblings) required; %i available",
|
log_debug("minimum of %i free slots (%i for siblings) required; %i available",
|
||||||
min_required_free_slots,
|
sibling_nodes_stats.min_required_free_slots,
|
||||||
reachable_sibling_nodes_with_slot_count,
|
sibling_nodes_stats.reachable_sibling_nodes_with_slot_count,
|
||||||
available_slots);
|
available_slots);
|
||||||
|
|
||||||
if (available_slots < min_required_free_slots)
|
if (available_slots < sibling_nodes_stats.min_required_free_slots)
|
||||||
{
|
{
|
||||||
if (runtime_options.force == false || runtime_options.dry_run == true)
|
if (runtime_options.force == false || runtime_options.dry_run == true)
|
||||||
{
|
{
|
||||||
log_error(_("insufficient free replication slots to attach all nodes"));
|
log_error(_("insufficient free replication slots to attach all nodes"));
|
||||||
log_detail(_("at least %i additional replication slots required but only %i free slots available on promotion candidate"),
|
log_detail(_("at least %i additional replication slots required but only %i free slots available on promotion candidate"),
|
||||||
min_required_free_slots,
|
sibling_nodes_stats.min_required_free_slots,
|
||||||
available_slots);
|
available_slots);
|
||||||
log_hint(_("increase parameter \"max_replication_slots\" or use -F/--force to proceed in any case"));
|
log_hint(_("increase parameter \"max_replication_slots\" or use -F/--force to proceed in any case"));
|
||||||
|
|
||||||
@@ -4046,7 +3943,7 @@ do_standby_switchover(void)
|
|||||||
if (runtime_options.dry_run == true)
|
if (runtime_options.dry_run == true)
|
||||||
{
|
{
|
||||||
log_info(_("%i replication slots required, %i available"),
|
log_info(_("%i replication slots required, %i available"),
|
||||||
min_required_free_slots,
|
sibling_nodes_stats.min_required_free_slots,
|
||||||
available_slots);
|
available_slots);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -4645,104 +4542,7 @@ do_standby_switchover(void)
|
|||||||
*/
|
*/
|
||||||
if (runtime_options.siblings_follow == true && sibling_nodes.node_count > 0)
|
if (runtime_options.siblings_follow == true && sibling_nodes.node_count > 0)
|
||||||
{
|
{
|
||||||
int failed_follow_count = 0;
|
sibling_nodes_follow(&local_node_record, &sibling_nodes, &sibling_nodes_stats);
|
||||||
char host[MAXLEN] = "";
|
|
||||||
NodeInfoListCell *cell = NULL;
|
|
||||||
|
|
||||||
log_notice(_("executing STANDBY FOLLOW on %i of %i siblings"),
|
|
||||||
sibling_nodes.node_count - unreachable_sibling_node_count,
|
|
||||||
sibling_nodes.node_count);
|
|
||||||
|
|
||||||
for (cell = sibling_nodes.head; cell; cell = cell->next)
|
|
||||||
{
|
|
||||||
bool success = false;
|
|
||||||
t_node_info sibling_node_record = T_NODE_INFO_INITIALIZER;
|
|
||||||
|
|
||||||
/* skip nodes previously determined as unreachable */
|
|
||||||
if (cell->node_info->reachable == false)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
record_status = get_node_record(local_conn,
|
|
||||||
cell->node_info->node_id,
|
|
||||||
&sibling_node_record);
|
|
||||||
|
|
||||||
initPQExpBuffer(&remote_command_str);
|
|
||||||
make_remote_repmgr_path(&remote_command_str, &sibling_node_record);
|
|
||||||
|
|
||||||
if (sibling_node_record.type == WITNESS)
|
|
||||||
{
|
|
||||||
PGconn *witness_conn = NULL;
|
|
||||||
|
|
||||||
/* TODO: create "repmgr witness resync" or similar */
|
|
||||||
appendPQExpBuffer(&remote_command_str,
|
|
||||||
"witness register -d \\'%s\\' --force 2>/dev/null && echo \"1\" || echo \"0\"",
|
|
||||||
local_node_record.conninfo);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Notify the witness repmgrd about the new primary, as at this point it will be assuming
|
|
||||||
* a failover situation is in place. It will detect the new primary at some point, this
|
|
||||||
* just speeds up the process.
|
|
||||||
*
|
|
||||||
* In the unlikely event repmgrd is not running or not in use, this will have no effect.
|
|
||||||
*/
|
|
||||||
witness_conn = establish_db_connection_quiet(cell->node_info->conninfo);
|
|
||||||
|
|
||||||
if (PQstatus(witness_conn) == CONNECTION_OK)
|
|
||||||
{
|
|
||||||
notify_follow_primary(witness_conn, local_node_record.node_id);
|
|
||||||
}
|
|
||||||
PQfinish(witness_conn);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
appendPQExpBufferStr(&remote_command_str,
|
|
||||||
"standby follow 2>/dev/null && echo \"1\" || echo \"0\"");
|
|
||||||
}
|
|
||||||
get_conninfo_value(cell->node_info->conninfo, "host", host);
|
|
||||||
log_debug("executing:\n %s", remote_command_str.data);
|
|
||||||
|
|
||||||
initPQExpBuffer(&command_output);
|
|
||||||
|
|
||||||
success = remote_command(host,
|
|
||||||
runtime_options.remote_user,
|
|
||||||
remote_command_str.data,
|
|
||||||
config_file_options.ssh_options,
|
|
||||||
&command_output);
|
|
||||||
|
|
||||||
termPQExpBuffer(&remote_command_str);
|
|
||||||
|
|
||||||
if (success == false || command_output.data[0] == '0')
|
|
||||||
{
|
|
||||||
if (sibling_node_record.type == WITNESS)
|
|
||||||
{
|
|
||||||
log_warning(_("WITNESS REGISTER failed on node \"%s\""),
|
|
||||||
cell->node_info->node_name);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
log_warning(_("STANDBY FOLLOW failed on node \"%s\""),
|
|
||||||
cell->node_info->node_name);
|
|
||||||
}
|
|
||||||
failed_follow_count++;
|
|
||||||
}
|
|
||||||
|
|
||||||
termPQExpBuffer(&command_output);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (failed_follow_count == 0)
|
|
||||||
{
|
|
||||||
log_info(_("STANDBY FOLLOW successfully executed on all reachable sibling nodes"));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
log_warning(_("execution of STANDBY FOLLOW failed on %i sibling nodes"),
|
|
||||||
failed_follow_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* TODO: double-check all expected nodes are in pg_stat_replication
|
|
||||||
* and entries in repmgr.nodes match
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
clear_node_info_list(&sibling_nodes);
|
clear_node_info_list(&sibling_nodes);
|
||||||
@@ -7175,6 +6975,241 @@ write_primary_conninfo(PQExpBufferData *dest, t_conninfo_param_list *param_list)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool
|
||||||
|
check_sibling_nodes(NodeInfoList *sibling_nodes, SiblingNodeStats *sibling_nodes_stats)
|
||||||
|
{
|
||||||
|
char host[MAXLEN] = "";
|
||||||
|
NodeInfoListCell *cell;
|
||||||
|
int r;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If --siblings-follow not specified, warn about any extant
|
||||||
|
* siblings which will not follow the new primary
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (runtime_options.siblings_follow == false)
|
||||||
|
{
|
||||||
|
if (sibling_nodes->node_count > 0)
|
||||||
|
{
|
||||||
|
PQExpBufferData nodes;
|
||||||
|
NodeInfoListCell *cell;
|
||||||
|
|
||||||
|
initPQExpBuffer(&nodes);
|
||||||
|
|
||||||
|
for (cell = sibling_nodes->head; cell; cell = cell->next)
|
||||||
|
{
|
||||||
|
appendPQExpBuffer(&nodes,
|
||||||
|
" %s (node ID: %i)",
|
||||||
|
cell->node_info->node_name,
|
||||||
|
cell->node_info->node_id);
|
||||||
|
if (cell->next)
|
||||||
|
appendPQExpBufferStr(&nodes, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning(_("%i sibling nodes found, but option \"--siblings-follow\" not specified"),
|
||||||
|
sibling_nodes->node_count);
|
||||||
|
log_detail(_("these nodes will remain attached to the current primary:\n%s"), nodes.data);
|
||||||
|
|
||||||
|
termPQExpBuffer(&nodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
log_verbose(LOG_INFO, _("%i active sibling nodes found"),
|
||||||
|
sibling_nodes->node_count);
|
||||||
|
|
||||||
|
if (sibling_nodes->node_count == 0)
|
||||||
|
{
|
||||||
|
log_warning(_("option \"--sibling-nodes\" specified, but no sibling nodes exist"));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (cell = sibling_nodes->head; cell; cell = cell->next)
|
||||||
|
{
|
||||||
|
/* get host from node record */
|
||||||
|
get_conninfo_value(cell->node_info->conninfo, "host", host);
|
||||||
|
r = test_ssh_connection(host, runtime_options.remote_user);
|
||||||
|
|
||||||
|
if (r != 0)
|
||||||
|
{
|
||||||
|
cell->node_info->reachable = false;
|
||||||
|
sibling_nodes_stats->unreachable_sibling_node_count++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cell->node_info->reachable = true;
|
||||||
|
sibling_nodes_stats->reachable_sibling_node_count++;
|
||||||
|
sibling_nodes_stats->min_required_wal_senders++;
|
||||||
|
|
||||||
|
if (cell->node_info->slot_name[0] != '\0')
|
||||||
|
{
|
||||||
|
sibling_nodes_stats->reachable_sibling_nodes_with_slot_count++;
|
||||||
|
sibling_nodes_stats->min_required_free_slots++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sibling_nodes_stats->unreachable_sibling_node_count > 0)
|
||||||
|
{
|
||||||
|
if (runtime_options.force == false)
|
||||||
|
{
|
||||||
|
log_error(_("%i of %i sibling nodes unreachable via SSH:"),
|
||||||
|
sibling_nodes_stats->unreachable_sibling_node_count,
|
||||||
|
sibling_nodes->node_count);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_warning(_("%i of %i sibling nodes unreachable via SSH:"),
|
||||||
|
sibling_nodes_stats->unreachable_sibling_node_count,
|
||||||
|
sibling_nodes->node_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* display list of unreachable sibling nodes */
|
||||||
|
for (cell = sibling_nodes->head; cell; cell = cell->next)
|
||||||
|
{
|
||||||
|
if (cell->node_info->reachable == true)
|
||||||
|
continue;
|
||||||
|
log_detail(" %s (ID: %i)",
|
||||||
|
cell->node_info->node_name,
|
||||||
|
cell->node_info->node_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (runtime_options.force == false)
|
||||||
|
{
|
||||||
|
log_hint(_("use -F/--force to proceed in any case"));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (runtime_options.dry_run == true)
|
||||||
|
{
|
||||||
|
log_detail(_("F/--force specified, would proceed anyway"));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_detail(_("F/--force specified, proceeding anyway"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
char *msg = _("all sibling nodes are reachable via SSH");
|
||||||
|
|
||||||
|
if (runtime_options.dry_run == true)
|
||||||
|
{
|
||||||
|
log_info("%s", msg);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_verbose(LOG_INFO, "%s", msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
sibling_nodes_follow(t_node_info *local_node_record, NodeInfoList *sibling_nodes, SiblingNodeStats *sibling_nodes_stats)
|
||||||
|
{
|
||||||
|
int failed_follow_count = 0;
|
||||||
|
char host[MAXLEN] = "";
|
||||||
|
NodeInfoListCell *cell = NULL;
|
||||||
|
PQExpBufferData remote_command_str;
|
||||||
|
PQExpBufferData command_output;
|
||||||
|
|
||||||
|
log_notice(_("executing STANDBY FOLLOW on %i of %i siblings"),
|
||||||
|
sibling_nodes->node_count - sibling_nodes_stats->unreachable_sibling_node_count,
|
||||||
|
sibling_nodes->node_count);
|
||||||
|
|
||||||
|
for (cell = sibling_nodes->head; cell; cell = cell->next)
|
||||||
|
{
|
||||||
|
bool success = false;
|
||||||
|
|
||||||
|
/* skip nodes previously determined as unreachable */
|
||||||
|
if (cell->node_info->reachable == false)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
initPQExpBuffer(&remote_command_str);
|
||||||
|
make_remote_repmgr_path(&remote_command_str, cell->node_info);
|
||||||
|
|
||||||
|
if (cell->node_info->type == WITNESS)
|
||||||
|
{
|
||||||
|
PGconn *witness_conn = NULL;
|
||||||
|
|
||||||
|
/* TODO: create "repmgr witness resync" or similar */
|
||||||
|
appendPQExpBuffer(&remote_command_str,
|
||||||
|
"witness register -d \\'%s\\' --force 2>/dev/null && echo \"1\" || echo \"0\"",
|
||||||
|
local_node_record->conninfo);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Notify the witness repmgrd about the new primary, as at this point it will be assuming
|
||||||
|
* a failover situation is in place. It will detect the new primary at some point, this
|
||||||
|
* just speeds up the process.
|
||||||
|
*
|
||||||
|
* In the unlikely event repmgrd is not running or not in use, this will have no effect.
|
||||||
|
*/
|
||||||
|
witness_conn = establish_db_connection_quiet(cell->node_info->conninfo);
|
||||||
|
|
||||||
|
if (PQstatus(witness_conn) == CONNECTION_OK)
|
||||||
|
{
|
||||||
|
notify_follow_primary(witness_conn, local_node_record->node_id);
|
||||||
|
}
|
||||||
|
PQfinish(witness_conn);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
appendPQExpBufferStr(&remote_command_str,
|
||||||
|
"standby follow 2>/dev/null && echo \"1\" || echo \"0\"");
|
||||||
|
}
|
||||||
|
get_conninfo_value(cell->node_info->conninfo, "host", host);
|
||||||
|
log_debug("executing:\n %s", remote_command_str.data);
|
||||||
|
|
||||||
|
initPQExpBuffer(&command_output);
|
||||||
|
|
||||||
|
success = remote_command(host,
|
||||||
|
runtime_options.remote_user,
|
||||||
|
remote_command_str.data,
|
||||||
|
config_file_options.ssh_options,
|
||||||
|
&command_output);
|
||||||
|
|
||||||
|
termPQExpBuffer(&remote_command_str);
|
||||||
|
|
||||||
|
if (success == false || command_output.data[0] == '0')
|
||||||
|
{
|
||||||
|
if (cell->node_info->type == WITNESS)
|
||||||
|
{
|
||||||
|
log_warning(_("WITNESS REGISTER failed on node \"%s\""),
|
||||||
|
cell->node_info->node_name);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_warning(_("STANDBY FOLLOW failed on node \"%s\""),
|
||||||
|
cell->node_info->node_name);
|
||||||
|
}
|
||||||
|
failed_follow_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
termPQExpBuffer(&command_output);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (failed_follow_count == 0)
|
||||||
|
{
|
||||||
|
log_info(_("STANDBY FOLLOW successfully executed on all reachable sibling nodes"));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log_warning(_("execution of STANDBY FOLLOW failed on %i sibling nodes"),
|
||||||
|
failed_follow_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* TODO: double-check all expected nodes are in pg_stat_replication
|
||||||
|
* and entries in repmgr.nodes match
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static NodeStatus
|
static NodeStatus
|
||||||
parse_node_status_is_shutdown_cleanly(const char *node_status_output, XLogRecPtr *checkPoint)
|
parse_node_status_is_shutdown_cleanly(const char *node_status_output, XLogRecPtr *checkPoint)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user