mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-28 17:36:30 +00:00
Issue #829 was opened indicating, with a reproducible, that repmgr was not able to run `node rejoin` with minimal privileged user. The main obstacle is that pg_rewind is not able to execute the rewind operation if the user has REPLICATION privileges, but the user repmgr uses requires REPLICATION. This is a typical catch22. The solution provided here adds a --superuser, similar to what other commands have, to `node rejoin` AI-assisted development notes: The approach was designed and directed by Martín Marqués, who also reviewed and refined the output. Code was written by Claude (AI), with some additions from Martín in the documentation. Fixes #829 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Martín Marqués <martin.marques@enterprisedb.com>
3741 lines
91 KiB
C
3741 lines
91 KiB
C
/*
|
|
* repmgr-action-node.c
|
|
*
|
|
* Implements actions available for any kind of node
|
|
*
|
|
* Copyright (c) EnterpriseDB Corporation, 2010-2021
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <sys/stat.h>
|
|
#include <dirent.h>
|
|
|
|
#include "repmgr.h"
|
|
#include "controldata.h"
|
|
#include "dirutil.h"
|
|
#include "dbutils.h"
|
|
#include "compat.h"
|
|
|
|
#include "repmgr-client-global.h"
|
|
#include "repmgr-action-node.h"
|
|
#include "repmgr-action-standby.h"
|
|
|
|
static bool copy_file(const char *src_file, const char *dest_file);
|
|
static void format_archive_dir(PQExpBufferData *archive_dir);
|
|
static t_server_action parse_server_action(const char *action);
|
|
static const char *output_repmgrd_status(CheckStatus status);
|
|
|
|
static void exit_optformat_error(const char *error, int errcode);
|
|
|
|
static void _do_node_service_list_actions(t_server_action action);
|
|
static void _do_node_status_is_shutdown_cleanly(void);
|
|
static void _do_node_archive_config(void);
|
|
static void _do_node_restore_config(void);
|
|
|
|
static void do_node_check_replication_connection(void);
|
|
static CheckStatus do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_repmgrd(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
|
|
static CheckStatus do_node_check_db_connection(PGconn *conn, OutputMode mode);
|
|
|
|
|
|
/*
|
|
* NODE STATUS
|
|
*
|
|
* Can only be run on the local node, as it needs to be able to
|
|
* read the data directory.
|
|
*
|
|
* Parameters:
|
|
* --is-shutdown-cleanly (for internal use only)
|
|
* --csv
|
|
*/
|
|
|
|
void
|
|
do_node_status(void)
|
|
{
|
|
PGconn *conn = NULL;
|
|
|
|
t_node_info node_info = T_NODE_INFO_INITIALIZER;
|
|
char cluster_size[MAXLEN];
|
|
PQExpBufferData output;
|
|
|
|
KeyValueList node_status = {NULL, NULL};
|
|
KeyValueListCell *cell = NULL;
|
|
NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER;
|
|
|
|
ItemList warnings = {NULL, NULL};
|
|
RecoveryType recovery_type = RECTYPE_UNKNOWN;
|
|
ReplInfo replication_info;
|
|
t_recovery_conf recovery_conf = T_RECOVERY_CONF_INITIALIZER;
|
|
|
|
char data_dir[MAXPGPATH] = "";
|
|
char server_version_str[MAXVERSIONSTR] = "";
|
|
|
|
/*
|
|
* A database connection is *not* required for this check
|
|
*/
|
|
if (runtime_options.is_shutdown_cleanly == true)
|
|
{
|
|
return _do_node_status_is_shutdown_cleanly();
|
|
}
|
|
|
|
init_replication_info(&replication_info);
|
|
|
|
|
|
/* config file required, so we should have "conninfo" and "data_directory" */
|
|
conn = establish_db_connection(config_file_options.conninfo, true);
|
|
strncpy(data_dir, config_file_options.data_directory, MAXPGPATH);
|
|
|
|
(void)get_server_version(conn, server_version_str);
|
|
|
|
/* check node exists */
|
|
|
|
if (get_node_record_with_upstream(conn, config_file_options.node_id, &node_info) != RECORD_FOUND)
|
|
{
|
|
log_error(_("no record found for node %i"), config_file_options.node_id);
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
if (get_cluster_size(conn, cluster_size) == false)
|
|
strncpy(cluster_size, _("unknown"), MAXLEN);
|
|
|
|
recovery_type = get_recovery_type(conn);
|
|
|
|
get_node_replication_stats(conn, &node_info);
|
|
|
|
key_value_list_set(&node_status,
|
|
"PostgreSQL version",
|
|
server_version_str);
|
|
|
|
key_value_list_set(&node_status,
|
|
"Total data size",
|
|
cluster_size);
|
|
|
|
key_value_list_set(&node_status,
|
|
"Conninfo",
|
|
node_info.conninfo);
|
|
|
|
if (runtime_options.verbose == true)
|
|
{
|
|
uint64 local_system_identifier = get_system_identifier(config_file_options.data_directory);
|
|
|
|
if (local_system_identifier == UNKNOWN_SYSTEM_IDENTIFIER)
|
|
{
|
|
key_value_list_set(&node_status,
|
|
"System identifier",
|
|
"unknown");
|
|
item_list_append_format(&warnings,
|
|
_("unable to retrieve system identifier from pg_control"));
|
|
}
|
|
else
|
|
{
|
|
key_value_list_set_format(&node_status,
|
|
"System identifier",
|
|
"%lu", local_system_identifier);
|
|
}
|
|
}
|
|
|
|
key_value_list_set(&node_status,
|
|
"Role",
|
|
get_node_type_string(node_info.type));
|
|
|
|
switch (node_info.type)
|
|
{
|
|
case PRIMARY:
|
|
if (recovery_type == RECTYPE_STANDBY)
|
|
{
|
|
item_list_append(&warnings,
|
|
_("- node is registered as primary but running as standby"));
|
|
}
|
|
break;
|
|
case STANDBY:
|
|
if (recovery_type == RECTYPE_PRIMARY)
|
|
{
|
|
item_list_append(&warnings,
|
|
_("- node is registered as standby but running as primary"));
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (guc_set(conn, "archive_mode", "=", "off"))
|
|
{
|
|
key_value_list_set(&node_status,
|
|
"WAL archiving",
|
|
"off");
|
|
|
|
key_value_list_set(&node_status,
|
|
"Archive command",
|
|
"(none)");
|
|
}
|
|
else
|
|
{
|
|
/* "archive_mode" is not "off", i.e. one of "on", "always" */
|
|
bool enabled = true;
|
|
PQExpBufferData archiving_status;
|
|
char archive_command[MAXLEN] = "";
|
|
|
|
initPQExpBuffer(&archiving_status);
|
|
|
|
/*
|
|
* if the node is a standby, and "archive_mode" is "on", archiving will
|
|
* actually be disabled.
|
|
*/
|
|
if (recovery_type == RECTYPE_STANDBY)
|
|
{
|
|
if (guc_set(conn, "archive_mode", "=", "on"))
|
|
enabled = false;
|
|
}
|
|
|
|
if (enabled == true)
|
|
{
|
|
appendPQExpBufferStr(&archiving_status, "enabled");
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBufferStr(&archiving_status, "disabled");
|
|
}
|
|
|
|
if (enabled == false && recovery_type == RECTYPE_STANDBY)
|
|
{
|
|
if (PQserverVersion(conn) >= 90500)
|
|
{
|
|
appendPQExpBufferStr(&archiving_status,
|
|
" (on standbys \"archive_mode\" must be set to \"always\" to be effective)");
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBufferStr(&archiving_status,
|
|
" (\"archive_mode\" has no effect on standbys)");
|
|
}
|
|
}
|
|
|
|
key_value_list_set(&node_status,
|
|
"WAL archiving",
|
|
archiving_status.data);
|
|
|
|
termPQExpBuffer(&archiving_status);
|
|
|
|
get_pg_setting(conn, "archive_command", archive_command);
|
|
|
|
key_value_list_set(&node_status,
|
|
"Archive command",
|
|
archive_command);
|
|
}
|
|
|
|
{
|
|
int ready_files;
|
|
|
|
ready_files = get_ready_archive_files(conn, data_dir);
|
|
|
|
if (ready_files == ARCHIVE_STATUS_DIR_ERROR)
|
|
{
|
|
item_list_append_format(&warnings,
|
|
"- unable to check archive_status directory\n");
|
|
}
|
|
else
|
|
{
|
|
if (runtime_options.output_mode == OM_CSV)
|
|
{
|
|
key_value_list_set_format(&node_status,
|
|
"WALs pending archiving",
|
|
"%i",
|
|
ready_files);
|
|
}
|
|
else
|
|
{
|
|
key_value_list_set_format(&node_status,
|
|
"WALs pending archiving",
|
|
"%i pending files",
|
|
ready_files);
|
|
}
|
|
}
|
|
|
|
if (guc_set(conn, "archive_mode", "=", "off"))
|
|
{
|
|
key_value_list_set_output_mode(&node_status, "WALs pending archiving", OM_CSV);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
if (node_info.max_wal_senders >= 0)
|
|
{
|
|
/* In CSV mode, raw values supplied as well */
|
|
key_value_list_set_format(&node_status,
|
|
"Replication connections",
|
|
"%i (of maximal %i)",
|
|
node_info.attached_wal_receivers,
|
|
node_info.max_wal_senders);
|
|
}
|
|
else if (node_info.max_wal_senders == 0)
|
|
{
|
|
key_value_list_set_format(&node_status,
|
|
"Replication connections",
|
|
"disabled");
|
|
}
|
|
|
|
/* check for attached nodes */
|
|
{
|
|
NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
|
NodeInfoListCell *node_cell = NULL;
|
|
ItemList missing_nodes = {NULL, NULL};
|
|
int missing_nodes_count = 0;
|
|
int expected_nodes_count = 0;
|
|
|
|
get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes);
|
|
|
|
/* if a witness node is present, we'll need to remove this from the total */
|
|
expected_nodes_count = downstream_nodes.node_count;
|
|
|
|
for (node_cell = downstream_nodes.head; node_cell; node_cell = node_cell->next)
|
|
{
|
|
/* skip witness server */
|
|
if (node_cell->node_info->type == WITNESS)
|
|
{
|
|
expected_nodes_count --;
|
|
continue;
|
|
}
|
|
|
|
if (is_downstream_node_attached(conn, node_cell->node_info->node_name, NULL) != NODE_ATTACHED)
|
|
{
|
|
missing_nodes_count++;
|
|
item_list_append_format(&missing_nodes,
|
|
"%s (ID: %i)",
|
|
node_cell->node_info->node_name,
|
|
node_cell->node_info->node_id);
|
|
}
|
|
}
|
|
|
|
if (missing_nodes_count)
|
|
{
|
|
ItemListCell *missing_cell = NULL;
|
|
|
|
item_list_append_format(&warnings,
|
|
_("- %i of %i downstream nodes not attached:"),
|
|
missing_nodes_count,
|
|
expected_nodes_count);
|
|
|
|
for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
|
|
{
|
|
item_list_append_format(&warnings,
|
|
" - %s\n", missing_cell->string);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (node_info.max_replication_slots == 0)
|
|
{
|
|
key_value_list_set(&node_status,
|
|
"Replication slots",
|
|
"disabled");
|
|
}
|
|
else
|
|
{
|
|
PQExpBufferData slotinfo;
|
|
|
|
/*
|
|
* check for missing replication slots - we do this regardless of
|
|
* what "max_replication_slots" is set to, in case the downstream
|
|
* node was configured with "use_replication_slots=true" and is
|
|
* expecting a replication slot to be available
|
|
*/
|
|
get_downstream_nodes_with_missing_slot(conn,
|
|
config_file_options.node_id,
|
|
&missing_slots);
|
|
|
|
if (missing_slots.node_count > 0)
|
|
{
|
|
NodeInfoListCell *missing_slot_cell = NULL;
|
|
|
|
item_list_append_format(&warnings,
|
|
_("- replication slots missing for following %i node(s):"),
|
|
missing_slots.node_count);
|
|
|
|
for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
|
|
{
|
|
item_list_append_format(&warnings,
|
|
_(" - %s (ID: %i, slot name: \"%s\")"),
|
|
missing_slot_cell->node_info->node_name,
|
|
missing_slot_cell->node_info->node_id,
|
|
missing_slot_cell->node_info->slot_name);
|
|
}
|
|
}
|
|
|
|
initPQExpBuffer(&slotinfo);
|
|
|
|
appendPQExpBuffer(&slotinfo,
|
|
"%i physical (of maximal %i; %i missing)",
|
|
node_info.active_replication_slots + node_info.inactive_replication_slots,
|
|
node_info.max_replication_slots,
|
|
missing_slots.node_count);
|
|
|
|
if (node_info.inactive_replication_slots > 0)
|
|
{
|
|
KeyValueList inactive_replication_slots = {NULL, NULL};
|
|
|
|
(void) get_inactive_replication_slots(conn, &inactive_replication_slots);
|
|
|
|
appendPQExpBuffer(&slotinfo,
|
|
"; %i inactive",
|
|
node_info.inactive_replication_slots);
|
|
|
|
item_list_append_format(&warnings,
|
|
_("- node has %i inactive physical replication slots"),
|
|
node_info.inactive_replication_slots);
|
|
|
|
for (cell = inactive_replication_slots.head; cell; cell = cell->next)
|
|
{
|
|
item_list_append_format(&warnings,
|
|
" - %s", cell->key);
|
|
}
|
|
|
|
key_value_list_free(&inactive_replication_slots);
|
|
}
|
|
|
|
key_value_list_set(&node_status,
|
|
"Replication slots",
|
|
slotinfo.data);
|
|
|
|
termPQExpBuffer(&slotinfo);
|
|
}
|
|
|
|
|
|
if (node_info.type == STANDBY)
|
|
{
|
|
key_value_list_set_format(&node_status,
|
|
"Upstream node",
|
|
"%s (ID: %i)",
|
|
node_info.upstream_node_name,
|
|
node_info.upstream_node_id);
|
|
|
|
get_replication_info(conn, node_info.type, &replication_info);
|
|
|
|
key_value_list_set_format(&node_status,
|
|
"Replication lag",
|
|
"%i seconds",
|
|
replication_info.replication_lag_time);
|
|
|
|
key_value_list_set_format(&node_status,
|
|
"Last received LSN",
|
|
"%X/%X", format_lsn(replication_info.last_wal_receive_lsn));
|
|
|
|
key_value_list_set_format(&node_status,
|
|
"Last replayed LSN",
|
|
"%X/%X", format_lsn(replication_info.last_wal_replay_lsn));
|
|
}
|
|
else
|
|
{
|
|
key_value_list_set(&node_status,
|
|
"Upstream node",
|
|
"(none)");
|
|
key_value_list_set_output_mode(&node_status,
|
|
"Upstream node",
|
|
OM_CSV);
|
|
|
|
key_value_list_set(&node_status,
|
|
"Replication lag",
|
|
"n/a");
|
|
|
|
key_value_list_set(&node_status,
|
|
"Last received LSN",
|
|
"(none)");
|
|
|
|
key_value_list_set_output_mode(&node_status,
|
|
"Last received LSN",
|
|
OM_CSV);
|
|
|
|
key_value_list_set(&node_status,
|
|
"Last replayed LSN",
|
|
"(none)");
|
|
|
|
key_value_list_set_output_mode(&node_status,
|
|
"Last replayed LSN",
|
|
OM_CSV);
|
|
}
|
|
|
|
|
|
parse_recovery_conf(data_dir, &recovery_conf);
|
|
|
|
/* format output */
|
|
initPQExpBuffer(&output);
|
|
|
|
if (runtime_options.output_mode == OM_CSV)
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
"\"Node name\",\"%s\"\n",
|
|
node_info.node_name);
|
|
|
|
appendPQExpBuffer(&output,
|
|
"\"Node ID\",\"%i\"\n",
|
|
node_info.node_id);
|
|
|
|
for (cell = node_status.head; cell; cell = cell->next)
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
"\"%s\",\"%s\"\n",
|
|
cell->key, cell->value);
|
|
}
|
|
|
|
/* we'll add the raw data as well */
|
|
appendPQExpBuffer(&output,
|
|
"\"max_wal_senders\",%i\n",
|
|
node_info.max_wal_senders);
|
|
|
|
appendPQExpBuffer(&output,
|
|
"\"occupied_wal_senders\",%i\n",
|
|
node_info.attached_wal_receivers);
|
|
|
|
appendPQExpBuffer(&output,
|
|
"\"max_replication_slots\",%i\n",
|
|
node_info.max_replication_slots);
|
|
|
|
appendPQExpBuffer(&output,
|
|
"\"active_replication_slots\",%i\n",
|
|
node_info.active_replication_slots);
|
|
|
|
/* output inactive slot information */
|
|
appendPQExpBuffer(&output,
|
|
"\"inactive_replication_slots\",%i",
|
|
node_info.inactive_replication_slots);
|
|
|
|
if (node_info.inactive_replication_slots)
|
|
{
|
|
KeyValueList inactive_replication_slots = {NULL, NULL};
|
|
(void) get_inactive_replication_slots(conn, &inactive_replication_slots);
|
|
for (cell = inactive_replication_slots.head; cell; cell = cell->next)
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
",\"%s\"", cell->key);
|
|
}
|
|
|
|
key_value_list_free(&inactive_replication_slots);
|
|
}
|
|
|
|
/* output missing slot information */
|
|
|
|
appendPQExpBufferChar(&output, '\n');
|
|
appendPQExpBuffer(&output,
|
|
"\"missing_replication_slots\",%i",
|
|
missing_slots.node_count);
|
|
|
|
if (missing_slots.node_count > 0)
|
|
{
|
|
NodeInfoListCell *missing_slot_cell = NULL;
|
|
|
|
for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
",\"%s\"", missing_slot_cell->node_info->slot_name);
|
|
}
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
"Node \"%s\":\n",
|
|
node_info.node_name);
|
|
|
|
for (cell = node_status.head; cell; cell = cell->next)
|
|
{
|
|
if (cell->output_mode == OM_NOT_SET)
|
|
appendPQExpBuffer(&output,
|
|
"\t%s: %s\n",
|
|
cell->key, cell->value);
|
|
}
|
|
}
|
|
|
|
puts(output.data);
|
|
|
|
termPQExpBuffer(&output);
|
|
|
|
if (warnings.head != NULL && runtime_options.terse == false && runtime_options.output_mode == OM_TEXT)
|
|
{
|
|
log_warning(_("following issue(s) were detected:"));
|
|
print_item_list(&warnings);
|
|
log_hint(_("execute \"repmgr node check\" for more details"));
|
|
}
|
|
|
|
clear_node_info_list(&missing_slots);
|
|
key_value_list_free(&node_status);
|
|
item_list_free(&warnings);
|
|
PQfinish(conn);
|
|
|
|
/*
|
|
* If warnings were noted, even if they're not displayed (e.g. in --csv node),
|
|
* that means something's not right so we need to emit a non-zero exit code.
|
|
*/
|
|
if (warnings.head != NULL)
|
|
{
|
|
exit(ERR_NODE_STATUS);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
/*
|
|
* Returns information about the running state of the node.
|
|
* For internal use during "standby switchover".
|
|
*
|
|
* Returns "longopt" output:
|
|
*
|
|
* --status=(RUNNING|SHUTDOWN|UNCLEAN_SHUTDOWN|UNKNOWN)
|
|
* --last-checkpoint=...
|
|
*/
|
|
|
|
static void
|
|
_do_node_status_is_shutdown_cleanly(void)
|
|
{
|
|
PGPing ping_status;
|
|
PQExpBufferData output;
|
|
|
|
DBState db_state;
|
|
XLogRecPtr checkPoint = InvalidXLogRecPtr;
|
|
|
|
NodeStatus node_status = NODE_STATUS_UNKNOWN;
|
|
|
|
initPQExpBuffer(&output);
|
|
|
|
appendPQExpBufferStr(&output,
|
|
"--state=");
|
|
|
|
/* sanity-check we're dealing with a PostgreSQL directory */
|
|
if (is_pg_dir(config_file_options.data_directory) == false)
|
|
{
|
|
appendPQExpBufferStr(&output, "UNKNOWN");
|
|
printf("%s\n", output.data);
|
|
termPQExpBuffer(&output);
|
|
return;
|
|
}
|
|
|
|
ping_status = PQping(config_file_options.conninfo);
|
|
|
|
switch (ping_status)
|
|
{
|
|
case PQPING_OK:
|
|
node_status = NODE_STATUS_UP;
|
|
break;
|
|
case PQPING_REJECT:
|
|
node_status = NODE_STATUS_UP;
|
|
break;
|
|
case PQPING_NO_ATTEMPT:
|
|
case PQPING_NO_RESPONSE:
|
|
/* status not yet clear */
|
|
break;
|
|
}
|
|
|
|
/* check what pg_control says */
|
|
|
|
if (get_db_state(config_file_options.data_directory, &db_state) == false)
|
|
{
|
|
/*
|
|
* Unable to retrieve the database state from pg_control
|
|
*/
|
|
node_status = NODE_STATUS_UNKNOWN;
|
|
log_verbose(LOG_DEBUG, "unable to determine db state");
|
|
goto return_state;
|
|
}
|
|
|
|
log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state));
|
|
|
|
if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY)
|
|
{
|
|
if (node_status != NODE_STATUS_UP)
|
|
{
|
|
node_status = NODE_STATUS_UNCLEAN_SHUTDOWN;
|
|
}
|
|
/* server is still responding but shutting down */
|
|
else if (db_state == DB_SHUTDOWNING)
|
|
{
|
|
node_status = NODE_STATUS_SHUTTING_DOWN;
|
|
}
|
|
}
|
|
|
|
checkPoint = get_latest_checkpoint_location(config_file_options.data_directory);
|
|
|
|
if (checkPoint == InvalidXLogRecPtr)
|
|
{
|
|
/* unable to read pg_control, don't know what's happening */
|
|
node_status = NODE_STATUS_UNKNOWN;
|
|
}
|
|
else if (node_status == NODE_STATUS_UNKNOWN)
|
|
{
|
|
/*
|
|
* if still "UNKNOWN" at this point, then the node must be cleanly shut
|
|
* down
|
|
*/
|
|
node_status = NODE_STATUS_DOWN;
|
|
}
|
|
|
|
|
|
return_state:
|
|
|
|
log_verbose(LOG_DEBUG, "node status determined as: %s",
|
|
print_node_status(node_status));
|
|
|
|
appendPQExpBuffer(&output,
|
|
"%s", print_node_status(node_status));
|
|
|
|
if (node_status == NODE_STATUS_DOWN)
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
" --last-checkpoint-lsn=%X/%X",
|
|
format_lsn(checkPoint));
|
|
}
|
|
|
|
printf("%s\n", output.data);
|
|
termPQExpBuffer(&output);
|
|
return;
|
|
}
|
|
|
|
static void
|
|
exit_optformat_error(const char *error, int errcode)
|
|
{
|
|
PQExpBufferData output;
|
|
|
|
Assert(runtime_options.output_mode == OM_OPTFORMAT);
|
|
|
|
initPQExpBuffer(&output);
|
|
|
|
appendPQExpBuffer(&output,
|
|
"--error=%s",
|
|
error);
|
|
|
|
printf("%s\n", output.data);
|
|
|
|
termPQExpBuffer(&output);
|
|
|
|
exit(errcode);
|
|
}
|
|
|
|
/*
|
|
* Configuration file required
|
|
*/
|
|
void
|
|
do_node_check(void)
|
|
{
|
|
PGconn *conn = NULL;
|
|
PQExpBufferData output;
|
|
|
|
t_node_info node_info = T_NODE_INFO_INITIALIZER;
|
|
|
|
CheckStatus return_code;
|
|
CheckStatusList status_list = {NULL, NULL};
|
|
CheckStatusListCell *cell = NULL;
|
|
|
|
bool issue_detected = false;
|
|
bool exit_on_connection_error = true;
|
|
|
|
/* for internal use */
|
|
if (runtime_options.has_passfile == true)
|
|
{
|
|
return_code = has_passfile() ? 0 : 1;
|
|
|
|
exit(return_code);
|
|
}
|
|
|
|
/* for use by "standby switchover" */
|
|
if (runtime_options.replication_connection == true)
|
|
{
|
|
do_node_check_replication_connection();
|
|
exit(SUCCESS);
|
|
}
|
|
|
|
if (runtime_options.db_connection == true)
|
|
{
|
|
exit_on_connection_error = false;
|
|
}
|
|
|
|
/*
|
|
* If --optformat was provided, we'll assume this is a remote invocation
|
|
* and instead of exiting with an error, we'll return an error string to
|
|
* so the remote invoker will know what's happened.
|
|
*/
|
|
if (runtime_options.output_mode == OM_OPTFORMAT)
|
|
{
|
|
exit_on_connection_error = false;
|
|
}
|
|
|
|
|
|
if (config_file_options.conninfo[0] != '\0')
|
|
{
|
|
t_conninfo_param_list node_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
|
char *errmsg = NULL;
|
|
bool parse_success = false;
|
|
|
|
initialize_conninfo_params(&node_conninfo, false);
|
|
|
|
parse_success = parse_conninfo_string(config_file_options.conninfo,
|
|
&node_conninfo,
|
|
&errmsg, false);
|
|
|
|
if (parse_success == false)
|
|
{
|
|
if (runtime_options.output_mode == OM_OPTFORMAT)
|
|
{
|
|
exit_optformat_error("CONNINFO_PARSE",
|
|
ERR_BAD_CONFIG);
|
|
}
|
|
|
|
log_error(_("unable to parse conninfo string \"%s\" for local node"),
|
|
config_file_options.conninfo);
|
|
log_detail("%s", errmsg);
|
|
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/*
|
|
* If --superuser option provided, attempt to connect as the specified user
|
|
*/
|
|
|
|
if (runtime_options.superuser[0] != '\0')
|
|
{
|
|
conn = establish_db_connection_with_replacement_param(
|
|
config_file_options.conninfo,
|
|
"user",
|
|
runtime_options.superuser,
|
|
exit_on_connection_error);
|
|
}
|
|
else
|
|
{
|
|
conn = establish_db_connection_by_params(&node_conninfo, exit_on_connection_error);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
conn = establish_db_connection_by_params(&source_conninfo, exit_on_connection_error);
|
|
}
|
|
|
|
|
|
/*
|
|
* --db-connection option provided
|
|
*/
|
|
if (runtime_options.db_connection == true)
|
|
{
|
|
return_code = do_node_check_db_connection(conn, runtime_options.output_mode);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
/*
|
|
* If we've reached here, and the connection is invalid, then --optformat was provided
|
|
*/
|
|
if (PQstatus(conn) != CONNECTION_OK)
|
|
{
|
|
exit_optformat_error("DB_CONNECTION",
|
|
ERR_DB_CONN);
|
|
}
|
|
|
|
if (get_node_record(conn, config_file_options.node_id, &node_info) != RECORD_FOUND)
|
|
{
|
|
log_error(_("no record found for node %i"), config_file_options.node_id);
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/* add replication statistics to node record */
|
|
get_node_replication_stats(conn, &node_info);
|
|
|
|
/*
|
|
* handle specific checks ======================
|
|
*/
|
|
if (runtime_options.archive_ready == true)
|
|
{
|
|
return_code = do_node_check_archive_ready(conn,
|
|
runtime_options.output_mode,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
if (runtime_options.upstream == true)
|
|
{
|
|
return_code = do_node_check_upstream(conn,
|
|
runtime_options.output_mode,
|
|
&node_info,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
if (runtime_options.downstream == true)
|
|
{
|
|
return_code = do_node_check_downstream(conn,
|
|
runtime_options.output_mode,
|
|
&node_info,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
if (runtime_options.replication_lag == true)
|
|
{
|
|
return_code = do_node_check_replication_lag(conn,
|
|
runtime_options.output_mode,
|
|
&node_info,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
if (runtime_options.role == true)
|
|
{
|
|
return_code = do_node_check_role(conn,
|
|
runtime_options.output_mode,
|
|
&node_info,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
if (runtime_options.slots == true)
|
|
{
|
|
return_code = do_node_check_slots(conn,
|
|
runtime_options.output_mode,
|
|
&node_info,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
if (runtime_options.missing_slots == true)
|
|
{
|
|
return_code = do_node_check_missing_slots(conn,
|
|
runtime_options.output_mode,
|
|
&node_info,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
if (runtime_options.data_directory_config == true)
|
|
{
|
|
return_code = do_node_check_data_directory(conn,
|
|
runtime_options.output_mode,
|
|
&node_info,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
if (runtime_options.repmgrd == true)
|
|
{
|
|
return_code = do_node_check_repmgrd(conn,
|
|
runtime_options.output_mode,
|
|
&node_info,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
if (runtime_options.replication_config_owner == true)
|
|
{
|
|
return_code = do_node_check_replication_config_owner(conn,
|
|
runtime_options.output_mode,
|
|
&node_info,
|
|
NULL);
|
|
PQfinish(conn);
|
|
exit(return_code);
|
|
}
|
|
|
|
|
|
if (runtime_options.output_mode == OM_NAGIOS)
|
|
{
|
|
log_error(_("--nagios can only be used with a specific check"));
|
|
log_hint(_("execute \"repmgr node --help\" for details"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/* output general overview */
|
|
|
|
initPQExpBuffer(&output);
|
|
|
|
/* order functions are called is also output order */
|
|
if (do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
|
|
issue_detected = true;
|
|
|
|
if (do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
|
|
issue_detected = true;
|
|
|
|
if (do_node_check_archive_ready(conn, runtime_options.output_mode, &status_list) != CHECK_STATUS_OK)
|
|
issue_detected = true;
|
|
|
|
if (do_node_check_upstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
|
|
issue_detected = true;
|
|
|
|
if (do_node_check_downstream(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
|
|
issue_detected = true;
|
|
|
|
if (do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
|
|
issue_detected = true;
|
|
|
|
if (do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
|
|
issue_detected = true;
|
|
|
|
if (do_node_check_data_directory(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
|
|
issue_detected = true;
|
|
|
|
if (runtime_options.output_mode == OM_CSV)
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
"\"Node name\",\"%s\"\n",
|
|
node_info.node_name);
|
|
|
|
appendPQExpBuffer(&output,
|
|
"\"Node ID\",\"%i\"\n",
|
|
node_info.node_id);
|
|
|
|
for (cell = status_list.head; cell; cell = cell->next)
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
"\"%s\",\"%s\"",
|
|
cell->item,
|
|
output_check_status(cell->status));
|
|
|
|
if (strlen(cell->details))
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
",\"%s\"",
|
|
cell->details);
|
|
}
|
|
appendPQExpBufferChar(&output, '\n');
|
|
}
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
"Node \"%s\":\n",
|
|
node_info.node_name);
|
|
|
|
for (cell = status_list.head; cell; cell = cell->next)
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
"\t%s: %s",
|
|
cell->item,
|
|
output_check_status(cell->status));
|
|
|
|
if (strlen(cell->details))
|
|
{
|
|
appendPQExpBuffer(&output,
|
|
" (%s)",
|
|
cell->details);
|
|
}
|
|
appendPQExpBufferChar(&output, '\n');
|
|
}
|
|
}
|
|
|
|
|
|
printf("%s", output.data);
|
|
termPQExpBuffer(&output);
|
|
check_status_list_free(&status_list);
|
|
|
|
PQfinish(conn);
|
|
|
|
if (issue_detected == true)
|
|
{
|
|
exit(ERR_NODE_STATUS);
|
|
}
|
|
}
|
|
|
|
|
|
static void
|
|
do_node_check_replication_connection(void)
|
|
{
|
|
PGconn *local_conn = NULL;
|
|
PGconn *repl_conn = NULL;
|
|
t_node_info node_record = T_NODE_INFO_INITIALIZER;
|
|
RecordStatus record_status = RECORD_NOT_FOUND;
|
|
PQExpBufferData output;
|
|
|
|
|
|
initPQExpBuffer(&output);
|
|
appendPQExpBufferStr(&output,
|
|
"--connection=");
|
|
|
|
if (runtime_options.remote_node_id == UNKNOWN_NODE_ID)
|
|
{
|
|
appendPQExpBufferStr(&output, "UNKNOWN");
|
|
printf("%s\n", output.data);
|
|
termPQExpBuffer(&output);
|
|
return;
|
|
}
|
|
|
|
/* retrieve remote node record from local database */
|
|
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
|
|
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
|
{
|
|
appendPQExpBufferStr(&output, "CONNECTION_ERROR");
|
|
printf("%s\n", output.data);
|
|
termPQExpBuffer(&output);
|
|
return;
|
|
}
|
|
|
|
record_status = get_node_record(local_conn, runtime_options.remote_node_id, &node_record);
|
|
PQfinish(local_conn);
|
|
|
|
if (record_status != RECORD_FOUND)
|
|
{
|
|
appendPQExpBufferStr(&output, "UNKNOWN");
|
|
printf("%s\n", output.data);
|
|
termPQExpBuffer(&output);
|
|
return;
|
|
}
|
|
|
|
repl_conn = establish_replication_connection_from_conninfo(node_record.conninfo,
|
|
node_record.repluser);
|
|
|
|
if (PQstatus(repl_conn) != CONNECTION_OK)
|
|
{
|
|
appendPQExpBufferStr(&output, "BAD");
|
|
printf("%s\n", output.data);
|
|
termPQExpBuffer(&output);
|
|
return;
|
|
}
|
|
|
|
PQfinish(repl_conn);
|
|
|
|
appendPQExpBufferStr(&output, "OK");
|
|
printf("%s\n", output.data);
|
|
termPQExpBuffer(&output);
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
static CheckStatus
|
|
do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output)
|
|
{
|
|
int ready_archive_files = 0;
|
|
CheckStatus status = CHECK_STATUS_UNKNOWN;
|
|
PQExpBufferData details;
|
|
|
|
if (mode == OM_CSV && list_output == NULL)
|
|
{
|
|
log_error(_("--csv output not provided with --archive-ready option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&details);
|
|
|
|
ready_archive_files = get_ready_archive_files(conn, config_file_options.data_directory);
|
|
|
|
if (ready_archive_files > config_file_options.archive_ready_critical)
|
|
{
|
|
status = CHECK_STATUS_CRITICAL;
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
appendPQExpBuffer(&details,
|
|
"--files=%i --threshold=%i",
|
|
ready_archive_files, config_file_options.archive_ready_critical);
|
|
break;
|
|
case OM_NAGIOS:
|
|
appendPQExpBuffer(&details,
|
|
"%i pending archive ready files | files=%i;%i;%i",
|
|
ready_archive_files,
|
|
ready_archive_files,
|
|
config_file_options.archive_ready_warning,
|
|
config_file_options.archive_ready_critical);
|
|
break;
|
|
case OM_TEXT:
|
|
appendPQExpBuffer(&details,
|
|
"%i pending archive ready files, critical threshold: %i",
|
|
ready_archive_files, config_file_options.archive_ready_critical);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else if (ready_archive_files > config_file_options.archive_ready_warning)
|
|
{
|
|
status = CHECK_STATUS_WARNING;
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
appendPQExpBuffer(&details,
|
|
"--files=%i --threshold=%i",
|
|
ready_archive_files, config_file_options.archive_ready_warning);
|
|
break;
|
|
case OM_NAGIOS:
|
|
appendPQExpBuffer(&details,
|
|
"%i pending archive ready files | files=%i;%i;%i",
|
|
ready_archive_files,
|
|
ready_archive_files,
|
|
config_file_options.archive_ready_warning,
|
|
config_file_options.archive_ready_critical);
|
|
|
|
break;
|
|
case OM_TEXT:
|
|
appendPQExpBuffer(&details,
|
|
"%i pending archive ready files (threshold: %i)",
|
|
ready_archive_files, config_file_options.archive_ready_warning);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else if (ready_archive_files < 0)
|
|
{
|
|
status = CHECK_STATUS_UNKNOWN;
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
break;
|
|
case OM_NAGIOS:
|
|
case OM_TEXT:
|
|
appendPQExpBufferStr(&details,
|
|
"unable to check archive_status directory");
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
status = CHECK_STATUS_OK;
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
appendPQExpBuffer(&details,
|
|
"--files=%i", ready_archive_files);
|
|
break;
|
|
case OM_NAGIOS:
|
|
appendPQExpBuffer(&details,
|
|
"%i pending archive ready files | files=%i;%i;%i",
|
|
ready_archive_files,
|
|
ready_archive_files,
|
|
config_file_options.archive_ready_warning,
|
|
config_file_options.archive_ready_critical);
|
|
break;
|
|
case OM_TEXT:
|
|
appendPQExpBuffer(&details,
|
|
"%i pending archive ready files", ready_archive_files);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
{
|
|
printf("--status=%s %s\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
break;
|
|
case OM_NAGIOS:
|
|
printf("REPMGR_ARCHIVE_READY %s: %s\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
break;
|
|
case OM_CSV:
|
|
case OM_TEXT:
|
|
if (list_output != NULL)
|
|
{
|
|
check_status_list_set(list_output,
|
|
"WAL archiving",
|
|
status,
|
|
details.data);
|
|
}
|
|
else
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
termPQExpBuffer(&details);
|
|
return status;
|
|
}
|
|
|
|
|
|
static CheckStatus
|
|
do_node_check_downstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
|
|
{
|
|
NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER;
|
|
NodeInfoListCell *cell = NULL;
|
|
int missing_nodes_count = 0;
|
|
int expected_nodes_count = 0;
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
ItemList missing_nodes = {NULL, NULL};
|
|
ItemList attached_nodes = {NULL, NULL};
|
|
PQExpBufferData details;
|
|
|
|
if (mode == OM_CSV && list_output == NULL)
|
|
{
|
|
log_error(_("--csv output not provided with --downstream option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&details);
|
|
|
|
get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes);
|
|
|
|
/* if a witness node is present, we'll need to remove this from the total */
|
|
expected_nodes_count = downstream_nodes.node_count;
|
|
|
|
for (cell = downstream_nodes.head; cell; cell = cell->next)
|
|
{
|
|
/* skip witness server */
|
|
if (cell->node_info->type == WITNESS)
|
|
{
|
|
expected_nodes_count --;
|
|
continue;
|
|
}
|
|
|
|
if (is_downstream_node_attached_quiet(conn, cell->node_info->node_name, NULL) != NODE_ATTACHED)
|
|
{
|
|
missing_nodes_count++;
|
|
item_list_append_format(&missing_nodes,
|
|
"%s (ID: %i)",
|
|
cell->node_info->node_name,
|
|
cell->node_info->node_id);
|
|
}
|
|
else
|
|
{
|
|
item_list_append_format(&attached_nodes,
|
|
"%s (ID: %i)",
|
|
cell->node_info->node_name,
|
|
cell->node_info->node_id);
|
|
}
|
|
}
|
|
|
|
if (node_info->type == WITNESS)
|
|
{
|
|
/* witness is not connecting to any upstream */
|
|
appendPQExpBufferStr(&details,
|
|
_("N/A - node is a witness"));
|
|
}
|
|
else if (missing_nodes_count == 0)
|
|
{
|
|
if (expected_nodes_count == 0)
|
|
appendPQExpBufferStr(&details,
|
|
"this node has no downstream nodes");
|
|
else
|
|
appendPQExpBuffer(&details,
|
|
"%i of %i downstream nodes attached",
|
|
expected_nodes_count - missing_nodes_count,
|
|
expected_nodes_count);
|
|
}
|
|
else
|
|
{
|
|
ItemListCell *missing_cell = NULL;
|
|
bool first = true;
|
|
|
|
status = CHECK_STATUS_CRITICAL;
|
|
|
|
appendPQExpBuffer(&details,
|
|
"%i of %i downstream nodes not attached",
|
|
missing_nodes_count,
|
|
expected_nodes_count);
|
|
|
|
if (mode != OM_NAGIOS)
|
|
{
|
|
appendPQExpBufferStr(&details, "; missing: ");
|
|
|
|
for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
|
|
{
|
|
if (first == false)
|
|
appendPQExpBufferStr(&details,
|
|
", ");
|
|
else
|
|
first = false;
|
|
|
|
if (first == false)
|
|
appendPQExpBufferStr(&details, missing_cell->string);
|
|
}
|
|
}
|
|
}
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_NAGIOS:
|
|
{
|
|
if (missing_nodes_count)
|
|
{
|
|
ItemListCell *missing_cell = NULL;
|
|
bool first = true;
|
|
|
|
appendPQExpBufferStr(&details, " (missing: ");
|
|
|
|
for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
|
|
{
|
|
if (first == false)
|
|
appendPQExpBufferStr(&details, ", ");
|
|
else
|
|
first = false;
|
|
|
|
if (first == false)
|
|
appendPQExpBufferStr(&details, missing_cell->string);
|
|
}
|
|
|
|
appendPQExpBufferChar(&details, ')');
|
|
}
|
|
|
|
printf("REPMGR_DOWNSTREAM_SERVERS %s: %s | attached=%i, missing=%i\n",
|
|
output_check_status(status),
|
|
details.data,
|
|
expected_nodes_count - missing_nodes_count,
|
|
missing_nodes_count);
|
|
}
|
|
break;
|
|
case OM_CSV:
|
|
case OM_TEXT:
|
|
if (list_output != NULL)
|
|
{
|
|
check_status_list_set(list_output,
|
|
"Downstream servers",
|
|
status,
|
|
details.data);
|
|
}
|
|
else
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
default:
|
|
break;
|
|
|
|
}
|
|
termPQExpBuffer(&details);
|
|
clear_node_info_list(&downstream_nodes);
|
|
return status;
|
|
}
|
|
|
|
|
|
static CheckStatus
|
|
do_node_check_upstream(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
|
|
{
|
|
PGconn *upstream_conn = NULL;
|
|
t_node_info upstream_node_info = T_NODE_INFO_INITIALIZER;
|
|
PQExpBufferData details;
|
|
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
|
|
if (mode == OM_CSV && list_output == NULL)
|
|
{
|
|
log_error(_("--csv output not provided with --upstream option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&details);
|
|
|
|
if (node_info->type == WITNESS)
|
|
{
|
|
/* witness is not connecting to any upstream */
|
|
appendPQExpBufferStr(&details,
|
|
_("N/A - node is a witness"));
|
|
}
|
|
else if (get_node_record(conn, node_info->upstream_node_id, &upstream_node_info) != RECORD_FOUND)
|
|
{
|
|
if (get_recovery_type(conn) == RECTYPE_STANDBY)
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
_("node \"%s\" (ID: %i) is a standby but no upstream record found"),
|
|
node_info->node_name,
|
|
node_info->node_id);
|
|
status = CHECK_STATUS_CRITICAL;
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBufferStr(&details,
|
|
_("N/A - node is primary"));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
upstream_conn = establish_db_connection(upstream_node_info.conninfo, true);
|
|
|
|
/* check our node is connected */
|
|
if (is_downstream_node_attached(upstream_conn, config_file_options.node_name, NULL) != NODE_ATTACHED)
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
_("node \"%s\" (ID: %i) is not attached to expected upstream node \"%s\" (ID: %i)"),
|
|
node_info->node_name,
|
|
node_info->node_id,
|
|
upstream_node_info.node_name,
|
|
upstream_node_info.node_id);
|
|
status = CHECK_STATUS_CRITICAL;
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
_("node \"%s\" (ID: %i) is attached to expected upstream node \"%s\" (ID: %i)"),
|
|
node_info->node_name,
|
|
node_info->node_id,
|
|
upstream_node_info.node_name,
|
|
upstream_node_info.node_id);
|
|
}
|
|
}
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_NAGIOS:
|
|
{
|
|
printf("REPMGR_UPSTREAM_SERVER %s: %s\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
break;
|
|
case OM_TEXT:
|
|
if (list_output != NULL)
|
|
{
|
|
check_status_list_set(list_output,
|
|
"Upstream connection",
|
|
status,
|
|
details.data);
|
|
}
|
|
else
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
termPQExpBuffer(&details);
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
static CheckStatus
|
|
do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
|
|
{
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
int lag_seconds = 0;
|
|
PQExpBufferData details;
|
|
|
|
if (mode == OM_CSV && list_output == NULL)
|
|
{
|
|
log_error(_("--csv output not provided with --replication-lag option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&details);
|
|
|
|
if (node_info->recovery_type == RECTYPE_PRIMARY)
|
|
{
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
appendPQExpBufferStr(&details,
|
|
"--lag=0");
|
|
break;
|
|
case OM_NAGIOS:
|
|
appendPQExpBuffer(&details,
|
|
"0 seconds | lag=0;%i;%i",
|
|
config_file_options.replication_lag_warning,
|
|
config_file_options.replication_lag_critical);
|
|
break;
|
|
case OM_TEXT:
|
|
if (node_info->type == WITNESS)
|
|
{
|
|
appendPQExpBufferStr(&details,
|
|
"N/A - node is witness");
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBufferStr(&details,
|
|
"N/A - node is primary");
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
lag_seconds = get_replication_lag_seconds(conn);
|
|
|
|
log_debug("lag seconds: %i", lag_seconds);
|
|
|
|
if (lag_seconds >= config_file_options.replication_lag_critical)
|
|
{
|
|
status = CHECK_STATUS_CRITICAL;
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
appendPQExpBuffer(&details,
|
|
"--lag=%i --threshold=%i",
|
|
lag_seconds, config_file_options.replication_lag_critical);
|
|
break;
|
|
case OM_NAGIOS:
|
|
appendPQExpBuffer(&details,
|
|
"%i seconds | lag=%i;%i;%i",
|
|
lag_seconds,
|
|
lag_seconds,
|
|
config_file_options.replication_lag_warning,
|
|
config_file_options.replication_lag_critical);
|
|
break;
|
|
case OM_TEXT:
|
|
appendPQExpBuffer(&details,
|
|
"%i seconds, critical threshold: %i)",
|
|
lag_seconds, config_file_options.replication_lag_critical);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else if (lag_seconds > config_file_options.replication_lag_warning)
|
|
{
|
|
status = CHECK_STATUS_WARNING;
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
appendPQExpBuffer(&details,
|
|
"--lag=%i --threshold=%i",
|
|
lag_seconds, config_file_options.replication_lag_warning);
|
|
break;
|
|
case OM_NAGIOS:
|
|
appendPQExpBuffer(&details,
|
|
"%i seconds | lag=%i;%i;%i",
|
|
lag_seconds,
|
|
lag_seconds,
|
|
config_file_options.replication_lag_warning,
|
|
config_file_options.replication_lag_critical);
|
|
break;
|
|
case OM_TEXT:
|
|
appendPQExpBuffer(&details,
|
|
"%i seconds, warning threshold: %i)",
|
|
lag_seconds, config_file_options.replication_lag_warning);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
|
|
{
|
|
status = CHECK_STATUS_UNKNOWN;
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
break;
|
|
case OM_NAGIOS:
|
|
case OM_TEXT:
|
|
appendPQExpBufferStr(&details,
|
|
"unable to query replication lag");
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
status = CHECK_STATUS_OK;
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
appendPQExpBuffer(&details,
|
|
"--lag=%i",
|
|
lag_seconds);
|
|
break;
|
|
case OM_NAGIOS:
|
|
appendPQExpBuffer(&details,
|
|
"%i seconds | lag=%i;%i;%i",
|
|
lag_seconds,
|
|
lag_seconds,
|
|
config_file_options.replication_lag_warning,
|
|
config_file_options.replication_lag_critical);
|
|
break;
|
|
case OM_TEXT:
|
|
appendPQExpBuffer(&details,
|
|
"%i seconds",
|
|
lag_seconds);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
printf("--status=%s %s\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
break;
|
|
case OM_NAGIOS:
|
|
printf("REPMGR_REPLICATION_LAG %s: %s\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
break;
|
|
case OM_CSV:
|
|
case OM_TEXT:
|
|
if (list_output != NULL)
|
|
{
|
|
check_status_list_set(list_output,
|
|
"Replication lag",
|
|
status,
|
|
details.data);
|
|
}
|
|
else
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
termPQExpBuffer(&details);
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
static CheckStatus
|
|
do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
|
|
{
|
|
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
PQExpBufferData details;
|
|
RecoveryType recovery_type = get_recovery_type(conn);
|
|
|
|
if (mode == OM_CSV && list_output == NULL)
|
|
{
|
|
log_error(_("--csv output not provided with --role option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&details);
|
|
|
|
switch (node_info->type)
|
|
{
|
|
case PRIMARY:
|
|
if (recovery_type == RECTYPE_STANDBY)
|
|
{
|
|
status = CHECK_STATUS_CRITICAL;
|
|
appendPQExpBufferStr(&details,
|
|
_("node is registered as primary but running as standby"));
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBufferStr(&details,
|
|
_("node is primary"));
|
|
}
|
|
break;
|
|
case STANDBY:
|
|
if (recovery_type == RECTYPE_PRIMARY)
|
|
{
|
|
status = CHECK_STATUS_CRITICAL;
|
|
appendPQExpBufferStr(&details,
|
|
_("node is registered as standby but running as primary"));
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBufferStr(&details,
|
|
_("node is standby"));
|
|
}
|
|
break;
|
|
case WITNESS:
|
|
if (recovery_type == RECTYPE_STANDBY)
|
|
{
|
|
status = CHECK_STATUS_CRITICAL;
|
|
appendPQExpBufferStr(&details,
|
|
_("node is registered as witness but running as standby"));
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBufferStr(&details,
|
|
_("node is witness"));
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_NAGIOS:
|
|
printf("REPMGR_SERVER_ROLE %s: %s\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
break;
|
|
case OM_CSV:
|
|
case OM_TEXT:
|
|
if (list_output != NULL)
|
|
{
|
|
check_status_list_set(list_output,
|
|
"Server role",
|
|
status,
|
|
details.data);
|
|
}
|
|
else
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
termPQExpBuffer(&details);
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
static CheckStatus
|
|
do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
|
|
{
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
PQExpBufferData details;
|
|
|
|
if (mode == OM_CSV && list_output == NULL)
|
|
{
|
|
log_error(_("--csv output not provided with --slots option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&details);
|
|
|
|
if (node_info->total_replication_slots == 0)
|
|
{
|
|
appendPQExpBufferStr(&details,
|
|
_("node has no physical replication slots"));
|
|
}
|
|
else if (node_info->inactive_replication_slots == 0)
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
_("%i of %i physical replication slots are active"),
|
|
node_info->total_replication_slots,
|
|
node_info->total_replication_slots);
|
|
}
|
|
else if (node_info->inactive_replication_slots > 0)
|
|
{
|
|
status = CHECK_STATUS_CRITICAL;
|
|
|
|
appendPQExpBuffer(&details,
|
|
_("%i of %i physical replication slots are inactive"),
|
|
node_info->inactive_replication_slots,
|
|
node_info->total_replication_slots);
|
|
}
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_NAGIOS:
|
|
printf("REPMGR_INACTIVE_SLOTS %s: %s | slots=%i;%i\n",
|
|
output_check_status(status),
|
|
details.data,
|
|
node_info->total_replication_slots,
|
|
node_info->inactive_replication_slots);
|
|
break;
|
|
case OM_CSV:
|
|
case OM_TEXT:
|
|
if (list_output != NULL)
|
|
{
|
|
check_status_list_set(list_output,
|
|
"Replication slots",
|
|
status,
|
|
details.data);
|
|
}
|
|
else
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
termPQExpBuffer(&details);
|
|
return status;
|
|
}
|
|
|
|
|
|
static CheckStatus
|
|
do_node_check_missing_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
|
|
{
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
PQExpBufferData details;
|
|
NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER;
|
|
|
|
if (mode == OM_CSV && list_output == NULL)
|
|
{
|
|
log_error(_("--csv output not provided with --missing-slots option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&details);
|
|
|
|
get_downstream_nodes_with_missing_slot(conn,
|
|
config_file_options.node_id,
|
|
&missing_slots);
|
|
|
|
if (missing_slots.node_count == 0)
|
|
{
|
|
appendPQExpBufferStr(&details,
|
|
_("node has no missing physical replication slots"));
|
|
}
|
|
else
|
|
{
|
|
NodeInfoListCell *missing_slot_cell = NULL;
|
|
bool first_element = true;
|
|
|
|
status = CHECK_STATUS_CRITICAL;
|
|
|
|
appendPQExpBuffer(&details,
|
|
_("%i physical replication slots are missing"),
|
|
missing_slots.node_count);
|
|
|
|
if (missing_slots.node_count)
|
|
{
|
|
appendPQExpBufferStr(&details, ": ");
|
|
|
|
for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
|
|
{
|
|
if (first_element == true)
|
|
{
|
|
first_element = false;
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBufferStr(&details, ", ");
|
|
}
|
|
|
|
appendPQExpBufferStr(&details, missing_slot_cell->node_info->slot_name);
|
|
}
|
|
}
|
|
}
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_NAGIOS:
|
|
{
|
|
printf("REPMGR_MISSING_SLOTS %s: %s | missing_slots=%i",
|
|
output_check_status(status),
|
|
details.data,
|
|
missing_slots.node_count);
|
|
|
|
if (missing_slots.node_count)
|
|
{
|
|
NodeInfoListCell *missing_slot_cell = NULL;
|
|
bool first_element = true;
|
|
|
|
printf(";");
|
|
|
|
for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
|
|
{
|
|
if (first_element == true)
|
|
{
|
|
first_element = false;
|
|
}
|
|
else
|
|
{
|
|
printf(",");
|
|
}
|
|
printf("%s", missing_slot_cell->node_info->slot_name);
|
|
}
|
|
}
|
|
printf("\n");
|
|
break;
|
|
}
|
|
case OM_CSV:
|
|
case OM_TEXT:
|
|
if (list_output != NULL)
|
|
{
|
|
check_status_list_set(list_output,
|
|
"Missing physical replication slots",
|
|
status,
|
|
details.data);
|
|
}
|
|
else
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
clear_node_info_list(&missing_slots);
|
|
|
|
termPQExpBuffer(&details);
|
|
return status;
|
|
}
|
|
|
|
CheckStatus
|
|
do_node_check_data_directory(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
|
|
{
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
char actual_data_directory[MAXPGPATH] = "";
|
|
PQExpBufferData details;
|
|
|
|
if (mode == OM_CSV && list_output == NULL)
|
|
{
|
|
log_error(_("--csv output not provided with --data-directory-config option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&details);
|
|
|
|
/*
|
|
* Check actual data directory matches that in repmgr.conf; note this requires
|
|
* a superuser connection
|
|
*/
|
|
if (connection_has_pg_monitor_role(conn, "pg_read_all_settings") == true)
|
|
{
|
|
/* we expect to have a database connection */
|
|
if (get_pg_setting(conn, "data_directory", actual_data_directory) == false)
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
_("unable to determine current \"data_directory\""));
|
|
status = CHECK_STATUS_UNKNOWN;
|
|
}
|
|
|
|
if (strncmp(actual_data_directory, config_file_options.data_directory, MAXPGPATH) != 0)
|
|
{
|
|
if (mode != OM_NAGIOS)
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
_("configured \"data_directory\" is \"%s\"; "),
|
|
config_file_options.data_directory);
|
|
}
|
|
|
|
appendPQExpBuffer(&details,
|
|
"actual data directory is \"%s\"",
|
|
actual_data_directory);
|
|
|
|
status = CHECK_STATUS_CRITICAL;
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
_("configured \"data_directory\" is \"%s\""),
|
|
config_file_options.data_directory);
|
|
}
|
|
}
|
|
/*
|
|
* If no superuser connection available, sanity-check that the configuration directory looks
|
|
* like a PostgreSQL directory and hope it's the right one.
|
|
*/
|
|
else
|
|
{
|
|
if (mode == OM_TEXT)
|
|
{
|
|
log_info(_("connection is not a superuser connection, falling back to simple check"));
|
|
|
|
if (PQserverVersion(conn) >= 100000)
|
|
{
|
|
log_hint(_("provide a superuser with -S/--superuser, or add the \"%s\" user to role \"pg_read_all_settings\" or \"pg_monitor\""),
|
|
PQuser(conn));
|
|
}
|
|
}
|
|
|
|
if (is_pg_dir(config_file_options.data_directory) == false)
|
|
{
|
|
if (mode == OM_NAGIOS)
|
|
{
|
|
appendPQExpBufferStr(&details,
|
|
_("configured \"data_directory\" is not a PostgreSQL data directory"));
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
_("configured \"data_directory\" \"%s\" is not a PostgreSQL data directory"),
|
|
actual_data_directory);
|
|
}
|
|
|
|
status = CHECK_STATUS_CRITICAL;
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
_("configured \"data_directory\" is \"%s\""),
|
|
config_file_options.data_directory);
|
|
}
|
|
}
|
|
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
printf("--configured-data-directory=%s\n",
|
|
output_check_status(status));
|
|
break;
|
|
case OM_NAGIOS:
|
|
printf("REPMGR_DATA_DIRECTORY %s: %s",
|
|
output_check_status(status),
|
|
config_file_options.data_directory);
|
|
|
|
if (status == CHECK_STATUS_CRITICAL)
|
|
{
|
|
printf(" | %s", details.data);
|
|
}
|
|
puts("");
|
|
break;
|
|
case OM_CSV:
|
|
case OM_TEXT:
|
|
if (list_output != NULL)
|
|
{
|
|
check_status_list_set(list_output,
|
|
"Configured data directory",
|
|
status,
|
|
details.data);
|
|
}
|
|
else
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
termPQExpBuffer(&details);
|
|
|
|
return status;
|
|
}
|
|
|
|
CheckStatus
|
|
do_node_check_repmgrd(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
|
|
{
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
|
|
if (mode == OM_CSV && list_output == NULL)
|
|
{
|
|
log_error(_("--csv output not provided with --repmgrd option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
status = get_repmgrd_status(conn);
|
|
switch (mode)
|
|
{
|
|
case OM_OPTFORMAT:
|
|
printf("--repmgrd=%s\n",
|
|
output_check_status(status));
|
|
break;
|
|
case OM_NAGIOS:
|
|
printf("REPMGRD %s: %s\n",
|
|
output_check_status(status),
|
|
output_repmgrd_status(status));
|
|
|
|
break;
|
|
case OM_CSV:
|
|
case OM_TEXT:
|
|
if (list_output != NULL)
|
|
{
|
|
check_status_list_set(list_output,
|
|
"repmgrd",
|
|
status,
|
|
output_repmgrd_status(status));
|
|
}
|
|
else
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
output_repmgrd_status(status));
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return status;
|
|
}
|
|
|
|
/*
|
|
* This is not included in the general list output
|
|
*/
|
|
static
|
|
CheckStatus do_node_check_replication_config_owner(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
|
|
{
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
|
|
PQExpBufferData errmsg;
|
|
PQExpBufferData details;
|
|
|
|
if (mode != OM_OPTFORMAT)
|
|
{
|
|
log_error(_("--replication-config-owner option can only be used with --optformat"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&errmsg);
|
|
initPQExpBuffer(&details);
|
|
|
|
if (check_replication_config_owner(PQserverVersion(conn),
|
|
config_file_options.data_directory,
|
|
&errmsg, &details) == false)
|
|
{
|
|
status = CHECK_STATUS_CRITICAL;
|
|
}
|
|
|
|
printf("--replication-config-owner=%s\n",
|
|
output_check_status(status));
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
/*
|
|
* This is not included in the general list output
|
|
*/
|
|
static CheckStatus
|
|
do_node_check_db_connection(PGconn *conn, OutputMode mode)
|
|
{
|
|
CheckStatus status = CHECK_STATUS_OK;
|
|
PQExpBufferData details;
|
|
|
|
if (mode == OM_CSV)
|
|
{
|
|
log_error(_("--csv output not provided with --db-connection option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/* This check is for configuration diagnostics only */
|
|
if (mode == OM_NAGIOS)
|
|
{
|
|
log_error(_("--nagios output not provided with --db-connection option"));
|
|
PQfinish(conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
initPQExpBuffer(&details);
|
|
|
|
if (PQstatus(conn) != CONNECTION_OK)
|
|
{
|
|
t_conninfo_param_list conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
|
int c;
|
|
|
|
status = CHECK_STATUS_CRITICAL;
|
|
initialize_conninfo_params(&conninfo, false);
|
|
conn_to_param_list(conn, &conninfo);
|
|
|
|
appendPQExpBufferStr(&details,
|
|
"connection parameters used:");
|
|
for (c = 0; c < conninfo.size && conninfo.keywords[c] != NULL; c++)
|
|
{
|
|
if (conninfo.values[c] != NULL && conninfo.values[c][0] != '\0')
|
|
{
|
|
appendPQExpBuffer(&details,
|
|
" %s=%s",
|
|
conninfo.keywords[c], conninfo.values[c]);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if (mode == OM_OPTFORMAT)
|
|
{
|
|
printf("--db-connection=%s\n",
|
|
output_check_status(status));
|
|
}
|
|
else if (mode == OM_TEXT)
|
|
{
|
|
printf("%s (%s)\n",
|
|
output_check_status(status),
|
|
details.data);
|
|
}
|
|
termPQExpBuffer(&details);
|
|
|
|
return status;
|
|
}
|
|
|
|
|
|
void
|
|
do_node_service(void)
|
|
{
|
|
t_server_action action = ACTION_UNKNOWN;
|
|
char data_dir[MAXPGPATH] = "";
|
|
char command[MAXLEN] = "";
|
|
PQExpBufferData output;
|
|
|
|
action = parse_server_action(runtime_options.action);
|
|
|
|
if (action == ACTION_UNKNOWN)
|
|
{
|
|
log_error(_("unknown value \"%s\" provided for parameter --action"),
|
|
runtime_options.action);
|
|
log_hint(_("valid values are \"start\", \"stop\", \"restart\", \"reload\" and \"promote\""));
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
if (runtime_options.list_actions == true)
|
|
{
|
|
return _do_node_service_list_actions(action);
|
|
}
|
|
|
|
|
|
if (data_dir_required_for_action(action))
|
|
{
|
|
get_node_config_directory(data_dir);
|
|
|
|
if (data_dir[0] == '\0')
|
|
{
|
|
log_error(_("unable to determine data directory for action"));
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
}
|
|
|
|
|
|
if ((action == ACTION_STOP || action == ACTION_RESTART) && runtime_options.checkpoint == true)
|
|
{
|
|
PGconn *conn = NULL;
|
|
|
|
if (config_file_options.conninfo[0] != '\0')
|
|
{
|
|
/*
|
|
* If --superuser option provided, attempt to connect as the specified user
|
|
*/
|
|
if (runtime_options.superuser[0] != '\0')
|
|
{
|
|
conn = establish_db_connection_with_replacement_param(
|
|
config_file_options.conninfo,
|
|
"user",
|
|
runtime_options.superuser,
|
|
true);
|
|
}
|
|
else
|
|
{
|
|
conn = establish_db_connection(config_file_options.conninfo, true);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
conn = establish_db_connection_by_params(&source_conninfo, true);
|
|
}
|
|
|
|
if (can_execute_checkpoint(conn) == false)
|
|
{
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
log_warning(_("a CHECKPOINT would be issued here but no authorized connection is available"));
|
|
}
|
|
else
|
|
{
|
|
log_warning(_("an authorized connection is required to issue a CHECKPOINT"));
|
|
}
|
|
|
|
if (PQserverVersion(conn) >= 150000)
|
|
{
|
|
log_hint(_("provide a superuser with -S/--superuser or grant pg_checkpoint role to repmgr user"));
|
|
}
|
|
else
|
|
{
|
|
log_hint(_("provide a superuser with -S/--superuser"));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
log_info(_("a CHECKPOINT would be issued here"));
|
|
}
|
|
else
|
|
{
|
|
|
|
log_notice(_("issuing CHECKPOINT on node \"%s\" (ID: %i) "),
|
|
config_file_options.node_name,
|
|
config_file_options.node_id);
|
|
|
|
checkpoint(conn);
|
|
}
|
|
}
|
|
|
|
PQfinish(conn);
|
|
}
|
|
|
|
get_server_action(action, command, data_dir);
|
|
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
log_info(_("would execute server command \"%s\""), command);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* log level is "DETAIL" here as this command is intended to be executed
|
|
* by another repmgr process (e.g. during standby switchover); that repmgr
|
|
* should emit a "NOTICE" about the intent of the command.
|
|
*/
|
|
log_detail(_("executing server command \"%s\""), command);
|
|
|
|
initPQExpBuffer(&output);
|
|
|
|
if (local_command(command, &output) == false)
|
|
{
|
|
termPQExpBuffer(&output);
|
|
exit(ERR_LOCAL_COMMAND);
|
|
}
|
|
|
|
termPQExpBuffer(&output);
|
|
}
|
|
|
|
|
|
static void
|
|
_do_node_service_list_actions(t_server_action action)
|
|
{
|
|
char command[MAXLEN] = "";
|
|
|
|
char data_dir[MAXPGPATH] = "";
|
|
|
|
bool data_dir_required = false;
|
|
|
|
/* do we need to provide a data directory for any of the actions? */
|
|
if (data_dir_required_for_action(ACTION_START))
|
|
data_dir_required = true;
|
|
|
|
if (data_dir_required_for_action(ACTION_STOP))
|
|
data_dir_required = true;
|
|
|
|
if (data_dir_required_for_action(ACTION_RESTART))
|
|
data_dir_required = true;
|
|
|
|
if (data_dir_required_for_action(ACTION_RELOAD))
|
|
data_dir_required = true;
|
|
|
|
if (data_dir_required_for_action(ACTION_PROMOTE))
|
|
data_dir_required = true;
|
|
|
|
if (data_dir_required == true)
|
|
{
|
|
get_node_config_directory(data_dir);
|
|
}
|
|
|
|
/* show command for specific action only */
|
|
if (action != ACTION_NONE)
|
|
{
|
|
get_server_action(action, command, data_dir);
|
|
printf("%s\n", command);
|
|
return;
|
|
}
|
|
|
|
puts(_("Following commands would be executed for each action:"));
|
|
puts("");
|
|
|
|
get_server_action(ACTION_START, command, data_dir);
|
|
printf(" start: \"%s\"\n", command);
|
|
|
|
get_server_action(ACTION_STOP, command, data_dir);
|
|
printf(" stop: \"%s\"\n", command);
|
|
|
|
get_server_action(ACTION_RESTART, command, data_dir);
|
|
printf(" restart: \"%s\"\n", command);
|
|
|
|
get_server_action(ACTION_RELOAD, command, data_dir);
|
|
printf(" reload: \"%s\"\n", command);
|
|
|
|
get_server_action(ACTION_PROMOTE, command, data_dir);
|
|
printf(" promote: \"%s\"\n", command);
|
|
|
|
puts("");
|
|
|
|
}
|
|
|
|
|
|
static t_server_action
|
|
parse_server_action(const char *action_name)
|
|
{
|
|
if (action_name[0] == '\0')
|
|
return ACTION_NONE;
|
|
|
|
if (strcasecmp(action_name, "start") == 0)
|
|
return ACTION_START;
|
|
|
|
if (strcasecmp(action_name, "stop") == 0)
|
|
return ACTION_STOP;
|
|
|
|
if (strcasecmp(action_name, "restart") == 0)
|
|
return ACTION_RESTART;
|
|
|
|
if (strcasecmp(action_name, "reload") == 0)
|
|
return ACTION_RELOAD;
|
|
|
|
if (strcasecmp(action_name, "promote") == 0)
|
|
return ACTION_PROMOTE;
|
|
|
|
return ACTION_UNKNOWN;
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
* Rejoin a dormant (shut down) node to the replication cluster; this
|
|
* is typically a former primary which needs to be demoted to a standby.
|
|
*
|
|
* Note that "repmgr node rejoin" is also executed by
|
|
* "repmgr standby switchover" after promoting the new primary.
|
|
*
|
|
* Parameters:
|
|
* --dry-run
|
|
* --force-rewind[=VALUE]
|
|
* --config-files
|
|
* --config-archive-dir
|
|
* -W/--no-wait
|
|
*/
|
|
void
|
|
do_node_rejoin(void)
|
|
{
|
|
PGconn *upstream_conn = NULL;
|
|
RecoveryType primary_recovery_type = RECTYPE_UNKNOWN;
|
|
PGconn *primary_conn = NULL;
|
|
|
|
DBState db_state;
|
|
PGPing status;
|
|
bool is_shutdown = true;
|
|
int server_version_num = UNKNOWN_SERVER_VERSION_NUM;
|
|
bool hide_standby_signal = false;
|
|
|
|
KeyValueListCell *cell = NULL;
|
|
PQExpBufferData command;
|
|
PQExpBufferData command_output;
|
|
PQExpBufferData follow_output;
|
|
struct stat statbuf;
|
|
t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;
|
|
t_node_info local_node_record = T_NODE_INFO_INITIALIZER;
|
|
|
|
bool success = true;
|
|
int follow_error_code = SUCCESS;
|
|
|
|
/* check node is not actually running */
|
|
status = PQping(config_file_options.conninfo);
|
|
|
|
switch (status)
|
|
{
|
|
case PQPING_NO_ATTEMPT:
|
|
log_error(_("unable to determine status of server"));
|
|
exit(ERR_BAD_CONFIG);
|
|
case PQPING_OK:
|
|
is_shutdown = false;
|
|
break;
|
|
case PQPING_REJECT:
|
|
is_shutdown = false;
|
|
break;
|
|
case PQPING_NO_RESPONSE:
|
|
/* status not yet clear */
|
|
break;
|
|
}
|
|
|
|
if (get_db_state(config_file_options.data_directory, &db_state) == false)
|
|
{
|
|
log_error(_("unable to determine database state from pg_control"));
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
if (is_shutdown == false)
|
|
{
|
|
log_error(_("database is still running in state \"%s\""),
|
|
describe_db_state(db_state));
|
|
log_hint(_("\"repmgr node rejoin\" cannot be executed on a running node"));
|
|
exit(ERR_REJOIN_FAIL);
|
|
}
|
|
|
|
/*
|
|
* Server version number required to determine whether pg_rewind will run
|
|
* crash recovery (Pg 13 and later).
|
|
*/
|
|
server_version_num = get_pg_version(config_file_options.data_directory, NULL);
|
|
|
|
if (server_version_num == UNKNOWN_SERVER_VERSION_NUM)
|
|
{
|
|
/* This is very unlikely to happen */
|
|
log_error(_("unable to determine database version"));
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
log_verbose(LOG_DEBUG, "server version number is: %i", server_version_num);
|
|
|
|
/* check if cleanly shut down */
|
|
if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY)
|
|
{
|
|
if (db_state == DB_SHUTDOWNING)
|
|
{
|
|
log_error(_("database is still shutting down"));
|
|
}
|
|
else if (server_version_num >= 130000 && runtime_options.force_rewind_used == true)
|
|
{
|
|
log_warning(_("database is not shut down cleanly"));
|
|
log_detail(_("--force-rewind provided, pg_rewind will automatically perform recovery"));
|
|
|
|
/*
|
|
* If pg_rewind is executed, the first change it will make
|
|
* is to start the server in single user mode, which will fail
|
|
* in the presence of "standby.signal", so we'll "hide" it
|
|
* (actually delete and recreate).
|
|
*/
|
|
hide_standby_signal = true;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* If the database was not shut down cleanly, it *might* rejoin correctly
|
|
* after starting up and recovering, but better to ensure the database
|
|
* can recover before trying anything else.
|
|
*/
|
|
log_error(_("database is not shut down cleanly"));
|
|
|
|
if (server_version_num >= 130000)
|
|
{
|
|
log_hint(_("provide --force-rewind to run recovery"));
|
|
}
|
|
else
|
|
{
|
|
if (runtime_options.force_rewind_used == true)
|
|
{
|
|
log_detail(_("pg_rewind will not be able to run"));
|
|
}
|
|
log_hint(_("database should be restarted then shut down cleanly after crash recovery completes"));
|
|
}
|
|
|
|
exit(ERR_REJOIN_FAIL);
|
|
}
|
|
}
|
|
|
|
/* check provided upstream connection */
|
|
upstream_conn = establish_db_connection_by_params(&source_conninfo, true);
|
|
|
|
if (get_primary_node_record(upstream_conn, &primary_node_record) == false)
|
|
{
|
|
log_error(_("unable to retrieve primary node record"));
|
|
log_hint(_("check the provided database connection string is for a \"repmgr\" database"));
|
|
PQfinish(upstream_conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/*
|
|
* Emit a notice about the identity of the rejoin target
|
|
*/
|
|
log_notice(_("rejoin target is node \"%s\" (ID: %i)"),
|
|
primary_node_record.node_name,
|
|
primary_node_record.node_id);
|
|
|
|
/* connect to registered primary and check it's not in recovery */
|
|
primary_conn = establish_db_connection(primary_node_record.conninfo, false);
|
|
|
|
if (PQstatus(primary_conn) != CONNECTION_OK)
|
|
{
|
|
RecoveryType upstream_recovery_type = get_recovery_type(upstream_conn);
|
|
|
|
log_error(_("unable to connect to current registered primary \"%s\" (ID: %i)"),
|
|
primary_node_record.node_name,
|
|
primary_node_record.node_id);
|
|
log_detail(_("registered primary node conninfo is: \"%s\""),
|
|
primary_node_record.conninfo);
|
|
/*
|
|
* Catch case where provided upstream is not in recovery, but is also
|
|
* not registered as primary
|
|
*/
|
|
|
|
if (upstream_recovery_type == RECTYPE_PRIMARY)
|
|
{
|
|
log_warning(_("provided upstream connection string is for a server which is not in recovery, but not registered as primary"));
|
|
log_hint(_("fix repmgr metadata configuration before continuing"));
|
|
}
|
|
|
|
PQfinish(upstream_conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
PQfinish(upstream_conn);
|
|
|
|
primary_recovery_type = get_recovery_type(primary_conn);
|
|
|
|
if (primary_recovery_type != RECTYPE_PRIMARY)
|
|
{
|
|
log_error(_("primary server is registered as node \"%s\" (ID: %i), but server is not a primary"),
|
|
primary_node_record.node_name,
|
|
primary_node_record.node_id);
|
|
/* TODO: hint about checking cluster */
|
|
PQfinish(primary_conn);
|
|
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/*
|
|
* Fetch the local node record - we'll need this later, and it acts as an
|
|
* additional sanity-check that the node is known to the primary.
|
|
*/
|
|
if (get_node_record(primary_conn, config_file_options.node_id, &local_node_record) != RECORD_FOUND)
|
|
{
|
|
log_error(_("unable to retrieve node record for the local node"));
|
|
log_hint(_("check the local node is registered with the current primary \"%s\" (ID: %i)"),
|
|
primary_node_record.node_name,
|
|
primary_node_record.node_id);
|
|
|
|
PQfinish(primary_conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/*
|
|
* Sanity-check replication slot availability
|
|
*/
|
|
if (config_file_options.use_replication_slots)
|
|
{
|
|
bool slots_available = check_replication_slots_available(primary_node_record.node_id,
|
|
primary_conn);
|
|
if (slots_available == false)
|
|
{
|
|
PQfinish(primary_conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* sanity-check that it will actually be possible to stream from the new upstream
|
|
*/
|
|
{
|
|
bool can_rejoin;
|
|
TimeLineID tli = get_min_recovery_end_timeline(config_file_options.data_directory);
|
|
XLogRecPtr min_recovery_location = get_min_recovery_location(config_file_options.data_directory);
|
|
|
|
/*
|
|
* It's possible this was a former primary, so the minRecoveryPoint*
|
|
* fields may be empty.
|
|
*/
|
|
|
|
if (min_recovery_location == InvalidXLogRecPtr)
|
|
min_recovery_location = get_latest_checkpoint_location(config_file_options.data_directory);
|
|
if (tli == 0)
|
|
tli = get_timeline(config_file_options.data_directory);
|
|
|
|
can_rejoin = check_node_can_attach(tli,
|
|
min_recovery_location,
|
|
primary_conn,
|
|
&primary_node_record,
|
|
true);
|
|
|
|
if (can_rejoin == false)
|
|
{
|
|
PQfinish(primary_conn);
|
|
exit(ERR_REJOIN_FAIL);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* --force-rewind specified - check prerequisites, and attempt to execute
|
|
* (if --dry-run provided, just output the command which would be executed)
|
|
*/
|
|
|
|
if (runtime_options.force_rewind_used == true)
|
|
{
|
|
PQExpBufferData msg;
|
|
PQExpBufferData filebuf;
|
|
int ret;
|
|
|
|
/*
|
|
* Check that pg_rewind can be used
|
|
*/
|
|
|
|
initPQExpBuffer(&msg);
|
|
|
|
if (can_use_pg_rewind(primary_conn, config_file_options.data_directory, &msg) == false)
|
|
{
|
|
log_error(_("--force-rewind specified but pg_rewind cannot be used"));
|
|
log_detail("%s", msg.data);
|
|
termPQExpBuffer(&msg);
|
|
PQfinish(primary_conn);
|
|
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
appendPQExpBufferStr(&msg,
|
|
_("prerequisites for using pg_rewind are met"));
|
|
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
log_info("%s", msg.data);
|
|
}
|
|
else
|
|
{
|
|
log_verbose(LOG_INFO, "%s", msg.data);
|
|
}
|
|
termPQExpBuffer(&msg);
|
|
|
|
/*
|
|
* Archive requested configuration files.
|
|
*
|
|
* In --dry-run mode this acts as a check that the files can be archived, though
|
|
* errors will only be logged; any copied files will be deleted and --dry-run
|
|
* execution will continue.
|
|
*/
|
|
_do_node_archive_config();
|
|
|
|
/* execute pg_rewind */
|
|
initPQExpBuffer(&command);
|
|
|
|
if (runtime_options.force_rewind_path[0] != '\0')
|
|
{
|
|
appendPQExpBuffer(&command,
|
|
"%s -D ",
|
|
runtime_options.force_rewind_path);
|
|
}
|
|
else
|
|
{
|
|
make_pg_path(&command, "pg_rewind");
|
|
appendPQExpBufferStr(&command,
|
|
" -D ");
|
|
}
|
|
|
|
appendShellString(&command,
|
|
config_file_options.data_directory);
|
|
|
|
if (runtime_options.superuser[0] != '\0')
|
|
{
|
|
t_conninfo_param_list rewind_conninfo = T_CONNINFO_PARAM_LIST_INITIALIZER;
|
|
char *rewind_conninfo_str = NULL;
|
|
|
|
initialize_conninfo_params(&rewind_conninfo, false);
|
|
parse_conninfo_string(primary_node_record.conninfo, &rewind_conninfo, NULL, false);
|
|
param_set(&rewind_conninfo, "user", runtime_options.superuser);
|
|
rewind_conninfo_str = param_list_to_string(&rewind_conninfo);
|
|
|
|
appendPQExpBuffer(&command,
|
|
" --source-server='%s'",
|
|
rewind_conninfo_str);
|
|
|
|
pfree(rewind_conninfo_str);
|
|
free_conninfo_params(&rewind_conninfo);
|
|
}
|
|
else
|
|
{
|
|
appendPQExpBuffer(&command,
|
|
" --source-server='%s'",
|
|
primary_node_record.conninfo);
|
|
}
|
|
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
log_info(_("pg_rewind would now be executed"));
|
|
log_detail(_("pg_rewind command is:\n %s"),
|
|
command.data);
|
|
}
|
|
else
|
|
{
|
|
log_notice(_("executing pg_rewind"));
|
|
log_detail(_("pg_rewind command is \"%s\""),
|
|
command.data);
|
|
|
|
/*
|
|
* In Pg13 and later, pg_rewind will attempt to start up a server which
|
|
* was not cleanly shut down in single user mode. This will fail if
|
|
* "standby.signal" is present. We'll remove it and restore it after
|
|
* pg_rewind runs.
|
|
*/
|
|
if (hide_standby_signal == true)
|
|
{
|
|
char standby_signal_file_path[MAXPGPATH] = "";
|
|
|
|
log_notice(_("temporarily removing \"standby.signal\""));
|
|
log_detail(_("this is required so pg_rewind can fix the unclean shutdown"));
|
|
|
|
make_standby_signal_path(config_file_options.data_directory,
|
|
standby_signal_file_path);
|
|
|
|
if (unlink(standby_signal_file_path) < 0 && errno != ENOENT)
|
|
{
|
|
log_error(_("unable to remove \"standby.signal\" file in data directory \"%s\""),
|
|
standby_signal_file_path);
|
|
log_detail("%s", strerror(errno));
|
|
exit(ERR_REJOIN_FAIL);
|
|
}
|
|
}
|
|
|
|
initPQExpBuffer(&command_output);
|
|
|
|
ret = local_command(command.data,
|
|
&command_output);
|
|
|
|
termPQExpBuffer(&command);
|
|
|
|
if (hide_standby_signal == true)
|
|
{
|
|
/*
|
|
* Restore standby.signal if we previously removed it, regardless
|
|
* of whether the pg_rewind operation failed.
|
|
*/
|
|
log_notice(_("recreating \"standby.signal\""));
|
|
write_standby_signal(config_file_options.data_directory);
|
|
}
|
|
|
|
if (ret == false)
|
|
{
|
|
log_error(_("pg_rewind execution failed"));
|
|
log_detail("%s", command_output.data);
|
|
|
|
termPQExpBuffer(&command_output);
|
|
|
|
exit(ERR_REJOIN_FAIL);
|
|
}
|
|
|
|
termPQExpBuffer(&command_output);
|
|
|
|
/* Restore any previously archived config files */
|
|
_do_node_restore_config();
|
|
|
|
initPQExpBuffer(&filebuf);
|
|
|
|
/* remove any recovery.done file copied in by pg_rewind */
|
|
appendPQExpBuffer(&filebuf,
|
|
"%s/recovery.done",
|
|
config_file_options.data_directory);
|
|
|
|
if (stat(filebuf.data, &statbuf) == 0)
|
|
{
|
|
log_verbose(LOG_INFO, _("deleting \"recovery.done\""));
|
|
|
|
if (unlink(filebuf.data) == -1)
|
|
{
|
|
log_warning(_("unable to delete \"%s\""),
|
|
filebuf.data);
|
|
log_detail("%s", strerror(errno));
|
|
}
|
|
}
|
|
termPQExpBuffer(&filebuf);
|
|
|
|
/*
|
|
* Delete any replication slots copied in by pg_rewind.
|
|
*
|
|
* TODO:
|
|
* - from PostgreSQL 11, this will be handled by pg_rewind, so
|
|
* we can skip this step from that version; see commit
|
|
* 266b6acb312fc440c1c1a2036aa9da94916beac6
|
|
* - possibly delete contents of various other directories
|
|
* as per the above commit for pre-PostgreSQL 11
|
|
*/
|
|
{
|
|
PQExpBufferData slotdir_path;
|
|
DIR *slotdir;
|
|
struct dirent *slotdir_ent;
|
|
|
|
initPQExpBuffer(&slotdir_path);
|
|
|
|
appendPQExpBuffer(&slotdir_path,
|
|
"%s/pg_replslot",
|
|
config_file_options.data_directory);
|
|
|
|
slotdir = opendir(slotdir_path.data);
|
|
|
|
if (slotdir == NULL)
|
|
{
|
|
log_warning(_("unable to open replication slot directory \"%s\""),
|
|
slotdir_path.data);
|
|
log_detail("%s", strerror(errno));
|
|
}
|
|
else
|
|
{
|
|
while ((slotdir_ent = readdir(slotdir)) != NULL) {
|
|
struct stat local_statbuf;
|
|
PQExpBufferData slotdir_ent_path;
|
|
|
|
if (strcmp(slotdir_ent->d_name, ".") == 0 || strcmp(slotdir_ent->d_name, "..") == 0)
|
|
continue;
|
|
|
|
initPQExpBuffer(&slotdir_ent_path);
|
|
|
|
appendPQExpBuffer(&slotdir_ent_path,
|
|
"%s/%s",
|
|
slotdir_path.data,
|
|
slotdir_ent->d_name);
|
|
|
|
if (stat(slotdir_ent_path.data, &local_statbuf) == 0 && !S_ISDIR(local_statbuf.st_mode))
|
|
{
|
|
termPQExpBuffer(&slotdir_ent_path);
|
|
continue;
|
|
}
|
|
|
|
log_debug("deleting slot directory \"%s\"", slotdir_ent_path.data);
|
|
if (rmdir_recursive(slotdir_ent_path.data) != 0 && errno != EEXIST)
|
|
{
|
|
log_warning(_("unable to delete replication slot directory \"%s\""), slotdir_ent_path.data);
|
|
log_detail("%s", strerror(errno));
|
|
log_hint(_("directory may need to be manually removed"));
|
|
}
|
|
|
|
termPQExpBuffer(&slotdir_ent_path);
|
|
}
|
|
|
|
closedir(slotdir);
|
|
}
|
|
termPQExpBuffer(&slotdir_path);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
log_info(_("prerequisites for executing NODE REJOIN are met"));
|
|
exit(SUCCESS);
|
|
}
|
|
|
|
initPQExpBuffer(&follow_output);
|
|
|
|
/*
|
|
* do_standby_follow_internal() can handle situations where the follow
|
|
* target is not the primary, so requires database handles to both
|
|
* (even if they point to the same node). For the time being,
|
|
* "node rejoin" will only attach a standby to the primary.
|
|
*/
|
|
success = do_standby_follow_internal(primary_conn,
|
|
primary_conn,
|
|
&primary_node_record,
|
|
&follow_output,
|
|
ERR_REJOIN_FAIL,
|
|
&follow_error_code);
|
|
|
|
if (success == false)
|
|
{
|
|
log_error(_("NODE REJOIN failed"));
|
|
|
|
if (strlen(follow_output.data))
|
|
log_detail("%s", follow_output.data);
|
|
|
|
create_event_notification(primary_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"node_rejoin",
|
|
success,
|
|
follow_output.data);
|
|
|
|
PQfinish(primary_conn);
|
|
|
|
termPQExpBuffer(&follow_output);
|
|
exit(follow_error_code);
|
|
}
|
|
|
|
/*
|
|
* Actively check that node actually started and connected to primary,
|
|
* if not exit with ERR_REJOIN_FAIL.
|
|
*
|
|
* This check can be overridden with -W/--no-wait, in which case a one-time
|
|
* check will be carried out.
|
|
*/
|
|
if (runtime_options.no_wait == false)
|
|
{
|
|
standy_join_status join_success = check_standby_join(primary_conn,
|
|
&primary_node_record,
|
|
&local_node_record);
|
|
|
|
create_event_notification(primary_conn,
|
|
&config_file_options,
|
|
config_file_options.node_id,
|
|
"node_rejoin",
|
|
join_success == JOIN_SUCCESS ? true : false,
|
|
follow_output.data);
|
|
|
|
if (join_success != JOIN_SUCCESS)
|
|
{
|
|
termPQExpBuffer(&follow_output);
|
|
log_error(_("NODE REJOIN failed"));
|
|
|
|
if (join_success == JOIN_FAIL_NO_PING) {
|
|
log_detail(_("local node \"%s\" did not become available start after %i seconds"),
|
|
config_file_options.node_name,
|
|
config_file_options.node_rejoin_timeout);
|
|
}
|
|
else {
|
|
log_detail(_("no active record for local node \"%s\" found in node \"%s\"'s \"pg_stat_replication\" table"),
|
|
config_file_options.node_name,
|
|
primary_node_record.node_name);
|
|
}
|
|
log_hint(_("check the PostgreSQL log on the local node"));
|
|
|
|
exit(ERR_REJOIN_FAIL);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* -W/--no-wait provided - check once */
|
|
NodeAttached node_attached = is_downstream_node_attached(primary_conn, config_file_options.node_name, NULL);
|
|
if (node_attached == NODE_ATTACHED)
|
|
success = true;
|
|
}
|
|
|
|
/*
|
|
* Handle replication slots:
|
|
* - if a slot for the new upstream exists, delete that
|
|
* - warn about any other inactive replication slots
|
|
*/
|
|
if (runtime_options.force_rewind_used == false && config_file_options.use_replication_slots)
|
|
{
|
|
PGconn *local_conn = NULL;
|
|
local_conn = establish_db_connection(config_file_options.conninfo, false);
|
|
|
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
|
{
|
|
log_warning(_("unable to connect to local node to check replication slot status"));
|
|
log_hint(_("execute \"repmgr node check\" to check inactive slots and drop manually if necessary"));
|
|
}
|
|
else
|
|
{
|
|
KeyValueList inactive_replication_slots = {NULL, NULL};
|
|
int inactive_count = 0;
|
|
PQExpBufferData slotinfo;
|
|
|
|
drop_replication_slot_if_exists(local_conn,
|
|
config_file_options.node_id,
|
|
primary_node_record.slot_name);
|
|
|
|
(void) get_inactive_replication_slots(local_conn, &inactive_replication_slots);
|
|
|
|
initPQExpBuffer(&slotinfo);
|
|
for (cell = inactive_replication_slots.head; cell; cell = cell->next)
|
|
{
|
|
appendPQExpBuffer(&slotinfo,
|
|
" - %s (%s)", cell->key, cell->value);
|
|
inactive_count++;
|
|
}
|
|
|
|
if (inactive_count > 0)
|
|
{
|
|
log_warning(_("%i inactive replication slots detected"), inactive_count);
|
|
log_detail(_("inactive replication slots:\n%s"), slotinfo.data);
|
|
log_hint(_("these replication slots may need to be removed manually"));
|
|
}
|
|
|
|
termPQExpBuffer(&slotinfo);
|
|
|
|
PQfinish(local_conn);
|
|
}
|
|
}
|
|
|
|
if (success == true)
|
|
{
|
|
log_notice(_("NODE REJOIN successful"));
|
|
log_detail("%s", follow_output.data);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* if we reach here, no record found in upstream node's pg_stat_replication
|
|
*/
|
|
log_notice(_("NODE REJOIN has completed but node is not yet reattached to upstream"));
|
|
log_hint(_("you will need to manually check the node's replication status"));
|
|
}
|
|
termPQExpBuffer(&follow_output);
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
/*
|
|
* Currently for testing purposes only, not documented;
|
|
* use at own risk!
|
|
*/
|
|
|
|
void
|
|
do_node_control(void)
|
|
{
|
|
PGconn *conn = NULL;
|
|
pid_t wal_receiver_pid = UNKNOWN_PID;
|
|
conn = establish_db_connection(config_file_options.conninfo, true);
|
|
|
|
if (runtime_options.disable_wal_receiver == true)
|
|
{
|
|
wal_receiver_pid = disable_wal_receiver(conn);
|
|
|
|
PQfinish(conn);
|
|
|
|
if (wal_receiver_pid == UNKNOWN_PID)
|
|
exit(ERR_BAD_CONFIG);
|
|
|
|
exit(SUCCESS);
|
|
}
|
|
|
|
if (runtime_options.enable_wal_receiver == true)
|
|
{
|
|
wal_receiver_pid = enable_wal_receiver(conn, true);
|
|
|
|
PQfinish(conn);
|
|
|
|
if (wal_receiver_pid == UNKNOWN_PID)
|
|
exit(ERR_BAD_CONFIG);
|
|
|
|
exit(SUCCESS);
|
|
}
|
|
|
|
log_error(_("no option provided"));
|
|
|
|
PQfinish(conn);
|
|
}
|
|
|
|
|
|
/*
|
|
* For "internal" use by `node rejoin` on the local node when
|
|
* called by "standby switchover" from the remote node.
|
|
*
|
|
* This archives any configuration files in the data directory, which may be
|
|
* overwritten by pg_rewind.
|
|
*
|
|
* Requires configuration file, optionally --config-archive-dir
|
|
*/
|
|
static void
|
|
_do_node_archive_config(void)
|
|
{
|
|
PQExpBufferData archive_dir;
|
|
struct stat statbuf;
|
|
struct dirent *arcdir_ent;
|
|
DIR *arcdir;
|
|
|
|
KeyValueList config_files = {NULL, NULL};
|
|
KeyValueListCell *cell = NULL;
|
|
int copied_count = 0;
|
|
|
|
initPQExpBuffer(&archive_dir);
|
|
format_archive_dir(&archive_dir);
|
|
|
|
/* sanity-check directory path */
|
|
if (stat(archive_dir.data, &statbuf) == -1)
|
|
{
|
|
if (errno != ENOENT)
|
|
{
|
|
log_error(_("error encountered when checking archive directory \"%s\""),
|
|
archive_dir.data);
|
|
log_detail("%s", strerror(errno));
|
|
termPQExpBuffer(&archive_dir);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/* attempt to create and open the directory */
|
|
if (mkdir(archive_dir.data, S_IRWXU) != 0 && errno != EEXIST)
|
|
{
|
|
log_error(_("unable to create temporary archive directory \"%s\""),
|
|
archive_dir.data);
|
|
log_detail("%s", strerror(errno));
|
|
termPQExpBuffer(&archive_dir);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
log_verbose(LOG_INFO, "temporary archive directory \"%s\" created", archive_dir.data);
|
|
}
|
|
}
|
|
else if (!S_ISDIR(statbuf.st_mode))
|
|
{
|
|
log_error(_("\"%s\" exists but is not a directory"),
|
|
archive_dir.data);
|
|
termPQExpBuffer(&archive_dir);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
arcdir = opendir(archive_dir.data);
|
|
|
|
/* always attempt to open the directory */
|
|
if (arcdir == NULL)
|
|
{
|
|
log_error(_("unable to open archive directory \"%s\""),
|
|
archive_dir.data);
|
|
log_detail("%s", strerror(errno));
|
|
termPQExpBuffer(&archive_dir);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
if (runtime_options.dry_run == false)
|
|
{
|
|
|
|
/*
|
|
* attempt to remove any existing files in the directory
|
|
* TODO: collate problem files into list
|
|
*/
|
|
while ((arcdir_ent = readdir(arcdir)) != NULL)
|
|
{
|
|
PQExpBufferData arcdir_ent_path;
|
|
|
|
initPQExpBuffer(&arcdir_ent_path);
|
|
|
|
appendPQExpBuffer(&arcdir_ent_path,
|
|
"%s/%s",
|
|
archive_dir.data,
|
|
arcdir_ent->d_name);
|
|
|
|
if (stat(arcdir_ent_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
|
|
{
|
|
termPQExpBuffer(&arcdir_ent_path);
|
|
continue;
|
|
}
|
|
|
|
if (unlink(arcdir_ent_path.data) == -1)
|
|
{
|
|
log_error(_("unable to delete file in temporary archive directory"));
|
|
log_detail(_("file is: \"%s\""), arcdir_ent_path.data);
|
|
log_detail("%s", strerror(errno));
|
|
closedir(arcdir);
|
|
termPQExpBuffer(&arcdir_ent_path);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
termPQExpBuffer(&arcdir_ent_path);
|
|
}
|
|
}
|
|
|
|
closedir(arcdir);
|
|
|
|
|
|
/*
|
|
* extract list of config files from --config-files
|
|
*/
|
|
{
|
|
int i = 0;
|
|
int j = 0;
|
|
int config_file_len = strlen(runtime_options.config_files);
|
|
|
|
char filenamebuf[MAXPGPATH] = "";
|
|
PQExpBufferData pathbuf;
|
|
|
|
for (j = 0; j < config_file_len; j++)
|
|
{
|
|
if (runtime_options.config_files[j] == ',')
|
|
{
|
|
int filename_len = j - i;
|
|
|
|
if (filename_len >= MAXPGPATH)
|
|
filename_len = MAXPGPATH - 1;
|
|
|
|
strncpy(filenamebuf, runtime_options.config_files + i, filename_len);
|
|
|
|
filenamebuf[filename_len] = '\0';
|
|
|
|
initPQExpBuffer(&pathbuf);
|
|
|
|
appendPQExpBuffer(&pathbuf,
|
|
"%s/%s",
|
|
config_file_options.data_directory,
|
|
filenamebuf);
|
|
|
|
key_value_list_set(&config_files,
|
|
filenamebuf,
|
|
pathbuf.data);
|
|
termPQExpBuffer(&pathbuf);
|
|
i = j + 1;
|
|
}
|
|
}
|
|
|
|
if (i < config_file_len)
|
|
{
|
|
int filename_len = config_file_len - i;
|
|
|
|
strncpy(filenamebuf, runtime_options.config_files + i, filename_len);
|
|
|
|
filenamebuf[filename_len] = '\0';
|
|
|
|
initPQExpBuffer(&pathbuf);
|
|
appendPQExpBuffer(&pathbuf,
|
|
"%s/%s",
|
|
config_file_options.data_directory,
|
|
filenamebuf);
|
|
|
|
key_value_list_set(&config_files,
|
|
filenamebuf,
|
|
pathbuf.data);
|
|
termPQExpBuffer(&pathbuf);
|
|
}
|
|
}
|
|
|
|
|
|
for (cell = config_files.head; cell; cell = cell->next)
|
|
{
|
|
PQExpBufferData dest_file;
|
|
|
|
initPQExpBuffer(&dest_file);
|
|
|
|
appendPQExpBuffer(&dest_file,
|
|
"%s/%s",
|
|
archive_dir.data,
|
|
cell->key);
|
|
|
|
if (stat(cell->value, &statbuf) == -1)
|
|
{
|
|
log_warning(_("specified file \"%s\" not found, skipping"),
|
|
cell->value);
|
|
}
|
|
else
|
|
{
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
log_info("file \"%s\" would be copied to \"%s\"",
|
|
cell->key, dest_file.data);
|
|
copied_count++;
|
|
}
|
|
else
|
|
{
|
|
log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"",
|
|
cell->key, dest_file.data);
|
|
copy_file(cell->value, dest_file.data);
|
|
copied_count++;
|
|
}
|
|
}
|
|
|
|
termPQExpBuffer(&dest_file);
|
|
}
|
|
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
log_verbose(LOG_INFO, _("%i files would have been copied to \"%s\""),
|
|
copied_count, archive_dir.data);
|
|
}
|
|
else
|
|
{
|
|
log_verbose(LOG_INFO, _("%i files copied to \"%s\""),
|
|
copied_count, archive_dir.data);
|
|
}
|
|
|
|
if (runtime_options.dry_run == true)
|
|
{
|
|
/*
|
|
* Delete directory in --dry-run mode - it should be empty unless it's been
|
|
* interfered with for some reason, in which case manual intervention is
|
|
* required
|
|
*/
|
|
if (rmdir(archive_dir.data) != 0 && errno != EEXIST)
|
|
{
|
|
log_warning(_("unable to delete directory \"%s\""), archive_dir.data);
|
|
log_detail("%s", strerror(errno));
|
|
log_hint(_("directory may need to be manually removed"));
|
|
}
|
|
else
|
|
{
|
|
log_verbose(LOG_INFO, "temporary archive directory \"%s\" deleted", archive_dir.data);
|
|
}
|
|
}
|
|
|
|
termPQExpBuffer(&archive_dir);
|
|
}
|
|
|
|
|
|
/*
|
|
* Intended mainly for "internal" use by `standby switchover`, which
|
|
* calls this on the target server to restore any configuration files
|
|
* to the data directory, which may have been overwritten by an operation
|
|
* like pg_rewind
|
|
*
|
|
* Not designed to be called if the instance is running, but does
|
|
* not currently check.
|
|
*
|
|
* Requires -D/--pgdata, optionally --config-archive-dir
|
|
*
|
|
* Removes --config-archive-dir after successful copy
|
|
*/
|
|
|
|
static void
|
|
_do_node_restore_config(void)
|
|
{
|
|
PQExpBufferData archive_dir;
|
|
|
|
DIR *arcdir;
|
|
struct dirent *arcdir_ent;
|
|
int copied_count = 0;
|
|
bool copy_ok = true;
|
|
|
|
initPQExpBuffer(&archive_dir);
|
|
|
|
format_archive_dir(&archive_dir);
|
|
|
|
arcdir = opendir(archive_dir.data);
|
|
|
|
if (arcdir == NULL)
|
|
{
|
|
log_error(_("unable to open archive directory \"%s\""),
|
|
archive_dir.data);
|
|
log_detail("%s", strerror(errno));
|
|
termPQExpBuffer(&archive_dir);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
while ((arcdir_ent = readdir(arcdir)) != NULL)
|
|
{
|
|
struct stat statbuf;
|
|
PQExpBufferData src_file_path;
|
|
PQExpBufferData dest_file_path;
|
|
|
|
initPQExpBuffer(&src_file_path);
|
|
|
|
appendPQExpBuffer(&src_file_path,
|
|
"%s/%s",
|
|
archive_dir.data,
|
|
arcdir_ent->d_name);
|
|
|
|
/* skip non-files */
|
|
if (stat(src_file_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
|
|
{
|
|
termPQExpBuffer(&src_file_path);
|
|
continue;
|
|
}
|
|
|
|
initPQExpBuffer(&dest_file_path);
|
|
|
|
appendPQExpBuffer(&dest_file_path,
|
|
"%s/%s",
|
|
config_file_options.data_directory,
|
|
arcdir_ent->d_name);
|
|
|
|
log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"",
|
|
src_file_path.data, dest_file_path.data);
|
|
|
|
if (copy_file(src_file_path.data, dest_file_path.data) == false)
|
|
{
|
|
copy_ok = false;
|
|
log_warning(_("unable to copy \"%s\" to \"%s\""),
|
|
arcdir_ent->d_name, runtime_options.data_dir);
|
|
}
|
|
else
|
|
{
|
|
unlink(src_file_path.data);
|
|
copied_count++;
|
|
}
|
|
|
|
termPQExpBuffer(&dest_file_path);
|
|
termPQExpBuffer(&src_file_path);
|
|
}
|
|
|
|
closedir(arcdir);
|
|
|
|
log_notice(_("%i files copied to %s"),
|
|
copied_count,
|
|
config_file_options.data_directory);
|
|
|
|
if (copy_ok == false)
|
|
{
|
|
log_warning(_("unable to copy all files from \"%s\""), archive_dir.data);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Finally, delete directory - it should be empty unless it's been
|
|
* interfered with for some reason, in which case manual intervention is
|
|
* required
|
|
*/
|
|
if (rmdir(archive_dir.data) != 0 && errno != EEXIST)
|
|
{
|
|
log_warning(_("unable to delete directory \"%s\""), archive_dir.data);
|
|
log_detail("%s", strerror(errno));
|
|
log_hint(_("directory may need to be manually removed"));
|
|
}
|
|
else
|
|
{
|
|
log_verbose(LOG_INFO, "directory \"%s\" deleted", archive_dir.data);
|
|
}
|
|
}
|
|
|
|
termPQExpBuffer(&archive_dir);
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
static void
|
|
format_archive_dir(PQExpBufferData *archive_dir)
|
|
{
|
|
appendPQExpBuffer(archive_dir,
|
|
"%s/repmgr-config-archive-%s",
|
|
runtime_options.config_archive_dir,
|
|
config_file_options.node_name);
|
|
|
|
log_verbose(LOG_DEBUG, "using archive directory \"%s\"", archive_dir->data);
|
|
}
|
|
|
|
|
|
static bool
|
|
copy_file(const char *src_file, const char *dest_file)
|
|
{
|
|
FILE *ptr_old,
|
|
*ptr_new;
|
|
int a = 0;
|
|
|
|
ptr_old = fopen(src_file, "r");
|
|
|
|
if (ptr_old == NULL)
|
|
return false;
|
|
|
|
ptr_new = fopen(dest_file, "w");
|
|
|
|
if (ptr_new == NULL)
|
|
{
|
|
fclose(ptr_old);
|
|
return false;
|
|
}
|
|
|
|
chmod(dest_file, S_IRUSR | S_IWUSR);
|
|
|
|
while (1)
|
|
{
|
|
a = fgetc(ptr_old);
|
|
|
|
if (!feof(ptr_old))
|
|
{
|
|
fputc(a, ptr_new);
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
fclose(ptr_new);
|
|
fclose(ptr_old);
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
static const char *
|
|
output_repmgrd_status(CheckStatus status)
|
|
{
|
|
switch (status)
|
|
{
|
|
case CHECK_STATUS_OK:
|
|
return "repmgrd running";
|
|
case CHECK_STATUS_WARNING:
|
|
return "repmgrd running but paused";
|
|
case CHECK_STATUS_CRITICAL:
|
|
return "repmgrd not running";
|
|
case CHECK_STATUS_UNKNOWN:
|
|
return "repmgrd status unknown";
|
|
}
|
|
|
|
return "UNKNOWN";
|
|
}
|
|
|
|
|
|
void
|
|
do_node_help(void)
|
|
{
|
|
print_help_header();
|
|
|
|
printf(_("Usage:\n"));
|
|
printf(_(" %s [OPTIONS] node status\n"), progname());
|
|
printf(_(" %s [OPTIONS] node check\n"), progname());
|
|
printf(_(" %s [OPTIONS] node rejoin\n"), progname());
|
|
printf(_(" %s [OPTIONS] node service\n"), progname());
|
|
puts("");
|
|
|
|
printf(_("NODE STATUS\n"));
|
|
puts("");
|
|
printf(_(" \"node status\" displays an overview of a node's basic information and replication status.\n"));
|
|
puts("");
|
|
printf(_(" Configuration file required, runs on local node only.\n"));
|
|
puts("");
|
|
printf(_(" --csv emit output as CSV\n"));
|
|
puts("");
|
|
|
|
printf(_("NODE CHECK\n"));
|
|
puts("");
|
|
printf(_(" \"node check\" performs some health checks on a node from a replication perspective.\n"));
|
|
puts("");
|
|
printf(_(" Configuration file required, runs on local node only.\n"));
|
|
puts("");
|
|
printf(_(" Connection options:\n"));
|
|
printf(_(" -S, --superuser=USERNAME superuser to use, if repmgr user is not superuser\n"));
|
|
puts("");
|
|
printf(_(" Output options:\n"));
|
|
printf(_(" --csv emit output as CSV (not available for individual check output)\n"));
|
|
printf(_(" --nagios emit output in Nagios format (individual check output only)\n"));
|
|
puts("");
|
|
printf(_(" Following options check an individual status:\n"));
|
|
printf(_(" --archive-ready number of WAL files ready for archiving\n"));
|
|
printf(_(" --downstream whether all downstream nodes are connected\n"));
|
|
printf(_(" --upstream whether the node is connected to its upstream\n"));
|
|
printf(_(" --replication-lag replication lag in seconds (standbys only)\n"));
|
|
printf(_(" --role check node has expected role\n"));
|
|
printf(_(" --slots check for inactive replication slots\n"));
|
|
printf(_(" --missing-slots check for missing replication slots\n"));
|
|
printf(_(" --repmgrd check if repmgrd is running\n"));
|
|
printf(_(" --data-directory-config check repmgr's data directory configuration\n"));
|
|
|
|
puts("");
|
|
|
|
printf(_("NODE REJOIN\n"));
|
|
puts("");
|
|
printf(_(" \"node rejoin\" enables a dormant (stopped) node to be rejoined to the replication cluster.\n"));
|
|
puts("");
|
|
printf(_(" Configuration file required, runs on local node only.\n"));
|
|
puts("");
|
|
printf(_(" --dry-run check that the prerequisites are met for rejoining the node\n" \
|
|
" (including usability of \"pg_rewind\" if requested)\n"));
|
|
printf(_(" --force-rewind[=VALUE] execute \"pg_rewind\" if necessary\n"));
|
|
printf(_(" (PostgreSQL 9.4 - provide full \"pg_rewind\" path)\n"));
|
|
|
|
printf(_(" --config-files comma-separated list of configuration files to retain\n" \
|
|
" after executing \"pg_rewind\"\n"));
|
|
printf(_(" --config-archive-dir directory to temporarily store retained configuration files\n" \
|
|
" (default: /tmp)\n"));
|
|
printf(_(" -W, --no-wait don't wait for the node to rejoin cluster\n"));
|
|
printf(_(" -S, --superuser=USERNAME superuser to use for pg_rewind if repmgr user is not superuser\n"));
|
|
puts("");
|
|
|
|
printf(_("NODE SERVICE\n"));
|
|
puts("");
|
|
printf(_(" \"node service\" executes a system service command to stop/start/restart/reload a node\n" \
|
|
" or optionally display which command would be executed\n"));
|
|
puts("");
|
|
printf(_(" Configuration file required, runs on local node only.\n"));
|
|
puts("");
|
|
printf(_(" --dry-run show what action would be performed, but don't execute it\n"));
|
|
printf(_(" --action action to perform (one of \"start\", \"stop\", \"restart\" or \"reload\")\n"));
|
|
printf(_(" --list-actions show what command would be performed for each action\n"));
|
|
printf(_(" --checkpoint issue a CHECKPOINT before stopping or restarting the node\n"));
|
|
printf(_(" -S, --superuser=USERNAME superuser to use, if repmgr user is not superuser\n"));
|
|
|
|
puts("");
|
|
|
|
printf(_("%s home page: <%s>\n"), "repmgr", REPMGR_URL);
|
|
}
|