Files
repmgr/repmgr-action-node.c
Ian Barwick 3ccf1cf182 Enable pg_rewind to be used with PostgreSQL 9.3/9.4
pg_rewind is not part of the core distribution for those, but we
provided support in repmgr 3.3 so should extend it to repmgr 4.

Note that there is no check in place whether the pg_rewind binary
exists, so it's up to the user to ensure it's present.

Addresses GitHub #413.
2018-04-02 20:54:29 +09:00

2494 lines
58 KiB
C

/*
* repmgr-action-node.c
*
* Implements actions available for any kind of node
*
* Copyright (c) 2ndQuadrant, 2010-2018
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <sys/stat.h>
#include <dirent.h>
#include "repmgr.h"
#include "controldata.h"
#include "dirutil.h"
#include "dbutils.h"
#include "compat.h"
#include "repmgr-client-global.h"
#include "repmgr-action-node.h"
#include "repmgr-action-standby.h"
static bool copy_file(const char *src_file, const char *dest_file);
static void format_archive_dir(PQExpBufferData *archive_dir);
static t_server_action parse_server_action(const char *action);
static void _do_node_service_list_actions(t_server_action action);
static void _do_node_status_is_shutdown_cleanly(void);
static void _do_node_archive_config(void);
static void _do_node_restore_config(void);
static void do_node_check_replication_connection(void);
static CheckStatus do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
static CheckStatus do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_output);
static CheckStatus do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
static CheckStatus do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
static CheckStatus do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output);
/*
* NODE STATUS
*
* Can only be run on the local node, as it needs to be able to
* read the data directory.
*
* Parameters:
* --is-shutdown-cleanly (for internal use only)
* --csv
*/
void
do_node_status(void)
{
PGconn *conn = NULL;
t_node_info node_info = T_NODE_INFO_INITIALIZER;
char server_version[MAXLEN];
char cluster_size[MAXLEN];
PQExpBufferData output;
KeyValueList node_status = {NULL, NULL};
KeyValueListCell *cell = NULL;
ItemList warnings = {NULL, NULL};
RecoveryType recovery_type = RECTYPE_UNKNOWN;
ReplInfo replication_info = T_REPLINFO_INTIALIZER;
t_recovery_conf recovery_conf = T_RECOVERY_CONF_INITIALIZER;
char data_dir[MAXPGPATH] = "";
if (runtime_options.is_shutdown_cleanly == true)
{
return _do_node_status_is_shutdown_cleanly();
}
/* config file required, so we should have "conninfo" and "data_directory" */
conn = establish_db_connection(config_file_options.conninfo, true);
strncpy(data_dir, config_file_options.data_directory, MAXPGPATH);
server_version_num = get_server_version(conn, NULL);
/* Check node exists and is really a standby */
if (get_node_record_with_upstream(conn, config_file_options.node_id, &node_info) != RECORD_FOUND)
{
log_error(_("no record found for node %i"), config_file_options.node_id);
PQfinish(conn);
exit(ERR_BAD_CONFIG);
}
(void) get_server_version(conn, server_version);
if (get_cluster_size(conn, cluster_size) == false)
strncpy(cluster_size, _("unknown"), MAXLEN);
recovery_type = get_recovery_type(conn);
get_node_replication_stats(conn, server_version_num, &node_info);
key_value_list_set(
&node_status,
"PostgreSQL version",
server_version);
key_value_list_set(
&node_status,
"Total data size",
cluster_size);
key_value_list_set(
&node_status,
"Conninfo",
node_info.conninfo);
if (runtime_options.verbose == true)
{
uint64 local_system_identifier = UNKNOWN_SYSTEM_IDENTIFIER;
local_system_identifier = get_system_identifier(config_file_options.data_directory);
key_value_list_set_format(
&node_status,
"System identifier",
"%lu", local_system_identifier);
}
key_value_list_set(
&node_status,
"Role",
get_node_type_string(node_info.type));
switch (node_info.type)
{
case PRIMARY:
if (recovery_type == RECTYPE_STANDBY)
{
item_list_append(
&warnings,
_("- node is registered as primary but running as standby"));
}
break;
case STANDBY:
if (recovery_type == RECTYPE_PRIMARY)
{
item_list_append(
&warnings,
_("- node is registered as standby but running as primary"));
}
break;
case BDR:
default:
break;
}
if (guc_set(conn, "archive_mode", "=", "off"))
{
key_value_list_set(
&node_status,
"WAL archiving",
"off");
key_value_list_set(
&node_status,
"Archive command",
"(none)");
}
else
{
bool enabled = true;
PQExpBufferData archiving_status;
char archive_command[MAXLEN] = "";
initPQExpBuffer(&archiving_status);
if (recovery_type == RECTYPE_STANDBY)
{
if (guc_set(conn, "archive_mode", "=", "on"))
enabled = false;
}
if (enabled == true)
{
appendPQExpBuffer(&archiving_status, "enabled");
}
else
{
appendPQExpBuffer(&archiving_status, "disabled");
}
if (enabled == false && recovery_type == RECTYPE_STANDBY)
{
appendPQExpBuffer(&archiving_status, " (on standbys \"archive_mode\" must be set to \"always\" to be effective)");
}
key_value_list_set(
&node_status,
"WAL archiving",
archiving_status.data);
termPQExpBuffer(&archiving_status);
get_pg_setting(conn, "archive_command", archive_command);
key_value_list_set(
&node_status,
"Archive command",
archive_command);
}
{
int ready_files;
ready_files = get_ready_archive_files(conn, data_dir);
if (runtime_options.output_mode == OM_CSV)
{
key_value_list_set_format(
&node_status,
"WALs pending archiving",
"%i",
ready_files);
}
else
{
key_value_list_set_format(
&node_status,
"WALs pending archiving",
"%i pending files",
ready_files);
}
if (guc_set(conn, "archive_mode", "=", "off"))
{
key_value_list_set_output_mode(&node_status, "WALs pending archiving", OM_CSV);
}
}
if (node_info.max_wal_senders >= 0)
{
/* In CSV mode, raw values supplied as well */
key_value_list_set_format(&node_status,
"Replication connections",
"%i (of maximal %i)",
node_info.attached_wal_receivers,
node_info.max_wal_senders);
}
else if (node_info.max_wal_senders == 0)
{
key_value_list_set_format(&node_status,
"Replication connections",
"disabled");
}
if (server_version_num < 90400)
{
key_value_list_set(&node_status,
"Replication slots",
"not available");
}
else if (node_info.max_replication_slots > 0)
{
PQExpBufferData slotinfo;
initPQExpBuffer(&slotinfo);
appendPQExpBuffer(&slotinfo,
"%i (of maximal %i)",
node_info.active_replication_slots + node_info.inactive_replication_slots,
node_info.max_replication_slots);
if (node_info.inactive_replication_slots > 0)
{
appendPQExpBuffer(&slotinfo,
"; %i inactive",
node_info.inactive_replication_slots);
item_list_append_format(&warnings,
_("- node has %i inactive replication slots"),
node_info.inactive_replication_slots);
}
key_value_list_set(&node_status,
"Replication slots",
slotinfo.data);
termPQExpBuffer(&slotinfo);
}
else if (node_info.max_replication_slots == 0)
{
key_value_list_set(&node_status,
"Replication slots",
"disabled");
}
/*
* check for missing replication slots - we do this regardless of
* what "max_replication_slots" is set to, in case the downstream
* node was configured with "use_replication_slots=true" and is
* expecting a replication slot to be available
*/
{
NodeInfoList missing_slots = T_NODE_INFO_LIST_INITIALIZER;
get_downstream_nodes_with_missing_slot(conn,
config_file_options.node_id,
&missing_slots);
if (missing_slots.node_count > 0)
{
NodeInfoListCell *missing_slot_cell = NULL;
item_list_append_format(&warnings,
_("- replication slots missing for following %i node(s):"),
missing_slots.node_count);
for (missing_slot_cell = missing_slots.head; missing_slot_cell; missing_slot_cell = missing_slot_cell->next)
{
item_list_append_format(&warnings,
_(" - %s (ID: %i, slot name: \"%s\")"),
missing_slot_cell->node_info->node_name,
missing_slot_cell->node_info->node_id,
missing_slot_cell->node_info->slot_name);
}
}
}
if (node_info.type == STANDBY)
{
key_value_list_set_format(&node_status,
"Upstream node",
"%s (ID: %i)",
node_info.upstream_node_name,
node_info.upstream_node_id);
get_replication_info(conn, &replication_info);
key_value_list_set_format(&node_status,
"Replication lag",
"%i seconds",
replication_info.replication_lag_time);
key_value_list_set_format(&node_status,
"Last received LSN",
"%X/%X", format_lsn(replication_info.last_wal_receive_lsn));
key_value_list_set_format(&node_status,
"Last replayed LSN",
"%X/%X", format_lsn(replication_info.last_wal_replay_lsn));
}
else
{
key_value_list_set(&node_status,
"Upstream node",
"(none)");
key_value_list_set_output_mode(&node_status,
"Upstream node",
OM_CSV);
key_value_list_set(&node_status,
"Replication lag",
"n/a");
key_value_list_set(&node_status,
"Last received LSN",
"(none)");
key_value_list_set_output_mode(&node_status,
"Last received LSN",
OM_CSV);
key_value_list_set(&node_status,
"Last replayed LSN",
"(none)");
key_value_list_set_output_mode(&node_status,
"Last replayed LSN",
OM_CSV);
}
parse_recovery_conf(data_dir, &recovery_conf);
/* format output */
initPQExpBuffer(&output);
if (runtime_options.output_mode == OM_CSV)
{
appendPQExpBuffer(&output,
"\"Node name\",\"%s\"\n",
node_info.node_name);
appendPQExpBuffer(&output,
"\"Node ID\",\"%i\"\n",
node_info.node_id);
for (cell = node_status.head; cell; cell = cell->next)
{
appendPQExpBuffer(&output,
"\"%s\",\"%s\"\n",
cell->key, cell->value);
}
/* we'll add the raw data as well */
appendPQExpBuffer(&output,
"\"max_wal_senders\",%i\n",
node_info.max_wal_senders);
appendPQExpBuffer(&output,
"\"occupied_wal_senders\",%i\n",
node_info.attached_wal_receivers);
appendPQExpBuffer(&output,
"\"max_replication_slots\",%i\n",
node_info.max_replication_slots);
appendPQExpBuffer(&output,
"\"active_replication_slots\",%i\n",
node_info.active_replication_slots);
appendPQExpBuffer(&output,
"\"inactive_replaction_slots\",%i\n",
node_info.inactive_replication_slots);
}
else
{
appendPQExpBuffer(&output,
"Node \"%s\":\n",
node_info.node_name);
for (cell = node_status.head; cell; cell = cell->next)
{
if (cell->output_mode == OM_NOT_SET)
appendPQExpBuffer(&output,
"\t%s: %s\n",
cell->key, cell->value);
}
}
puts(output.data);
termPQExpBuffer(&output);
if (warnings.head != NULL && runtime_options.terse == false)
{
log_warning(_("following issue(s) were detected:"));
print_item_list(&warnings);
/* add this when functionality implemented */
/* log_hint(_("execute \"repmgr node check\" for more details")); */
}
key_value_list_free(&node_status);
item_list_free(&warnings);
PQfinish(conn);
}
/*
* Returns information about the running state of the node.
* For internal use during "standby switchover".
*
* Returns "longopt" output:
*
* --status=(RUNNING|SHUTDOWN|UNCLEAN_SHUTDOWN|UNKNOWN)
* --last-checkpoint=...
*/
static
void
_do_node_status_is_shutdown_cleanly(void)
{
PGPing ping_status;
PQExpBufferData output;
DBState db_state;
XLogRecPtr checkPoint = InvalidXLogRecPtr;
NodeStatus node_status = NODE_STATUS_UNKNOWN;
initPQExpBuffer(&output);
appendPQExpBuffer(&output,
"--state=");
/* sanity-check we're dealing with a PostgreSQL directory */
if (is_pg_dir(config_file_options.data_directory) == false)
{
appendPQExpBuffer(&output, "UNKNOWN");
printf("%s\n", output.data);
termPQExpBuffer(&output);
return;
}
ping_status = PQping(config_file_options.conninfo);
switch (ping_status)
{
case PQPING_OK:
node_status = NODE_STATUS_UP;
break;
case PQPING_REJECT:
node_status = NODE_STATUS_UP;
break;
case PQPING_NO_ATTEMPT:
case PQPING_NO_RESPONSE:
/* status not yet clear */
break;
}
/* check what pg_controldata says */
db_state = get_db_state(config_file_options.data_directory);
log_verbose(LOG_DEBUG, "db state now: %s", describe_db_state(db_state));
if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY)
{
if (node_status != NODE_STATUS_UP)
{
node_status = NODE_STATUS_UNCLEAN_SHUTDOWN;
}
/* server is still responding but shutting down */
else if (db_state == DB_SHUTDOWNING)
{
node_status = NODE_STATUS_SHUTTING_DOWN;
}
}
checkPoint = get_latest_checkpoint_location(config_file_options.data_directory);
/* unable to read pg_control, don't know what's happening */
if (checkPoint == InvalidXLogRecPtr)
{
node_status = NODE_STATUS_UNKNOWN;
}
/*
* if still "UNKNOWN" at this point, then the node must be cleanly shut
* down
*/
else if (node_status == NODE_STATUS_UNKNOWN)
{
node_status = NODE_STATUS_DOWN;
}
log_verbose(LOG_DEBUG, "node status determined as: %s", print_node_status(node_status));
switch (node_status)
{
case NODE_STATUS_UP:
appendPQExpBuffer(&output, "RUNNING");
break;
case NODE_STATUS_SHUTTING_DOWN:
appendPQExpBuffer(&output, "SHUTTING_DOWN");
break;
case NODE_STATUS_DOWN:
appendPQExpBuffer(&output,
"SHUTDOWN --last-checkpoint-lsn=%X/%X",
format_lsn(checkPoint));
break;
case NODE_STATUS_UNCLEAN_SHUTDOWN:
appendPQExpBuffer(&output, "UNCLEAN_SHUTDOWN");
break;
case NODE_STATUS_UNKNOWN:
appendPQExpBuffer(&output, "UNKNOWN");
break;
}
printf("%s\n", output.data);
termPQExpBuffer(&output);
return;
}
/*
* Configuration file required
*/
void
do_node_check(void)
{
PGconn *conn = NULL;
PQExpBufferData output;
t_node_info node_info = T_NODE_INFO_INITIALIZER;
CheckStatus return_code;
CheckStatusList status_list = {NULL, NULL};
CheckStatusListCell *cell = NULL;
/* internal */
if (runtime_options.has_passfile == true)
{
return_code = has_passfile() ? 0 : 1;
exit(return_code);
}
if (runtime_options.replication_connection == true)
{
do_node_check_replication_connection();
exit(SUCCESS);
}
if (strlen(config_file_options.conninfo))
conn = establish_db_connection(config_file_options.conninfo, true);
else
conn = establish_db_connection_by_params(&source_conninfo, true);
if (get_node_record(conn, config_file_options.node_id, &node_info) != RECORD_FOUND)
{
log_error(_("no record found for node %i"), config_file_options.node_id);
PQfinish(conn);
exit(ERR_BAD_CONFIG);
}
server_version_num = get_server_version(conn, NULL);
/* add replication statistics to node record */
get_node_replication_stats(conn, server_version_num, &node_info);
/*
* handle specific checks ======================
*/
if (runtime_options.archive_ready == true)
{
return_code = do_node_check_archive_ready(conn,
runtime_options.output_mode,
NULL);
PQfinish(conn);
exit(return_code);
}
if (runtime_options.downstream == true)
{
return_code = do_node_check_downstream(conn,
runtime_options.output_mode,
NULL);
PQfinish(conn);
exit(return_code);
}
if (runtime_options.replication_lag == true)
{
return_code = do_node_check_replication_lag(conn,
runtime_options.output_mode,
&node_info,
NULL);
PQfinish(conn);
exit(return_code);
}
if (runtime_options.role == true)
{
return_code = do_node_check_role(conn,
runtime_options.output_mode,
&node_info,
NULL);
PQfinish(conn);
exit(return_code);
}
if (runtime_options.slots == true)
{
return_code = do_node_check_slots(conn,
runtime_options.output_mode,
&node_info,
NULL);
PQfinish(conn);
exit(return_code);
}
if (runtime_options.output_mode == OM_NAGIOS)
{
log_error(_("--nagios can only be used with a specific check"));
log_hint(_("execute \"repmgr node --help\" for details"));
PQfinish(conn);
return;
}
/* output general overview */
initPQExpBuffer(&output);
/* order functions are called is also output order */
(void) do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list);
(void) do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list);
(void) do_node_check_archive_ready(conn, runtime_options.output_mode, &status_list);
(void) do_node_check_downstream(conn, runtime_options.output_mode, &status_list);
(void) do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list);
if (runtime_options.output_mode == OM_CSV)
{
/* TODO */
}
else
{
appendPQExpBuffer(
&output,
"Node \"%s\":\n",
node_info.node_name);
for (cell = status_list.head; cell; cell = cell->next)
{
appendPQExpBuffer(
&output,
"\t%s: %s",
cell->item,
output_check_status(cell->status));
if (strlen(cell->details))
{
appendPQExpBuffer(
&output,
" (%s)",
cell->details);
}
appendPQExpBuffer(&output, "\n");
}
}
printf("%s", output.data);
termPQExpBuffer(&output);
check_status_list_free(&status_list);
PQfinish(conn);
}
static CheckStatus
do_node_check_role(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
{
CheckStatus status = CHECK_STATUS_OK;
PQExpBufferData details;
RecoveryType recovery_type = get_recovery_type(conn);
if (mode == OM_CSV)
{
log_error(_("--csv output not provided with --role option"));
PQfinish(conn);
exit(ERR_BAD_CONFIG);
}
initPQExpBuffer(&details);
switch (node_info->type)
{
case PRIMARY:
if (recovery_type == RECTYPE_STANDBY)
{
status = CHECK_STATUS_CRITICAL;
appendPQExpBuffer(&details,
_("node is registered as primary but running as standby"));
}
else
{
appendPQExpBuffer(&details,
_("node is primary"));
}
break;
case STANDBY:
if (recovery_type == RECTYPE_PRIMARY)
{
status = CHECK_STATUS_CRITICAL;
appendPQExpBuffer(&details,
_("node is registered as standby but running as primary"));
}
else
{
appendPQExpBuffer(&details,
_("node is standby"));
}
break;
case BDR:
{
PQExpBufferData output;
initPQExpBuffer(&output);
if (is_bdr_db(conn, &output) == false)
{
status = CHECK_STATUS_CRITICAL;
appendPQExpBuffer(&details,
"%s", output.data);
}
termPQExpBuffer(&output);
if (status == CHECK_STATUS_OK)
{
if (is_active_bdr_node(conn, node_info->node_name) == false)
{
status = CHECK_STATUS_CRITICAL;
appendPQExpBuffer(&details,
_("node is not an active BDR node"));
}
else
{
appendPQExpBuffer(&details,
_("node is an active BDR node"));
}
}
}
default:
break;
}
switch (mode)
{
case OM_NAGIOS:
printf("REPMGR_SERVER_ROLE %s: %s\n",
output_check_status(status),
details.data);
break;
case OM_TEXT:
if (list_output != NULL)
{
check_status_list_set(list_output,
"Server role",
status,
details.data);
}
else
{
printf("%s (%s)\n",
output_check_status(status),
details.data);
}
default:
break;
}
termPQExpBuffer(&details);
return status;
}
static CheckStatus
do_node_check_slots(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
{
CheckStatus status = CHECK_STATUS_OK;
PQExpBufferData details;
initPQExpBuffer(&details);
if (server_version_num < 90400)
{
appendPQExpBuffer(&details,
_("replication slots not available for this PostgreSQL version"));
}
else if (node_info->total_replication_slots == 0)
{
appendPQExpBuffer(&details,
_("node has no replication slots"));
}
else if (node_info->inactive_replication_slots == 0)
{
appendPQExpBuffer(&details,
_("%i of %i replication slots are active"),
node_info->total_replication_slots,
node_info->total_replication_slots);
}
else if (node_info->inactive_replication_slots > 0)
{
status = CHECK_STATUS_CRITICAL;
appendPQExpBuffer(&details,
_("%i of %i replication slots are inactive"),
node_info->inactive_replication_slots,
node_info->total_replication_slots);
}
switch (mode)
{
case OM_NAGIOS:
printf("REPMGR_INACTIVE_SLOTS %s: %s | slots=%i;%i\n",
output_check_status(status),
details.data,
node_info->total_replication_slots,
node_info->inactive_replication_slots);
break;
case OM_TEXT:
if (list_output != NULL)
{
check_status_list_set(list_output,
"Replication slots",
status,
details.data);
}
else
{
printf("%s (%s)\n",
output_check_status(status),
details.data);
}
default:
break;
}
termPQExpBuffer(&details);
return status;
}
static void
do_node_check_replication_connection(void)
{
PGconn *local_conn = NULL;
PGconn *repl_conn = NULL;
t_node_info node_record = T_NODE_INFO_INITIALIZER;
RecordStatus record_status = RECORD_NOT_FOUND;
t_conninfo_param_list remote_conninfo;
PQExpBufferData output;
initPQExpBuffer(&output);
appendPQExpBuffer(&output,
"--connection=");
if (runtime_options.remote_node_id == UNKNOWN_NODE_ID)
{
appendPQExpBuffer(&output, "UNKNOWN");
printf("%s\n", output.data);
termPQExpBuffer(&output);
return;
}
local_conn = establish_db_connection(config_file_options.conninfo, true);
record_status = get_node_record(local_conn, runtime_options.remote_node_id, &node_record);
PQfinish(local_conn);
if (record_status != RECORD_FOUND)
{
appendPQExpBuffer(&output, "UNKNOWN");
printf("%s\n", output.data);
termPQExpBuffer(&output);
return;
}
initialize_conninfo_params(&remote_conninfo, false);
parse_conninfo_string(node_record.conninfo, &remote_conninfo, NULL, false);
param_set(&remote_conninfo, "replication", "1");
param_set(&remote_conninfo, "user", node_record.repluser);
repl_conn = establish_db_connection_by_params(&remote_conninfo, false);
if (PQstatus(repl_conn) != CONNECTION_OK)
{
appendPQExpBuffer(&output, "BAD");
printf("%s\n", output.data);
termPQExpBuffer(&output);
return;
}
PQfinish(repl_conn);
appendPQExpBuffer(&output, "OK");
printf("%s\n", output.data);
termPQExpBuffer(&output);
return;
}
static CheckStatus
do_node_check_archive_ready(PGconn *conn, OutputMode mode, CheckStatusList *list_output)
{
int ready_archive_files = 0;
CheckStatus status = CHECK_STATUS_UNKNOWN;
PQExpBufferData details;
if (mode == OM_CSV)
{
log_error(_("--csv output not provided with --archive-ready option"));
PQfinish(conn);
exit(ERR_BAD_CONFIG);
}
initPQExpBuffer(&details);
ready_archive_files = get_ready_archive_files(conn, config_file_options.data_directory);
if (ready_archive_files > config_file_options.archive_ready_critical)
{
status = CHECK_STATUS_CRITICAL;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(&details,
"--files=%i --threshold=%i",
ready_archive_files, config_file_options.archive_ready_critical);
break;
case OM_NAGIOS:
appendPQExpBuffer(&details,
"%i pending archive ready files | files=%i;%i;%i",
ready_archive_files,
ready_archive_files,
config_file_options.archive_ready_warning,
config_file_options.archive_ready_critical);
break;
case OM_TEXT:
appendPQExpBuffer(&details,
"%i pending archive ready files, critical threshold: %i",
ready_archive_files, config_file_options.archive_ready_critical);
break;
default:
break;
}
}
else if (ready_archive_files > config_file_options.archive_ready_warning)
{
status = CHECK_STATUS_WARNING;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(&details,
"--files=%i --threshold=%i",
ready_archive_files, config_file_options.archive_ready_warning);
break;
case OM_NAGIOS:
appendPQExpBuffer(&details,
"%i pending archive ready files | files=%i;%i;%i",
ready_archive_files,
ready_archive_files,
config_file_options.archive_ready_warning,
config_file_options.archive_ready_critical);
break;
case OM_TEXT:
appendPQExpBuffer(&details,
"%i pending archive ready files (threshold: %i)",
ready_archive_files, config_file_options.archive_ready_warning);
break;
default:
break;
}
}
else if (ready_archive_files < 0)
{
status = CHECK_STATUS_UNKNOWN;
switch (mode)
{
case OM_OPTFORMAT:
break;
case OM_NAGIOS:
case OM_TEXT:
appendPQExpBuffer(
&details,
"unable to check archive_status directory");
break;
default:
break;
}
}
else
{
status = CHECK_STATUS_OK;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(&details,
"--files=%i", ready_archive_files);
break;
case OM_NAGIOS:
appendPQExpBuffer(&details,
"%i pending archive ready files | files=%i;%i;%i",
ready_archive_files,
ready_archive_files,
config_file_options.archive_ready_warning,
config_file_options.archive_ready_critical);
break;
case OM_TEXT:
appendPQExpBuffer(&details,
"%i pending archive ready files", ready_archive_files);
break;
default:
break;
}
}
switch (mode)
{
case OM_OPTFORMAT:
{
printf("--status=%s %s\n",
output_check_status(status),
details.data);
}
break;
case OM_NAGIOS:
printf("REPMGR_ARCHIVE_READY %s: %s\n",
output_check_status(status),
details.data);
break;
case OM_TEXT:
if (list_output != NULL)
{
check_status_list_set(list_output,
"WAL archiving",
status,
details.data);
}
else
{
printf("%s (%s)\n",
output_check_status(status),
details.data);
}
default:
break;
}
termPQExpBuffer(&details);
return status;
}
static CheckStatus
do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_info, CheckStatusList *list_output)
{
CheckStatus status = CHECK_STATUS_OK;
int lag_seconds = 0;
PQExpBufferData details;
if (mode == OM_CSV)
{
log_error(_("--csv output not provided with --replication-lag option"));
PQfinish(conn);
exit(ERR_BAD_CONFIG);
}
initPQExpBuffer(&details);
if (node_info->recovery_type == RECTYPE_PRIMARY)
{
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(
&details,
"--lag=0");
break;
case OM_NAGIOS:
appendPQExpBuffer(
&details,
"0 seconds | lag=0;%i;%i",
config_file_options.replication_lag_warning,
config_file_options.replication_lag_critical);
break;
case OM_TEXT:
appendPQExpBuffer(
&details,
"N/A - node is primary");
break;
default:
break;
}
}
else
{
lag_seconds = get_replication_lag_seconds(conn);
log_debug("lag seconds: %i", lag_seconds);
if (lag_seconds >= config_file_options.replication_lag_critical)
{
status = CHECK_STATUS_CRITICAL;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(&details,
"--lag=%i --threshold=%i",
lag_seconds, config_file_options.replication_lag_critical);
break;
case OM_NAGIOS:
appendPQExpBuffer(&details,
"%i seconds | lag=%i;%i;%i",
lag_seconds,
lag_seconds,
config_file_options.replication_lag_warning,
config_file_options.replication_lag_critical);
break;
case OM_TEXT:
appendPQExpBuffer(&details,
"%i seconds, critical threshold: %i)",
lag_seconds, config_file_options.replication_lag_critical);
break;
default:
break;
}
}
else if (lag_seconds > config_file_options.replication_lag_warning)
{
status = CHECK_STATUS_WARNING;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(&details,
"--lag=%i --threshold=%i",
lag_seconds, config_file_options.replication_lag_warning);
break;
case OM_NAGIOS:
appendPQExpBuffer(&details,
"%i seconds | lag=%i;%i;%i",
lag_seconds,
lag_seconds,
config_file_options.replication_lag_warning,
config_file_options.replication_lag_critical);
break;
case OM_TEXT:
appendPQExpBuffer(&details,
"%i seconds, warning threshold: %i)",
lag_seconds, config_file_options.replication_lag_warning);
break;
default:
break;
}
}
else if (lag_seconds < 0)
{
status = CHECK_STATUS_UNKNOWN;
switch (mode)
{
case OM_OPTFORMAT:
break;
case OM_NAGIOS:
case OM_TEXT:
appendPQExpBuffer(
&details,
"unable to query replication lag");
break;
default:
break;
}
}
else
{
status = CHECK_STATUS_OK;
switch (mode)
{
case OM_OPTFORMAT:
appendPQExpBuffer(&details,
"--lag=%i",
lag_seconds);
break;
case OM_NAGIOS:
appendPQExpBuffer(&details,
"%i seconds | lag=%i;%i;%i",
lag_seconds,
lag_seconds,
config_file_options.replication_lag_warning,
config_file_options.replication_lag_critical);
break;
case OM_TEXT:
appendPQExpBuffer(&details,
"%i seconds",
lag_seconds);
break;
default:
break;
}
}
}
switch (mode)
{
case OM_OPTFORMAT:
{
printf("--status=%s %s\n",
output_check_status(status),
details.data);
}
break;
case OM_NAGIOS:
printf("REPMGR_REPLICATION_LAG %s: %s\n",
output_check_status(status),
details.data);
break;
case OM_TEXT:
if (list_output != NULL)
{
check_status_list_set(list_output,
"Replication lag",
status,
details.data);
}
else
{
printf("%s (%s)\n",
output_check_status(status),
details.data);
}
default:
break;
}
termPQExpBuffer(&details);
return status;
}
/* TODO: ensure only runs on streaming replication nodes */
static CheckStatus
do_node_check_downstream(PGconn *conn, OutputMode mode, CheckStatusList *list_output)
{
NodeInfoList downstream_nodes = T_NODE_INFO_LIST_INITIALIZER;
NodeInfoListCell *cell = NULL;
int missing_nodes_count = 0;
CheckStatus status = CHECK_STATUS_OK;
ItemList missing_nodes = {NULL, NULL};
ItemList attached_nodes = {NULL, NULL};
PQExpBufferData details;
initPQExpBuffer(&details);
get_downstream_node_records(conn, config_file_options.node_id, &downstream_nodes);
for (cell = downstream_nodes.head; cell; cell = cell->next)
{
if (is_downstream_node_attached(conn, cell->node_info->node_name) == false)
{
missing_nodes_count++;
item_list_append_format(&missing_nodes,
"%s (ID: %i)",
cell->node_info->node_name,
cell->node_info->node_id);
}
else
{
item_list_append_format(&attached_nodes,
"%s (ID: %i)",
cell->node_info->node_name,
cell->node_info->node_id);
}
}
if (missing_nodes_count == 0)
{
if (downstream_nodes.node_count == 0)
appendPQExpBuffer(
&details,
"this node has no downstream nodes");
else
appendPQExpBuffer(
&details,
"%i of %i downstream nodes attached",
downstream_nodes.node_count,
downstream_nodes.node_count);
}
else
{
ItemListCell *missing_cell = NULL;
bool first = true;
status = CHECK_STATUS_CRITICAL;
appendPQExpBuffer(
&details,
"%i of %i downstream nodes not attached",
missing_nodes_count,
downstream_nodes.node_count);
if (mode != OM_NAGIOS)
{
appendPQExpBuffer(
&details, "; missing: ");
for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
{
if (first == false)
appendPQExpBuffer(
&details,
", ");
else
first = false;
if (first == false)
appendPQExpBuffer(
&details,
"%s", missing_cell->string);
}
}
}
switch (mode)
{
case OM_NAGIOS:
{
printf("REPMGR_DOWNSTREAM_SERVERS %s: %s | ",
output_check_status(status),
details.data);
if (missing_nodes_count)
{
ItemListCell *missing_cell = NULL;
bool first = true;
printf("missing: ");
for (missing_cell = missing_nodes.head; missing_cell; missing_cell = missing_cell->next)
{
if (first == false)
printf(", ");
else
first = false;
if (first == false)
printf("%s", missing_cell->string);
}
}
if (downstream_nodes.node_count - missing_nodes_count)
{
ItemListCell *attached_cell = NULL;
bool first = true;
if (missing_nodes_count)
printf("; ");
printf("attached: ");
for (attached_cell = attached_nodes.head; attached_cell; attached_cell = attached_cell->next)
{
if (first == false)
printf(", ");
else
first = false;
if (first == false)
printf("%s", attached_cell->string);
}
}
printf("\n");
}
break;
case OM_TEXT:
if (list_output != NULL)
{
check_status_list_set(list_output,
"Downstream servers",
status,
details.data);
}
else
{
printf("%s (%s)\n",
output_check_status(status),
details.data);
}
default:
break;
}
termPQExpBuffer(&details);
clear_node_info_list(&downstream_nodes);
return status;
}
void
do_node_service(void)
{
t_server_action action = ACTION_UNKNOWN;
char data_dir[MAXPGPATH] = "";
char command[MAXLEN] = "";
PQExpBufferData output;
action = parse_server_action(runtime_options.action);
if (action == ACTION_UNKNOWN)
{
log_error(_("unknown value \"%s\" provided for parameter --action"),
runtime_options.action);
log_hint(_("valid values are \"start\", \"stop\", \"restart\", \"reload\" and \"promote\""));
exit(ERR_BAD_CONFIG);
}
if (runtime_options.list_actions == true)
{
return _do_node_service_list_actions(action);
}
if (data_dir_required_for_action(action))
{
get_node_data_directory(data_dir);
if (data_dir[0] == '\0')
{
log_error(_("unable to determine data directory for action"));
exit(ERR_BAD_CONFIG);
}
}
if ((action == ACTION_STOP || action == ACTION_RESTART) && runtime_options.checkpoint == true)
{
if (runtime_options.dry_run == true)
{
log_info(_("a CHECKPOINT would be issued here"));
}
else
{
PGconn *conn = NULL;
if (strlen(config_file_options.conninfo))
conn = establish_db_connection(config_file_options.conninfo, true);
else
conn = establish_db_connection_by_params(&source_conninfo, true);
log_notice(_("issuing CHECKPOINT"));
/* check superuser conn! */
checkpoint(conn);
PQfinish(conn);
}
}
get_server_action(action, command, data_dir);
if (runtime_options.dry_run == true)
{
log_info(_("would execute server command \"%s\""), command);
return;
}
/*
* log level is "DETAIL" here as this command is intended to be executed
* by another repmgr process (e.g. during standby switchover); that repmgr
* should emit a "NOTICE" about the intent of the command.
*/
log_detail(_("executing server command \"%s\""), command);
initPQExpBuffer(&output);
if (local_command(command, &output) == false)
{
termPQExpBuffer(&output);
exit(ERR_LOCAL_COMMAND);
}
termPQExpBuffer(&output);
}
static void
_do_node_service_list_actions(t_server_action action)
{
char command[MAXLEN] = "";
char data_dir[MAXPGPATH] = "";
bool data_dir_required = false;
/* do we need to provide a data directory for any of the actions? */
if (data_dir_required_for_action(ACTION_START))
data_dir_required = true;
if (data_dir_required_for_action(ACTION_STOP))
data_dir_required = true;
if (data_dir_required_for_action(ACTION_RESTART))
data_dir_required = true;
if (data_dir_required_for_action(ACTION_RELOAD))
data_dir_required = true;
if (data_dir_required_for_action(ACTION_PROMOTE))
data_dir_required = true;
if (data_dir_required == true)
{
get_node_data_directory(data_dir);
}
/* show command for specific action only */
if (action != ACTION_NONE)
{
get_server_action(action, command, data_dir);
printf("%s\n", command);
return;
}
puts(_("Following commands would be executed for each action:"));
puts("");
get_server_action(ACTION_START, command, data_dir);
printf(" start: \"%s\"\n", command);
get_server_action(ACTION_STOP, command, data_dir);
printf(" stop: \"%s\"\n", command);
get_server_action(ACTION_RESTART, command, data_dir);
printf(" restart: \"%s\"\n", command);
get_server_action(ACTION_RELOAD, command, data_dir);
printf(" reload: \"%s\"\n", command);
get_server_action(ACTION_PROMOTE, command, data_dir);
printf(" promote: \"%s\"\n", command);
puts("");
}
static t_server_action
parse_server_action(const char *action_name)
{
if (action_name[0] == '\0')
return ACTION_NONE;
if (strcasecmp(action_name, "start") == 0)
return ACTION_START;
if (strcasecmp(action_name, "stop") == 0)
return ACTION_STOP;
if (strcasecmp(action_name, "restart") == 0)
return ACTION_RESTART;
if (strcasecmp(action_name, "reload") == 0)
return ACTION_RELOAD;
if (strcasecmp(action_name, "promote") == 0)
return ACTION_PROMOTE;
return ACTION_UNKNOWN;
}
/*
* Rejoin a dormant (shut down) node to the replication cluster; this
* is typically a former primary which needs to be demoted to a standby.
*
* Note that "repmgr node rejoin" is also executed by
* "repmgr standby switchover" after promoting the new primary.
*/
void
do_node_rejoin(void)
{
PGconn *upstream_conn = NULL;
RecoveryType upstream_recovery_type = RECTYPE_UNKNOWN;
DBState db_state;
PGPing status;
bool is_shutdown = true;
PQExpBufferData command;
PQExpBufferData command_output;
PQExpBufferData follow_output;
struct stat statbuf;
t_node_info primary_node_record = T_NODE_INFO_INITIALIZER;
bool success = true;
int server_version_num = UNKNOWN_SERVER_VERSION_NUM;
int follow_error_code = SUCCESS;
/* check node is not actually running */
status = PQping(config_file_options.conninfo);
switch (status)
{
case PQPING_NO_ATTEMPT:
log_error(_("unable to determine status of server"));
exit(ERR_BAD_CONFIG);
case PQPING_OK:
is_shutdown = false;
break;
case PQPING_REJECT:
is_shutdown = false;
break;
case PQPING_NO_RESPONSE:
/* status not yet clear */
break;
}
db_state = get_db_state(config_file_options.data_directory);
if (is_shutdown == false)
{
log_error(_("database is still running in state \"%s\""),
describe_db_state(db_state));
log_hint(_("\"repmgr node rejoin\" cannot be executed on a running node"));
exit(ERR_BAD_CONFIG);
}
/* check if cleanly shut down */
if (db_state != DB_SHUTDOWNED && db_state != DB_SHUTDOWNED_IN_RECOVERY)
{
if (db_state == DB_SHUTDOWNING)
{
log_error(_("database is still shutting down"));
}
else
{
log_error(_("database is not shut down cleanly"));
if (runtime_options.force_rewind_used == true)
{
log_detail(_("pg_rewind will not be able to run"));
}
log_hint(_("database should be restarted then shut down cleanly after crash recovery completes"));
exit(ERR_BAD_CONFIG);
}
}
/* check provided upstream connection */
upstream_conn = establish_db_connection_by_params(&source_conninfo, true);
/* sanity checks for 9.3 */
server_version_num = get_server_version(upstream_conn, NULL);
if (server_version_num < 90400)
check_93_config();
if (get_primary_node_record(upstream_conn, &primary_node_record) == false)
{
log_error(_("unable to retrieve primary node record"));
log_hint(_("check the provided database connection string is for a \"repmgr\" database"));
PQfinish(upstream_conn);
exit(ERR_BAD_CONFIG);
}
PQfinish(upstream_conn);
/* connect to registered primary and check it's not in recovery */
upstream_conn = establish_db_connection(primary_node_record.conninfo, true);
upstream_recovery_type = get_recovery_type(upstream_conn);
if (upstream_recovery_type != RECTYPE_PRIMARY)
{
log_error(_("primary server is registered node \"%s\" (ID: %i), but server is not a primary"),
primary_node_record.node_name,
primary_node_record.node_id);
/* TODO: hint about checking cluster */
PQfinish(upstream_conn);
exit(ERR_BAD_CONFIG);
}
/*
* If --force-rewind specified, check pg_rewind can be used, and
* pre-emptively fetch the list of configuration files which should be
* archived
*/
if (runtime_options.force_rewind_used == true)
{
PQExpBufferData reason;
PQExpBufferData msg;
initPQExpBuffer(&reason);
if (can_use_pg_rewind(upstream_conn, config_file_options.data_directory, &reason) == false)
{
log_error(_("--force-rewind specified but pg_rewind cannot be used"));
log_detail("%s", reason.data);
termPQExpBuffer(&reason);
PQfinish(upstream_conn);
exit(ERR_BAD_CONFIG);
}
termPQExpBuffer(&reason);
initPQExpBuffer(&msg);
appendPQExpBuffer(&msg,
_("prerequisites for using pg_rewind are met"));
if (runtime_options.dry_run == true)
{
log_info("%s", msg.data);
}
else
{
log_verbose(LOG_INFO, "%s", msg.data);
}
termPQExpBuffer(&msg);
}
/*
* Forcibly rewind node if requested (this is mainly for use when this
* action is being executed by "repmgr standby switchover")
*/
if (runtime_options.force_rewind_used == true && runtime_options.dry_run == false)
{
int ret;
PQExpBufferData filebuf;
_do_node_archive_config();
/* execute pg_rewind */
initPQExpBuffer(&command);
if (runtime_options.force_rewind_path[0] != '\0')
{
appendPQExpBuffer(&command,
"%s -D ",
runtime_options.force_rewind_path);
}
else
{
appendPQExpBuffer(&command,
"%s -D ",
make_pg_path("pg_rewind"));
}
appendShellString(&command,
config_file_options.data_directory);
appendPQExpBuffer(&command,
" --source-server='%s'",
primary_node_record.conninfo);
if (runtime_options.dry_run == true)
{
log_info(_("pg_rewind would now be executed"));
log_detail(_("pg_rewind command is:\n %s"),
command.data);
PQfinish(upstream_conn);
exit(SUCCESS);
}
log_notice(_("executing pg_rewind"));
log_debug("pg_rewind command is:\n %s",
command.data);
initPQExpBuffer(&command_output);
ret = local_command(
command.data,
&command_output);
termPQExpBuffer(&command);
if (ret == false)
{
log_error(_("unable to execute pg_rewind"));
log_detail("%s", command_output.data);
termPQExpBuffer(&command_output);
exit(ERR_BAD_CONFIG);
}
termPQExpBuffer(&command_output);
/* Restore any previously archived config files */
_do_node_restore_config();
initPQExpBuffer(&filebuf);
/* remove any recovery.done file copied in by pg_rewind */
appendPQExpBuffer(&filebuf,
"%s/recovery.done",
config_file_options.data_directory);
if (stat(filebuf.data, &statbuf) == 0)
{
log_verbose(LOG_INFO, _("deleting \"recovery.done\""));
if (unlink(filebuf.data) == -1)
{
log_warning(_("unable to delete \"%s\""),
filebuf.data);
log_detail("%s", strerror(errno));
}
}
termPQExpBuffer(&filebuf);
/* delete any replication slots copied in by pg_rewind */
{
PQExpBufferData slotdir_path;
DIR *slotdir;
struct dirent *slotdir_ent;
initPQExpBuffer(&slotdir_path);
appendPQExpBuffer(&slotdir_path,
"%s/pg_replslot",
config_file_options.data_directory);
slotdir = opendir(slotdir_path.data);
if (slotdir == NULL)
{
log_warning(_("unable to open replication slot directory \"%s\""),
slotdir_path.data);
log_detail("%s", strerror(errno));
}
else
{
while ((slotdir_ent = readdir(slotdir)) != NULL) {
struct stat statbuf;
PQExpBufferData slotdir_ent_path;
if(strcmp(slotdir_ent->d_name, ".") == 0 || strcmp(slotdir_ent->d_name, "..") == 0)
continue;
initPQExpBuffer(&slotdir_ent_path);
appendPQExpBuffer(&slotdir_ent_path,
"%s/%s",
slotdir_path.data,
slotdir_ent->d_name);
if (stat(slotdir_ent_path.data, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
{
termPQExpBuffer(&slotdir_ent_path);
continue;
}
log_debug("deleting slot directory \"%s\"", slotdir_ent_path.data);
if (rmdir_recursive(slotdir_ent_path.data) != 0 && errno != EEXIST)
{
log_warning(_("unable to delete replication slot directory \"%s\""), slotdir_ent_path.data);
log_detail("%s", strerror(errno));
log_hint(_("directory may need to be manually removed"));
}
termPQExpBuffer(&slotdir_ent_path);
}
}
termPQExpBuffer(&slotdir_path);
}
}
if (runtime_options.dry_run == true)
{
log_info(_("prerequisites for executing NODE REJOIN are met"));
exit(SUCCESS);
}
initPQExpBuffer(&follow_output);
success = do_standby_follow_internal(upstream_conn,
&primary_node_record,
&follow_output,
&follow_error_code);
if (success == false)
{
log_notice(_("NODE REJOIN failed"));
log_detail("%s", follow_output.data);
create_event_notification(upstream_conn,
&config_file_options,
config_file_options.node_id,
"node_rejoin",
success,
follow_output.data);
PQfinish(upstream_conn);
termPQExpBuffer(&follow_output);
exit(follow_error_code);
}
/*
* XXX add checks that node actually started and connected to primary,
* if not exit with ERR_REJOIN_FAIL
*/
create_event_notification(upstream_conn,
&config_file_options,
config_file_options.node_id,
"node_rejoin",
success,
follow_output.data);
PQfinish(upstream_conn);
log_notice(_("NODE REJOIN successful"));
log_detail("%s", follow_output.data);
termPQExpBuffer(&follow_output);
return;
}
/*
* For "internal" use by `node rejoin` on the local node when
* called by "standby switchover" from the remote node.
*
* This archives any configuration files in the data directory, which may be
* overwritten by pg_rewind.
*
* Requires configuration file, optionally --config-archive-dir
*/
static void
_do_node_archive_config(void)
{
PQExpBufferData archive_dir;
struct stat statbuf;
struct dirent *arcdir_ent;
DIR *arcdir;
KeyValueList config_files = {NULL, NULL};
KeyValueListCell *cell = NULL;
int copied_count = 0;
initPQExpBuffer(&archive_dir);
format_archive_dir(&archive_dir);
/* sanity-check directory path */
if (stat(archive_dir.data, &statbuf) == -1)
{
if (errno != ENOENT)
{
log_error(_("error encountered when checking archive directory \"%s\""),
archive_dir.data);
log_detail("%s", strerror(errno));
termPQExpBuffer(&archive_dir);
exit(ERR_BAD_CONFIG);
}
/* attempt to create and open the directory */
if (mkdir(archive_dir.data, S_IRWXU) != 0 && errno != EEXIST)
{
log_error(_("unable to create temporary archive directory \"%s\""),
archive_dir.data);
log_detail("%s", strerror(errno));
termPQExpBuffer(&archive_dir);
exit(ERR_BAD_CONFIG);
}
}
else if (!S_ISDIR(statbuf.st_mode))
{
log_error(_("\"%s\" exists but is not a directory"),
archive_dir.data);
termPQExpBuffer(&archive_dir);
exit(ERR_BAD_CONFIG);
}
arcdir = opendir(archive_dir.data);
if (arcdir == NULL)
{
log_error(_("unable to open archive directory \"%s\""),
archive_dir.data);
log_detail("%s", strerror(errno));
termPQExpBuffer(&archive_dir);
exit(ERR_BAD_CONFIG);
}
if (runtime_options.dry_run == false)
{
/*
* attempt to remove any existing files in the directory TODO: collate
* problem files into list
*/
while ((arcdir_ent = readdir(arcdir)) != NULL)
{
PQExpBufferData arcdir_ent_path;
initPQExpBuffer(&arcdir_ent_path);
appendPQExpBuffer(&arcdir_ent_path,
"%s/%s",
archive_dir.data,
arcdir_ent->d_name);
if (stat(arcdir_ent_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
{
termPQExpBuffer(&arcdir_ent_path);
continue;
}
if (unlink(arcdir_ent_path.data) == -1)
{
log_error(_("unable to delete file in temporary archive directory"));
log_detail(_("file is: \"%s\""), arcdir_ent_path.data);
log_detail("%s", strerror(errno));
closedir(arcdir);
termPQExpBuffer(&arcdir_ent_path);
exit(ERR_BAD_CONFIG);
}
termPQExpBuffer(&arcdir_ent_path);
}
closedir(arcdir);
}
/*
* extract list of config files from --config-files
*/
{
int i = 0;
int j = 0;
int config_file_len = strlen(runtime_options.config_files);
char filenamebuf[MAXPGPATH] = "";
PQExpBufferData pathbuf;
for (j = 0; j < config_file_len; j++)
{
if (runtime_options.config_files[j] == ',')
{
int filename_len = j - i;
if (filename_len > MAXPGPATH)
filename_len = MAXPGPATH - 1;
strncpy(filenamebuf, runtime_options.config_files + i, filename_len);
filenamebuf[filename_len] = '\0';
initPQExpBuffer(&pathbuf);
appendPQExpBuffer(&pathbuf,
"%s/%s",
config_file_options.data_directory,
filenamebuf);
key_value_list_set(&config_files,
filenamebuf,
pathbuf.data);
termPQExpBuffer(&pathbuf);
i = j + 1;
}
}
if (i < config_file_len)
{
strncpy(filenamebuf, runtime_options.config_files + i, config_file_len - i);
initPQExpBuffer(&pathbuf);
appendPQExpBuffer(&pathbuf,
"%s/%s",
config_file_options.data_directory,
filenamebuf);
key_value_list_set(&config_files,
filenamebuf,
pathbuf.data);
termPQExpBuffer(&pathbuf);
}
}
for (cell = config_files.head; cell; cell = cell->next)
{
PQExpBufferData dest_file;
initPQExpBuffer(&dest_file);
appendPQExpBuffer(&dest_file,
"%s/%s",
archive_dir.data,
cell->key);
if (stat(cell->value, &statbuf) == -1)
{
log_warning(_("specified file \"%s\" not found, skipping"),
cell->value);
}
else
{
if (runtime_options.dry_run == true)
{
log_info("file \"%s\" would be copied to \"%s\"",
cell->key, dest_file.data);
copied_count++;
}
else
{
log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"",
cell->key, dest_file.data);
copy_file(cell->value, dest_file.data);
copied_count++;
}
}
termPQExpBuffer(&dest_file);
}
if (runtime_options.dry_run == true)
{
log_verbose(LOG_INFO, _("%i files would have been copied to \"%s\""),
copied_count, archive_dir.data);
}
else
{
log_verbose(LOG_INFO, _("%i files copied to \"%s\""),
copied_count, archive_dir.data);
}
if (runtime_options.dry_run == true)
{
/*
* Delete directory in --dry-run mode - it should be empty unless it's been
* interfered with for some reason, in which case manual intervention is
* required
*/
if (rmdir(archive_dir.data) != 0 && errno != EEXIST)
{
log_warning(_("unable to delete directory \"%s\""), archive_dir.data);
log_detail("%s", strerror(errno));
log_hint(_("directory may need to be manually removed"));
}
else
{
log_verbose(LOG_INFO, "directory \"%s\" deleted", archive_dir.data);
}
}
termPQExpBuffer(&archive_dir);
}
/*
* Intended mainly for "internal" use by `standby switchover`, which
* calls this on the target server to restore any configuration files
* to the data directory, which may have been overwritten by an operation
* like pg_rewind
*
* Not designed to be called if the instance is running, but does
* not currently check.
*
* Requires -D/--pgdata, optionally --config-archive-dir
*
* Removes --config-archive-dir after successful copy
*/
static void
_do_node_restore_config(void)
{
PQExpBufferData archive_dir;
DIR *arcdir;
struct dirent *arcdir_ent;
int copied_count = 0;
bool copy_ok = true;
initPQExpBuffer(&archive_dir);
format_archive_dir(&archive_dir);
arcdir = opendir(archive_dir.data);
if (arcdir == NULL)
{
log_error(_("unable to open archive directory \"%s\""),
archive_dir.data);
log_detail("%s", strerror(errno));
termPQExpBuffer(&archive_dir);
exit(ERR_BAD_CONFIG);
}
while ((arcdir_ent = readdir(arcdir)) != NULL)
{
struct stat statbuf;
PQExpBufferData src_file_path;
PQExpBufferData dest_file_path;
initPQExpBuffer(&src_file_path);
appendPQExpBuffer(&src_file_path,
"%s/%s",
archive_dir.data,
arcdir_ent->d_name);
/* skip non-files */
if (stat(src_file_path.data, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
{
termPQExpBuffer(&src_file_path);
continue;
}
initPQExpBuffer(&dest_file_path);
appendPQExpBuffer(&dest_file_path,
"%s/%s",
config_file_options.data_directory,
arcdir_ent->d_name);
log_verbose(LOG_DEBUG, "copying \"%s\" to \"%s\"",
src_file_path.data, dest_file_path.data);
if (copy_file(src_file_path.data, dest_file_path.data) == false)
{
copy_ok = false;
log_warning(_("unable to copy \"%s\" to \"%s\""),
arcdir_ent->d_name, runtime_options.data_dir);
}
else
{
unlink(src_file_path.data);
copied_count++;
}
termPQExpBuffer(&dest_file_path);
termPQExpBuffer(&src_file_path);
}
closedir(arcdir);
log_notice(_("%i files copied to %s"),
copied_count,
config_file_options.data_directory);
if (copy_ok == false)
{
log_warning(_("unable to copy all files from \"%s\""), archive_dir.data);
}
else
{
/*
* Finally, delete directory - it should be empty unless it's been
* interfered with for some reason, in which case manual intervention is
* required
*/
if (rmdir(archive_dir.data) != 0 && errno != EEXIST)
{
log_warning(_("unable to delete directory \"%s\""), archive_dir.data);
log_detail("%s", strerror(errno));
log_hint(_("directory may need to be manually removed"));
}
else
{
log_verbose(LOG_INFO, "directory \"%s\" deleted", archive_dir.data);
}
}
termPQExpBuffer(&archive_dir);
return;
}
static void
format_archive_dir(PQExpBufferData *archive_dir)
{
appendPQExpBuffer(archive_dir,
"%s/repmgr-config-archive-%s",
runtime_options.config_archive_dir,
config_file_options.node_name);
log_verbose(LOG_DEBUG, "using archive directory \"%s\"", archive_dir->data);
}
static bool
copy_file(const char *src_file, const char *dest_file)
{
FILE *ptr_old,
*ptr_new;
int a = 0;
ptr_old = fopen(src_file, "r");
ptr_new = fopen(dest_file, "w");
if (ptr_old == NULL)
return false;
if (ptr_new == NULL)
{
fclose(ptr_old);
return false;
}
chmod(dest_file, S_IRUSR | S_IWUSR);
while (1)
{
a = fgetc(ptr_old);
if (!feof(ptr_old))
{
fputc(a, ptr_new);
}
else
{
break;
}
}
fclose(ptr_new);
fclose(ptr_old);
return true;
}
void
do_node_help(void)
{
print_help_header();
printf(_("Usage:\n"));
printf(_(" %s [OPTIONS] node status\n"), progname());
printf(_(" %s [OPTIONS] node check\n"), progname());
printf(_(" %s [OPTIONS] node rejoin\n"), progname());
printf(_(" %s [OPTIONS] node service\n"), progname());
puts("");
printf(_("NODE STATUS\n"));
puts("");
printf(_(" \"node status\" displays an overview of a node's basic information and replication status.\n"));
puts("");
printf(_(" Configuration file required, runs on local node only.\n"));
puts("");
printf(_(" --csv emit output as CSV\n"));
puts("");
printf(_("NODE CHECK\n"));
puts("");
printf(_(" \"node check\" performs some health checks on a node from a replication perspective.\n"));
puts("");
printf(_(" Configuration file required, runs on local node only.\n"));
puts("");
printf(_(" --csv emit output as CSV\n"));
printf(_(" --nagios emit output in Nagios format (individual status output only)\n"));
puts("");
printf(_(" Following options check an individual status:\n"));
printf(_(" --archive-ready number of WAL files ready for archiving\n"));
printf(_(" --downstream whether all downstream nodes are connected\n"));
printf(_(" --replication-lag replication lag in seconds (standbys only)\n"));
printf(_(" --role check node has expected role\n"));
printf(_(" --slots check for inactive replication slots\n"));
puts("");
printf(_("NODE REJOIN\n"));
puts("");
printf(_(" \"node rejoin\" enables a dormant (stopped) node to be rejoined to the replication cluster.\n"));
puts("");
printf(_(" Configuration file required, runs on local node only.\n"));
puts("");
printf(_(" --dry-run check that the prerequisites are met for rejoining the node\n" \
" (including usability of \"pg_rewind\" if requested)\n"));
printf(_(" --force-rewind[=VALUE] execute \"pg_rewind\" if necessary\n"));
printf(_(" (9.3 and 9.4 - provide full \"pg_rewind\" path)\n"));
printf(_(" --config-files comma-separated list of configuration files to retain\n" \
" after executing \"pg_rewind\"\n"));
printf(_(" --config-archive-dir directory to temporarily store retained configuration files\n" \
" (default: /tmp)\n"));
puts("");
printf(_("NODE SERVICE\n"));
puts("");
printf(_(" \"node service\" executes a system service command to stop/start/restart/reload a node\n" \
" or optionally display which command would be executed\n"));
puts("");
printf(_(" Configuration file required, runs on local node only.\n"));
puts("");
printf(_(" --dry-run show what action would be performed, but don't execute it\n"));
printf(_(" --action action to perform (one of \"start\", \"stop\", \"restart\" or \"reload\")\n"));
printf(_(" --list-actions show what command would be performed for each action\n"));
puts("");
}