repmgr/sysutils.c

/*
 * sysutils.c
 *
 * Functions which need to be executed on the local system.
 *
 * Copyright (c) EnterpriseDB Corporation, 2010-2021
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <signal.h>

#include "repmgr.h"

static bool _local_command(const char *command, PQExpBufferData *outputbuf, bool simple, int *return_value);


/*
 * Execute a command locally. "outputbuf" should either be an
 * initialised PQExpPuffer, or NULL
 */
bool
local_command(const char *command, PQExpBufferData *outputbuf)
{
	return _local_command(command, outputbuf, false, NULL);
}

bool
local_command_return_value(const char *command, PQExpBufferData *outputbuf, int *return_value)
{
	return _local_command(command, outputbuf, false, return_value);
}


bool
local_command_simple(const char *command, PQExpBufferData *outputbuf)
{
	return _local_command(command, outputbuf, true, NULL);
}


static bool
_local_command(const char *command, PQExpBufferData *outputbuf, bool simple, int *return_value)
{
	FILE	   *fp = NULL;
	char		output[MAXLEN];
	int			retval = 0;
	bool		success;
	char		tmpfile_path[MAXPGPATH];
	const char *tmpdir = getenv("TMPDIR");
	int			fd;
	PQExpBufferData command_final;

	if (!tmpdir)
		tmpdir = "/tmp";

	maxpath_snprintf(tmpfile_path, "%s/repmgr_command.XXXXXX",
					 tmpdir);

	fd = mkstemp(tmpfile_path);

	if (fd < 1)
	{
		log_error(_("unable to open temporary file"));
		return false;
	}

	initPQExpBuffer(&command_final);
	appendPQExpBufferStr(&command_final, command);

	appendPQExpBuffer(&command_final, " 2>%s", tmpfile_path);

	log_verbose(LOG_DEBUG, "executing:\n  %s", command_final.data);

	if (outputbuf == NULL)
	{
		retval = system(command_final.data);
		termPQExpBuffer(&command_final);

		if (return_value != NULL)
			*return_value = WEXITSTATUS(retval);

		close(fd);

		return (retval == 0) ? true : false;
	}

	fp = popen(command_final.data, "r");

	if (fp == NULL)
	{
		log_error(_("unable to execute local command:\n%s"), command_final.data);
		termPQExpBuffer(&command_final);
		close(fd);
		return false;
	}

	termPQExpBuffer(&command_final);

	while (fgets(output, MAXLEN, fp) != NULL)
	{
		appendPQExpBufferStr(outputbuf, output);

		if (!feof(fp) && simple == false)
		{
			break;
		}
	}

	retval = pclose(fp);

	/* 141 = SIGPIPE */
	success = (WEXITSTATUS(retval) == 0 || WEXITSTATUS(retval) == 141) ? true : false;

	log_verbose(LOG_DEBUG, "result of command was %i (%i)", WEXITSTATUS(retval), retval);

	/*
	 * Append any captured STDERR output
	 */

	fp = fopen(tmpfile_path, "r");

	/*
	 * Not critical if we can't open the file
	 */
	if (fp != NULL)
	{
		while (fgets(output, MAXLEN, fp) != NULL)
		{
			appendPQExpBufferStr(outputbuf, output);
		}

		fclose(fp);
	}

	unlink(tmpfile_path);

	if (return_value != NULL)
		*return_value = WEXITSTATUS(retval);

	if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
		log_verbose(LOG_DEBUG, "local_command(): output returned was:\n%s", outputbuf->data);
	else
		log_verbose(LOG_DEBUG, "local_command(): no output returned");


	return success;
}


/*
 * Execute a command via ssh on the remote host.
 *
 * TODO: implement SSH calls using libssh2.
 */
bool
remote_command(const char *host, const char *user, const char *command, const char *ssh_options, PQExpBufferData *outputbuf)
{
	FILE	   *fp;
	PQExpBufferData ssh_command;

	char		output[MAXLEN] = "";


	initPQExpBuffer(&ssh_command);

	make_remote_command(host, user, command, ssh_options, &ssh_command);

	log_debug("remote_command():\n  %s", ssh_command.data);

	fp = popen(ssh_command.data, "r");

	if (fp == NULL)
	{
		log_error(_("unable to execute remote command:\n  %s"), ssh_command.data);
		termPQExpBuffer(&ssh_command);
		return false;
	}

	termPQExpBuffer(&ssh_command);

	if (outputbuf != NULL)
	{
		/* TODO: better error handling */
		while (fgets(output, MAXLEN, fp) != NULL)
		{
			appendPQExpBufferStr(outputbuf, output);
		}
	}
	else
	{
		while (fgets(output, MAXLEN, fp) != NULL)
		{
			if (!feof(fp))
			{
				break;
			}
		}
	}

	pclose(fp);

	if (outputbuf != NULL)
	{
		if (outputbuf->data != NULL && outputbuf->data[0] != '\0')
			log_verbose(LOG_DEBUG, "remote_command(): output returned was:\n%s", outputbuf->data);
		else
			log_verbose(LOG_DEBUG, "remote_command(): no output returned");
	}

	return true;
}


void
make_remote_command(const char *host, const char *user, const char *command, const char *ssh_options, PQExpBufferData *ssh_command)
{
	PQExpBufferData ssh_host;

	initPQExpBuffer(&ssh_host);

	if (*user != '\0')
	{
		appendPQExpBuffer(&ssh_host, "%s@", user);
	}

	appendPQExpBufferStr(&ssh_host, host);


	appendPQExpBuffer(ssh_command,
					  "ssh -o Batchmode=yes %s %s %s",
					  ssh_options,
					  ssh_host.data,
					  command);

	termPQExpBuffer(&ssh_host);

}


pid_t
disable_wal_receiver(PGconn *conn)
{
	char buf[MAXLEN];
	int wal_retrieve_retry_interval, new_wal_retrieve_retry_interval;
	pid_t wal_receiver_pid = UNKNOWN_PID;
	int kill_ret;
	int i, j;
	int max_retries = 2;

	if (is_superuser_connection(conn, NULL) == false)
	{
		log_error(_("superuser connection required"));
		return UNKNOWN_PID;
	}

	if (get_recovery_type(conn) == RECTYPE_PRIMARY)
	{
		log_error(_("node is not in recovery"));
		log_detail(_("wal receiver can only run on standby nodes"));
		return UNKNOWN_PID;
	}

	wal_receiver_pid = (pid_t)get_wal_receiver_pid(conn);

	if (wal_receiver_pid == UNKNOWN_PID)
	{
		log_warning(_("unable to retrieve wal receiver PID"));
		return UNKNOWN_PID;
	}

	get_pg_setting(conn, "wal_retrieve_retry_interval", buf);

	/* TODO: potentially handle atoi error, though unlikely at this point */
	wal_retrieve_retry_interval = atoi(buf);

	new_wal_retrieve_retry_interval = wal_retrieve_retry_interval + WALRECEIVER_DISABLE_TIMEOUT_VALUE;

	if (wal_retrieve_retry_interval < WALRECEIVER_DISABLE_TIMEOUT_VALUE)
	{
		bool success;

		log_notice(_("setting \"wal_retrieve_retry_interval\" to %i milliseconds"),
				   new_wal_retrieve_retry_interval);
		alter_system_int(conn, "wal_retrieve_retry_interval", new_wal_retrieve_retry_interval);

		success = pg_reload_conf(conn);

		if (success == false)
		{
			log_warning(_("unable to reload configuration"));
			return UNKNOWN_PID;
		}
	}

	/*
	 * If, at this point, the WAL receiver is not running, we don't need to (and indeed can't)
	 * kill it.
	 */
	if (wal_receiver_pid == 0)
	{
		log_warning(_("wal receiver not running"));
		return UNKNOWN_PID;
	}


	/* why 5? */
	log_info(_("sleeping 5 seconds"));
	sleep(5);

	/* see comment below as to why we need a loop here */
	for (i = 0; i < max_retries; i++)
	{
		log_notice(_("killing WAL receiver with PID %i"), (int)wal_receiver_pid);

		kill((int)wal_receiver_pid, SIGTERM);

		for (j = 0; j < 30; j++)
		{
			kill_ret = kill(wal_receiver_pid, 0);

			if (kill_ret != 0)
			{
				log_info(_("WAL receiver with pid %i killed"), (int)wal_receiver_pid);
				break;
			}
			sleep(1);
		}

		/*
		 * Wait briefly to check that the WAL receiver has indeed gone away -
		 * for reasons as yet unclear, after a server start/restart, immediately
		 * after the first time a WAL receiver is killed, a new one is started
		 * straight away, so we'll need to kill that too.
		 */
		sleep(1);
		wal_receiver_pid = (pid_t)get_wal_receiver_pid(conn);
		if (wal_receiver_pid == UNKNOWN_PID || wal_receiver_pid == 0)
			break;
	}

	return wal_receiver_pid;
}

pid_t
enable_wal_receiver(PGconn *conn, bool wait_startup)
{
	char buf[MAXLEN];
	int wal_retrieve_retry_interval;
	pid_t wal_receiver_pid = UNKNOWN_PID;

	/* make timeout configurable */
	int i, timeout = 30;

	if (PQstatus(conn) != CONNECTION_OK)
	{
		log_error(_("database connection not available"));
		return UNKNOWN_PID;
	}

	if (is_superuser_connection(conn, NULL) == false)
	{
		log_error(_("superuser connection required"));
		return UNKNOWN_PID;
	}

	if (get_recovery_type(conn) == RECTYPE_PRIMARY)
	{
		log_error(_("node is not in recovery"));
		log_detail(_("wal receiver can only run on standby nodes"));
		return UNKNOWN_PID;
	}

	if (get_pg_setting(conn, "wal_retrieve_retry_interval", buf) == false)
	{
		log_error(_("unable to retrieve \"wal_retrieve_retry_interval\""));
		return UNKNOWN_PID;
	}

	/* TODO: potentially handle atoi error, though unlikely at this point */
	wal_retrieve_retry_interval = atoi(buf);

	if (wal_retrieve_retry_interval > WALRECEIVER_DISABLE_TIMEOUT_VALUE)
	{
		int new_wal_retrieve_retry_interval = wal_retrieve_retry_interval - WALRECEIVER_DISABLE_TIMEOUT_VALUE;
		bool success;

		log_notice(_("setting \"wal_retrieve_retry_interval\" to %i ms"),
				   new_wal_retrieve_retry_interval);

		success = alter_system_int(conn,
								   "wal_retrieve_retry_interval",
								   new_wal_retrieve_retry_interval);

		if (success == false)
		{
			log_warning(_("unable to change \"wal_retrieve_retry_interval\""));
			return UNKNOWN_PID;
		}

		success = pg_reload_conf(conn);

		if (success == false)
		{
			log_warning(_("unable to reload configuration"));
			return UNKNOWN_PID;
		}
	}
	else
	{
		/* TODO: add threshold sanity check */
		log_info(_("\"wal_retrieve_retry_interval\" is %i, not changing"),
				 wal_retrieve_retry_interval);
	}

	if (wait_startup == false)
		return UNKNOWN_PID;

	for (i = 0; i < timeout; i++)
	{
		wal_receiver_pid = (pid_t)get_wal_receiver_pid(conn);

		if (wal_receiver_pid > 0)
			break;

		log_info(_("sleeping %i of maximum %i seconds waiting for WAL receiver to start up"),
				 i + 1, timeout)
		sleep(1);
	}

	if (wal_receiver_pid == UNKNOWN_PID)
	{
		log_warning(_("unable to retrieve WAL receiver PID"));
		return UNKNOWN_PID;
	}
	else if (wal_receiver_pid == 0)
	{
		log_error(_("WAL receiver did not start up after %i seconds"), timeout);
		return UNKNOWN_PID;
	}

	log_info(_("WAL receiver started up with PID %i"), (int)wal_receiver_pid);

	return wal_receiver_pid;
}