repmgr/repmgrd.c

/*
 * repmgrd.c - Replication manager daemon
 * Copyright (C) 2ndQuadrant, 2010-2016
 *
 * This module connects to the nodes of a replication cluster and monitors
 * how far are they from master
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

#include <signal.h>

#include <sys/types.h>
#include <sys/stat.h>

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>


#include "repmgr.h"
#include "config.h"
#include "log.h"
#include "strutil.h"
#include "version.h"

/* Required PostgreSQL headers */
#include "access/xlogdefs.h"
#include "pqexpbuffer.h"


/* Local info */
t_configuration_options local_options = T_CONFIGURATION_OPTIONS_INITIALIZER;
PGconn	   *my_local_conn = NULL;

/* Master info */
t_configuration_options master_options = T_CONFIGURATION_OPTIONS_INITIALIZER;

PGconn	   *master_conn = NULL;

char	   *config_file = "";
bool		verbose = false;
bool		monitoring_history = false;
t_node_info node_info;

bool		failover_done = false;

char	   *pid_file = NULL;

static void help(void);
static void usage(void);
static void check_cluster_configuration(PGconn *conn);
static void check_node_configuration(void);

static void standby_monitor(void);
static void witness_monitor(void);
static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
static bool set_local_node_status(void);

static void update_shared_memory(char *last_wal_standby_applied);
static void update_registration(void);
static void do_master_failover(void);
static bool do_upstream_standby_failover(t_node_info upstream_node);

static t_node_info get_node_info(PGconn *conn, char *cluster, int node_id);
static XLogRecPtr lsn_to_xlogrecptr(char *lsn, bool *format_ok);

/*
 * Flag to mark SIGHUP. Whenever the main loop comes around it
 * will reread the configuration file.
 */
static volatile sig_atomic_t got_SIGHUP = false;

static void handle_sighup(SIGNAL_ARGS);
static void handle_sigint(SIGNAL_ARGS);

static void terminate(int retval);

#ifndef WIN32
static void setup_event_handlers(void);
#endif

static void do_daemonize(void);
static void check_and_create_pid_file(const char *pid_file);

static void
close_connections()
{
	if (master_conn != NULL && PQisBusy(master_conn) == 1)
		cancel_query(master_conn, local_options.master_response_timeout);

	if (my_local_conn != NULL)
		PQfinish(my_local_conn);

	if (master_conn != NULL && master_conn != my_local_conn)
		PQfinish(master_conn);

	master_conn = NULL;
	my_local_conn = NULL;
}


int
main(int argc, char **argv)
{
	static struct option long_options[] =
	{
		{"config-file", required_argument, NULL, 'f'},
		{"verbose", no_argument, NULL, 'v'},
		{"monitoring-history", no_argument, NULL, 'm'},
		{"daemonize", no_argument, NULL, 'd'},
		{"pid-file", required_argument, NULL, 'p'},
		{"help", no_argument, NULL, '?'},
		{"version", no_argument, NULL, 'V'},
		{NULL, 0, NULL, 0}
	};

	int			optindex;
	int			c;
	bool		daemonize = false;
	bool        startup_event_logged = false;

	FILE	   *fd;

	int			server_version_num = 0;

	set_progname(argv[0]);

	/* Disallow running as root to prevent directory ownership problems */
	if (geteuid() == 0)
	{
		fprintf(stderr,
				_("%s: cannot be run as root\n"
				  "Please log in (using, e.g., \"su\") as the "
				  "(unprivileged) user that owns "
				  "the data directory.\n"
				),
				progname());
		exit(1);
	}


	while ((c = getopt_long(argc, argv, "?Vf:vmdp:", long_options, &optindex)) != -1)
	{
		switch (c)
		{
			case 'f':
				config_file = optarg;
				break;
			case 'v':
				verbose = true;
				break;
			case 'm':
				monitoring_history = true;
				break;
			case 'd':
				daemonize = true;
				break;
			case 'p':
				pid_file = optarg;
				break;
			case '?':
				help();
				exit(SUCCESS);
			case 'V':
				printf("%s %s (PostgreSQL %s)\n", progname(), REPMGR_VERSION, PG_VERSION);
				exit(SUCCESS);
			default:
				usage();
				exit(ERR_BAD_CONFIG);
		}
	}

	/*
	 * Parse the configuration file, if provided. If no configuration file
	 * was provided, or one was but was incomplete, parse_config() will
	 * abort anyway, with an appropriate message.
	 *
	 * XXX it might be desirable to create an event record for this, in
	 * which case we'll need to refactor parse_config() not to abort,
	 * and return the error message.
	 */
	load_config(config_file, verbose, &local_options, argv[0]);

	if (daemonize)
	{
		do_daemonize();
	}

	if (pid_file)
	{
		check_and_create_pid_file(pid_file);
	}

#ifndef WIN32
	setup_event_handlers();
#endif

	fd = freopen("/dev/null", "r", stdin);
	if (fd == NULL)
	{
		fprintf(stderr, "error reopening stdin to '/dev/null': %s",
				strerror(errno));
	}

	fd = freopen("/dev/null", "w", stdout);
	if (fd == NULL)
	{
		fprintf(stderr, "error reopening stdout to '/dev/null': %s",
				strerror(errno));
	}

	logger_init(&local_options, progname());
	if (verbose)
		logger_set_verbose();

	if (log_type == REPMGR_SYSLOG)
	{
		fd = freopen("/dev/null", "w", stderr);

		if (fd == NULL)
		{
			fprintf(stderr, "error reopening stderr to '/dev/null': %s",
					strerror(errno));
		}
	}

	/* Initialise the repmgr schema name */
	/* XXX check this handles quoting properly */
	maxlen_snprintf(repmgr_schema, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX,
			 local_options.cluster_name);

	log_info(_("connecting to database '%s'\n"),
			 local_options.conninfo);
	my_local_conn = establish_db_connection(local_options.conninfo, true);

	/* Verify that server is a supported version */
	log_info(_("connected to database, checking its state\n"));

	server_version_num = get_server_version(my_local_conn, NULL);

	if (server_version_num < MIN_SUPPORTED_VERSION_NUM)
	{
		if (server_version_num > 0)
		{
			log_err(_("%s requires PostgreSQL %s or later\n"),
					progname(),
					MIN_SUPPORTED_VERSION) ;
		}
		else
		{
			log_err(_("unable to determine PostgreSQL server version\n"));
		}

		terminate(ERR_BAD_CONFIG);
	}

	/* Retrieve record for this node from the local database */
	node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);

	/*
	 * No node record found - exit gracefully
	 *
	 * Note: it's highly unlikely this situation will occur when starting
	 * repmgrd on a witness, unless someone goes to the trouble of
	 * deleting the node record from the previously copied table.
	 */

	if (node_info.node_id == NODE_NOT_FOUND)
	{
		log_err(_("No metadata record found for this node - terminating\n"));
		log_hint(_("Check that 'repmgr (master|standby) register' was executed for this node\n"));
		terminate(ERR_BAD_CONFIG);
	}

	log_debug("node id is %i, upstream is %i\n", node_info.node_id, node_info.upstream_node_id);

	/*
	 * MAIN LOOP This loops cycles at startup and once per failover and
	 * Requisites: - my_local_conn needs to be already setted with an active
	 * connection - no master connection
	 */
	do
	{
		/* Timer for repl_nodes synchronisation interval */
		int sync_repl_nodes_elapsed = 0;

		/*
		 * Set my server mode, establish a connection to master and start
		 * monitoring
		 */

		switch (node_info.type)
		{
			case MASTER:
				master_options.node = local_options.node;
				strncpy(master_options.conninfo, local_options.conninfo,
						MAXLEN);
				master_conn = my_local_conn;

				check_cluster_configuration(my_local_conn);
				check_node_configuration();

				if (reload_config(&local_options))
				{
					PQfinish(my_local_conn);
					my_local_conn = establish_db_connection(local_options.conninfo, true);
					master_conn = my_local_conn;
					update_registration();
				}

				/* Log startup event */
				if (startup_event_logged == false)
				{
					create_event_record(master_conn,
										&local_options,
										local_options.node,
										"repmgrd_start",
										true,
										NULL);
					startup_event_logged = true;
				}

				log_info(_("starting continuous master connection check\n"));

				/*
				 * Check that master is still alive.
				 * XXX We should also check that the
				 * standby servers are sending info
				 */

				/*
				 * Every local_options.monitor_interval_secs seconds, do
				 * master checks
				 */
				do
				{
					if (check_connection(&master_conn, "master", NULL))
					{
						sleep(local_options.monitor_interval_secs);
					}
					else
					{
						/*
						 * XXX May we do something more verbose ?
						 */
						terminate(1);
					}

					if (got_SIGHUP)
					{
						/*
						 * if we can reload the configuration file, then could need to change
						 * my_local_conn
						 */
						if (reload_config(&local_options))
						{
							PQfinish(my_local_conn);
							my_local_conn = establish_db_connection(local_options.conninfo, true);
							master_conn = my_local_conn;

							if (*local_options.logfile)
							{
								FILE	   *fd;

								fd = freopen(local_options.logfile, "a", stderr);
								if (fd == NULL)
								{
									fprintf(stderr, "error reopening stderr to '%s': %s",
									 local_options.logfile, strerror(errno));
								}

							}

							update_registration();
						}
						got_SIGHUP = false;
					}
				} while (!failover_done);
				break;

			case WITNESS:
			case STANDBY:

				/* We need the node id of the master server as well as a connection to it */
				log_info(_("connecting to master node of cluster '%s'\n"),
						 local_options.cluster_name);

				master_conn = get_master_connection(my_local_conn,
													local_options.cluster_name,
													&master_options.node, NULL);

				if (master_conn == NULL)
				{
					PQExpBufferData errmsg;
					initPQExpBuffer(&errmsg);

					appendPQExpBuffer(&errmsg,
									  _("unable to connect to master node"));

					log_err("%s\n", errmsg.data);

					create_event_record(NULL,
										&local_options,
										local_options.node,
										"repmgrd_shutdown",
										false,
										errmsg.data);

					terminate(ERR_BAD_CONFIG);
				}

				check_cluster_configuration(my_local_conn);
				check_node_configuration();

				if (reload_config(&local_options))
				{
					PQfinish(my_local_conn);
					my_local_conn = establish_db_connection(local_options.conninfo, true);
					update_registration();
				}
				/* Log startup event */
				if (startup_event_logged == false)
				{
					create_event_record(master_conn,
										&local_options,
										local_options.node,
										"repmgrd_start",
										true,
										NULL);
					startup_event_logged = true;
				}

				/*
				 * Every local_options.monitor_interval_secs seconds, do
				 * checks
				 */
				if (node_info.type == WITNESS)
				{
					log_info(_("starting continuous witness node monitoring\n"));
				}
				else if (node_info.type == STANDBY)
				{
					log_info(_("starting continuous standby node monitoring\n"));
				}

				do
				{
					log_verbose(LOG_DEBUG, "standby check loop...\n");

					if (node_info.type == WITNESS)
					{
						witness_monitor();
					}
					else if (node_info.type == STANDBY)
					{
						standby_monitor();
					}

					sleep(local_options.monitor_interval_secs);

					/*
					 * On a witness node, regularly resync the repl_nodes table
					 * to keep up with any changes on the primary
					 *
					 * TODO: only resync the table if changes actually detected
					 */
					if (node_info.type == WITNESS)
					{
						sync_repl_nodes_elapsed += local_options.monitor_interval_secs;
						log_debug(_("seconds since last node record sync: %i (sync interval: %i)\n"), sync_repl_nodes_elapsed, local_options.witness_repl_nodes_sync_interval_secs);
						if(sync_repl_nodes_elapsed >= local_options.witness_repl_nodes_sync_interval_secs)
						{
							log_debug(_("Resyncing repl_nodes table\n"));
							witness_copy_node_records(master_conn, my_local_conn, local_options.cluster_name);
							sync_repl_nodes_elapsed = 0;
						}
					}

					if (got_SIGHUP)
					{
						/*
						 * if we can reload, then could need to change
						 * my_local_conn
						 */
						if (reload_config(&local_options))
						{
							PQfinish(my_local_conn);
							my_local_conn = establish_db_connection(local_options.conninfo, true);
							update_registration();
						}
						got_SIGHUP = false;
					}

					if (failover_done)
					{
						log_debug(_("standby check loop will terminate\n"));
					}
				} while (!failover_done);
				break;
			default:
				log_err(_("unrecognized mode for node %d\n"),
						local_options.node);
		}

		failover_done = false;

	} while (true);

	/* close the connection to the database and cleanup */
	close_connections();

	/* Shuts down logging system */
	logger_shutdown();

	return 0;
}


/*
 * witness_monitor()
 *
 * Monitors witness server; attempt to find and connect to new master
 * if existing master connection is lost
 */
static void
witness_monitor(void)
{
	char		monitor_witness_timestamp[MAXLEN];
	PGresult   *res;
	char		sqlquery[QUERY_STR_LEN];
	bool        connection_ok;

	/*
	 * Check if master is available; if not, assume failover situation
	 * and try to determine new master. There may be a delay between detection
	 * of a missing master and promotion of a standby by that standby's
	 * repmgrd, so we'll loop for a while before giving up.
	 */
	connection_ok = check_connection(&master_conn, "master", NULL);

	if (connection_ok == false)
	{
		int			connection_retries;
		log_debug(_("old master node ID: %i\n"), master_options.node);

		/* We need to wait a while for the new master to be promoted */
		log_info(
			_("waiting %i seconds for a new master to be promoted...\n"),
			local_options.master_response_timeout
			);

		sleep(local_options.master_response_timeout);

		/* Attempt to find the new master */
		for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
		{
			log_info(
				_("attempt %i of %i to determine new master...\n"),
				connection_retries + 1,
				local_options.reconnect_attempts
				);
			master_conn = get_master_connection(my_local_conn,
												 local_options.cluster_name, &master_options.node, NULL);

			if (PQstatus(master_conn) != CONNECTION_OK)
			{
				log_warning(
					_("unable to determine a valid master server; waiting %i seconds to retry...\n"),
					local_options.reconnect_interval
					);
				PQfinish(master_conn);
				sleep(local_options.reconnect_interval);
			}
			else
			{
				log_debug(_("new master found with node ID: %i\n"), master_options.node);
				connection_ok = true;

				/*
				 * Update the repl_nodes table from the new master to reflect the changed
				 * node configuration
				 *
				 * XXX it would be neat to be able to handle this with e.g. table-based
				 * logical replication
				 */
				witness_copy_node_records(master_conn, my_local_conn, local_options.cluster_name);

				break;
			}
		}

		if (connection_ok == false)
		{
			PQExpBufferData errmsg;
			initPQExpBuffer(&errmsg);

			appendPQExpBuffer(&errmsg,
							  _("unable to determine a valid master node, terminating..."));

			log_err("%s\n", errmsg.data);

			create_event_record(NULL,
								&local_options,
								local_options.node,
								"repmgrd_shutdown",
								false,
								errmsg.data);

			terminate(ERR_DB_CON);
		}
	}

	/* Fast path for the case where no history is requested */
	if (!monitoring_history)
		return;

	/*
	 * Cancel any query that is still being executed, so i can insert the
	 * current record
	 */
	if (!cancel_query(master_conn, local_options.master_response_timeout))
		return;
	if (wait_connection_availability(master_conn,
								 local_options.master_response_timeout) != 1)
		return;

	/* Get local xlog info */
	sqlquery_snprintf(sqlquery, "SELECT CURRENT_TIMESTAMP");

	res = PQexec(my_local_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn));
		PQclear(res);
		/* if there is any error just let it be and retry in next loop */
		return;
	}

	strcpy(monitor_witness_timestamp, PQgetvalue(res, 0, 0));
	PQclear(res);

	/*
	 * Build the SQL to execute on master
	 */
	sqlquery_snprintf(sqlquery,
					  "INSERT INTO %s.repl_monitor "
					  "           (primary_node, standby_node, "
					  "            last_monitor_time, last_apply_time, "
					  "            last_wal_primary_location, last_wal_standby_location, "
					  "            replication_lag, apply_lag )"
					  "      VALUES(%d, %d, "
					  "             '%s'::TIMESTAMP WITH TIME ZONE, NULL, "
					  "             pg_catalog.pg_current_xlog_location(), NULL, "
					  "             0, 0) ",
					  get_repmgr_schema_quoted(my_local_conn),
					  master_options.node,
					  local_options.node,
					  monitor_witness_timestamp);

	/*
	 * Execute the query asynchronously, but don't check for a result. We will
	 * check the result next time we pause for a monitor step.
	 */
	if (PQsendQuery(master_conn, sqlquery) == 0)
		log_warning(_("query could not be sent to master: %s\n"),
					PQerrorMessage(master_conn));
}


/*
 * standby_monitor()
 *
 * Monitor standby server and handle failover situation. Also insert
 * monitoring information if configured.
 */
static void
standby_monitor(void)
{
	PGresult   *res;
	char		monitor_standby_timestamp[MAXLEN];
	char		last_wal_primary_location[MAXLEN];
	char		last_xlog_receive_location[MAXLEN];
	char		last_xlog_replay_location[MAXLEN];
	char		last_xact_replay_timestamp[MAXLEN];
	bool		last_xlog_receive_location_gte_replayed;
	char		sqlquery[QUERY_STR_LEN];

	XLogRecPtr	lsn_master_current_xlog_location;
	XLogRecPtr	lsn_last_xlog_receive_location;
	XLogRecPtr	lsn_last_xlog_replay_location;

	long long unsigned int replication_lag;
	long long unsigned int apply_lag;

	int			connection_retries,
				ret;
	bool		did_retry = false;

	PGconn	   *upstream_conn;
	char		upstream_conninfo[MAXCONNINFO];
	int			upstream_node_id;
	t_node_info upstream_node;

	int			active_master_id;
	const char *upstream_node_type = NULL;

	bool		receiving_streamed_wal = true;
	/*
	 * Verify that the local node is still available - if not there's
	 * no point in doing much else anyway
	 */

	if (!check_connection(&my_local_conn, "standby", NULL))
	{
		PQExpBufferData errmsg;

		set_local_node_status();

		initPQExpBuffer(&errmsg);

		appendPQExpBuffer(&errmsg,
						  _("failed to connect to local node, node marked as failed!"));

		log_err("%s\n", errmsg.data);

		goto continue_monitoring_standby;
	}

	upstream_conn = get_upstream_connection(my_local_conn,
											local_options.cluster_name,
											local_options.node,
											&upstream_node_id,
											upstream_conninfo);

	upstream_node_type = (upstream_node_id == master_options.node)
		? "master"
		: "upstream";

	/*
	 * Check that the upstream node is still available
	 * If not, initiate failover process
	 */

	check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
	/*
	 * This takes up to local_options.reconnect_attempts *
	 * local_options.reconnect_interval seconds
	 */

	if (PQstatus(upstream_conn) != CONNECTION_OK)
	{
		PQfinish(upstream_conn);
		upstream_conn = NULL;

		if (local_options.failover == MANUAL_FAILOVER)
		{
			log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), upstream_node_type);

			for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
			{
				master_conn = get_master_connection(my_local_conn,
					local_options.cluster_name, &master_options.node, NULL);
				if (PQstatus(master_conn) == CONNECTION_OK)
				{
					/*
					 * Connected, we can continue the process so break the
					 * loop
					 */
					log_err(_("connected to node %d, continuing monitoring.\n"),
							master_options.node);
					break;
				}
				else
				{
					log_err(
					    _("no new master found, waiting %i seconds before retry...\n"),
					    local_options.retry_promote_interval_secs
					    );

					sleep(local_options.retry_promote_interval_secs);
				}
			}

			if (PQstatus(master_conn) != CONNECTION_OK)
			{
				PQExpBufferData errmsg;
				initPQExpBuffer(&errmsg);

				appendPQExpBuffer(&errmsg,
								  _("Unable to reconnect to master after %i attempts, terminating..."),
								  local_options.reconnect_attempts);

				log_err("%s\n", errmsg.data);

				create_event_record(NULL,
									&local_options,
									local_options.node,
									"repmgrd_shutdown",
									false,
									errmsg.data);

				terminate(ERR_DB_CON);
			}
		}
		else if (local_options.failover == AUTOMATIC_FAILOVER)
		{
			/*
			 * When we return from this function we will have a new master
			 * and a new master_conn
			 *
			 * Failover handling is handled differently depending on whether
			 * the failed node is the master or a cascading standby
			 */
			upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);

			if (upstream_node.type == MASTER)
			{
				log_debug(_("failure detected on master node (%i); attempting to promote a standby\n"),
						  node_info.upstream_node_id);
				do_master_failover();
			}
			else
			{
				log_debug(_("failure detected on upstream node %i; attempting to reconnect to new upstream node\n"),
						  node_info.upstream_node_id);

				if (!do_upstream_standby_failover(upstream_node))
				{
					PQExpBufferData errmsg;
					initPQExpBuffer(&errmsg);

					appendPQExpBuffer(&errmsg,
							  _("unable to reconnect to new upstream node, terminating..."));

					log_err("%s\n", errmsg.data);

					create_event_record(master_conn,
							    &local_options,
							    local_options.node,
							    "repmgrd_shutdown",
							    false,
							    errmsg.data);

					terminate(ERR_DB_CON);
				}
			}
			return;
		}
	}

	PQfinish(upstream_conn);

  continue_monitoring_standby:
	/* Check if we still are a standby, we could have been promoted */
	do
	{
		ret = is_standby(my_local_conn);

		switch (ret)
		{
			case 0:
				/*
				 * This situation can occur if `pg_ctl promote` was manually executed
				 * on the node. If the original master is still running after this
				 * node has been promoted, we're in a "two brain" situation which
				 * will require manual resolution as there's no way of determing
				 * which master is the correct one.
				 *
				 * We should log a message so the user knows of the situation at hand.
				 *
				 * XXX check if the original master is still active and display a
				 * warning
				 */
				log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
				log_err(_("Check your cluster and manually fix any anomaly.\n"));
				terminate(1);
				break;

			case -1:
				log_err(_("standby node has disappeared, trying to reconnect...\n"));
				did_retry = true;

				if (!check_connection(&my_local_conn, "standby", NULL))
				{
					set_local_node_status();
					/*
					 * Let's continue checking, and if the postgres server on the
					 * standby comes back up, we will activate it again
					 */
				}

				break;
		}
	} while (ret == -1);

	if (did_retry)
	{
		/*
		 * There's a possible situation where the standby went down for some reason
		 * (maintenance for example) and is now up and maybe connected once again to
		 * the stream. If we set the local standby node as failed and it's now running
		 * and receiving replication data, we should activate it again.
		 */
	        set_local_node_status();
	        log_info(_("standby connection recovered!\n"));
	}

	/* Fast path for the case where no history is requested */
	if (!monitoring_history)
		return;

	/*
	 * If original master has gone away we'll need to get the new one
	 * from the upstream node to write monitoring information
	 */

	upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);

	sprintf(sqlquery,
			"SELECT id "
			"  FROM %s.repl_nodes "
			" WHERE type = 'master' "
			"   AND active IS TRUE ",
			get_repmgr_schema_quoted(my_local_conn));

	res = PQexec(my_local_conn, sqlquery);

	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("standby_monitor() - query error:%s\n"), PQerrorMessage(my_local_conn));
		PQclear(res);

		/* Not a fatal error, just means no monitoring records will be written */
		return;
	}

	if (PQntuples(res) == 0)
	{
		log_err(_("standby_monitor(): no active master found\n"));
		PQclear(res);
		return;
	}

	active_master_id = atoi(PQgetvalue(res, 0, 0));
	PQclear(res);

	if (active_master_id != master_options.node)
	{
		log_notice(_("connecting to active master (node %i)...\n"), active_master_id); \
		if (master_conn != NULL)
		{
			PQfinish(master_conn);
		}
		master_conn = get_master_connection(my_local_conn,
											 local_options.cluster_name,
											 &master_options.node, NULL);
	}
	if (PQstatus(master_conn) != CONNECTION_OK)
		PQreset(master_conn);

	/*
	 * Cancel any query that is still being executed, so i can insert the
	 * current record
	 */
	if (!cancel_query(master_conn, local_options.master_response_timeout))
		return;
	if (wait_connection_availability(master_conn, local_options.master_response_timeout) != 1)
		return;

	/* Get local xlog info */
	sqlquery_snprintf(sqlquery,
					  "SELECT CURRENT_TIMESTAMP, "
					  "pg_catalog.pg_last_xlog_receive_location(), "
					  "pg_catalog.pg_last_xlog_replay_location(), "
					  "pg_catalog.pg_last_xact_replay_timestamp(), "
					  "pg_catalog.pg_last_xlog_receive_location() >= pg_catalog.pg_last_xlog_replay_location()");

	res = PQexec(my_local_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn));
		PQclear(res);
		/* if there is any error just let it be and retry in next loop */
		return;
	}

	strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
	strncpy(last_xlog_receive_location, PQgetvalue(res, 0, 1), MAXLEN);
	strncpy(last_xlog_replay_location, PQgetvalue(res, 0, 2), MAXLEN);
	strncpy(last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);

	last_xlog_receive_location_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
		? true
		: false;

	/*
	 * If pg_last_xlog_receive_location is NULL, this means we're in archive
	 * recovery and will need to calculate lag based on pg_last_xlog_replay_location
	 */

	/*
	 * Replayed WAL is greater than received streamed WAL
	 */
	if (PQgetisnull(res, 0, 1))
	{
		receiving_streamed_wal = false;
	}

	PQclear(res);

	/*
	 * In the unusual event of a standby becoming disconnected from the primary,
	 * while this repmgrd remains connected to the primary,  subtracting
	 * "last_xlog_replay_location" from "lsn_last_xlog_receive_location" and coercing to
	 * (long long unsigned int) will result in a meaningless, very large
	 * value which will overflow a BIGINT column and spew error messages into the
	 * PostgreSQL log. In the absence of a better strategy, skip attempting
	 * to insert a monitoring record.
	 */
	if (receiving_streamed_wal == true && last_xlog_receive_location_gte_replayed == false)
	{
		log_verbose(LOG_WARNING,
					"Replayed WAL newer than received WAL - is this standby connected to its upstream?\n");
	}

	/*
	 * Get master xlog position
	 *
	 * TODO: investigate whether pg_current_xlog_insert_location() would be a better
	 * choice; see: https://github.com/2ndQuadrant/repmgr/issues/189
	 */
	sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_current_xlog_location()");

	res = PQexec(master_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(master_conn));
		PQclear(res);
		return;
	}

	strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN);
	PQclear(res);


	lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_primary_location, NULL);
	lsn_last_xlog_replay_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL);
	lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);


	/* Calculate apply lag */
	if (last_xlog_receive_location_gte_replayed == false)
	{
		/*
		 * We're not receiving streaming WAL - in this case the receive location
		 * equals the last replayed location
		 */
		apply_lag = 0;
		strncpy(last_xlog_receive_location, last_xlog_replay_location, MAXLEN);
	}
	else
	{
		apply_lag = (long long unsigned int)lsn_last_xlog_receive_location - lsn_last_xlog_replay_location;
	}

	/* Calculate replication lag */
	if (lsn_master_current_xlog_location >= lsn_last_xlog_receive_location)
	{
		replication_lag = (long long unsigned int)(lsn_master_current_xlog_location - lsn_last_xlog_receive_location);
	}
	else
	{
		/* This should never happen, but in case it does set lag to zero */
		log_warning("Master xlog (%s) location appears less than standby receive location (%s)\n",
					last_wal_primary_location,
					last_xlog_receive_location);
		replication_lag = 0;
	}

	/*
	 * Build the SQL to execute on master
	 */
	sqlquery_snprintf(sqlquery,
					  "INSERT INTO %s.repl_monitor "
					  "           (primary_node, "
					  "            standby_node, "
					  "            last_monitor_time, "
					  "            last_apply_time, "
					  "            last_wal_primary_location, "
					  "            last_wal_standby_location, "
					  "            replication_lag, "
					  "            apply_lag ) "
					  "     VALUES(%d, "
					  "            %d, "
					  "            '%s'::TIMESTAMP WITH TIME ZONE, "
					  "            '%s'::TIMESTAMP WITH TIME ZONE, "
					  "            '%s', "
					  "            '%s', "
					  "            %llu, "
					  "            %llu) ",
					  get_repmgr_schema_quoted(master_conn),
					  master_options.node,
					  local_options.node,
					  monitor_standby_timestamp,
					  last_xact_replay_timestamp,
					  last_wal_primary_location,
					  last_xlog_receive_location,
					  replication_lag,
					  apply_lag);

	/*
	 * Execute the query asynchronously, but don't check for a result. We will
	 * check the result next time we pause for a monitor step.
	 */
	log_verbose(LOG_DEBUG, "standby_monitor:() %s\n", sqlquery);

	if (PQsendQuery(master_conn, sqlquery) == 0)
		log_warning(_("query could not be sent to master. %s\n"),
					PQerrorMessage(master_conn));
}


/*
 * do_master_failover()
 *
 * Handles failover to new cluster master
 */

static void
do_master_failover(void)
{
	PGresult   *res;
	char		sqlquery[QUERY_STR_LEN];

	int			total_nodes = 0;
	int			visible_nodes = 0;
	int			ready_nodes = 0;

	bool		candidate_found = false;

	int			i;
	int			r;

	XLogRecPtr	xlog_recptr;
	bool		lsn_format_ok;

	char		last_xlog_replay_location[MAXLEN];

	PGconn	   *node_conn = NULL;

	/*
	 * will get info about until 50 nodes, which seems to be large enough for
	 * most scenarios
	 */
	t_node_info nodes[FAILOVER_NODES_MAX_CHECK];

	/* Store details of the failed node here */
	t_node_info failed_master = T_NODE_INFO_INITIALIZER;

	/* Store details of the best candidate for promotion to master here */
	t_node_info best_candidate = T_NODE_INFO_INITIALIZER;

	/* get a list of standby nodes, including myself */
	sprintf(sqlquery,
			"SELECT id, conninfo, type, upstream_node_id "
			"  FROM %s.repl_nodes "
			" WHERE cluster = '%s' "
		        "   AND active IS TRUE "
			"   AND priority > 0 "
			" ORDER BY priority DESC, id "
			" LIMIT %i ",
			get_repmgr_schema_quoted(my_local_conn),
			local_options.cluster_name,
			FAILOVER_NODES_MAX_CHECK);

	res = PQexec(my_local_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("unable to retrieve node records: %s\n"), PQerrorMessage(my_local_conn));
		PQclear(res);
		terminate(ERR_DB_QUERY);
	}

	/*
	 * total nodes that are registered
	 */
	total_nodes = PQntuples(res);
	log_debug(_("%d active nodes registered\n"), total_nodes);

	/*
	 * Build an array with the nodes and indicate which ones are visible and
	 * ready
	 */
	for (i = 0; i < total_nodes; i++)
	{
		nodes[i].node_id = atoi(PQgetvalue(res, i, 0));

		strncpy(nodes[i].conninfo_str, PQgetvalue(res, i, 1), MAXCONNINFO);

		nodes[i].type = parse_node_type(PQgetvalue(res, i, 2));

		/* Copy details of the failed node */
		/* XXX only node_id is actually used later */
		if (nodes[i].type == MASTER)
		{
			failed_master.node_id = nodes[i].node_id;
			failed_master.xlog_location = nodes[i].xlog_location;
			failed_master.is_ready = nodes[i].is_ready;
		}

		nodes[i].upstream_node_id = atoi(PQgetvalue(res, i, 3));

		/*
		 * Initialize on false so if we can't reach this node we know that
		 * later
		 */
		nodes[i].is_visible = false;
		nodes[i].is_ready = false;

		nodes[i].xlog_location = InvalidXLogRecPtr;

		log_debug(_("node=%d conninfo=\"%s\" type=%s\n"),
				  nodes[i].node_id, nodes[i].conninfo_str,
				  PQgetvalue(res, i, 2));

		node_conn = establish_db_connection(nodes[i].conninfo_str, false);

		/* if we can't see the node just skip it */
		if (PQstatus(node_conn) != CONNECTION_OK)
		{
			if (node_conn != NULL)
				PQfinish(node_conn);

			continue;
		}

		visible_nodes++;
		nodes[i].is_visible = true;

		PQfinish(node_conn);
	}
	PQclear(res);

	log_debug(_("total nodes counted: registered=%d, visible=%d\n"),
			  total_nodes, visible_nodes);

	/*
	 * Am I on the group that should keep alive? If I see less than half of
	 * total_nodes then I should do nothing
	 */
	if (visible_nodes < (total_nodes / 2.0))
	{
		log_err(_("Unable to reach most of the nodes.\n"
				  "Let the other standby servers decide which one will be the master.\n"
				  "Manual action will be needed to re-add this node to the cluster.\n"));
		terminate(ERR_FAILOVER_FAIL);
	}

	/* Query all available nodes to determine readiness and LSN */
	for (i = 0; i < total_nodes; i++)
	{
		log_debug("checking node %i...\n", nodes[i].node_id);

		/* if the node is not visible, skip it */
		if (!nodes[i].is_visible)
			continue;

		/* if the node is a witness node, skip it */
		if (nodes[i].type == WITNESS)
			continue;

		/* if node does not have same upstream node, skip it */
		if (nodes[i].upstream_node_id != node_info.upstream_node_id)
			continue;

		node_conn = establish_db_connection(nodes[i].conninfo_str, false);

		/*
		 * XXX This shouldn't happen, if this happens it means this is a major
		 * problem maybe network outages? anyway, is better for a human to
		 * react
		 */
		if (PQstatus(node_conn) != CONNECTION_OK)
		{
			log_err(_("It seems new problems are arising, manual intervention is needed\n"));
			terminate(ERR_FAILOVER_FAIL);
		}

		sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
		res = PQexec(node_conn, sqlquery);
		if (PQresultStatus(res) != PGRES_TUPLES_OK)
		{
			log_info(_("unable to retrieve node's last standby location: %s\n"),
					 PQerrorMessage(node_conn));

			log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
			PQclear(res);
			PQfinish(node_conn);
			terminate(ERR_FAILOVER_FAIL);
		}

		xlog_recptr = lsn_to_xlogrecptr(PQgetvalue(res, 0, 0), &lsn_format_ok);

		log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, PQgetvalue(res, 0, 0));

		PQclear(res);
		PQfinish(node_conn);

		/* If position is 0/0, error */
		/* XXX do we need to terminate ourselves if the queried node has a problem? */
		if (xlog_recptr == InvalidXLogRecPtr)
		{
			log_err(_("InvalidXLogRecPtr detected on standby node %i\n"), nodes[i].node_id);
			terminate(ERR_FAILOVER_FAIL);
		}

		nodes[i].xlog_location = xlog_recptr;
	}

	/* last we get info about this node, and update shared memory */
	sprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
	res = PQexec(my_local_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s.\nReport an invalid value to not be "
				  " considered as new master and exit.\n"),
				PQerrorMessage(my_local_conn));
		PQclear(res);
		sprintf(last_xlog_replay_location, "'%X/%X'", 0, 0);
		update_shared_memory(last_xlog_replay_location);
		terminate(ERR_DB_QUERY);
	}
	/* write last location in shared memory */
	update_shared_memory(PQgetvalue(res, 0, 0));
	PQclear(res);

	/* Wait for each node to come up and report a valid LSN */
	for (i = 0; i < total_nodes; i++)
	{
		/*
		 * ensure witness server is marked as ready, and skip
		 * LSN check
		 */
		if (nodes[i].type == WITNESS)
		{
			if (!nodes[i].is_ready)
			{
				nodes[i].is_ready = true;
				ready_nodes++;
			}
			continue;
		}

		/* if the node is not visible, skip it */
		if (!nodes[i].is_visible)
			continue;

		/* if node does not have same upstream node, skip it */
		if (nodes[i].upstream_node_id != node_info.upstream_node_id)
			continue;

		node_conn = establish_db_connection(nodes[i].conninfo_str, false);

		/*
		 * XXX This shouldn't happen, if this happens it means this is a
		 * major problem maybe network outages? anyway, is better for a
		 * human to react
		 */
		if (PQstatus(node_conn) != CONNECTION_OK)
		{
			/* XXX */
			log_info(_("At this point, it could be some race conditions "
					   "that are acceptable, assume the node is restarting "
					   "and starting failover procedure\n"));
			continue;
		}

		while (!nodes[i].is_ready)
		{

			sqlquery_snprintf(sqlquery,
							  "SELECT %s.repmgr_get_last_standby_location()",
							  get_repmgr_schema_quoted(node_conn));
			res = PQexec(node_conn, sqlquery);
			if (PQresultStatus(res) != PGRES_TUPLES_OK)
			{
				log_err(_("PQexec failed: %s.\nReport an invalid value to not "
						  "be considered as new master and exit.\n"),
						PQerrorMessage(node_conn));
				PQclear(res);
				PQfinish(node_conn);
				terminate(ERR_DB_QUERY);
			}

			xlog_recptr = lsn_to_xlogrecptr(PQgetvalue(res, 0, 0), &lsn_format_ok);

			/* If position reported as "invalid", check for format error or
			 * empty string; otherwise position is 0/0 and we need to continue
			 * looping until a valid LSN is reported
			 */
			if (xlog_recptr == InvalidXLogRecPtr)
			{
				if (lsn_format_ok == false)
				{
					/* Unable to parse value returned by `repmgr_get_last_standby_location()` */
					if (*PQgetvalue(res, 0, 0) == '\0')
					{
						log_crit(
							_("unable to obtain LSN from node %i"), nodes[i].node_id
							);
						log_hint(
							_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n")
							);

						PQclear(res);
						PQfinish(node_conn);
						exit(ERR_BAD_CONFIG);
					}

					/*
					 * Very unlikely to happen; in the absence of any better
					 * strategy keep checking
					 */
					log_warning(_("unable to parse LSN \"%s\"\n"),
								PQgetvalue(res, 0, 0));
				}
				else
				{
					log_debug(
						_("invalid LSN returned from node %i: '%s'\n"),
						nodes[i].node_id,
						PQgetvalue(res, 0, 0)
						);
				}

				PQclear(res);

				/* If position is 0/0, keep checking */
				/* XXX we should add a timeout here to prevent infinite looping
				 * if the other node's repmgrd is not up
				 */
				continue;
			}

			if (nodes[i].xlog_location < xlog_recptr)
			{
				nodes[i].xlog_location = xlog_recptr;
			}

			log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, PQgetvalue(res, 0, 0));
			PQclear(res);

			ready_nodes++;
			nodes[i].is_ready = true;
		}

		PQfinish(node_conn);
	}


	/*
	 * determine which one is the best candidate to promote to master
	 */
	for (i = 0; i < total_nodes; i++)
	{
		/* witness server can never be a candidate */
		if (nodes[i].type == WITNESS)
			continue;

		if (!nodes[i].is_ready || !nodes[i].is_visible)
			continue;

		if (!candidate_found)
		{
			/*
			 * If no candidate has been found so far, the first visible and ready
			 * node becomes the best candidate by default
			 */
			best_candidate.node_id = nodes[i].node_id;
			best_candidate.xlog_location = nodes[i].xlog_location;
			best_candidate.is_ready = nodes[i].is_ready;
			strncpy(best_candidate.conninfo_str, nodes[i].conninfo_str, MAXCONNINFO);
			candidate_found = true;
		}

		/*
		 * Nodes are retrieved ordered by priority, so if the current best
		 * candidate is lower than the next node's wal location then assign
		 * next node as the new best candidate.
		 */
		if (best_candidate.xlog_location < nodes[i].xlog_location)
		{
			best_candidate.node_id = nodes[i].node_id;
			best_candidate.xlog_location = nodes[i].xlog_location;
			best_candidate.is_ready = nodes[i].is_ready;
			strncpy(best_candidate.conninfo_str, nodes[i].conninfo_str, MAXCONNINFO);
		}
	}

	/* Terminate if no candidate found */
	if (!candidate_found)
	{
		log_err(_("no suitable candidate for promotion found; terminating.\n"));
		terminate(ERR_FAILOVER_FAIL);
	}

	log_debug("best candidate node id is %i\n", best_candidate.node_id);

	/* if local node is the best candidate, promote it */
	if (best_candidate.node_id == local_options.node)
	{
		PQExpBufferData event_details;

		/* Close the connection to this server */
		PQfinish(my_local_conn);
		my_local_conn = NULL;

		initPQExpBuffer(&event_details);
		/* wait */
		sleep(5);

		log_notice(_("this node is the best candidate to be the new master, promoting...\n"));

		log_debug("promote command is: \"%s\"\n",
				  local_options.promote_command);

		if (log_type == REPMGR_STDERR && *local_options.logfile)
		{
			fflush(stderr);
		}

		r = system(local_options.promote_command);
		if (r != 0)
		{
			/*
			 * Check whether the primary reappeared, which will have caused the
			 * promote command to fail
			 */
			my_local_conn = establish_db_connection(local_options.conninfo, false);

			if (my_local_conn != NULL)
			{
				int master_node_id;

				master_conn = get_master_connection(my_local_conn,
													local_options.cluster_name,
													&master_node_id, NULL);

				if (master_conn != NULL && master_node_id == failed_master.node_id)
				{
					log_notice(_("Original master reappeared before this standby was promoted - no action taken\n"));

					PQfinish(master_conn);
					master_conn = NULL;

					/* no failover occurred but we'll want to restart connections */
					failover_done = true;
					return;
				}
			}

			log_err(_("promote command failed. You could check and try it manually.\n"));

			terminate(ERR_DB_QUERY);
		}

		/* and reconnect to the local database */
		my_local_conn = establish_db_connection(local_options.conninfo, true);


		/* update internal record for this node */
		node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);

		appendPQExpBuffer(&event_details,
						  _("node %i promoted to master; old master %i marked as failed"),
						  node_info.node_id,
						  failed_master.node_id);

		/* my_local_conn is now the master */
		create_event_record(my_local_conn,
							&local_options,
							node_info.node_id,
							"repmgrd_failover_promote",
							true,
							event_details.data);

	}
    /* local node not promotion candidate - find the new master */
	else
	{
		PGconn	   *new_master_conn;
		PQExpBufferData event_details;
		int master_node_id;

		initPQExpBuffer(&event_details);

		/* wait */
		sleep(10);

		/*
		 * Check whether the primary reappeared while we were waiting, so we
		 * don't end up following the promotion candidate
		 */

		master_conn = get_master_connection(my_local_conn,
											local_options.cluster_name,
											&master_node_id, NULL);

		if (master_conn != NULL && master_node_id == failed_master.node_id)
		{
			log_notice(_("Original master reappeared - no action taken\n"));

			PQfinish(master_conn);
			/* no failover occurred but we'll want to restart connections */
			failover_done = true;
			return;
		}


		/* Close the connection to this server */
		PQfinish(my_local_conn);
		my_local_conn = NULL;

		/* XXX double-check the promotion candidate did become the new primary */

		log_notice(_("node %d is the best candidate for new master, attempting to follow...\n"),
				 best_candidate.node_id);

		/*
		 * The new master may some time to be promoted. The follow command
		 * should take care of that.
		 */
		if (log_type == REPMGR_STDERR && *local_options.logfile)
		{
			fflush(stderr);
		}


		log_debug(_("executing follow command: \"%s\"\n"), local_options.follow_command);

		r = system(local_options.follow_command);
		if (r != 0)
		{
			appendPQExpBuffer(&event_details,
							  _("Unable to execute follow command:\n %s"),
							  local_options.follow_command);

			log_err("%s\n", event_details.data);

			/* It won't be possible to write to the event notification
			 * table but we should be able to generate an external notification
			 * if required.
			 */
			create_event_record(NULL,
								&local_options,
								node_info.node_id,
								"repmgrd_failover_follow",
								false,
								event_details.data);

			terminate(ERR_BAD_CONFIG);
		}

		/* and reconnect to the local database */
		my_local_conn = establish_db_connection(local_options.conninfo, true);

		/* update internal record for this node*/
		new_master_conn = establish_db_connection(best_candidate.conninfo_str, true);

		node_info = get_node_info(new_master_conn, local_options.cluster_name, local_options.node);
		appendPQExpBuffer(&event_details,
						  _("node %i now following new upstream node %i"),
						  node_info.node_id,
						  best_candidate.node_id);

		log_notice("%s\n", event_details.data);

		create_event_record(new_master_conn,
							&local_options,
							node_info.node_id,
							"repmgrd_failover_follow",
							true,
							event_details.data);

		PQfinish(new_master_conn);
		termPQExpBuffer(&event_details);
	}

	/* to force it to re-calculate mode and master node */
	// ^ ZZZ check that behaviour ^
	failover_done = true;
}


/*
 * do_upstream_standby_failover()
 *
 * Attach cascaded standby to new upstream server
 *
 * Currently we will try to attach to the failed upstream's upstream.
 * It might be worth providing a selection of reconnection strategies
 * as different behaviour might be desirable in different situations;
 * or maybe the option not to reconnect might be required?
 *
 * XXX check this handles replication slots gracefully
 */
static bool
do_upstream_standby_failover(t_node_info upstream_node)
{
	PGresult   *res;
	char		sqlquery[QUERY_STR_LEN];
	int			upstream_node_id = node_info.upstream_node_id;
	int			r;
	PQExpBufferData event_details;

	log_debug(_("do_upstream_standby_failover(): performing failover for node %i\n"),
              node_info.node_id);

	/*
	 * Verify that we can still talk to the cluster master even though
	 * node upstream is not available
	 */
	if (!check_connection(&master_conn, "master", NULL))
	{
		log_err(_("do_upstream_standby_failover(): Unable to connect to last known master node\n"));
		return false;
	}

	while(1)
	{
		sqlquery_snprintf(sqlquery,
						  "SELECT id, active, upstream_node_id, type, conninfo "
						  "  FROM %s.repl_nodes "
						  " WHERE id = %i ",
						  get_repmgr_schema_quoted(master_conn),
						  upstream_node_id);

		res = PQexec(master_conn, sqlquery);

		if (PQresultStatus(res) != PGRES_TUPLES_OK)
		{
			log_err(_("unable to query cluster master: %s\n"), PQerrorMessage(master_conn));
			PQclear(res);
			return false;
		}

		if (PQntuples(res) == 0)
		{
			log_err(_("no node with id %i found\n"), upstream_node_id);
			PQclear(res);
			return false;
		}

		/* upstream node is inactive */
		if (strcmp(PQgetvalue(res, 0, 1), "f") == 0)
		{
			/*
			 * Upstream node is an inactive master, meaning no there are no direct
			 * upstream nodes available to reattach to.
			 *
			 * XXX For now we'll simply terminate, however it would make sense to
			 * provide an option to either try and find the current master and/or
			 * a strategy to connect to a different upstream node
			 */
			if (strcmp(PQgetvalue(res, 0, 4), "master") == 0)
			{
				log_err(_("unable to find active master node\n"));
				PQclear(res);
				return false;
			}

			upstream_node_id = atoi(PQgetvalue(res, 0, 2));
		}
		else
		{
			upstream_node_id = atoi(PQgetvalue(res, 0, 0));

			log_notice(_("found active upstream node with id %i\n"), upstream_node_id);
			PQclear(res);
			break;
		}

		PQclear(res);
		sleep(local_options.reconnect_interval);
	}

	/* Close the connection to this server */
	PQfinish(my_local_conn);
	my_local_conn = NULL;

	initPQExpBuffer(&event_details);

	/* Follow new upstream */
	r = system(local_options.follow_command);
	if (r != 0)
	{
		appendPQExpBuffer(&event_details,
						  _("Unable to execute follow command:\n %s"),
						  local_options.follow_command);

		log_err("%s\n", event_details.data);

		/* It won't be possible to write to the event notification
		 * table but we should be able to generate an external notification
		 * if required.
		 */
		create_event_record(NULL,
							&local_options,
							node_info.node_id,
							"repmgrd_failover_follow",
							false,
							event_details.data);
		terminate(ERR_BAD_CONFIG);
	}

	if (update_node_record_set_upstream(master_conn, local_options.cluster_name, node_info.node_id, upstream_node_id) == false)
	{
		appendPQExpBuffer(&event_details,
						  _("Unable to set node %i's new upstream ID to %i"),
						  node_info.node_id,
						  upstream_node_id);
		create_event_record(NULL,
							&local_options,
							node_info.node_id,
							"repmgrd_failover_follow",
							false,
							event_details.data);
		terminate(ERR_BAD_CONFIG);
	}

	appendPQExpBuffer(&event_details,
					  _("node %i is now following upstream node %i"),
					  node_info.node_id,
					  upstream_node_id);

	create_event_record(NULL,
						&local_options,
						node_info.node_id,
						"repmgrd_failover_follow",
						true,
						event_details.data);

	my_local_conn = establish_db_connection(local_options.conninfo, true);

	return true;
}


static bool
check_connection(PGconn **conn, const char *type, const char *conninfo)
{
	int			connection_retries;

	/*
	 * Check if the node is still available if after
	 * local_options.reconnect_attempts * local_options.reconnect_interval
	 * seconds of retries we cannot reconnect return false
	 */
	for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
	{
		if (*conn == NULL)
		{
			if (conninfo == NULL)
			{
				log_err("INTERNAL ERROR: *conn == NULL && conninfo == NULL");
				terminate(ERR_INTERNAL);
			}
			*conn = establish_db_connection(conninfo, false);
		}
		if (!is_pgup(*conn, local_options.master_response_timeout))
		{
			log_warning(_("connection to %s has been lost, trying to recover... %i seconds before failover decision\n"),
						type,
						(local_options.reconnect_interval * (local_options.reconnect_attempts - connection_retries)));
			/* wait local_options.reconnect_interval seconds between retries */
			sleep(local_options.reconnect_interval);
		}
		else
		{
			if (connection_retries > 0)
			{
				log_info(_("connection to %s has been restored.\n"), type);
			}
			return true;
		}
	}

	if (!is_pgup(*conn, local_options.master_response_timeout))
	{
		log_err(_("unable to reconnect to %s (timeout %i seconds)...\n"),
				type,
				local_options.master_response_timeout
			);

		return false;
	}

	return true;
}


/*
 * set_local_node_status()
 *
 * If failure of the local node is detected, attempt to connect
 * to the current master server (as stored in the global variable
 * `master_conn`) and update its record to failed.
 */

static bool
set_local_node_status(void)
{
	PGresult       *res;
	char		sqlquery[QUERY_STR_LEN];
	int		active_master_node_id = NODE_NOT_FOUND;
	char		master_conninfo[MAXLEN];

	if (!check_connection(&master_conn, "master", NULL))
	{
		log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
		return false;
	}

	/*
	 * Check that the node `master_conn` is connected to is node is still
	 * master - it's just about conceivable that it might have become a
	 * standby of a new master in the intervening period
	 */

	sqlquery_snprintf(sqlquery,
					  "SELECT id, conninfo "
					  "  FROM %s.repl_nodes "
					  " WHERE type = 'master' "
					  "   AND active IS TRUE ",
					  get_repmgr_schema_quoted(master_conn));

	res = PQexec(master_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("unable to obtain record for active master: %s\n"),
				PQerrorMessage(master_conn));

		return false;
	}

	if (!PQntuples(res))
	{
		log_err(_("no active master record found\n"));
		return false;
	}

	active_master_node_id = atoi(PQgetvalue(res, 0, 0));
	strncpy(master_conninfo, PQgetvalue(res, 0, 1), MAXLEN);
	PQclear(res);

	if (active_master_node_id != master_options.node)
	{
		log_notice(_("current active master is %i; attempting to connect\n"),
			active_master_node_id);
		PQfinish(master_conn);
		master_conn = establish_db_connection(master_conninfo, false);

		if (PQstatus(master_conn) != CONNECTION_OK)
		{
			log_err(_("unable to connect to active master\n"));
			return false;
		}

		log_notice(_("Connection to new master was successful\n"));
	}


	/*
	 * Attempt to set the active record to the correct value.
	 * First
	 */

	if (!update_node_record_status(master_conn,
					    local_options.cluster_name,
					    node_info.node_id,
					    "standby",
					    node_info.upstream_node_id,
					    is_standby(my_local_conn)==1))
	{
		log_err(_("unable to set local node %i as inactive on master: %s\n"),
				node_info.node_id,
				PQerrorMessage(master_conn));

		return false;
	}

	log_notice(_("marking this node (%i) as inactive on master\n"), node_info.node_id);
	return true;
}


static void
check_cluster_configuration(PGconn *conn)
{
	PGresult   *res;
	char		sqlquery[QUERY_STR_LEN];

	log_info(_("checking cluster configuration with schema '%s'\n"), get_repmgr_schema());

	sqlquery_snprintf(sqlquery,
					  "SELECT oid FROM pg_catalog.pg_class "
					  " WHERE oid = '%s.repl_nodes'::regclass ",
					  get_repmgr_schema_quoted(master_conn));

	res = PQexec(conn, sqlquery);

	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(conn));
		PQclear(res);
		terminate(ERR_DB_QUERY);
	}

	/*
	 * If there isn't any results then we have not configured a master node
	 * yet in repmgr or the connection string is pointing to the wrong
	 * database.
	 *
	 * XXX if we are the master, should we try to create the tables needed?
	 */
	if (PQntuples(res) == 0)
	{
		log_err(_("the replication cluster is not configured\n"));
		PQclear(res);
		terminate(ERR_BAD_CONFIG);
	}
	PQclear(res);
}


static void
check_node_configuration(void)
{
	PGresult   *res;
	char		sqlquery[QUERY_STR_LEN];

	/*
	 * Check if this node has an entry in `repl_nodes`
	 */
	log_info(_("checking node %d in cluster '%s'\n"),
			 local_options.node, local_options.cluster_name);

	sqlquery_snprintf(sqlquery,
					  "SELECT COUNT(*) "
					  "  FROM %s.repl_nodes "
					  " WHERE id = %d "
					  "   AND cluster = '%s' ",
					  get_repmgr_schema_quoted(my_local_conn),
					  local_options.node,
					  local_options.cluster_name);

	res = PQexec(my_local_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn));
		PQclear(res);
		terminate(ERR_BAD_CONFIG);
	}

	/*
	 * If there isn't any results then we have not configured this node yet in
	 * repmgr, if that is the case we will insert the node to the cluster,
	 * except if it is a witness
	 */
	if (PQntuples(res) == 0)
	{
		PQclear(res);

		if (node_info.type == WITNESS)
		{
			log_err(_("The witness is not configured\n"));
			terminate(ERR_BAD_CONFIG);
		}

		/* Adding the node */
		log_info(_("adding node %d to cluster '%s'\n"),
				 local_options.node, local_options.cluster_name);

		/* XXX use create_node_record() */
		sqlquery_snprintf(sqlquery,
						  "INSERT INTO %s.repl_nodes"
						  "           (id, cluster, name, conninfo, priority, witness) "
						  "    VALUES (%d, '%s', '%s', '%s', 0, FALSE) ",
						  get_repmgr_schema_quoted(master_conn),
						  local_options.node,
						  local_options.cluster_name,
						  local_options.node_name,
						  local_options.conninfo);

		if (!PQexec(master_conn, sqlquery))
		{
			log_err(_("unable to insert node details, %s\n"),
					PQerrorMessage(master_conn));
			terminate(ERR_BAD_CONFIG);
		}
	}
	else
	{
		PQclear(res);
	}
}


/*
 * lsn_to_xlogrecptr()
 *
 * Convert an LSN represented as a string to an XLogRecPtr;
 * optionally set a flag to indicated the provided string
 * could not be parsed
 */
static XLogRecPtr
lsn_to_xlogrecptr(char *lsn, bool *format_ok)
{
	uint32 xlogid;
	uint32 xrecoff;

	if (sscanf(lsn, "%X/%X", &xlogid, &xrecoff) != 2)
	{
		if (format_ok != NULL)
			*format_ok = false;
		log_err(_("incorrect log location format: %s\n"), lsn);
		return 0;
	}

	if (format_ok != NULL)
		*format_ok = true;

	return (((XLogRecPtr) xlogid * 16 * 1024 * 1024 * 255) + xrecoff);
}

void
usage(void)
{
	log_err(_("%s: Replicator manager daemon \n"), progname());
	log_err(_("Try \"%s --help\" for more information.\n"), progname());
}


void
help(void)
{
	printf(_("%s: replication management daemon for PostgreSQL\n"), progname());
	printf(_("\n"));
	printf(_("Usage:\n"));
	printf(_("  %s [OPTIONS]\n"), progname());
	printf(_("\n"));
	printf(_("Options:\n"));
	printf(_("  -?, --help                show this help, then exit\n"));
	printf(_("  -V, --version             output version information, then exit\n"));
	printf(_("  -v, --verbose             output verbose activity information\n"));
	printf(_("  -m, --monitoring-history  track advance or lag of the replication in every standby in repl_monitor\n"));
	printf(_("  -f, --config-file=PATH    path to the configuration file\n"));
	printf(_("  -d, --daemonize           detach process from foreground\n"));
	printf(_("  -p, --pid-file=PATH       write a PID file\n"));
	printf(_("\n"));
	printf(_("%s monitors a cluster of servers and optionally performs failover.\n"), progname());
}


#ifndef WIN32
static void
handle_sigint(SIGNAL_ARGS)
{
	terminate(0);
}

/* SIGHUP: set flag to re-read config file at next convenient time */
static void
handle_sighup(SIGNAL_ARGS)
{
	got_SIGHUP = true;
}

static void
setup_event_handlers(void)
{
	pqsignal(SIGHUP, handle_sighup);
	pqsignal(SIGINT, handle_sigint);
	pqsignal(SIGTERM, handle_sigint);
}
#endif

static void
terminate(int retval)
{
	close_connections();
	logger_shutdown();

	if (pid_file)
	{
		unlink(pid_file);
	}

	log_info(_("%s terminating...\n"), progname());

	exit(retval);
}


static void
update_shared_memory(char *last_xlog_replay_location)
{
	PGresult   *res;
	char		sqlquery[QUERY_STR_LEN];

	sprintf(sqlquery,
			"SELECT %s.repmgr_update_standby_location('%s')",
			get_repmgr_schema_quoted(my_local_conn),
			last_xlog_replay_location);

	/* If an error happens, just inform about that and continue */
	res = PQexec(my_local_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_TUPLES_OK)
	{
		log_warning(_("Cannot update this standby's shared memory: %s\n"),
					PQerrorMessage(my_local_conn));
		/* XXX is this enough reason to terminate this repmgrd? */
	}
	else if (strcmp(PQgetvalue(res, 0, 0), "f") == 0)
	{
		/* this surely is more than enough reason to exit */
		log_crit(_("Cannot update this standby's shared memory, maybe shared_preload_libraries=repmgr_funcs is not set?\n"));
		exit(ERR_BAD_CONFIG);
	}

	PQclear(res);
}

static void
update_registration(void)
{
	PGresult   *res;
	char		sqlquery[QUERY_STR_LEN];

	sqlquery_snprintf(sqlquery,
					  "UPDATE %s.repl_nodes "
					  "   SET conninfo = '%s', "
					  "       priority = %d "
					  " WHERE id = %d ",
					  get_repmgr_schema_quoted(master_conn),
					  local_options.conninfo,
					  local_options.priority,
					  local_options.node);

	res = PQexec(master_conn, sqlquery);
	if (PQresultStatus(res) != PGRES_COMMAND_OK)
	{
		PQExpBufferData errmsg;
		initPQExpBuffer(&errmsg);

		appendPQExpBuffer(&errmsg,
						  _("unable to update registration: %s"),
						  PQerrorMessage(master_conn));

		log_err("%s\n", errmsg.data);

		create_event_record(master_conn,
							&local_options,
							local_options.node,
							"repmgrd_shutdown",
							false,
							errmsg.data);
		terminate(ERR_DB_CON);
	}
	PQclear(res);
}

static void
do_daemonize()
{
	char	   *ptr,
				path[MAXLEN];
	pid_t		pid = fork();
	int			ret;

	switch (pid)
	{
		case -1:
			log_err("Error in fork(): %s\n", strerror(errno));
			exit(ERR_SYS_FAILURE);
			break;

		case 0:			/* child process */
			pid = setsid();
			if (pid == (pid_t) -1)
			{
				log_err("Error in setsid(): %s\n", strerror(errno));
				exit(ERR_SYS_FAILURE);
			}

			/* ensure that we are no longer able to open a terminal */
			pid = fork();

			if (pid == -1)		/* error case */
			{
				log_err("Error in fork(): %s\n", strerror(errno));
				exit(ERR_SYS_FAILURE);
				break;
			}

			if (pid != 0)		/* parent process */
			{
				exit(0);
			}

			/* a child just flows along */

			memset(path, 0, MAXLEN);

			for (ptr = config_file + strlen(config_file); ptr > config_file; --ptr)
			{
				if (*ptr == '/')
				{
					strncpy(path, config_file, ptr - config_file);
				}
			}

			if (*path == '\0')
			{
				*path = '/';
			}

			ret = chdir(path);
			if (ret != 0)
			{
				log_err("Error changing directory to '%s': %s", path,
						strerror(errno));
			}

			break;

		default:				/* parent process */
			exit(0);
	}
}

static void
check_and_create_pid_file(const char *pid_file)
{
	struct stat st;
	FILE	   *fd;
	char		buff[MAXLEN];
	pid_t		pid;
	size_t		nread;

	if (stat(pid_file, &st) != -1)
	{
		memset(buff, 0, MAXLEN);

		fd = fopen(pid_file, "r");

		if (fd == NULL)
		{
			log_err("PID file %s exists but could not opened for reading. "
					"If repmgrd is no longer alive remove the file and restart repmgrd.\n",
					pid_file);
			exit(ERR_BAD_CONFIG);
		}

		nread = fread(buff, MAXLEN - 1, 1, fd);

		if (nread == 0 && ferror(fd))
		{
			log_err("Error reading PID file '%s', giving up...\n", pid_file);
			exit(ERR_BAD_CONFIG);
		}

		fclose(fd);

		pid = atoi(buff);

		if (pid != 0)
		{
			if (kill(pid, 0) != -1)
			{
				log_err("PID file %s exists and seems to contain a valid PID. "
						"If repmgrd is no longer alive remove the file and restart repmgrd.\n",
						pid_file);
				exit(ERR_BAD_CONFIG);
			}
		}
	}

	fd = fopen(pid_file, "w");
	if (fd == NULL)
	{
		log_err("Could not open PID file %s!\n", pid_file);
		exit(ERR_BAD_CONFIG);
	}

	fprintf(fd, "%d", getpid());
	fclose(fd);
}


t_node_info
get_node_info(PGconn *conn, char *cluster, int node_id)
{
	int res;

	t_node_info node_info = T_NODE_INFO_INITIALIZER;

	res = get_node_record(conn, cluster, node_id, &node_info);

	if (res == -1)
	{
		PQExpBufferData errmsg;
		initPQExpBuffer(&errmsg);

		appendPQExpBuffer(&errmsg,
						  _("unable to retrieve record for node %i: %s"),
						  node_id,
						  PQerrorMessage(conn));

		log_err("%s\n", errmsg.data);

		create_event_record(NULL,
							&local_options,
							local_options.node,
							"repmgrd_shutdown",
							false,
							errmsg.data);

		PQfinish(conn);
		conn = NULL;

		terminate(ERR_DB_QUERY);
	}

	if (res == 0)
	{
		log_warning(_("No record found for node %i\n"), node_id);
	}

	return node_info;
}