mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 15:16:29 +00:00
2458 lines
62 KiB
C
2458 lines
62 KiB
C
/*
|
|
* repmgrd.c - Replication manager daemon
|
|
* Copyright (C) 2ndQuadrant, 2010-2016
|
|
*
|
|
* This module connects to the nodes of a replication cluster and monitors
|
|
* how far are they from master
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
*/
|
|
|
|
#include <signal.h>
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/stat.h>
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <unistd.h>
|
|
|
|
|
|
|
|
#include "repmgr.h"
|
|
#include "config.h"
|
|
#include "log.h"
|
|
#include "strutil.h"
|
|
#include "version.h"
|
|
|
|
/* Required PostgreSQL headers */
|
|
#include "access/xlogdefs.h"
|
|
#include "pqexpbuffer.h"
|
|
|
|
|
|
|
|
/* Local info */
|
|
t_configuration_options local_options = T_CONFIGURATION_OPTIONS_INITIALIZER;
|
|
PGconn *my_local_conn = NULL;
|
|
|
|
/* Master info */
|
|
t_configuration_options master_options = T_CONFIGURATION_OPTIONS_INITIALIZER;
|
|
|
|
PGconn *master_conn = NULL;
|
|
|
|
char *config_file = "";
|
|
bool verbose = false;
|
|
bool monitoring_history = false;
|
|
t_node_info node_info;
|
|
|
|
bool failover_done = false;
|
|
|
|
char *pid_file = NULL;
|
|
|
|
static void help(void);
|
|
static void usage(void);
|
|
static void check_cluster_configuration(PGconn *conn);
|
|
static void check_node_configuration(void);
|
|
|
|
static void standby_monitor(void);
|
|
static void witness_monitor(void);
|
|
static bool check_connection(PGconn **conn, const char *type, const char *conninfo);
|
|
static bool set_local_node_status(void);
|
|
|
|
static void update_shared_memory(char *last_wal_standby_applied);
|
|
static void update_registration(void);
|
|
static void do_master_failover(void);
|
|
static bool do_upstream_standby_failover(t_node_info upstream_node);
|
|
|
|
static t_node_info get_node_info(PGconn *conn, char *cluster, int node_id);
|
|
static XLogRecPtr lsn_to_xlogrecptr(char *lsn, bool *format_ok);
|
|
|
|
/*
|
|
* Flag to mark SIGHUP. Whenever the main loop comes around it
|
|
* will reread the configuration file.
|
|
*/
|
|
static volatile sig_atomic_t got_SIGHUP = false;
|
|
|
|
static void handle_sighup(SIGNAL_ARGS);
|
|
static void handle_sigint(SIGNAL_ARGS);
|
|
|
|
static void terminate(int retval);
|
|
|
|
#ifndef WIN32
|
|
static void setup_event_handlers(void);
|
|
#endif
|
|
|
|
static void do_daemonize(void);
|
|
static void check_and_create_pid_file(const char *pid_file);
|
|
|
|
static void
|
|
close_connections()
|
|
{
|
|
if (master_conn != NULL && PQisBusy(master_conn) == 1)
|
|
cancel_query(master_conn, local_options.master_response_timeout);
|
|
|
|
if (my_local_conn != NULL)
|
|
PQfinish(my_local_conn);
|
|
|
|
if (master_conn != NULL && master_conn != my_local_conn)
|
|
PQfinish(master_conn);
|
|
|
|
master_conn = NULL;
|
|
my_local_conn = NULL;
|
|
}
|
|
|
|
|
|
int
|
|
main(int argc, char **argv)
|
|
{
|
|
static struct option long_options[] =
|
|
{
|
|
{"config-file", required_argument, NULL, 'f'},
|
|
{"verbose", no_argument, NULL, 'v'},
|
|
{"monitoring-history", no_argument, NULL, 'm'},
|
|
{"daemonize", no_argument, NULL, 'd'},
|
|
{"pid-file", required_argument, NULL, 'p'},
|
|
{"help", no_argument, NULL, '?'},
|
|
{"version", no_argument, NULL, 'V'},
|
|
{NULL, 0, NULL, 0}
|
|
};
|
|
|
|
int optindex;
|
|
int c;
|
|
bool daemonize = false;
|
|
bool startup_event_logged = false;
|
|
|
|
FILE *fd;
|
|
|
|
int server_version_num = 0;
|
|
|
|
set_progname(argv[0]);
|
|
|
|
/* Disallow running as root to prevent directory ownership problems */
|
|
if (geteuid() == 0)
|
|
{
|
|
fprintf(stderr,
|
|
_("%s: cannot be run as root\n"
|
|
"Please log in (using, e.g., \"su\") as the "
|
|
"(unprivileged) user that owns "
|
|
"the data directory.\n"
|
|
),
|
|
progname());
|
|
exit(1);
|
|
}
|
|
|
|
|
|
while ((c = getopt_long(argc, argv, "?Vf:vmdp:", long_options, &optindex)) != -1)
|
|
{
|
|
switch (c)
|
|
{
|
|
case 'f':
|
|
config_file = optarg;
|
|
break;
|
|
case 'v':
|
|
verbose = true;
|
|
break;
|
|
case 'm':
|
|
monitoring_history = true;
|
|
break;
|
|
case 'd':
|
|
daemonize = true;
|
|
break;
|
|
case 'p':
|
|
pid_file = optarg;
|
|
break;
|
|
case '?':
|
|
help();
|
|
exit(SUCCESS);
|
|
case 'V':
|
|
printf("%s %s (PostgreSQL %s)\n", progname(), REPMGR_VERSION, PG_VERSION);
|
|
exit(SUCCESS);
|
|
default:
|
|
usage();
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Parse the configuration file, if provided. If no configuration file
|
|
* was provided, or one was but was incomplete, parse_config() will
|
|
* abort anyway, with an appropriate message.
|
|
*
|
|
* XXX it might be desirable to create an event record for this, in
|
|
* which case we'll need to refactor parse_config() not to abort,
|
|
* and return the error message.
|
|
*/
|
|
load_config(config_file, verbose, &local_options, argv[0]);
|
|
|
|
if (daemonize)
|
|
{
|
|
do_daemonize();
|
|
}
|
|
|
|
if (pid_file)
|
|
{
|
|
check_and_create_pid_file(pid_file);
|
|
}
|
|
|
|
#ifndef WIN32
|
|
setup_event_handlers();
|
|
#endif
|
|
|
|
fd = freopen("/dev/null", "r", stdin);
|
|
if (fd == NULL)
|
|
{
|
|
fprintf(stderr, "error reopening stdin to '/dev/null': %s",
|
|
strerror(errno));
|
|
}
|
|
|
|
fd = freopen("/dev/null", "w", stdout);
|
|
if (fd == NULL)
|
|
{
|
|
fprintf(stderr, "error reopening stdout to '/dev/null': %s",
|
|
strerror(errno));
|
|
}
|
|
|
|
logger_init(&local_options, progname());
|
|
if (verbose)
|
|
logger_set_verbose();
|
|
|
|
if (log_type == REPMGR_SYSLOG)
|
|
{
|
|
fd = freopen("/dev/null", "w", stderr);
|
|
|
|
if (fd == NULL)
|
|
{
|
|
fprintf(stderr, "error reopening stderr to '/dev/null': %s",
|
|
strerror(errno));
|
|
}
|
|
}
|
|
|
|
/* Initialise the repmgr schema name */
|
|
/* XXX check this handles quoting properly */
|
|
maxlen_snprintf(repmgr_schema, "%s%s", DEFAULT_REPMGR_SCHEMA_PREFIX,
|
|
local_options.cluster_name);
|
|
|
|
log_info(_("connecting to database '%s'\n"),
|
|
local_options.conninfo);
|
|
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
|
|
|
/* Verify that server is a supported version */
|
|
log_info(_("connected to database, checking its state\n"));
|
|
|
|
server_version_num = get_server_version(my_local_conn, NULL);
|
|
|
|
if (server_version_num < MIN_SUPPORTED_VERSION_NUM)
|
|
{
|
|
if (server_version_num > 0)
|
|
{
|
|
log_err(_("%s requires PostgreSQL %s or later\n"),
|
|
progname(),
|
|
MIN_SUPPORTED_VERSION) ;
|
|
}
|
|
else
|
|
{
|
|
log_err(_("unable to determine PostgreSQL server version\n"));
|
|
}
|
|
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/* Retrieve record for this node from the local database */
|
|
node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);
|
|
|
|
/*
|
|
* No node record found - exit gracefully
|
|
*
|
|
* Note: it's highly unlikely this situation will occur when starting
|
|
* repmgrd on a witness, unless someone goes to the trouble of
|
|
* deleting the node record from the previously copied table.
|
|
*/
|
|
|
|
if (node_info.node_id == NODE_NOT_FOUND)
|
|
{
|
|
log_err(_("No metadata record found for this node - terminating\n"));
|
|
log_hint(_("Check that 'repmgr (master|standby) register' was executed for this node\n"));
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
log_debug("node id is %i, upstream is %i\n", node_info.node_id, node_info.upstream_node_id);
|
|
|
|
/*
|
|
* MAIN LOOP This loops cycles at startup and once per failover and
|
|
* Requisites: - my_local_conn needs to be already setted with an active
|
|
* connection - no master connection
|
|
*/
|
|
do
|
|
{
|
|
/* Timer for repl_nodes synchronisation interval */
|
|
int sync_repl_nodes_elapsed = 0;
|
|
|
|
/*
|
|
* Set my server mode, establish a connection to master and start
|
|
* monitoring
|
|
*/
|
|
|
|
switch (node_info.type)
|
|
{
|
|
case MASTER:
|
|
master_options.node = local_options.node;
|
|
strncpy(master_options.conninfo, local_options.conninfo,
|
|
MAXLEN);
|
|
master_conn = my_local_conn;
|
|
|
|
check_cluster_configuration(my_local_conn);
|
|
check_node_configuration();
|
|
|
|
if (reload_config(&local_options))
|
|
{
|
|
PQfinish(my_local_conn);
|
|
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
|
master_conn = my_local_conn;
|
|
update_registration();
|
|
}
|
|
|
|
/* Log startup event */
|
|
if (startup_event_logged == false)
|
|
{
|
|
create_event_record(master_conn,
|
|
&local_options,
|
|
local_options.node,
|
|
"repmgrd_start",
|
|
true,
|
|
NULL);
|
|
startup_event_logged = true;
|
|
}
|
|
|
|
log_info(_("starting continuous master connection check\n"));
|
|
|
|
/*
|
|
* Check that master is still alive.
|
|
* XXX We should also check that the
|
|
* standby servers are sending info
|
|
*/
|
|
|
|
/*
|
|
* Every local_options.monitor_interval_secs seconds, do
|
|
* master checks
|
|
*/
|
|
do
|
|
{
|
|
if (check_connection(&master_conn, "master", NULL))
|
|
{
|
|
sleep(local_options.monitor_interval_secs);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* XXX May we do something more verbose ?
|
|
*/
|
|
terminate(1);
|
|
}
|
|
|
|
if (got_SIGHUP)
|
|
{
|
|
/*
|
|
* if we can reload the configuration file, then could need to change
|
|
* my_local_conn
|
|
*/
|
|
if (reload_config(&local_options))
|
|
{
|
|
PQfinish(my_local_conn);
|
|
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
|
master_conn = my_local_conn;
|
|
|
|
if (*local_options.logfile)
|
|
{
|
|
FILE *fd;
|
|
|
|
fd = freopen(local_options.logfile, "a", stderr);
|
|
if (fd == NULL)
|
|
{
|
|
fprintf(stderr, "error reopening stderr to '%s': %s",
|
|
local_options.logfile, strerror(errno));
|
|
}
|
|
|
|
}
|
|
|
|
update_registration();
|
|
}
|
|
got_SIGHUP = false;
|
|
}
|
|
} while (!failover_done);
|
|
break;
|
|
|
|
case WITNESS:
|
|
case STANDBY:
|
|
|
|
/* We need the node id of the master server as well as a connection to it */
|
|
log_info(_("connecting to master node of cluster '%s'\n"),
|
|
local_options.cluster_name);
|
|
|
|
master_conn = get_master_connection(my_local_conn,
|
|
local_options.cluster_name,
|
|
&master_options.node, NULL);
|
|
|
|
if (master_conn == NULL)
|
|
{
|
|
PQExpBufferData errmsg;
|
|
initPQExpBuffer(&errmsg);
|
|
|
|
appendPQExpBuffer(&errmsg,
|
|
_("unable to connect to master node"));
|
|
|
|
log_err("%s\n", errmsg.data);
|
|
|
|
create_event_record(NULL,
|
|
&local_options,
|
|
local_options.node,
|
|
"repmgrd_shutdown",
|
|
false,
|
|
errmsg.data);
|
|
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
check_cluster_configuration(my_local_conn);
|
|
check_node_configuration();
|
|
|
|
if (reload_config(&local_options))
|
|
{
|
|
PQfinish(my_local_conn);
|
|
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
|
update_registration();
|
|
}
|
|
/* Log startup event */
|
|
if (startup_event_logged == false)
|
|
{
|
|
create_event_record(master_conn,
|
|
&local_options,
|
|
local_options.node,
|
|
"repmgrd_start",
|
|
true,
|
|
NULL);
|
|
startup_event_logged = true;
|
|
}
|
|
|
|
/*
|
|
* Every local_options.monitor_interval_secs seconds, do
|
|
* checks
|
|
*/
|
|
if (node_info.type == WITNESS)
|
|
{
|
|
log_info(_("starting continuous witness node monitoring\n"));
|
|
}
|
|
else if (node_info.type == STANDBY)
|
|
{
|
|
log_info(_("starting continuous standby node monitoring\n"));
|
|
}
|
|
|
|
do
|
|
{
|
|
log_verbose(LOG_DEBUG, "standby check loop...\n");
|
|
|
|
if (node_info.type == WITNESS)
|
|
{
|
|
witness_monitor();
|
|
}
|
|
else if (node_info.type == STANDBY)
|
|
{
|
|
standby_monitor();
|
|
}
|
|
|
|
sleep(local_options.monitor_interval_secs);
|
|
|
|
/*
|
|
* On a witness node, regularly resync the repl_nodes table
|
|
* to keep up with any changes on the primary
|
|
*
|
|
* TODO: only resync the table if changes actually detected
|
|
*/
|
|
if (node_info.type == WITNESS)
|
|
{
|
|
sync_repl_nodes_elapsed += local_options.monitor_interval_secs;
|
|
log_debug(_("seconds since last node record sync: %i (sync interval: %i)\n"), sync_repl_nodes_elapsed, local_options.witness_repl_nodes_sync_interval_secs);
|
|
if(sync_repl_nodes_elapsed >= local_options.witness_repl_nodes_sync_interval_secs)
|
|
{
|
|
log_debug(_("Resyncing repl_nodes table\n"));
|
|
witness_copy_node_records(master_conn, my_local_conn, local_options.cluster_name);
|
|
sync_repl_nodes_elapsed = 0;
|
|
}
|
|
}
|
|
|
|
if (got_SIGHUP)
|
|
{
|
|
/*
|
|
* if we can reload, then could need to change
|
|
* my_local_conn
|
|
*/
|
|
if (reload_config(&local_options))
|
|
{
|
|
PQfinish(my_local_conn);
|
|
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
|
update_registration();
|
|
}
|
|
got_SIGHUP = false;
|
|
}
|
|
|
|
if (failover_done)
|
|
{
|
|
log_debug(_("standby check loop will terminate\n"));
|
|
}
|
|
} while (!failover_done);
|
|
break;
|
|
default:
|
|
log_err(_("unrecognized mode for node %d\n"),
|
|
local_options.node);
|
|
}
|
|
|
|
failover_done = false;
|
|
|
|
} while (true);
|
|
|
|
/* close the connection to the database and cleanup */
|
|
close_connections();
|
|
|
|
/* Shuts down logging system */
|
|
logger_shutdown();
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
/*
|
|
* witness_monitor()
|
|
*
|
|
* Monitors witness server; attempt to find and connect to new master
|
|
* if existing master connection is lost
|
|
*/
|
|
static void
|
|
witness_monitor(void)
|
|
{
|
|
char monitor_witness_timestamp[MAXLEN];
|
|
PGresult *res;
|
|
char sqlquery[QUERY_STR_LEN];
|
|
bool connection_ok;
|
|
|
|
/*
|
|
* Check if master is available; if not, assume failover situation
|
|
* and try to determine new master. There may be a delay between detection
|
|
* of a missing master and promotion of a standby by that standby's
|
|
* repmgrd, so we'll loop for a while before giving up.
|
|
*/
|
|
connection_ok = check_connection(&master_conn, "master", NULL);
|
|
|
|
if (connection_ok == false)
|
|
{
|
|
int connection_retries;
|
|
log_debug(_("old master node ID: %i\n"), master_options.node);
|
|
|
|
/* We need to wait a while for the new master to be promoted */
|
|
log_info(
|
|
_("waiting %i seconds for a new master to be promoted...\n"),
|
|
local_options.master_response_timeout
|
|
);
|
|
|
|
sleep(local_options.master_response_timeout);
|
|
|
|
/* Attempt to find the new master */
|
|
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
|
{
|
|
log_info(
|
|
_("attempt %i of %i to determine new master...\n"),
|
|
connection_retries + 1,
|
|
local_options.reconnect_attempts
|
|
);
|
|
master_conn = get_master_connection(my_local_conn,
|
|
local_options.cluster_name, &master_options.node, NULL);
|
|
|
|
if (PQstatus(master_conn) != CONNECTION_OK)
|
|
{
|
|
log_warning(
|
|
_("unable to determine a valid master server; waiting %i seconds to retry...\n"),
|
|
local_options.reconnect_interval
|
|
);
|
|
PQfinish(master_conn);
|
|
sleep(local_options.reconnect_interval);
|
|
}
|
|
else
|
|
{
|
|
log_debug(_("new master found with node ID: %i\n"), master_options.node);
|
|
connection_ok = true;
|
|
|
|
/*
|
|
* Update the repl_nodes table from the new master to reflect the changed
|
|
* node configuration
|
|
*
|
|
* XXX it would be neat to be able to handle this with e.g. table-based
|
|
* logical replication
|
|
*/
|
|
witness_copy_node_records(master_conn, my_local_conn, local_options.cluster_name);
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (connection_ok == false)
|
|
{
|
|
PQExpBufferData errmsg;
|
|
initPQExpBuffer(&errmsg);
|
|
|
|
appendPQExpBuffer(&errmsg,
|
|
_("unable to determine a valid master node, terminating..."));
|
|
|
|
log_err("%s\n", errmsg.data);
|
|
|
|
create_event_record(NULL,
|
|
&local_options,
|
|
local_options.node,
|
|
"repmgrd_shutdown",
|
|
false,
|
|
errmsg.data);
|
|
|
|
terminate(ERR_DB_CON);
|
|
}
|
|
}
|
|
|
|
/* Fast path for the case where no history is requested */
|
|
if (!monitoring_history)
|
|
return;
|
|
|
|
/*
|
|
* Cancel any query that is still being executed, so i can insert the
|
|
* current record
|
|
*/
|
|
if (!cancel_query(master_conn, local_options.master_response_timeout))
|
|
return;
|
|
if (wait_connection_availability(master_conn,
|
|
local_options.master_response_timeout) != 1)
|
|
return;
|
|
|
|
/* Get local xlog info */
|
|
sqlquery_snprintf(sqlquery, "SELECT CURRENT_TIMESTAMP");
|
|
|
|
res = PQexec(my_local_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn));
|
|
PQclear(res);
|
|
/* if there is any error just let it be and retry in next loop */
|
|
return;
|
|
}
|
|
|
|
strcpy(monitor_witness_timestamp, PQgetvalue(res, 0, 0));
|
|
PQclear(res);
|
|
|
|
/*
|
|
* Build the SQL to execute on master
|
|
*/
|
|
sqlquery_snprintf(sqlquery,
|
|
"INSERT INTO %s.repl_monitor "
|
|
" (primary_node, standby_node, "
|
|
" last_monitor_time, last_apply_time, "
|
|
" last_wal_primary_location, last_wal_standby_location, "
|
|
" replication_lag, apply_lag )"
|
|
" VALUES(%d, %d, "
|
|
" '%s'::TIMESTAMP WITH TIME ZONE, NULL, "
|
|
" pg_catalog.pg_current_xlog_location(), NULL, "
|
|
" 0, 0) ",
|
|
get_repmgr_schema_quoted(my_local_conn),
|
|
master_options.node,
|
|
local_options.node,
|
|
monitor_witness_timestamp);
|
|
|
|
/*
|
|
* Execute the query asynchronously, but don't check for a result. We will
|
|
* check the result next time we pause for a monitor step.
|
|
*/
|
|
if (PQsendQuery(master_conn, sqlquery) == 0)
|
|
log_warning(_("query could not be sent to master: %s\n"),
|
|
PQerrorMessage(master_conn));
|
|
}
|
|
|
|
|
|
/*
|
|
* standby_monitor()
|
|
*
|
|
* Monitor standby server and handle failover situation. Also insert
|
|
* monitoring information if configured.
|
|
*/
|
|
static void
|
|
standby_monitor(void)
|
|
{
|
|
PGresult *res;
|
|
char monitor_standby_timestamp[MAXLEN];
|
|
char last_wal_primary_location[MAXLEN];
|
|
char last_xlog_receive_location[MAXLEN];
|
|
char last_xlog_replay_location[MAXLEN];
|
|
char last_xact_replay_timestamp[MAXLEN];
|
|
bool last_xlog_receive_location_gte_replayed;
|
|
char sqlquery[QUERY_STR_LEN];
|
|
|
|
XLogRecPtr lsn_master_current_xlog_location;
|
|
XLogRecPtr lsn_last_xlog_receive_location;
|
|
XLogRecPtr lsn_last_xlog_replay_location;
|
|
|
|
long long unsigned int replication_lag;
|
|
long long unsigned int apply_lag;
|
|
|
|
int connection_retries,
|
|
ret;
|
|
bool did_retry = false;
|
|
|
|
PGconn *upstream_conn;
|
|
char upstream_conninfo[MAXCONNINFO];
|
|
int upstream_node_id;
|
|
t_node_info upstream_node;
|
|
|
|
int active_master_id;
|
|
const char *upstream_node_type = NULL;
|
|
|
|
bool receiving_streamed_wal = true;
|
|
/*
|
|
* Verify that the local node is still available - if not there's
|
|
* no point in doing much else anyway
|
|
*/
|
|
|
|
if (!check_connection(&my_local_conn, "standby", NULL))
|
|
{
|
|
PQExpBufferData errmsg;
|
|
|
|
set_local_node_status();
|
|
|
|
initPQExpBuffer(&errmsg);
|
|
|
|
appendPQExpBuffer(&errmsg,
|
|
_("failed to connect to local node, node marked as failed!"));
|
|
|
|
log_err("%s\n", errmsg.data);
|
|
|
|
goto continue_monitoring_standby;
|
|
}
|
|
|
|
upstream_conn = get_upstream_connection(my_local_conn,
|
|
local_options.cluster_name,
|
|
local_options.node,
|
|
&upstream_node_id,
|
|
upstream_conninfo);
|
|
|
|
upstream_node_type = (upstream_node_id == master_options.node)
|
|
? "master"
|
|
: "upstream";
|
|
|
|
/*
|
|
* Check that the upstream node is still available
|
|
* If not, initiate failover process
|
|
*/
|
|
|
|
check_connection(&upstream_conn, upstream_node_type, upstream_conninfo);
|
|
/*
|
|
* This takes up to local_options.reconnect_attempts *
|
|
* local_options.reconnect_interval seconds
|
|
*/
|
|
|
|
if (PQstatus(upstream_conn) != CONNECTION_OK)
|
|
{
|
|
PQfinish(upstream_conn);
|
|
upstream_conn = NULL;
|
|
|
|
if (local_options.failover == MANUAL_FAILOVER)
|
|
{
|
|
log_err(_("Unable to reconnect to %s. Now checking if another node has been promoted.\n"), upstream_node_type);
|
|
|
|
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
|
{
|
|
master_conn = get_master_connection(my_local_conn,
|
|
local_options.cluster_name, &master_options.node, NULL);
|
|
if (PQstatus(master_conn) == CONNECTION_OK)
|
|
{
|
|
/*
|
|
* Connected, we can continue the process so break the
|
|
* loop
|
|
*/
|
|
log_err(_("connected to node %d, continuing monitoring.\n"),
|
|
master_options.node);
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
log_err(
|
|
_("no new master found, waiting %i seconds before retry...\n"),
|
|
local_options.retry_promote_interval_secs
|
|
);
|
|
|
|
sleep(local_options.retry_promote_interval_secs);
|
|
}
|
|
}
|
|
|
|
if (PQstatus(master_conn) != CONNECTION_OK)
|
|
{
|
|
PQExpBufferData errmsg;
|
|
initPQExpBuffer(&errmsg);
|
|
|
|
appendPQExpBuffer(&errmsg,
|
|
_("Unable to reconnect to master after %i attempts, terminating..."),
|
|
local_options.reconnect_attempts);
|
|
|
|
log_err("%s\n", errmsg.data);
|
|
|
|
create_event_record(NULL,
|
|
&local_options,
|
|
local_options.node,
|
|
"repmgrd_shutdown",
|
|
false,
|
|
errmsg.data);
|
|
|
|
terminate(ERR_DB_CON);
|
|
}
|
|
}
|
|
else if (local_options.failover == AUTOMATIC_FAILOVER)
|
|
{
|
|
/*
|
|
* When we return from this function we will have a new master
|
|
* and a new master_conn
|
|
*
|
|
* Failover handling is handled differently depending on whether
|
|
* the failed node is the master or a cascading standby
|
|
*/
|
|
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
|
|
|
if (upstream_node.type == MASTER)
|
|
{
|
|
log_debug(_("failure detected on master node (%i); attempting to promote a standby\n"),
|
|
node_info.upstream_node_id);
|
|
do_master_failover();
|
|
}
|
|
else
|
|
{
|
|
log_debug(_("failure detected on upstream node %i; attempting to reconnect to new upstream node\n"),
|
|
node_info.upstream_node_id);
|
|
|
|
if (!do_upstream_standby_failover(upstream_node))
|
|
{
|
|
PQExpBufferData errmsg;
|
|
initPQExpBuffer(&errmsg);
|
|
|
|
appendPQExpBuffer(&errmsg,
|
|
_("unable to reconnect to new upstream node, terminating..."));
|
|
|
|
log_err("%s\n", errmsg.data);
|
|
|
|
create_event_record(master_conn,
|
|
&local_options,
|
|
local_options.node,
|
|
"repmgrd_shutdown",
|
|
false,
|
|
errmsg.data);
|
|
|
|
terminate(ERR_DB_CON);
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
PQfinish(upstream_conn);
|
|
|
|
continue_monitoring_standby:
|
|
/* Check if we still are a standby, we could have been promoted */
|
|
do
|
|
{
|
|
ret = is_standby(my_local_conn);
|
|
|
|
switch (ret)
|
|
{
|
|
case 0:
|
|
/*
|
|
* This situation can occur if `pg_ctl promote` was manually executed
|
|
* on the node. If the original master is still running after this
|
|
* node has been promoted, we're in a "two brain" situation which
|
|
* will require manual resolution as there's no way of determing
|
|
* which master is the correct one.
|
|
*
|
|
* We should log a message so the user knows of the situation at hand.
|
|
*
|
|
* XXX check if the original master is still active and display a
|
|
* warning
|
|
*/
|
|
log_err(_("It seems this server was promoted manually (not by repmgr) so you might by in the presence of a split-brain.\n"));
|
|
log_err(_("Check your cluster and manually fix any anomaly.\n"));
|
|
terminate(1);
|
|
break;
|
|
|
|
case -1:
|
|
log_err(_("standby node has disappeared, trying to reconnect...\n"));
|
|
did_retry = true;
|
|
|
|
if (!check_connection(&my_local_conn, "standby", NULL))
|
|
{
|
|
set_local_node_status();
|
|
/*
|
|
* Let's continue checking, and if the postgres server on the
|
|
* standby comes back up, we will activate it again
|
|
*/
|
|
}
|
|
|
|
break;
|
|
}
|
|
} while (ret == -1);
|
|
|
|
if (did_retry)
|
|
{
|
|
/*
|
|
* There's a possible situation where the standby went down for some reason
|
|
* (maintenance for example) and is now up and maybe connected once again to
|
|
* the stream. If we set the local standby node as failed and it's now running
|
|
* and receiving replication data, we should activate it again.
|
|
*/
|
|
set_local_node_status();
|
|
log_info(_("standby connection recovered!\n"));
|
|
}
|
|
|
|
/* Fast path for the case where no history is requested */
|
|
if (!monitoring_history)
|
|
return;
|
|
|
|
/*
|
|
* If original master has gone away we'll need to get the new one
|
|
* from the upstream node to write monitoring information
|
|
*/
|
|
|
|
upstream_node = get_node_info(my_local_conn, local_options.cluster_name, upstream_node_id);
|
|
|
|
sprintf(sqlquery,
|
|
"SELECT id "
|
|
" FROM %s.repl_nodes "
|
|
" WHERE type = 'master' "
|
|
" AND active IS TRUE ",
|
|
get_repmgr_schema_quoted(my_local_conn));
|
|
|
|
res = PQexec(my_local_conn, sqlquery);
|
|
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("standby_monitor() - query error:%s\n"), PQerrorMessage(my_local_conn));
|
|
PQclear(res);
|
|
|
|
/* Not a fatal error, just means no monitoring records will be written */
|
|
return;
|
|
}
|
|
|
|
if (PQntuples(res) == 0)
|
|
{
|
|
log_err(_("standby_monitor(): no active master found\n"));
|
|
PQclear(res);
|
|
return;
|
|
}
|
|
|
|
active_master_id = atoi(PQgetvalue(res, 0, 0));
|
|
PQclear(res);
|
|
|
|
if (active_master_id != master_options.node)
|
|
{
|
|
log_notice(_("connecting to active master (node %i)...\n"), active_master_id); \
|
|
if (master_conn != NULL)
|
|
{
|
|
PQfinish(master_conn);
|
|
}
|
|
master_conn = get_master_connection(my_local_conn,
|
|
local_options.cluster_name,
|
|
&master_options.node, NULL);
|
|
}
|
|
if (PQstatus(master_conn) != CONNECTION_OK)
|
|
PQreset(master_conn);
|
|
|
|
/*
|
|
* Cancel any query that is still being executed, so i can insert the
|
|
* current record
|
|
*/
|
|
if (!cancel_query(master_conn, local_options.master_response_timeout))
|
|
return;
|
|
if (wait_connection_availability(master_conn, local_options.master_response_timeout) != 1)
|
|
return;
|
|
|
|
/* Get local xlog info */
|
|
sqlquery_snprintf(sqlquery,
|
|
"SELECT CURRENT_TIMESTAMP, "
|
|
"pg_catalog.pg_last_xlog_receive_location(), "
|
|
"pg_catalog.pg_last_xlog_replay_location(), "
|
|
"pg_catalog.pg_last_xact_replay_timestamp(), "
|
|
"pg_catalog.pg_last_xlog_receive_location() >= pg_catalog.pg_last_xlog_replay_location()");
|
|
|
|
res = PQexec(my_local_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn));
|
|
PQclear(res);
|
|
/* if there is any error just let it be and retry in next loop */
|
|
return;
|
|
}
|
|
|
|
strncpy(monitor_standby_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
|
|
strncpy(last_xlog_receive_location, PQgetvalue(res, 0, 1), MAXLEN);
|
|
strncpy(last_xlog_replay_location, PQgetvalue(res, 0, 2), MAXLEN);
|
|
strncpy(last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
|
|
|
|
last_xlog_receive_location_gte_replayed = (strcmp(PQgetvalue(res, 0, 4), "t") == 0)
|
|
? true
|
|
: false;
|
|
|
|
/*
|
|
* If pg_last_xlog_receive_location is NULL, this means we're in archive
|
|
* recovery and will need to calculate lag based on pg_last_xlog_replay_location
|
|
*/
|
|
|
|
/*
|
|
* Replayed WAL is greater than received streamed WAL
|
|
*/
|
|
if (PQgetisnull(res, 0, 1))
|
|
{
|
|
receiving_streamed_wal = false;
|
|
}
|
|
|
|
PQclear(res);
|
|
|
|
/*
|
|
* In the unusual event of a standby becoming disconnected from the primary,
|
|
* while this repmgrd remains connected to the primary, subtracting
|
|
* "last_xlog_replay_location" from "lsn_last_xlog_receive_location" and coercing to
|
|
* (long long unsigned int) will result in a meaningless, very large
|
|
* value which will overflow a BIGINT column and spew error messages into the
|
|
* PostgreSQL log. In the absence of a better strategy, skip attempting
|
|
* to insert a monitoring record.
|
|
*/
|
|
if (receiving_streamed_wal == true && last_xlog_receive_location_gte_replayed == false)
|
|
{
|
|
log_verbose(LOG_WARNING,
|
|
"Replayed WAL newer than received WAL - is this standby connected to its upstream?\n");
|
|
}
|
|
|
|
/*
|
|
* Get master xlog position
|
|
*
|
|
* TODO: investigate whether pg_current_xlog_insert_location() would be a better
|
|
* choice; see: https://github.com/2ndQuadrant/repmgr/issues/189
|
|
*/
|
|
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_current_xlog_location()");
|
|
|
|
res = PQexec(master_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("PQexec failed: %s\n"), PQerrorMessage(master_conn));
|
|
PQclear(res);
|
|
return;
|
|
}
|
|
|
|
strncpy(last_wal_primary_location, PQgetvalue(res, 0, 0), MAXLEN);
|
|
PQclear(res);
|
|
|
|
|
|
lsn_master_current_xlog_location = lsn_to_xlogrecptr(last_wal_primary_location, NULL);
|
|
lsn_last_xlog_replay_location = lsn_to_xlogrecptr(last_xlog_replay_location, NULL);
|
|
lsn_last_xlog_receive_location = lsn_to_xlogrecptr(last_xlog_receive_location, NULL);
|
|
|
|
|
|
/* Calculate apply lag */
|
|
if (last_xlog_receive_location_gte_replayed == false)
|
|
{
|
|
/*
|
|
* We're not receiving streaming WAL - in this case the receive location
|
|
* equals the last replayed location
|
|
*/
|
|
apply_lag = 0;
|
|
strncpy(last_xlog_receive_location, last_xlog_replay_location, MAXLEN);
|
|
}
|
|
else
|
|
{
|
|
apply_lag = (long long unsigned int)lsn_last_xlog_receive_location - lsn_last_xlog_replay_location;
|
|
}
|
|
|
|
/* Calculate replication lag */
|
|
if (lsn_master_current_xlog_location >= lsn_last_xlog_receive_location)
|
|
{
|
|
replication_lag = (long long unsigned int)(lsn_master_current_xlog_location - lsn_last_xlog_receive_location);
|
|
}
|
|
else
|
|
{
|
|
/* This should never happen, but in case it does set lag to zero */
|
|
log_warning("Master xlog (%s) location appears less than standby receive location (%s)\n",
|
|
last_wal_primary_location,
|
|
last_xlog_receive_location);
|
|
replication_lag = 0;
|
|
}
|
|
|
|
/*
|
|
* Build the SQL to execute on master
|
|
*/
|
|
sqlquery_snprintf(sqlquery,
|
|
"INSERT INTO %s.repl_monitor "
|
|
" (primary_node, "
|
|
" standby_node, "
|
|
" last_monitor_time, "
|
|
" last_apply_time, "
|
|
" last_wal_primary_location, "
|
|
" last_wal_standby_location, "
|
|
" replication_lag, "
|
|
" apply_lag ) "
|
|
" VALUES(%d, "
|
|
" %d, "
|
|
" '%s'::TIMESTAMP WITH TIME ZONE, "
|
|
" '%s'::TIMESTAMP WITH TIME ZONE, "
|
|
" '%s', "
|
|
" '%s', "
|
|
" %llu, "
|
|
" %llu) ",
|
|
get_repmgr_schema_quoted(master_conn),
|
|
master_options.node,
|
|
local_options.node,
|
|
monitor_standby_timestamp,
|
|
last_xact_replay_timestamp,
|
|
last_wal_primary_location,
|
|
last_xlog_receive_location,
|
|
replication_lag,
|
|
apply_lag);
|
|
|
|
/*
|
|
* Execute the query asynchronously, but don't check for a result. We will
|
|
* check the result next time we pause for a monitor step.
|
|
*/
|
|
log_verbose(LOG_DEBUG, "standby_monitor:() %s\n", sqlquery);
|
|
|
|
if (PQsendQuery(master_conn, sqlquery) == 0)
|
|
log_warning(_("query could not be sent to master. %s\n"),
|
|
PQerrorMessage(master_conn));
|
|
}
|
|
|
|
|
|
/*
|
|
* do_master_failover()
|
|
*
|
|
* Handles failover to new cluster master
|
|
*/
|
|
|
|
static void
|
|
do_master_failover(void)
|
|
{
|
|
PGresult *res;
|
|
char sqlquery[QUERY_STR_LEN];
|
|
|
|
int total_nodes = 0;
|
|
int visible_nodes = 0;
|
|
int ready_nodes = 0;
|
|
|
|
bool candidate_found = false;
|
|
|
|
int i;
|
|
int r;
|
|
|
|
XLogRecPtr xlog_recptr;
|
|
bool lsn_format_ok;
|
|
|
|
char last_xlog_replay_location[MAXLEN];
|
|
|
|
PGconn *node_conn = NULL;
|
|
|
|
/*
|
|
* will get info about until 50 nodes, which seems to be large enough for
|
|
* most scenarios
|
|
*/
|
|
t_node_info nodes[FAILOVER_NODES_MAX_CHECK];
|
|
|
|
/* Store details of the failed node here */
|
|
t_node_info failed_master = T_NODE_INFO_INITIALIZER;
|
|
|
|
/* Store details of the best candidate for promotion to master here */
|
|
t_node_info best_candidate = T_NODE_INFO_INITIALIZER;
|
|
|
|
/* get a list of standby nodes, including myself */
|
|
sprintf(sqlquery,
|
|
"SELECT id, conninfo, type, upstream_node_id "
|
|
" FROM %s.repl_nodes "
|
|
" WHERE cluster = '%s' "
|
|
" AND active IS TRUE "
|
|
" AND priority > 0 "
|
|
" ORDER BY priority DESC, id "
|
|
" LIMIT %i ",
|
|
get_repmgr_schema_quoted(my_local_conn),
|
|
local_options.cluster_name,
|
|
FAILOVER_NODES_MAX_CHECK);
|
|
|
|
res = PQexec(my_local_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("unable to retrieve node records: %s\n"), PQerrorMessage(my_local_conn));
|
|
PQclear(res);
|
|
terminate(ERR_DB_QUERY);
|
|
}
|
|
|
|
/*
|
|
* total nodes that are registered
|
|
*/
|
|
total_nodes = PQntuples(res);
|
|
log_debug(_("%d active nodes registered\n"), total_nodes);
|
|
|
|
/*
|
|
* Build an array with the nodes and indicate which ones are visible and
|
|
* ready
|
|
*/
|
|
for (i = 0; i < total_nodes; i++)
|
|
{
|
|
nodes[i].node_id = atoi(PQgetvalue(res, i, 0));
|
|
|
|
strncpy(nodes[i].conninfo_str, PQgetvalue(res, i, 1), MAXCONNINFO);
|
|
|
|
nodes[i].type = parse_node_type(PQgetvalue(res, i, 2));
|
|
|
|
/* Copy details of the failed node */
|
|
/* XXX only node_id is actually used later */
|
|
if (nodes[i].type == MASTER)
|
|
{
|
|
failed_master.node_id = nodes[i].node_id;
|
|
failed_master.xlog_location = nodes[i].xlog_location;
|
|
failed_master.is_ready = nodes[i].is_ready;
|
|
}
|
|
|
|
nodes[i].upstream_node_id = atoi(PQgetvalue(res, i, 3));
|
|
|
|
/*
|
|
* Initialize on false so if we can't reach this node we know that
|
|
* later
|
|
*/
|
|
nodes[i].is_visible = false;
|
|
nodes[i].is_ready = false;
|
|
|
|
nodes[i].xlog_location = InvalidXLogRecPtr;
|
|
|
|
log_debug(_("node=%d conninfo=\"%s\" type=%s\n"),
|
|
nodes[i].node_id, nodes[i].conninfo_str,
|
|
PQgetvalue(res, i, 2));
|
|
|
|
node_conn = establish_db_connection(nodes[i].conninfo_str, false);
|
|
|
|
/* if we can't see the node just skip it */
|
|
if (PQstatus(node_conn) != CONNECTION_OK)
|
|
{
|
|
if (node_conn != NULL)
|
|
PQfinish(node_conn);
|
|
|
|
continue;
|
|
}
|
|
|
|
visible_nodes++;
|
|
nodes[i].is_visible = true;
|
|
|
|
PQfinish(node_conn);
|
|
}
|
|
PQclear(res);
|
|
|
|
log_debug(_("total nodes counted: registered=%d, visible=%d\n"),
|
|
total_nodes, visible_nodes);
|
|
|
|
/*
|
|
* Am I on the group that should keep alive? If I see less than half of
|
|
* total_nodes then I should do nothing
|
|
*/
|
|
if (visible_nodes < (total_nodes / 2.0))
|
|
{
|
|
log_err(_("Unable to reach most of the nodes.\n"
|
|
"Let the other standby servers decide which one will be the master.\n"
|
|
"Manual action will be needed to re-add this node to the cluster.\n"));
|
|
terminate(ERR_FAILOVER_FAIL);
|
|
}
|
|
|
|
/* Query all available nodes to determine readiness and LSN */
|
|
for (i = 0; i < total_nodes; i++)
|
|
{
|
|
log_debug("checking node %i...\n", nodes[i].node_id);
|
|
|
|
/* if the node is not visible, skip it */
|
|
if (!nodes[i].is_visible)
|
|
continue;
|
|
|
|
/* if the node is a witness node, skip it */
|
|
if (nodes[i].type == WITNESS)
|
|
continue;
|
|
|
|
/* if node does not have same upstream node, skip it */
|
|
if (nodes[i].upstream_node_id != node_info.upstream_node_id)
|
|
continue;
|
|
|
|
node_conn = establish_db_connection(nodes[i].conninfo_str, false);
|
|
|
|
/*
|
|
* XXX This shouldn't happen, if this happens it means this is a major
|
|
* problem maybe network outages? anyway, is better for a human to
|
|
* react
|
|
*/
|
|
if (PQstatus(node_conn) != CONNECTION_OK)
|
|
{
|
|
log_err(_("It seems new problems are arising, manual intervention is needed\n"));
|
|
terminate(ERR_FAILOVER_FAIL);
|
|
}
|
|
|
|
sqlquery_snprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
|
res = PQexec(node_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_info(_("unable to retrieve node's last standby location: %s\n"),
|
|
PQerrorMessage(node_conn));
|
|
|
|
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
|
|
PQclear(res);
|
|
PQfinish(node_conn);
|
|
terminate(ERR_FAILOVER_FAIL);
|
|
}
|
|
|
|
xlog_recptr = lsn_to_xlogrecptr(PQgetvalue(res, 0, 0), &lsn_format_ok);
|
|
|
|
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, PQgetvalue(res, 0, 0));
|
|
|
|
PQclear(res);
|
|
PQfinish(node_conn);
|
|
|
|
/* If position is 0/0, error */
|
|
/* XXX do we need to terminate ourselves if the queried node has a problem? */
|
|
if (xlog_recptr == InvalidXLogRecPtr)
|
|
{
|
|
log_err(_("InvalidXLogRecPtr detected on standby node %i\n"), nodes[i].node_id);
|
|
terminate(ERR_FAILOVER_FAIL);
|
|
}
|
|
|
|
nodes[i].xlog_location = xlog_recptr;
|
|
}
|
|
|
|
/* last we get info about this node, and update shared memory */
|
|
sprintf(sqlquery, "SELECT pg_catalog.pg_last_xlog_receive_location()");
|
|
res = PQexec(my_local_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("PQexec failed: %s.\nReport an invalid value to not be "
|
|
" considered as new master and exit.\n"),
|
|
PQerrorMessage(my_local_conn));
|
|
PQclear(res);
|
|
sprintf(last_xlog_replay_location, "'%X/%X'", 0, 0);
|
|
update_shared_memory(last_xlog_replay_location);
|
|
terminate(ERR_DB_QUERY);
|
|
}
|
|
/* write last location in shared memory */
|
|
update_shared_memory(PQgetvalue(res, 0, 0));
|
|
PQclear(res);
|
|
|
|
/* Wait for each node to come up and report a valid LSN */
|
|
for (i = 0; i < total_nodes; i++)
|
|
{
|
|
/*
|
|
* ensure witness server is marked as ready, and skip
|
|
* LSN check
|
|
*/
|
|
if (nodes[i].type == WITNESS)
|
|
{
|
|
if (!nodes[i].is_ready)
|
|
{
|
|
nodes[i].is_ready = true;
|
|
ready_nodes++;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
/* if the node is not visible, skip it */
|
|
if (!nodes[i].is_visible)
|
|
continue;
|
|
|
|
/* if node does not have same upstream node, skip it */
|
|
if (nodes[i].upstream_node_id != node_info.upstream_node_id)
|
|
continue;
|
|
|
|
node_conn = establish_db_connection(nodes[i].conninfo_str, false);
|
|
|
|
/*
|
|
* XXX This shouldn't happen, if this happens it means this is a
|
|
* major problem maybe network outages? anyway, is better for a
|
|
* human to react
|
|
*/
|
|
if (PQstatus(node_conn) != CONNECTION_OK)
|
|
{
|
|
/* XXX */
|
|
log_info(_("At this point, it could be some race conditions "
|
|
"that are acceptable, assume the node is restarting "
|
|
"and starting failover procedure\n"));
|
|
continue;
|
|
}
|
|
|
|
while (!nodes[i].is_ready)
|
|
{
|
|
|
|
sqlquery_snprintf(sqlquery,
|
|
"SELECT %s.repmgr_get_last_standby_location()",
|
|
get_repmgr_schema_quoted(node_conn));
|
|
res = PQexec(node_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("PQexec failed: %s.\nReport an invalid value to not "
|
|
"be considered as new master and exit.\n"),
|
|
PQerrorMessage(node_conn));
|
|
PQclear(res);
|
|
PQfinish(node_conn);
|
|
terminate(ERR_DB_QUERY);
|
|
}
|
|
|
|
xlog_recptr = lsn_to_xlogrecptr(PQgetvalue(res, 0, 0), &lsn_format_ok);
|
|
|
|
/* If position reported as "invalid", check for format error or
|
|
* empty string; otherwise position is 0/0 and we need to continue
|
|
* looping until a valid LSN is reported
|
|
*/
|
|
if (xlog_recptr == InvalidXLogRecPtr)
|
|
{
|
|
if (lsn_format_ok == false)
|
|
{
|
|
/* Unable to parse value returned by `repmgr_get_last_standby_location()` */
|
|
if (*PQgetvalue(res, 0, 0) == '\0')
|
|
{
|
|
log_crit(
|
|
_("unable to obtain LSN from node %i"), nodes[i].node_id
|
|
);
|
|
log_hint(
|
|
_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n")
|
|
);
|
|
|
|
PQclear(res);
|
|
PQfinish(node_conn);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/*
|
|
* Very unlikely to happen; in the absence of any better
|
|
* strategy keep checking
|
|
*/
|
|
log_warning(_("unable to parse LSN \"%s\"\n"),
|
|
PQgetvalue(res, 0, 0));
|
|
}
|
|
else
|
|
{
|
|
log_debug(
|
|
_("invalid LSN returned from node %i: '%s'\n"),
|
|
nodes[i].node_id,
|
|
PQgetvalue(res, 0, 0)
|
|
);
|
|
}
|
|
|
|
PQclear(res);
|
|
|
|
/* If position is 0/0, keep checking */
|
|
/* XXX we should add a timeout here to prevent infinite looping
|
|
* if the other node's repmgrd is not up
|
|
*/
|
|
continue;
|
|
}
|
|
|
|
if (nodes[i].xlog_location < xlog_recptr)
|
|
{
|
|
nodes[i].xlog_location = xlog_recptr;
|
|
}
|
|
|
|
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, PQgetvalue(res, 0, 0));
|
|
PQclear(res);
|
|
|
|
ready_nodes++;
|
|
nodes[i].is_ready = true;
|
|
}
|
|
|
|
PQfinish(node_conn);
|
|
}
|
|
|
|
|
|
/*
|
|
* determine which one is the best candidate to promote to master
|
|
*/
|
|
for (i = 0; i < total_nodes; i++)
|
|
{
|
|
/* witness server can never be a candidate */
|
|
if (nodes[i].type == WITNESS)
|
|
continue;
|
|
|
|
if (!nodes[i].is_ready || !nodes[i].is_visible)
|
|
continue;
|
|
|
|
if (!candidate_found)
|
|
{
|
|
/*
|
|
* If no candidate has been found so far, the first visible and ready
|
|
* node becomes the best candidate by default
|
|
*/
|
|
best_candidate.node_id = nodes[i].node_id;
|
|
best_candidate.xlog_location = nodes[i].xlog_location;
|
|
best_candidate.is_ready = nodes[i].is_ready;
|
|
strncpy(best_candidate.conninfo_str, nodes[i].conninfo_str, MAXCONNINFO);
|
|
candidate_found = true;
|
|
}
|
|
|
|
/*
|
|
* Nodes are retrieved ordered by priority, so if the current best
|
|
* candidate is lower than the next node's wal location then assign
|
|
* next node as the new best candidate.
|
|
*/
|
|
if (best_candidate.xlog_location < nodes[i].xlog_location)
|
|
{
|
|
best_candidate.node_id = nodes[i].node_id;
|
|
best_candidate.xlog_location = nodes[i].xlog_location;
|
|
best_candidate.is_ready = nodes[i].is_ready;
|
|
strncpy(best_candidate.conninfo_str, nodes[i].conninfo_str, MAXCONNINFO);
|
|
}
|
|
}
|
|
|
|
/* Terminate if no candidate found */
|
|
if (!candidate_found)
|
|
{
|
|
log_err(_("no suitable candidate for promotion found; terminating.\n"));
|
|
terminate(ERR_FAILOVER_FAIL);
|
|
}
|
|
|
|
log_debug("best candidate node id is %i\n", best_candidate.node_id);
|
|
|
|
/* if local node is the best candidate, promote it */
|
|
if (best_candidate.node_id == local_options.node)
|
|
{
|
|
PQExpBufferData event_details;
|
|
|
|
/* Close the connection to this server */
|
|
PQfinish(my_local_conn);
|
|
my_local_conn = NULL;
|
|
|
|
initPQExpBuffer(&event_details);
|
|
/* wait */
|
|
sleep(5);
|
|
|
|
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
|
|
|
log_debug("promote command is: \"%s\"\n",
|
|
local_options.promote_command);
|
|
|
|
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
|
{
|
|
fflush(stderr);
|
|
}
|
|
|
|
r = system(local_options.promote_command);
|
|
if (r != 0)
|
|
{
|
|
/*
|
|
* Check whether the primary reappeared, which will have caused the
|
|
* promote command to fail
|
|
*/
|
|
my_local_conn = establish_db_connection(local_options.conninfo, false);
|
|
|
|
if (my_local_conn != NULL)
|
|
{
|
|
int master_node_id;
|
|
|
|
master_conn = get_master_connection(my_local_conn,
|
|
local_options.cluster_name,
|
|
&master_node_id, NULL);
|
|
|
|
if (master_conn != NULL && master_node_id == failed_master.node_id)
|
|
{
|
|
log_notice(_("Original master reappeared before this standby was promoted - no action taken\n"));
|
|
|
|
PQfinish(master_conn);
|
|
master_conn = NULL;
|
|
|
|
/* no failover occurred but we'll want to restart connections */
|
|
failover_done = true;
|
|
return;
|
|
}
|
|
}
|
|
|
|
log_err(_("promote command failed. You could check and try it manually.\n"));
|
|
|
|
terminate(ERR_DB_QUERY);
|
|
}
|
|
|
|
/* and reconnect to the local database */
|
|
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
|
|
|
|
|
/* update internal record for this node */
|
|
node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("node %i promoted to master; old master %i marked as failed"),
|
|
node_info.node_id,
|
|
failed_master.node_id);
|
|
|
|
/* my_local_conn is now the master */
|
|
create_event_record(my_local_conn,
|
|
&local_options,
|
|
node_info.node_id,
|
|
"repmgrd_failover_promote",
|
|
true,
|
|
event_details.data);
|
|
|
|
}
|
|
/* local node not promotion candidate - find the new master */
|
|
else
|
|
{
|
|
PGconn *new_master_conn;
|
|
PQExpBufferData event_details;
|
|
int master_node_id;
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
/* wait */
|
|
sleep(10);
|
|
|
|
/*
|
|
* Check whether the primary reappeared while we were waiting, so we
|
|
* don't end up following the promotion candidate
|
|
*/
|
|
|
|
master_conn = get_master_connection(my_local_conn,
|
|
local_options.cluster_name,
|
|
&master_node_id, NULL);
|
|
|
|
if (master_conn != NULL && master_node_id == failed_master.node_id)
|
|
{
|
|
log_notice(_("Original master reappeared - no action taken\n"));
|
|
|
|
PQfinish(master_conn);
|
|
/* no failover occurred but we'll want to restart connections */
|
|
failover_done = true;
|
|
return;
|
|
}
|
|
|
|
|
|
/* Close the connection to this server */
|
|
PQfinish(my_local_conn);
|
|
my_local_conn = NULL;
|
|
|
|
/* XXX double-check the promotion candidate did become the new primary */
|
|
|
|
log_notice(_("node %d is the best candidate for new master, attempting to follow...\n"),
|
|
best_candidate.node_id);
|
|
|
|
/*
|
|
* The new master may some time to be promoted. The follow command
|
|
* should take care of that.
|
|
*/
|
|
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
|
{
|
|
fflush(stderr);
|
|
}
|
|
|
|
|
|
log_debug(_("executing follow command: \"%s\"\n"), local_options.follow_command);
|
|
|
|
r = system(local_options.follow_command);
|
|
if (r != 0)
|
|
{
|
|
appendPQExpBuffer(&event_details,
|
|
_("Unable to execute follow command:\n %s"),
|
|
local_options.follow_command);
|
|
|
|
log_err("%s\n", event_details.data);
|
|
|
|
/* It won't be possible to write to the event notification
|
|
* table but we should be able to generate an external notification
|
|
* if required.
|
|
*/
|
|
create_event_record(NULL,
|
|
&local_options,
|
|
node_info.node_id,
|
|
"repmgrd_failover_follow",
|
|
false,
|
|
event_details.data);
|
|
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/* and reconnect to the local database */
|
|
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
|
|
|
/* update internal record for this node*/
|
|
new_master_conn = establish_db_connection(best_candidate.conninfo_str, true);
|
|
|
|
node_info = get_node_info(new_master_conn, local_options.cluster_name, local_options.node);
|
|
appendPQExpBuffer(&event_details,
|
|
_("node %i now following new upstream node %i"),
|
|
node_info.node_id,
|
|
best_candidate.node_id);
|
|
|
|
log_notice("%s\n", event_details.data);
|
|
|
|
create_event_record(new_master_conn,
|
|
&local_options,
|
|
node_info.node_id,
|
|
"repmgrd_failover_follow",
|
|
true,
|
|
event_details.data);
|
|
|
|
PQfinish(new_master_conn);
|
|
termPQExpBuffer(&event_details);
|
|
}
|
|
|
|
/* to force it to re-calculate mode and master node */
|
|
// ^ ZZZ check that behaviour ^
|
|
failover_done = true;
|
|
}
|
|
|
|
|
|
/*
|
|
* do_upstream_standby_failover()
|
|
*
|
|
* Attach cascaded standby to new upstream server
|
|
*
|
|
* Currently we will try to attach to the failed upstream's upstream.
|
|
* It might be worth providing a selection of reconnection strategies
|
|
* as different behaviour might be desirable in different situations;
|
|
* or maybe the option not to reconnect might be required?
|
|
*
|
|
* XXX check this handles replication slots gracefully
|
|
*/
|
|
static bool
|
|
do_upstream_standby_failover(t_node_info upstream_node)
|
|
{
|
|
PGresult *res;
|
|
char sqlquery[QUERY_STR_LEN];
|
|
int upstream_node_id = node_info.upstream_node_id;
|
|
int r;
|
|
PQExpBufferData event_details;
|
|
|
|
log_debug(_("do_upstream_standby_failover(): performing failover for node %i\n"),
|
|
node_info.node_id);
|
|
|
|
/*
|
|
* Verify that we can still talk to the cluster master even though
|
|
* node upstream is not available
|
|
*/
|
|
if (!check_connection(&master_conn, "master", NULL))
|
|
{
|
|
log_err(_("do_upstream_standby_failover(): Unable to connect to last known master node\n"));
|
|
return false;
|
|
}
|
|
|
|
while(1)
|
|
{
|
|
sqlquery_snprintf(sqlquery,
|
|
"SELECT id, active, upstream_node_id, type, conninfo "
|
|
" FROM %s.repl_nodes "
|
|
" WHERE id = %i ",
|
|
get_repmgr_schema_quoted(master_conn),
|
|
upstream_node_id);
|
|
|
|
res = PQexec(master_conn, sqlquery);
|
|
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("unable to query cluster master: %s\n"), PQerrorMessage(master_conn));
|
|
PQclear(res);
|
|
return false;
|
|
}
|
|
|
|
if (PQntuples(res) == 0)
|
|
{
|
|
log_err(_("no node with id %i found\n"), upstream_node_id);
|
|
PQclear(res);
|
|
return false;
|
|
}
|
|
|
|
/* upstream node is inactive */
|
|
if (strcmp(PQgetvalue(res, 0, 1), "f") == 0)
|
|
{
|
|
/*
|
|
* Upstream node is an inactive master, meaning no there are no direct
|
|
* upstream nodes available to reattach to.
|
|
*
|
|
* XXX For now we'll simply terminate, however it would make sense to
|
|
* provide an option to either try and find the current master and/or
|
|
* a strategy to connect to a different upstream node
|
|
*/
|
|
if (strcmp(PQgetvalue(res, 0, 4), "master") == 0)
|
|
{
|
|
log_err(_("unable to find active master node\n"));
|
|
PQclear(res);
|
|
return false;
|
|
}
|
|
|
|
upstream_node_id = atoi(PQgetvalue(res, 0, 2));
|
|
}
|
|
else
|
|
{
|
|
upstream_node_id = atoi(PQgetvalue(res, 0, 0));
|
|
|
|
log_notice(_("found active upstream node with id %i\n"), upstream_node_id);
|
|
PQclear(res);
|
|
break;
|
|
}
|
|
|
|
PQclear(res);
|
|
sleep(local_options.reconnect_interval);
|
|
}
|
|
|
|
/* Close the connection to this server */
|
|
PQfinish(my_local_conn);
|
|
my_local_conn = NULL;
|
|
|
|
initPQExpBuffer(&event_details);
|
|
|
|
/* Follow new upstream */
|
|
r = system(local_options.follow_command);
|
|
if (r != 0)
|
|
{
|
|
appendPQExpBuffer(&event_details,
|
|
_("Unable to execute follow command:\n %s"),
|
|
local_options.follow_command);
|
|
|
|
log_err("%s\n", event_details.data);
|
|
|
|
/* It won't be possible to write to the event notification
|
|
* table but we should be able to generate an external notification
|
|
* if required.
|
|
*/
|
|
create_event_record(NULL,
|
|
&local_options,
|
|
node_info.node_id,
|
|
"repmgrd_failover_follow",
|
|
false,
|
|
event_details.data);
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
if (update_node_record_set_upstream(master_conn, local_options.cluster_name, node_info.node_id, upstream_node_id) == false)
|
|
{
|
|
appendPQExpBuffer(&event_details,
|
|
_("Unable to set node %i's new upstream ID to %i"),
|
|
node_info.node_id,
|
|
upstream_node_id);
|
|
create_event_record(NULL,
|
|
&local_options,
|
|
node_info.node_id,
|
|
"repmgrd_failover_follow",
|
|
false,
|
|
event_details.data);
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
appendPQExpBuffer(&event_details,
|
|
_("node %i is now following upstream node %i"),
|
|
node_info.node_id,
|
|
upstream_node_id);
|
|
|
|
create_event_record(NULL,
|
|
&local_options,
|
|
node_info.node_id,
|
|
"repmgrd_failover_follow",
|
|
true,
|
|
event_details.data);
|
|
|
|
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
check_connection(PGconn **conn, const char *type, const char *conninfo)
|
|
{
|
|
int connection_retries;
|
|
|
|
/*
|
|
* Check if the node is still available if after
|
|
* local_options.reconnect_attempts * local_options.reconnect_interval
|
|
* seconds of retries we cannot reconnect return false
|
|
*/
|
|
for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
|
|
{
|
|
if (*conn == NULL)
|
|
{
|
|
if (conninfo == NULL)
|
|
{
|
|
log_err("INTERNAL ERROR: *conn == NULL && conninfo == NULL");
|
|
terminate(ERR_INTERNAL);
|
|
}
|
|
*conn = establish_db_connection(conninfo, false);
|
|
}
|
|
if (!is_pgup(*conn, local_options.master_response_timeout))
|
|
{
|
|
log_warning(_("connection to %s has been lost, trying to recover... %i seconds before failover decision\n"),
|
|
type,
|
|
(local_options.reconnect_interval * (local_options.reconnect_attempts - connection_retries)));
|
|
/* wait local_options.reconnect_interval seconds between retries */
|
|
sleep(local_options.reconnect_interval);
|
|
}
|
|
else
|
|
{
|
|
if (connection_retries > 0)
|
|
{
|
|
log_info(_("connection to %s has been restored.\n"), type);
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (!is_pgup(*conn, local_options.master_response_timeout))
|
|
{
|
|
log_err(_("unable to reconnect to %s (timeout %i seconds)...\n"),
|
|
type,
|
|
local_options.master_response_timeout
|
|
);
|
|
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* set_local_node_status()
|
|
*
|
|
* If failure of the local node is detected, attempt to connect
|
|
* to the current master server (as stored in the global variable
|
|
* `master_conn`) and update its record to failed.
|
|
*/
|
|
|
|
static bool
|
|
set_local_node_status(void)
|
|
{
|
|
PGresult *res;
|
|
char sqlquery[QUERY_STR_LEN];
|
|
int active_master_node_id = NODE_NOT_FOUND;
|
|
char master_conninfo[MAXLEN];
|
|
|
|
if (!check_connection(&master_conn, "master", NULL))
|
|
{
|
|
log_err(_("set_local_node_status(): Unable to connect to last known master node\n"));
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Check that the node `master_conn` is connected to is node is still
|
|
* master - it's just about conceivable that it might have become a
|
|
* standby of a new master in the intervening period
|
|
*/
|
|
|
|
sqlquery_snprintf(sqlquery,
|
|
"SELECT id, conninfo "
|
|
" FROM %s.repl_nodes "
|
|
" WHERE type = 'master' "
|
|
" AND active IS TRUE ",
|
|
get_repmgr_schema_quoted(master_conn));
|
|
|
|
res = PQexec(master_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("unable to obtain record for active master: %s\n"),
|
|
PQerrorMessage(master_conn));
|
|
|
|
return false;
|
|
}
|
|
|
|
if (!PQntuples(res))
|
|
{
|
|
log_err(_("no active master record found\n"));
|
|
return false;
|
|
}
|
|
|
|
active_master_node_id = atoi(PQgetvalue(res, 0, 0));
|
|
strncpy(master_conninfo, PQgetvalue(res, 0, 1), MAXLEN);
|
|
PQclear(res);
|
|
|
|
if (active_master_node_id != master_options.node)
|
|
{
|
|
log_notice(_("current active master is %i; attempting to connect\n"),
|
|
active_master_node_id);
|
|
PQfinish(master_conn);
|
|
master_conn = establish_db_connection(master_conninfo, false);
|
|
|
|
if (PQstatus(master_conn) != CONNECTION_OK)
|
|
{
|
|
log_err(_("unable to connect to active master\n"));
|
|
return false;
|
|
}
|
|
|
|
log_notice(_("Connection to new master was successful\n"));
|
|
}
|
|
|
|
|
|
/*
|
|
* Attempt to set the active record to the correct value.
|
|
* First
|
|
*/
|
|
|
|
if (!update_node_record_status(master_conn,
|
|
local_options.cluster_name,
|
|
node_info.node_id,
|
|
"standby",
|
|
node_info.upstream_node_id,
|
|
is_standby(my_local_conn)==1))
|
|
{
|
|
log_err(_("unable to set local node %i as inactive on master: %s\n"),
|
|
node_info.node_id,
|
|
PQerrorMessage(master_conn));
|
|
|
|
return false;
|
|
}
|
|
|
|
log_notice(_("marking this node (%i) as inactive on master\n"), node_info.node_id);
|
|
return true;
|
|
}
|
|
|
|
|
|
static void
|
|
check_cluster_configuration(PGconn *conn)
|
|
{
|
|
PGresult *res;
|
|
char sqlquery[QUERY_STR_LEN];
|
|
|
|
log_info(_("checking cluster configuration with schema '%s'\n"), get_repmgr_schema());
|
|
|
|
sqlquery_snprintf(sqlquery,
|
|
"SELECT oid FROM pg_catalog.pg_class "
|
|
" WHERE oid = '%s.repl_nodes'::regclass ",
|
|
get_repmgr_schema_quoted(master_conn));
|
|
|
|
res = PQexec(conn, sqlquery);
|
|
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("PQexec failed: %s\n"), PQerrorMessage(conn));
|
|
PQclear(res);
|
|
terminate(ERR_DB_QUERY);
|
|
}
|
|
|
|
/*
|
|
* If there isn't any results then we have not configured a master node
|
|
* yet in repmgr or the connection string is pointing to the wrong
|
|
* database.
|
|
*
|
|
* XXX if we are the master, should we try to create the tables needed?
|
|
*/
|
|
if (PQntuples(res) == 0)
|
|
{
|
|
log_err(_("the replication cluster is not configured\n"));
|
|
PQclear(res);
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
PQclear(res);
|
|
}
|
|
|
|
|
|
static void
|
|
check_node_configuration(void)
|
|
{
|
|
PGresult *res;
|
|
char sqlquery[QUERY_STR_LEN];
|
|
|
|
/*
|
|
* Check if this node has an entry in `repl_nodes`
|
|
*/
|
|
log_info(_("checking node %d in cluster '%s'\n"),
|
|
local_options.node, local_options.cluster_name);
|
|
|
|
sqlquery_snprintf(sqlquery,
|
|
"SELECT COUNT(*) "
|
|
" FROM %s.repl_nodes "
|
|
" WHERE id = %d "
|
|
" AND cluster = '%s' ",
|
|
get_repmgr_schema_quoted(my_local_conn),
|
|
local_options.node,
|
|
local_options.cluster_name);
|
|
|
|
res = PQexec(my_local_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_err(_("PQexec failed: %s\n"), PQerrorMessage(my_local_conn));
|
|
PQclear(res);
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/*
|
|
* If there isn't any results then we have not configured this node yet in
|
|
* repmgr, if that is the case we will insert the node to the cluster,
|
|
* except if it is a witness
|
|
*/
|
|
if (PQntuples(res) == 0)
|
|
{
|
|
PQclear(res);
|
|
|
|
if (node_info.type == WITNESS)
|
|
{
|
|
log_err(_("The witness is not configured\n"));
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
/* Adding the node */
|
|
log_info(_("adding node %d to cluster '%s'\n"),
|
|
local_options.node, local_options.cluster_name);
|
|
|
|
/* XXX use create_node_record() */
|
|
sqlquery_snprintf(sqlquery,
|
|
"INSERT INTO %s.repl_nodes"
|
|
" (id, cluster, name, conninfo, priority, witness) "
|
|
" VALUES (%d, '%s', '%s', '%s', 0, FALSE) ",
|
|
get_repmgr_schema_quoted(master_conn),
|
|
local_options.node,
|
|
local_options.cluster_name,
|
|
local_options.node_name,
|
|
local_options.conninfo);
|
|
|
|
if (!PQexec(master_conn, sqlquery))
|
|
{
|
|
log_err(_("unable to insert node details, %s\n"),
|
|
PQerrorMessage(master_conn));
|
|
terminate(ERR_BAD_CONFIG);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
PQclear(res);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* lsn_to_xlogrecptr()
|
|
*
|
|
* Convert an LSN represented as a string to an XLogRecPtr;
|
|
* optionally set a flag to indicated the provided string
|
|
* could not be parsed
|
|
*/
|
|
static XLogRecPtr
|
|
lsn_to_xlogrecptr(char *lsn, bool *format_ok)
|
|
{
|
|
uint32 xlogid;
|
|
uint32 xrecoff;
|
|
|
|
if (sscanf(lsn, "%X/%X", &xlogid, &xrecoff) != 2)
|
|
{
|
|
if (format_ok != NULL)
|
|
*format_ok = false;
|
|
log_err(_("incorrect log location format: %s\n"), lsn);
|
|
return 0;
|
|
}
|
|
|
|
if (format_ok != NULL)
|
|
*format_ok = true;
|
|
|
|
return (((XLogRecPtr) xlogid * 16 * 1024 * 1024 * 255) + xrecoff);
|
|
}
|
|
|
|
void
|
|
usage(void)
|
|
{
|
|
log_err(_("%s: Replicator manager daemon \n"), progname());
|
|
log_err(_("Try \"%s --help\" for more information.\n"), progname());
|
|
}
|
|
|
|
|
|
void
|
|
help(void)
|
|
{
|
|
printf(_("%s: replication management daemon for PostgreSQL\n"), progname());
|
|
printf(_("\n"));
|
|
printf(_("Usage:\n"));
|
|
printf(_(" %s [OPTIONS]\n"), progname());
|
|
printf(_("\n"));
|
|
printf(_("Options:\n"));
|
|
printf(_(" -?, --help show this help, then exit\n"));
|
|
printf(_(" -V, --version output version information, then exit\n"));
|
|
printf(_(" -v, --verbose output verbose activity information\n"));
|
|
printf(_(" -m, --monitoring-history track advance or lag of the replication in every standby in repl_monitor\n"));
|
|
printf(_(" -f, --config-file=PATH path to the configuration file\n"));
|
|
printf(_(" -d, --daemonize detach process from foreground\n"));
|
|
printf(_(" -p, --pid-file=PATH write a PID file\n"));
|
|
printf(_("\n"));
|
|
printf(_("%s monitors a cluster of servers and optionally performs failover.\n"), progname());
|
|
}
|
|
|
|
|
|
#ifndef WIN32
|
|
static void
|
|
handle_sigint(SIGNAL_ARGS)
|
|
{
|
|
terminate(0);
|
|
}
|
|
|
|
/* SIGHUP: set flag to re-read config file at next convenient time */
|
|
static void
|
|
handle_sighup(SIGNAL_ARGS)
|
|
{
|
|
got_SIGHUP = true;
|
|
}
|
|
|
|
static void
|
|
setup_event_handlers(void)
|
|
{
|
|
pqsignal(SIGHUP, handle_sighup);
|
|
pqsignal(SIGINT, handle_sigint);
|
|
pqsignal(SIGTERM, handle_sigint);
|
|
}
|
|
#endif
|
|
|
|
static void
|
|
terminate(int retval)
|
|
{
|
|
close_connections();
|
|
logger_shutdown();
|
|
|
|
if (pid_file)
|
|
{
|
|
unlink(pid_file);
|
|
}
|
|
|
|
log_info(_("%s terminating...\n"), progname());
|
|
|
|
exit(retval);
|
|
}
|
|
|
|
|
|
static void
|
|
update_shared_memory(char *last_xlog_replay_location)
|
|
{
|
|
PGresult *res;
|
|
char sqlquery[QUERY_STR_LEN];
|
|
|
|
sprintf(sqlquery,
|
|
"SELECT %s.repmgr_update_standby_location('%s')",
|
|
get_repmgr_schema_quoted(my_local_conn),
|
|
last_xlog_replay_location);
|
|
|
|
/* If an error happens, just inform about that and continue */
|
|
res = PQexec(my_local_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
|
{
|
|
log_warning(_("Cannot update this standby's shared memory: %s\n"),
|
|
PQerrorMessage(my_local_conn));
|
|
/* XXX is this enough reason to terminate this repmgrd? */
|
|
}
|
|
else if (strcmp(PQgetvalue(res, 0, 0), "f") == 0)
|
|
{
|
|
/* this surely is more than enough reason to exit */
|
|
log_crit(_("Cannot update this standby's shared memory, maybe shared_preload_libraries=repmgr_funcs is not set?\n"));
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
PQclear(res);
|
|
}
|
|
|
|
static void
|
|
update_registration(void)
|
|
{
|
|
PGresult *res;
|
|
char sqlquery[QUERY_STR_LEN];
|
|
|
|
sqlquery_snprintf(sqlquery,
|
|
"UPDATE %s.repl_nodes "
|
|
" SET conninfo = '%s', "
|
|
" priority = %d "
|
|
" WHERE id = %d ",
|
|
get_repmgr_schema_quoted(master_conn),
|
|
local_options.conninfo,
|
|
local_options.priority,
|
|
local_options.node);
|
|
|
|
res = PQexec(master_conn, sqlquery);
|
|
if (PQresultStatus(res) != PGRES_COMMAND_OK)
|
|
{
|
|
PQExpBufferData errmsg;
|
|
initPQExpBuffer(&errmsg);
|
|
|
|
appendPQExpBuffer(&errmsg,
|
|
_("unable to update registration: %s"),
|
|
PQerrorMessage(master_conn));
|
|
|
|
log_err("%s\n", errmsg.data);
|
|
|
|
create_event_record(master_conn,
|
|
&local_options,
|
|
local_options.node,
|
|
"repmgrd_shutdown",
|
|
false,
|
|
errmsg.data);
|
|
terminate(ERR_DB_CON);
|
|
}
|
|
PQclear(res);
|
|
}
|
|
|
|
static void
|
|
do_daemonize()
|
|
{
|
|
char *ptr,
|
|
path[MAXLEN];
|
|
pid_t pid = fork();
|
|
int ret;
|
|
|
|
switch (pid)
|
|
{
|
|
case -1:
|
|
log_err("Error in fork(): %s\n", strerror(errno));
|
|
exit(ERR_SYS_FAILURE);
|
|
break;
|
|
|
|
case 0: /* child process */
|
|
pid = setsid();
|
|
if (pid == (pid_t) -1)
|
|
{
|
|
log_err("Error in setsid(): %s\n", strerror(errno));
|
|
exit(ERR_SYS_FAILURE);
|
|
}
|
|
|
|
/* ensure that we are no longer able to open a terminal */
|
|
pid = fork();
|
|
|
|
if (pid == -1) /* error case */
|
|
{
|
|
log_err("Error in fork(): %s\n", strerror(errno));
|
|
exit(ERR_SYS_FAILURE);
|
|
break;
|
|
}
|
|
|
|
if (pid != 0) /* parent process */
|
|
{
|
|
exit(0);
|
|
}
|
|
|
|
/* a child just flows along */
|
|
|
|
memset(path, 0, MAXLEN);
|
|
|
|
for (ptr = config_file + strlen(config_file); ptr > config_file; --ptr)
|
|
{
|
|
if (*ptr == '/')
|
|
{
|
|
strncpy(path, config_file, ptr - config_file);
|
|
}
|
|
}
|
|
|
|
if (*path == '\0')
|
|
{
|
|
*path = '/';
|
|
}
|
|
|
|
ret = chdir(path);
|
|
if (ret != 0)
|
|
{
|
|
log_err("Error changing directory to '%s': %s", path,
|
|
strerror(errno));
|
|
}
|
|
|
|
break;
|
|
|
|
default: /* parent process */
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
static void
|
|
check_and_create_pid_file(const char *pid_file)
|
|
{
|
|
struct stat st;
|
|
FILE *fd;
|
|
char buff[MAXLEN];
|
|
pid_t pid;
|
|
size_t nread;
|
|
|
|
if (stat(pid_file, &st) != -1)
|
|
{
|
|
memset(buff, 0, MAXLEN);
|
|
|
|
fd = fopen(pid_file, "r");
|
|
|
|
if (fd == NULL)
|
|
{
|
|
log_err("PID file %s exists but could not opened for reading. "
|
|
"If repmgrd is no longer alive remove the file and restart repmgrd.\n",
|
|
pid_file);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
nread = fread(buff, MAXLEN - 1, 1, fd);
|
|
|
|
if (nread == 0 && ferror(fd))
|
|
{
|
|
log_err("Error reading PID file '%s', giving up...\n", pid_file);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
fclose(fd);
|
|
|
|
pid = atoi(buff);
|
|
|
|
if (pid != 0)
|
|
{
|
|
if (kill(pid, 0) != -1)
|
|
{
|
|
log_err("PID file %s exists and seems to contain a valid PID. "
|
|
"If repmgrd is no longer alive remove the file and restart repmgrd.\n",
|
|
pid_file);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
}
|
|
}
|
|
|
|
fd = fopen(pid_file, "w");
|
|
if (fd == NULL)
|
|
{
|
|
log_err("Could not open PID file %s!\n", pid_file);
|
|
exit(ERR_BAD_CONFIG);
|
|
}
|
|
|
|
fprintf(fd, "%d", getpid());
|
|
fclose(fd);
|
|
}
|
|
|
|
|
|
t_node_info
|
|
get_node_info(PGconn *conn, char *cluster, int node_id)
|
|
{
|
|
int res;
|
|
|
|
t_node_info node_info = T_NODE_INFO_INITIALIZER;
|
|
|
|
res = get_node_record(conn, cluster, node_id, &node_info);
|
|
|
|
if (res == -1)
|
|
{
|
|
PQExpBufferData errmsg;
|
|
initPQExpBuffer(&errmsg);
|
|
|
|
appendPQExpBuffer(&errmsg,
|
|
_("unable to retrieve record for node %i: %s"),
|
|
node_id,
|
|
PQerrorMessage(conn));
|
|
|
|
log_err("%s\n", errmsg.data);
|
|
|
|
create_event_record(NULL,
|
|
&local_options,
|
|
local_options.node,
|
|
"repmgrd_shutdown",
|
|
false,
|
|
errmsg.data);
|
|
|
|
PQfinish(conn);
|
|
conn = NULL;
|
|
|
|
terminate(ERR_DB_QUERY);
|
|
}
|
|
|
|
if (res == 0)
|
|
{
|
|
log_warning(_("No record found for node %i\n"), node_id);
|
|
}
|
|
|
|
return node_info;
|
|
}
|