Add initial event logging code

This commit is contained in:
Ian Barwick
2015-03-13 16:43:28 +09:00
parent 886a9fd036
commit 0307c51d4b
4 changed files with 410 additions and 19 deletions

View File

@@ -1101,3 +1101,55 @@ delete_node_record(PGconn *conn, int node, char *action)
return true;
}
bool
create_event_record(PGconn *conn, int node_id, char *event, bool successful, char *details)
{
char sqlquery[QUERY_STR_LEN];
PGresult *res;
int n_node_id = htonl(node_id);
char *t_successful = successful ? "TRUE" : "FALSE";
const char *values[4] = { (char *)&n_node_id,
event,
t_successful,
details
};
int lengths[4] = { sizeof(n_node_id),
0,
0,
0
};
int binary[4] = {1, 0, 0, 0};
sqlquery_snprintf(sqlquery,
" INSERT INTO %s.repl_events ( "
" node_id, "
" event, "
" successful, "
" details "
" ) "
" VALUES ($1, $2, $3, $4) ",
get_repmgr_schema_quoted(conn));
res = PQexecParams(conn,
sqlquery,
4,
NULL,
values,
lengths,
binary,
0);
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_warning(_("Cannot insert event details, %s\n"),
PQerrorMessage(conn));
PQclear(res);
return false;
}
return true;
}

View File

@@ -64,5 +64,7 @@ bool set_config_bool(PGconn *conn, const char *config_param, bool state);
bool copy_configuration(PGconn *masterconn, PGconn *witnessconn, char *cluster_name);
bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name);
bool delete_node_record(PGconn *conn, int node, char *action);
bool create_event_record(PGconn *conn, int node_id, char *event, bool successful, char *details);
#endif

286
repmgr.c
View File

@@ -79,6 +79,7 @@ static int check_server_version(PGconn *conn, char *server_type, bool exit_on_er
static bool check_upstream_config(PGconn *conn, int server_version_num, bool exit_on_error);
static char *make_pg_path(char *file);
static bool log_event(PGconn *standby_conn, bool success, char *details);
static void do_master_register(void);
static void do_standby_register(void);
@@ -776,6 +777,19 @@ do_master_register(void)
}
/* Log the event */
record_created = create_event_record(conn,
options.node,
"master_register",
true,
NULL);
if(record_created == false)
{
PQfinish(conn);
exit(ERR_DB_QUERY);
}
PQfinish(conn);
log_notice(_("master node correctly registered for cluster %s with id %d (conninfo: %s)\n"),
@@ -869,6 +883,20 @@ do_standby_register(void)
exit(ERR_BAD_CONFIG);
}
/* Log the event */
record_created = create_event_record(master_conn,
options.node,
"standby_register",
true,
NULL);
if(record_created == false)
{
PQfinish(master_conn);
PQfinish(conn);
exit(ERR_DB_QUERY);
}
PQfinish(master_conn);
PQfinish(conn);
@@ -919,6 +947,9 @@ do_standby_clone(void)
char *first_wal_segment = NULL;
char *last_wal_segment = NULL;
bool record_created = FALSE;
PQExpBufferData event_details;
/*
* If dest_dir (-D/--pgdata) was provided, this will become the new data
* directory (otherwise repmgr will default to the same directory as on the
@@ -1464,11 +1495,59 @@ stop_backup:
log_notice(_("for example : /etc/init.d/postgresql start\n"));
}
/* Log the event */
initPQExpBuffer(&event_details);
/* Add details about relevant runtime options used */
appendPQExpBuffer(&event_details,
_("Cloned from host '%s', port %s"),
runtime_options.host,
runtime_options.masterport);
appendPQExpBuffer(&event_details,
_("; backup method: %s"),
runtime_options.rsync_only ? "rsync" : "pg_basebackup");
appendPQExpBuffer(&event_details,
_("; --force: %s"),
runtime_options.force ? "Y" : "N");
record_created = log_event(upstream_conn,
true,
event_details.data);
if(record_created == false)
{
PQfinish(upstream_conn);
exit(ERR_DB_QUERY);
}
PQfinish(upstream_conn);
exit(retval);
}
static bool
log_event(PGconn *standby_conn, bool success, char *details)
{
PGconn *primary_conn;
bool retval;
primary_conn = get_primary_connection(standby_conn,
options.cluster_name,
NULL, NULL);
retval = create_event_record(primary_conn,
options.node,
"standby_clone",
success,
details);
PQfinish(primary_conn);
return retval;
}
static void
do_standby_promote(void)
{
@@ -1487,6 +1566,7 @@ do_standby_promote(void)
promote_check_interval = 2;
bool promote_sucess = false;
bool success;
bool record_created;
/* We need to connect to check configuration */
log_info(_("connecting to standby database\n"));
@@ -1568,6 +1648,17 @@ do_standby_promote(void)
if (promote_sucess == false)
{
PQExpBufferData details;
initPQExpBuffer(&details);
appendPQExpBuffer(&details,
"Node %i could not be promoted to master",
options.node);
record_created = create_event_record(old_master_conn,
options.node,
"standby_promote",
false,
details.data);
/* XXX exit with error? */
log_err(_(retval == 1 ?
"STANDBY PROMOTE failed, this is still a standby node.\n" :
@@ -1575,12 +1666,28 @@ do_standby_promote(void)
}
else
{
PQExpBufferData details;
initPQExpBuffer(&details);
appendPQExpBuffer(&details,
"Node %i was successfully promoted to master",
options.node);
log_notice(_("STANDBY PROMOTE successful. You should REINDEX any hash indexes you have.\n"));
/* Log the event */
record_created = create_event_record(conn,
options.node,
"standby_promote",
true,
details.data);
}
PQfinish(old_master_conn);
PQfinish(conn);
if(record_created == false)
{
exit(ERR_DB_QUERY);
}
return;
}
@@ -1746,10 +1853,21 @@ do_witness_create(void)
retval = is_standby(masterconn);
if (retval)
{
log_err("%s\n", _(retval == 1 ?
PQExpBufferData errmsg;
initPQExpBuffer(&errmsg);
appendPQExpBuffer(&errmsg,
"%s",
_(retval == 1 ?
"provided upstream node is not a master" :
"connection to upstream node lost"));
log_err("%s\n", errmsg.data);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg.data);
PQfinish(masterconn);
exit(ERR_BAD_CONFIG);
}
@@ -1759,8 +1877,18 @@ do_witness_create(void)
r = test_ssh_connection(runtime_options.host, runtime_options.remote_user);
if (r != 0)
{
log_err(_("unable to connect to remote host '%s' via SSH\n"), runtime_options.host);
PQExpBufferData errmsg;
initPQExpBuffer(&errmsg);
appendPQExpBuffer(&errmsg,
_("unable to connect to remote host '%s' via SSH"),
runtime_options.host);
log_err("%s\n", errmsg.data);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg.data);
PQfinish(masterconn);
exit(ERR_BAD_SSH);
}
@@ -1768,7 +1896,17 @@ do_witness_create(void)
/* Check this directory could be used as a PGDATA dir */
if (!create_pg_dir(runtime_options.dest_dir, runtime_options.force))
{
log_err(_("unable to create witness server data directory (\"%s\")\n"), runtime_options.host);
PQExpBufferData errmsg;
initPQExpBuffer(&errmsg);
appendPQExpBuffer(&errmsg,
_("unable to create witness server data directory (\"%s\")"),
runtime_options.host);
log_err("%s\n", errmsg.data);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg.data);
exit(ERR_BAD_CONFIG);
}
@@ -1792,8 +1930,13 @@ do_witness_create(void)
r = system(script);
if (r != 0)
{
log_err(_("unable to initialize cluster for witness server\n"));
char *errmsg = _("unable to initialize cluster for witness server");
log_err("%s\n", errmsg);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg);
PQfinish(masterconn);
exit(ERR_BAD_CONFIG);
}
@@ -1803,8 +1946,19 @@ do_witness_create(void)
pg_conf = fopen(buf, "a");
if (pg_conf == NULL)
{
log_err(_("unable to open \"%s\" to add additional configuration items: %s\n"),
buf, strerror(errno));
PQExpBufferData errmsg;
initPQExpBuffer(&errmsg);
appendPQExpBuffer(&errmsg,
_("unable to open \"%s\" to add additional configuration items: %s\n"),
buf,
strerror(errno));
log_err("%s\n", errmsg.data);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg.data);
PQfinish(masterconn);
exit(ERR_BAD_CONFIG);
@@ -1842,7 +1996,14 @@ do_witness_create(void)
r = system(script);
if (r != 0)
{
log_err(_("unable to start witness server\n"));
char *errmsg = _("unable to start witness server");
log_err("%s\n", errmsg);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg);
PQfinish(masterconn);
exit(ERR_BAD_CONFIG);
@@ -1860,8 +2021,14 @@ do_witness_create(void)
r = system(script);
if (r != 0)
{
log_err(_("unable to create user for witness server\n"));
char *errmsg = _("unable to create user for witness server");
log_err("%s\n", errmsg);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg);
PQfinish(masterconn);
exit(ERR_BAD_CONFIG);
}
@@ -1879,7 +2046,14 @@ do_witness_create(void)
r = system(script);
if (r != 0)
{
log_err(_("Unable to create database for witness server\n"));
char *errmsg = _("Unable to create database for witness server");
log_err("%s\n", errmsg);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg);
PQfinish(masterconn);
exit(ERR_BAD_CONFIG);
@@ -1891,7 +2065,14 @@ do_witness_create(void)
if (success == false)
{
log_err(_("unable to retrieve location of pg_hba.conf\n"));
char *errmsg = _("unable to retrieve location of pg_hba.conf");
log_err("%s\n", errmsg);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg);
exit(ERR_DB_QUERY);
}
@@ -1900,7 +2081,14 @@ do_witness_create(void)
master_hba_file, runtime_options.dest_dir, false, -1);
if (r != 0)
{
log_err(_("unable to copy pg_hba.conf from master\n"));
char *errmsg = _("unable to copy pg_hba.conf from master");
log_err("%s\n", errmsg);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg);
PQfinish(masterconn);
exit(ERR_BAD_CONFIG);
@@ -1914,7 +2102,14 @@ do_witness_create(void)
r = system(script);
if (r != 0)
{
log_err( _("unable to reload witness server\n"));
char *errmsg = _("unable to reload witness server");
log_err("%s\n", errmsg);
create_event_record(masterconn,
options.node,
"witness_create",
false,
errmsg);
PQfinish(masterconn);
exit(ERR_BAD_CONFIG);
@@ -1935,6 +2130,12 @@ do_witness_create(void)
if(record_created == false)
{
create_event_record(masterconn,
options.node,
"witness_create",
false,
"Unable to create witness node record on master");
PQfinish(masterconn);
exit(ERR_DB_QUERY);
}
@@ -1946,6 +2147,11 @@ do_witness_create(void)
if (!create_schema(witnessconn))
{
create_event_record(masterconn,
options.node,
"witness_create",
false,
_("unable to create schema on witness"));
PQfinish(masterconn);
PQfinish(witnessconn);
exit(ERR_BAD_CONFIG);
@@ -1954,6 +2160,11 @@ do_witness_create(void)
/* copy configuration from master, only repl_nodes is needed */
if (!copy_configuration(masterconn, witnessconn, options.cluster_name))
{
create_event_record(masterconn,
options.node,
"witness_create",
false,
_("Unable to copy configuration from master"));
PQfinish(masterconn);
PQfinish(witnessconn);
exit(ERR_BAD_CONFIG);
@@ -1979,6 +2190,19 @@ do_witness_create(void)
}
}
/* Log the event */
record_created = create_event_record(masterconn,
options.node,
"witness_create",
true,
NULL);
if(record_created == false)
{
PQfinish(masterconn);
exit(ERR_DB_QUERY);
}
PQfinish(masterconn);
PQfinish(witnessconn);
@@ -2411,6 +2635,8 @@ create_schema(PGconn *conn)
char sqlquery[QUERY_STR_LEN];
PGresult *res;
/* create schema */
sqlquery_snprintf(sqlquery, "CREATE SCHEMA %s", get_repmgr_schema_quoted(conn));
log_debug(_("master register: %s\n"), sqlquery);
res = PQexec(conn, sqlquery);
@@ -2423,6 +2649,9 @@ create_schema(PGconn *conn)
}
PQclear(res);
/* create functions */
/*
* to avoid confusion of the time_lag field and provide a consistent UI we
* use these functions for providing the latest update timestamp
@@ -2461,7 +2690,9 @@ create_schema(PGconn *conn)
PQclear(res);
/* ... the tables */
/* Create tables */
/* CREATE TABLE repl_nodes */
sqlquery_snprintf(sqlquery,
"CREATE TABLE %s.repl_nodes ( "
" id INTEGER PRIMARY KEY, "
@@ -2487,6 +2718,8 @@ create_schema(PGconn *conn)
}
PQclear(res);
/* CREATE TABLE repl_monitor */
sqlquery_snprintf(sqlquery,
"CREATE TABLE %s.repl_monitor ( "
" primary_node INTEGER NOT NULL, "
@@ -2509,7 +2742,30 @@ create_schema(PGconn *conn)
}
PQclear(res);
/* a view */
/* CREATE TABLE repl_events */
sqlquery_snprintf(sqlquery,
"CREATE TABLE %s.repl_events ( "
" node_id INTEGER NOT NULL, "
" event TEXT NOT NULL, "
" successful BOOLEAN NOT NULL DEFAULT TRUE, "
" event_timestamp TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT CURRENT_TIMESTAMP, "
" details TEXT NULL "
" ) ",
get_repmgr_schema_quoted(conn));
log_debug(_("master register: %s\n"), sqlquery);
res = PQexec(conn, sqlquery);
if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
{
log_err(_("unable to create table '%s.repl_events': %s\n"),
get_repmgr_schema_quoted(conn), PQerrorMessage(conn));
PQfinish(conn);
exit(ERR_BAD_CONFIG);
}
PQclear(res);
/* CREATE VIEW repl_status */
sqlquery_snprintf(sqlquery,
"CREATE VIEW %s.repl_status AS "
" SELECT m.primary_node, m.standby_node, n.name AS standby_name, "

View File

@@ -154,6 +154,7 @@ main(int argc, char **argv)
int optindex;
int c;
bool daemonize = false;
bool startup_event_logged = false;
FILE *fd;
@@ -312,6 +313,17 @@ main(int argc, char **argv)
update_registration();
}
/* Log startup event */
if(startup_event_logged == false)
{
create_event_record(primary_conn,
local_options.node,
"repmgrd_start",
true,
NULL);
startup_event_logged = true;
}
log_info(_("starting continuous primary connection check\n"));
/*
@@ -395,6 +407,16 @@ main(int argc, char **argv)
my_local_conn = establish_db_connection(local_options.conninfo, true);
update_registration();
}
/* Log startup event */
if(startup_event_logged == false)
{
create_event_record(primary_conn,
local_options.node,
"repmgrd_start",
true,
NULL);
startup_event_logged = true;
}
/*
* Every local_options.monitor_interval_secs seconds, do
@@ -1301,6 +1323,9 @@ do_primary_failover(void)
/* if local node is the best candidate, promote it */
if (best_candidate.node_id == local_options.node)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
/* wait */
sleep(5);
@@ -1329,17 +1354,42 @@ do_primary_failover(void)
/* update node information to reflect new status */
if(update_node_record_set_primary(my_local_conn, node_info.node_id, failed_primary.node_id) == false)
{
appendPQExpBuffer(&event_details,
_("unable to update node record for node %i (promoted to master following failure of node %i)"),
node_info.node_id,
failed_primary.node_id);
create_event_record(my_local_conn,
node_info.node_id,
"repmgrd_failover_promote",
false,
event_details.data);
terminate(ERR_DB_QUERY);
}
/* update internal record for this node*/
node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);
appendPQExpBuffer(&event_details,
_("node %i promoted to master; old master %i marked as failed"),
node_info.node_id,
failed_primary.node_id);
create_event_record(my_local_conn,
node_info.node_id,
"repmgrd_failover_promote",
true,
event_details.data);
}
/* local node not promotion candidate - find the new primary */
else
{
PGconn *new_primary_conn;
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
/* wait */
sleep(10);
@@ -1367,10 +1417,19 @@ do_primary_failover(void)
{
if(create_replication_slot(new_primary_conn, node_info.slot_name) == false)
{
log_err(("Unable to create slot '%s' on the primary node: %s\n"),
node_info.slot_name,
PQerrorMessage(new_primary_conn)
);
appendPQExpBuffer(&event_details,
_("Unable to create slot '%s' on the primary node: %s"),
node_info.slot_name,
PQerrorMessage(new_primary_conn));
create_event_record(new_primary_conn,
node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
log_err("%s\n", event_details.data);
PQfinish(new_primary_conn);
terminate(ERR_DB_QUERY);
@@ -1390,13 +1449,35 @@ do_primary_failover(void)
/* update node information to reflect new status */
if(update_node_record_set_upstream(new_primary_conn, node_info.node_id, best_candidate.node_id) == false)
{
appendPQExpBuffer(&event_details,
_("Unable to update node record for node %i (following new upstream node %i)"),
node_info.node_id,
best_candidate.node_id);
create_event_record(new_primary_conn,
node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
terminate(ERR_BAD_CONFIG);
}
/* update internal record for this node*/
node_info = get_node_info(new_primary_conn, local_options.cluster_name, local_options.node);
appendPQExpBuffer(&event_details,
_("Node %i now following new upstream node %i"),
node_info.node_id,
best_candidate.node_id);
create_event_record(new_primary_conn,
node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
PQfinish(new_primary_conn);
destroyPQExpBuffer(&event_details);
}
/* to force it to re-calculate mode and master node */