Improve handling of event logging in rempgrd

Provide the master connection if available, and if not enable
create_event_record() to skip trying to write to the database,
but execute the notification program if defined.
This commit is contained in:
Ian Barwick
2015-03-24 13:40:39 +09:00
parent 2cadb3424d
commit bd19a2c868
2 changed files with 92 additions and 64 deletions

118
dbutils.c
View File

@@ -1105,12 +1105,18 @@ delete_node_record(PGconn *conn, int node, char *action)
/*
* create_event_record()
*
* Insert a record into the events table.
* If `conn` is not NULL, insert a record into the events table.
*
* If configuration parameter `event_notification_command` is set, also
* attempt to execute that command.
*
* Returns true if all operations succeeded, false if one or more failed.
*
* Note this function may be called with `conn` set to NULL in cases where
* the master node is not available and it's therefore not possible to write
* an event record. In this case, if `event_notification_command` is set a user-
* defined notification to be generated; if not, this function will have
* no effect.
*/
bool
@@ -1120,69 +1126,79 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id,
PGresult *res;
char event_timestamp[MAXLEN] = "";
bool success = true;
struct tm ts;
int n_node_id = htonl(node_id);
char *t_successful = successful ? "TRUE" : "FALSE";
if(conn != NULL)
{
int n_node_id = htonl(node_id);
char *t_successful = successful ? "TRUE" : "FALSE";
const char *values[4] = { (char *)&n_node_id,
event,
t_successful,
details
};
const char *values[4] = { (char *)&n_node_id,
event,
t_successful,
details
};
int lengths[4] = { sizeof(n_node_id),
0,
0,
0
};
int binary[4] = {1, 0, 0, 0};
int lengths[4] = { sizeof(n_node_id),
0,
0,
0
};
sqlquery_snprintf(sqlquery,
" INSERT INTO %s.repl_events ( "
" node_id, "
" event, "
" successful, "
" details "
" ) "
" VALUES ($1, $2, $3, $4) "
" RETURNING event_timestamp ",
get_repmgr_schema_quoted(conn));
int binary[4] = {1, 0, 0, 0};
res = PQexecParams(conn,
sqlquery,
4,
NULL,
values,
lengths,
binary,
0);
sqlquery_snprintf(sqlquery,
" INSERT INTO %s.repl_events ( "
" node_id, "
" event, "
" successful, "
" details "
" ) "
" VALUES ($1, $2, $3, $4) "
" RETURNING event_timestamp ",
get_repmgr_schema_quoted(conn));
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
res = PQexecParams(conn,
sqlquery,
4,
NULL,
values,
lengths,
binary,
0);
if (!res || PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_warning(_("Unable to create event record: %s\n"),
PQerrorMessage(conn));
success = false;
}
else
{
/* Store timestamp to send to the notification command */
strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
log_debug(_("Event timestamp is: %s\n"), event_timestamp);
}
PQclear(res);
}
/*
* If no database connection provided, or the query failed, generate a
* current timestamp ourselves. This isn't quite the same
* format as PostgreSQL, but is close enough for diagnostic use.
*/
if(!strlen(event_timestamp))
{
time_t now;
struct tm ts;
log_warning(_("Unable to create event record: %s\n"),
PQerrorMessage(conn));
success = false;
/*
* If the query fails for whatever reason, generate a
* current timestamp ourselves. This isn't quite the same
* format as PostgreSQL, but is close enough for diagnostic use.
*/
time(&now);
ts = *localtime(&now);
strftime(event_timestamp, MAXLEN, "%Y-%m-%d %H:%M:%S%z", &ts);
}
else
{
strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
log_debug(_("Event timestamp is: %s\n"), event_timestamp);
}
PQclear(res);
/* an event notification command was provided - parse and execute it */
if(strlen(options->event_notification_command))

View File

@@ -410,12 +410,15 @@ main(int argc, char **argv)
_("unable to connect to master node '%s'"),
local_options.cluster_name);
create_event_record(my_local_conn,
log_err("%s\n", errmsg.data);
create_event_record(NULL,
&local_options,
local_options.node,
"repmgrd_shutdown",
false,
errmsg.data);
terminate(ERR_BAD_CONFIG);
}
@@ -586,8 +589,10 @@ witness_monitor(void)
appendPQExpBuffer(&errmsg,
_("unable to determine a valid master node, terminating..."));
log_err("%s\n", errmsg.data);
create_event_record(my_local_conn,
create_event_record(NULL,
&local_options,
local_options.node,
"repmgrd_shutdown",
@@ -659,7 +664,7 @@ witness_monitor(void)
/*
* Insert monitor info, this is basically the time and xlog replayed,
* applied on standby and current xlog location in primary.
* Also do the math to see how far are we in bytes for being uptodate
* Also do the math to see how far are we in bytes for being up-to-date
*/
static void
standby_monitor(void)
@@ -705,12 +710,13 @@ standby_monitor(void)
log_err("%s\n", errmsg.data);
create_event_record(my_local_conn,
create_event_record(primary_conn,
&local_options,
local_options.node,
"repmgrd_shutdown",
false,
errmsg.data);
terminate(ERR_DB_CON);
}
@@ -777,9 +783,10 @@ standby_monitor(void)
appendPQExpBuffer(&errmsg,
_("Unable to reconnect to master after %i attempts, terminating..."),
local_options.reconnect_attempts);
log_err("%s\n", errmsg.data);
create_event_record(my_local_conn,
create_event_record(NULL,
&local_options,
local_options.node,
"repmgrd_shutdown",
@@ -819,11 +826,11 @@ standby_monitor(void)
initPQExpBuffer(&errmsg);
appendPQExpBuffer(&errmsg,
_("unable to reconnect to new upstream node, terminating...")
);
_("unable to reconnect to new upstream node, terminating..."));
log_err("%s\n", errmsg.data);
create_event_record(my_local_conn,
create_event_record(primary_conn,
&local_options,
local_options.node,
"repmgrd_shutdown",
@@ -1434,7 +1441,9 @@ do_primary_failover(void)
node_info.node_id,
failed_primary.node_id);
create_event_record(my_local_conn,
log_err("%s\n", event_details.data);
create_event_record(NULL,
&local_options,
node_info.node_id,
"repmgrd_failover_promote",
@@ -1444,7 +1453,7 @@ do_primary_failover(void)
terminate(ERR_DB_QUERY);
}
/* update internal record for this node*/
/* update internal record for this node */
node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);
appendPQExpBuffer(&event_details,
@@ -1452,6 +1461,7 @@ do_primary_failover(void)
node_info.node_id,
failed_primary.node_id);
/* my_local_conn is now the master */
create_event_record(my_local_conn,
&local_options,
node_info.node_id,
@@ -1500,6 +1510,8 @@ do_primary_failover(void)
node_info.slot_name,
PQerrorMessage(new_primary_conn));
log_err("%s\n", event_details.data);
create_event_record(new_primary_conn,
&local_options,
node_info.node_id,
@@ -1507,8 +1519,6 @@ do_primary_failover(void)
false,
event_details.data);
log_err("%s\n", event_details.data);
PQfinish(new_primary_conn);
terminate(ERR_DB_QUERY);
}
@@ -1532,6 +1542,8 @@ do_primary_failover(void)
node_info.node_id,
best_candidate.node_id);
log_err("%s\n", event_details.data);
create_event_record(new_primary_conn,
&local_options,
node_info.node_id,
@@ -1685,7 +1697,7 @@ check_connection(PGconn *conn, const char *type)
int connection_retries;
/*
* Check if the master is still available if after
* Check if the node is still available if after
* local_options.reconnect_attempts * local_options.reconnect_intvl
* seconds of retries we cannot reconnect return false
*/