From bd19a2c8683555072ace554550d0a89e6bf50b25 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Tue, 24 Mar 2015 13:40:39 +0900 Subject: [PATCH] Improve handling of event logging in rempgrd Provide the master connection if available, and if not enable create_event_record() to skip trying to write to the database, but execute the notification program if defined. --- dbutils.c | 118 +++++++++++++++++++++++++++++++----------------------- repmgrd.c | 38 ++++++++++++------ 2 files changed, 92 insertions(+), 64 deletions(-) diff --git a/dbutils.c b/dbutils.c index 814cc1e8..434d0f29 100644 --- a/dbutils.c +++ b/dbutils.c @@ -1105,12 +1105,18 @@ delete_node_record(PGconn *conn, int node, char *action) /* * create_event_record() * - * Insert a record into the events table. + * If `conn` is not NULL, insert a record into the events table. * * If configuration parameter `event_notification_command` is set, also * attempt to execute that command. * * Returns true if all operations succeeded, false if one or more failed. + * + * Note this function may be called with `conn` set to NULL in cases where + * the master node is not available and it's therefore not possible to write + * an event record. In this case, if `event_notification_command` is set a user- + * defined notification to be generated; if not, this function will have + * no effect. */ bool @@ -1120,69 +1126,79 @@ create_event_record(PGconn *conn, t_configuration_options *options, int node_id, PGresult *res; char event_timestamp[MAXLEN] = ""; bool success = true; + struct tm ts; - int n_node_id = htonl(node_id); - char *t_successful = successful ? "TRUE" : "FALSE"; + if(conn != NULL) + { + int n_node_id = htonl(node_id); + char *t_successful = successful ? "TRUE" : "FALSE"; - const char *values[4] = { (char *)&n_node_id, - event, - t_successful, - details - }; + const char *values[4] = { (char *)&n_node_id, + event, + t_successful, + details + }; - int lengths[4] = { sizeof(n_node_id), - 0, - 0, - 0 - }; - int binary[4] = {1, 0, 0, 0}; + int lengths[4] = { sizeof(n_node_id), + 0, + 0, + 0 + }; - sqlquery_snprintf(sqlquery, - " INSERT INTO %s.repl_events ( " - " node_id, " - " event, " - " successful, " - " details " - " ) " - " VALUES ($1, $2, $3, $4) " - " RETURNING event_timestamp ", - get_repmgr_schema_quoted(conn)); + int binary[4] = {1, 0, 0, 0}; - res = PQexecParams(conn, - sqlquery, - 4, - NULL, - values, - lengths, - binary, - 0); + sqlquery_snprintf(sqlquery, + " INSERT INTO %s.repl_events ( " + " node_id, " + " event, " + " successful, " + " details " + " ) " + " VALUES ($1, $2, $3, $4) " + " RETURNING event_timestamp ", + get_repmgr_schema_quoted(conn)); - if (!res || PQresultStatus(res) != PGRES_TUPLES_OK) + res = PQexecParams(conn, + sqlquery, + 4, + NULL, + values, + lengths, + binary, + 0); + + if (!res || PQresultStatus(res) != PGRES_TUPLES_OK) + { + + log_warning(_("Unable to create event record: %s\n"), + PQerrorMessage(conn)); + + success = false; + + } + else + { + /* Store timestamp to send to the notification command */ + strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN); + log_debug(_("Event timestamp is: %s\n"), event_timestamp); + } + + PQclear(res); + } + + /* + * If no database connection provided, or the query failed, generate a + * current timestamp ourselves. This isn't quite the same + * format as PostgreSQL, but is close enough for diagnostic use. + */ + if(!strlen(event_timestamp)) { time_t now; - struct tm ts; - log_warning(_("Unable to create event record: %s\n"), - PQerrorMessage(conn)); - - success = false; - - /* - * If the query fails for whatever reason, generate a - * current timestamp ourselves. This isn't quite the same - * format as PostgreSQL, but is close enough for diagnostic use. - */ time(&now); ts = *localtime(&now); strftime(event_timestamp, MAXLEN, "%Y-%m-%d %H:%M:%S%z", &ts); } - else - { - strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN); - log_debug(_("Event timestamp is: %s\n"), event_timestamp); - } - - PQclear(res); /* an event notification command was provided - parse and execute it */ if(strlen(options->event_notification_command)) diff --git a/repmgrd.c b/repmgrd.c index fa617831..8d5ae13b 100644 --- a/repmgrd.c +++ b/repmgrd.c @@ -410,12 +410,15 @@ main(int argc, char **argv) _("unable to connect to master node '%s'"), local_options.cluster_name); - create_event_record(my_local_conn, + log_err("%s\n", errmsg.data); + + create_event_record(NULL, &local_options, local_options.node, "repmgrd_shutdown", false, errmsg.data); + terminate(ERR_BAD_CONFIG); } @@ -586,8 +589,10 @@ witness_monitor(void) appendPQExpBuffer(&errmsg, _("unable to determine a valid master node, terminating...")); + log_err("%s\n", errmsg.data); - create_event_record(my_local_conn, + + create_event_record(NULL, &local_options, local_options.node, "repmgrd_shutdown", @@ -659,7 +664,7 @@ witness_monitor(void) /* * Insert monitor info, this is basically the time and xlog replayed, * applied on standby and current xlog location in primary. - * Also do the math to see how far are we in bytes for being uptodate + * Also do the math to see how far are we in bytes for being up-to-date */ static void standby_monitor(void) @@ -705,12 +710,13 @@ standby_monitor(void) log_err("%s\n", errmsg.data); - create_event_record(my_local_conn, + create_event_record(primary_conn, &local_options, local_options.node, "repmgrd_shutdown", false, errmsg.data); + terminate(ERR_DB_CON); } @@ -777,9 +783,10 @@ standby_monitor(void) appendPQExpBuffer(&errmsg, _("Unable to reconnect to master after %i attempts, terminating..."), local_options.reconnect_attempts); + log_err("%s\n", errmsg.data); - create_event_record(my_local_conn, + create_event_record(NULL, &local_options, local_options.node, "repmgrd_shutdown", @@ -819,11 +826,11 @@ standby_monitor(void) initPQExpBuffer(&errmsg); appendPQExpBuffer(&errmsg, - _("unable to reconnect to new upstream node, terminating...") - ); + _("unable to reconnect to new upstream node, terminating...")); + log_err("%s\n", errmsg.data); - create_event_record(my_local_conn, + create_event_record(primary_conn, &local_options, local_options.node, "repmgrd_shutdown", @@ -1434,7 +1441,9 @@ do_primary_failover(void) node_info.node_id, failed_primary.node_id); - create_event_record(my_local_conn, + log_err("%s\n", event_details.data); + + create_event_record(NULL, &local_options, node_info.node_id, "repmgrd_failover_promote", @@ -1444,7 +1453,7 @@ do_primary_failover(void) terminate(ERR_DB_QUERY); } - /* update internal record for this node*/ + /* update internal record for this node */ node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node); appendPQExpBuffer(&event_details, @@ -1452,6 +1461,7 @@ do_primary_failover(void) node_info.node_id, failed_primary.node_id); + /* my_local_conn is now the master */ create_event_record(my_local_conn, &local_options, node_info.node_id, @@ -1500,6 +1510,8 @@ do_primary_failover(void) node_info.slot_name, PQerrorMessage(new_primary_conn)); + log_err("%s\n", event_details.data); + create_event_record(new_primary_conn, &local_options, node_info.node_id, @@ -1507,8 +1519,6 @@ do_primary_failover(void) false, event_details.data); - log_err("%s\n", event_details.data); - PQfinish(new_primary_conn); terminate(ERR_DB_QUERY); } @@ -1532,6 +1542,8 @@ do_primary_failover(void) node_info.node_id, best_candidate.node_id); + log_err("%s\n", event_details.data); + create_event_record(new_primary_conn, &local_options, node_info.node_id, @@ -1685,7 +1697,7 @@ check_connection(PGconn *conn, const char *type) int connection_retries; /* - * Check if the master is still available if after + * Check if the node is still available if after * local_options.reconnect_attempts * local_options.reconnect_intvl * seconds of retries we cannot reconnect return false */