Write events of note to a log table

This makes keeping track of events such as failovers
much easier. Note that this is for convenience and is
not a foolproof auditing log.

Sample output:

repmgr_db=# SELECT * from repmgr_test.repl_events ;
 node_id |          event           | successful |        event_timestamp        |                         details
---------+--------------------------+------------+-------------------------------+----------------------------------------------------------
       1 | master_register          | t          | 2015-03-06 14:14:08.196636+09 |
       2 | standby_clone            | t          | 2015-03-06 14:14:17.660768+09 | Backup method: pg_basebackup; --force: N
       2 | standby_register         | t          | 2015-03-06 14:14:18.762222+09 |
       4 | witness_create           | t          | 2015-03-06 14:14:22.072815+09 |
       3 | standby_clone            | t          | 2015-03-06 14:14:23.524673+09 | Backup method: pg_basebackup; --force: N
       3 | standby_register         | t          | 2015-03-06 14:14:24.620161+09 |
       2 | repmgrd_start            | t          | 2015-03-06 14:14:29.639096+09 |
       3 | repmgrd_start            | t          | 2015-03-06 14:14:29.641489+09 |
       4 | repmgrd_start            | t          | 2015-03-06 14:14:29.648002+09 |
       2 | standby_promote          | t          | 2015-03-06 14:15:01.956737+09 | Node 2 was successfully be promoted to master
       2 | repmgrd_failover_promote | t          | 2015-03-06 14:15:01.964771+09 | Node 2 promoted to master; old master 1 marked as failed
       3 | repmgrd_failover_follow  | t          | 2015-03-06 14:15:07.228493+09 | Node 3 now following new upstream node 2
(12 rows)
This commit is contained in:
Ian Barwick
2015-03-06 00:53:29 +09:00
parent 072c2d70ff
commit 491309f4ba
5 changed files with 466 additions and 64 deletions

103
repmgrd.c
View File

@@ -17,7 +17,7 @@
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
@@ -33,6 +33,7 @@
#include <unistd.h>
#include "repmgr.h"
#include "config.h"
#include "log.h"
@@ -41,6 +42,7 @@
/* Required PostgreSQL headers */
#include "access/xlogdefs.h"
#include "pqexpbuffer.h"
/*
* Struct to store node information
@@ -154,6 +156,7 @@ main(int argc, char **argv)
int optindex;
int c;
bool daemonize = false;
bool startup_event_logged = false;
FILE *fd;
@@ -318,6 +321,17 @@ main(int argc, char **argv)
update_registration();
}
/* Log startup event */
if(startup_event_logged == false)
{
create_event_record(primary_conn,
local_options.node,
"repmgrd_start",
true,
NULL);
startup_event_logged = true;
}
log_info(_("%s Starting continuous primary connection check\n"),
progname);
@@ -332,7 +346,7 @@ main(int argc, char **argv)
*/
do
{
log_debug("primary check loop...\n");
log_debug("primary check loop...\n"); // ZZZ
if (check_connection(primary_conn, "master"))
{
/*
@@ -406,6 +420,16 @@ main(int argc, char **argv)
my_local_conn = establish_db_connection(local_options.conninfo, true);
update_registration();
}
/* Log startup event */
if(startup_event_logged == false)
{
create_event_record(primary_conn,
local_options.node,
"repmgrd_start",
true,
NULL);
startup_event_logged = true;
}
/*
* Every local_options.monitor_interval_secs seconds, do
@@ -1316,6 +1340,9 @@ do_primary_failover(void)
/* if local node is the best candidate, promote it */
if (best_candidate.node_id == local_options.node)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
/* wait */
sleep(5);
@@ -1335,6 +1362,12 @@ do_primary_failover(void)
{
log_err(_("%s: promote command failed. You could check and try it manually.\n"),
progname);
/*
* At this point there's no valid primary we can write to,
* so can't add an entry to the event table.
*/
terminate(ERR_DB_QUERY);
}
@@ -1344,16 +1377,42 @@ do_primary_failover(void)
/* update node information to reflect new status */
if(update_node_record_set_primary(my_local_conn, node_info.node_id, failed_primary.node_id) == false)
{
appendPQExpBuffer(&event_details,
_("Unable to update node record for node %i (promoted to master following failure of node %i)"),
node_info.node_id,
failed_primary.node_id);
create_event_record(my_local_conn,
node_info.node_id,
"repmgrd_failover_promote",
false,
event_details.data);
terminate(ERR_DB_QUERY);
}
/* update internal record for this node*/
node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);
appendPQExpBuffer(&event_details,
_("Node %i promoted to master; old master %i marked as failed"),
node_info.node_id,
failed_primary.node_id);
create_event_record(my_local_conn,
node_info.node_id,
"repmgrd_failover_promote",
true,
event_details.data);
}
/* local node not promotion candidate - find the new primary */
else
{
PGconn *new_primary_conn;
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
/* wait */
sleep(10);
@@ -1381,9 +1440,20 @@ do_primary_failover(void)
{
if(create_replication_slot(new_primary_conn, node_info.slot_name) == false)
{
log_err(_("Unable to create slot '%s' on the primary node: %s\n"),
node_info.slot_name,
PQerrorMessage(new_primary_conn));
appendPQExpBuffer(&event_details,
_("Unable to create slot '%s' on the primary node: %s"),
node_info.slot_name,
PQerrorMessage(new_primary_conn));
create_event_record(new_primary_conn,
node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
log_err("%s\n", event_details.data);
PQfinish(new_primary_conn);
terminate(ERR_DB_QUERY);
}
@@ -1403,13 +1473,35 @@ do_primary_failover(void)
/* update node information to reflect new status */
if(update_node_record_set_upstream(new_primary_conn, node_info.node_id, best_candidate.node_id) == false)
{
appendPQExpBuffer(&event_details,
_("Unable to update node record for node %i (following new upstream node %i)"),
node_info.node_id,
best_candidate.node_id);
create_event_record(new_primary_conn,
node_info.node_id,
"repmgrd_failover_follow",
false,
event_details.data);
terminate(ERR_BAD_CONFIG);
}
/* update internal record for this node*/
node_info = get_node_info(new_primary_conn, local_options.cluster_name, local_options.node);
appendPQExpBuffer(&event_details,
_("Node %i now following new upstream node %i"),
node_info.node_id,
best_candidate.node_id);
create_event_record(new_primary_conn,
node_info.node_id,
"repmgrd_failover_follow",
true,
event_details.data);
PQfinish(new_primary_conn);
destroyPQExpBuffer(&event_details);
}
log_debug("failover done\n"); // ZZZ
@@ -1515,6 +1607,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
r = system(local_options.follow_command);
if (r != 0)
{
// ZZZ fail
log_err(_("%s: follow command failed. You could check and try it manually.\n"),
progname);
terminate(ERR_BAD_CONFIG);