mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-24 23:56:29 +00:00
Write events of note to a log table
This makes keeping track of events such as failovers
much easier. Note that this is for convenience and is
not a foolproof auditing log.
Sample output:
repmgr_db=# SELECT * from repmgr_test.repl_events ;
node_id | event | successful | event_timestamp | details
---------+--------------------------+------------+-------------------------------+----------------------------------------------------------
1 | master_register | t | 2015-03-06 14:14:08.196636+09 |
2 | standby_clone | t | 2015-03-06 14:14:17.660768+09 | Backup method: pg_basebackup; --force: N
2 | standby_register | t | 2015-03-06 14:14:18.762222+09 |
4 | witness_create | t | 2015-03-06 14:14:22.072815+09 |
3 | standby_clone | t | 2015-03-06 14:14:23.524673+09 | Backup method: pg_basebackup; --force: N
3 | standby_register | t | 2015-03-06 14:14:24.620161+09 |
2 | repmgrd_start | t | 2015-03-06 14:14:29.639096+09 |
3 | repmgrd_start | t | 2015-03-06 14:14:29.641489+09 |
4 | repmgrd_start | t | 2015-03-06 14:14:29.648002+09 |
2 | standby_promote | t | 2015-03-06 14:15:01.956737+09 | Node 2 was successfully be promoted to master
2 | repmgrd_failover_promote | t | 2015-03-06 14:15:01.964771+09 | Node 2 promoted to master; old master 1 marked as failed
3 | repmgrd_failover_follow | t | 2015-03-06 14:15:07.228493+09 | Node 3 now following new upstream node 2
(12 rows)
This commit is contained in:
103
repmgrd.c
103
repmgrd.c
@@ -17,7 +17,7 @@
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <unistd.h>
|
||||
|
||||
|
||||
|
||||
#include "repmgr.h"
|
||||
#include "config.h"
|
||||
#include "log.h"
|
||||
@@ -41,6 +42,7 @@
|
||||
|
||||
/* Required PostgreSQL headers */
|
||||
#include "access/xlogdefs.h"
|
||||
#include "pqexpbuffer.h"
|
||||
|
||||
/*
|
||||
* Struct to store node information
|
||||
@@ -154,6 +156,7 @@ main(int argc, char **argv)
|
||||
int optindex;
|
||||
int c;
|
||||
bool daemonize = false;
|
||||
bool startup_event_logged = false;
|
||||
|
||||
FILE *fd;
|
||||
|
||||
@@ -318,6 +321,17 @@ main(int argc, char **argv)
|
||||
update_registration();
|
||||
}
|
||||
|
||||
/* Log startup event */
|
||||
if(startup_event_logged == false)
|
||||
{
|
||||
create_event_record(primary_conn,
|
||||
local_options.node,
|
||||
"repmgrd_start",
|
||||
true,
|
||||
NULL);
|
||||
startup_event_logged = true;
|
||||
}
|
||||
|
||||
log_info(_("%s Starting continuous primary connection check\n"),
|
||||
progname);
|
||||
|
||||
@@ -332,7 +346,7 @@ main(int argc, char **argv)
|
||||
*/
|
||||
do
|
||||
{
|
||||
log_debug("primary check loop...\n");
|
||||
log_debug("primary check loop...\n"); // ZZZ
|
||||
if (check_connection(primary_conn, "master"))
|
||||
{
|
||||
/*
|
||||
@@ -406,6 +420,16 @@ main(int argc, char **argv)
|
||||
my_local_conn = establish_db_connection(local_options.conninfo, true);
|
||||
update_registration();
|
||||
}
|
||||
/* Log startup event */
|
||||
if(startup_event_logged == false)
|
||||
{
|
||||
create_event_record(primary_conn,
|
||||
local_options.node,
|
||||
"repmgrd_start",
|
||||
true,
|
||||
NULL);
|
||||
startup_event_logged = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Every local_options.monitor_interval_secs seconds, do
|
||||
@@ -1316,6 +1340,9 @@ do_primary_failover(void)
|
||||
/* if local node is the best candidate, promote it */
|
||||
if (best_candidate.node_id == local_options.node)
|
||||
{
|
||||
PQExpBufferData event_details;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
/* wait */
|
||||
sleep(5);
|
||||
|
||||
@@ -1335,6 +1362,12 @@ do_primary_failover(void)
|
||||
{
|
||||
log_err(_("%s: promote command failed. You could check and try it manually.\n"),
|
||||
progname);
|
||||
|
||||
/*
|
||||
* At this point there's no valid primary we can write to,
|
||||
* so can't add an entry to the event table.
|
||||
*/
|
||||
|
||||
terminate(ERR_DB_QUERY);
|
||||
}
|
||||
|
||||
@@ -1344,16 +1377,42 @@ do_primary_failover(void)
|
||||
/* update node information to reflect new status */
|
||||
if(update_node_record_set_primary(my_local_conn, node_info.node_id, failed_primary.node_id) == false)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to update node record for node %i (promoted to master following failure of node %i)"),
|
||||
node_info.node_id,
|
||||
failed_primary.node_id);
|
||||
|
||||
create_event_record(my_local_conn,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_promote",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
terminate(ERR_DB_QUERY);
|
||||
}
|
||||
|
||||
/* update internal record for this node*/
|
||||
node_info = get_node_info(my_local_conn, local_options.cluster_name, local_options.node);
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Node %i promoted to master; old master %i marked as failed"),
|
||||
node_info.node_id,
|
||||
failed_primary.node_id);
|
||||
|
||||
create_event_record(my_local_conn,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_promote",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
}
|
||||
/* local node not promotion candidate - find the new primary */
|
||||
else
|
||||
{
|
||||
PGconn *new_primary_conn;
|
||||
PQExpBufferData event_details;
|
||||
|
||||
initPQExpBuffer(&event_details);
|
||||
/* wait */
|
||||
sleep(10);
|
||||
|
||||
@@ -1381,9 +1440,20 @@ do_primary_failover(void)
|
||||
{
|
||||
if(create_replication_slot(new_primary_conn, node_info.slot_name) == false)
|
||||
{
|
||||
log_err(_("Unable to create slot '%s' on the primary node: %s\n"),
|
||||
node_info.slot_name,
|
||||
PQerrorMessage(new_primary_conn));
|
||||
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to create slot '%s' on the primary node: %s"),
|
||||
node_info.slot_name,
|
||||
PQerrorMessage(new_primary_conn));
|
||||
|
||||
create_event_record(new_primary_conn,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
log_err("%s\n", event_details.data);
|
||||
|
||||
PQfinish(new_primary_conn);
|
||||
terminate(ERR_DB_QUERY);
|
||||
}
|
||||
@@ -1403,13 +1473,35 @@ do_primary_failover(void)
|
||||
/* update node information to reflect new status */
|
||||
if(update_node_record_set_upstream(new_primary_conn, node_info.node_id, best_candidate.node_id) == false)
|
||||
{
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Unable to update node record for node %i (following new upstream node %i)"),
|
||||
node_info.node_id,
|
||||
best_candidate.node_id);
|
||||
|
||||
create_event_record(new_primary_conn,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
false,
|
||||
event_details.data);
|
||||
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
}
|
||||
|
||||
/* update internal record for this node*/
|
||||
node_info = get_node_info(new_primary_conn, local_options.cluster_name, local_options.node);
|
||||
appendPQExpBuffer(&event_details,
|
||||
_("Node %i now following new upstream node %i"),
|
||||
node_info.node_id,
|
||||
best_candidate.node_id);
|
||||
|
||||
create_event_record(new_primary_conn,
|
||||
node_info.node_id,
|
||||
"repmgrd_failover_follow",
|
||||
true,
|
||||
event_details.data);
|
||||
|
||||
PQfinish(new_primary_conn);
|
||||
destroyPQExpBuffer(&event_details);
|
||||
}
|
||||
|
||||
log_debug("failover done\n"); // ZZZ
|
||||
@@ -1515,6 +1607,7 @@ do_upstream_standby_failover(t_node_info upstream_node)
|
||||
r = system(local_options.follow_command);
|
||||
if (r != 0)
|
||||
{
|
||||
// ZZZ fail
|
||||
log_err(_("%s: follow command failed. You could check and try it manually.\n"),
|
||||
progname);
|
||||
terminate(ERR_BAD_CONFIG);
|
||||
|
||||
Reference in New Issue
Block a user