mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-22 22:56:29 +00:00
repmgrd: improve logging during failover
Ensure relevant decision making information is visible at the default log level (INFO), and also that where log messages are specific to a particular node, that node's ID is noted.
This commit is contained in:
3
HISTORY
3
HISTORY
@@ -1,7 +1,8 @@
|
||||
3.4.0 2019-02-
|
||||
default log level is now INFO
|
||||
default log level is now INFO (Ian)
|
||||
repmgr: fix `standby register --force` when updating existing node record (Ian)
|
||||
repmgrd: set LSN shared memory value at standby startup (Ian)
|
||||
repmgrd: improve logging during failover (Ian)
|
||||
|
||||
3.3.2 2017-06-01
|
||||
Add support for PostgreSQL 10 (Ian)
|
||||
|
||||
56
repmgrd.c
56
repmgrd.c
@@ -1396,7 +1396,7 @@ do_master_failover(void)
|
||||
}
|
||||
|
||||
total_active_nodes = PQntuples(res);
|
||||
log_debug(_("%d active nodes registered\n"), total_active_nodes);
|
||||
log_info(_("%d active nodes registered\n"), total_active_nodes);
|
||||
|
||||
/*
|
||||
* Build an array with the nodes and indicate which ones are visible and
|
||||
@@ -1479,8 +1479,8 @@ do_master_failover(void)
|
||||
}
|
||||
PQclear(res);
|
||||
|
||||
log_debug(_("total nodes counted: registered=%d, visible=%d\n"),
|
||||
total_active_nodes, visible_nodes);
|
||||
log_info(_("total nodes counted: registered=%d, visible=%d\n"),
|
||||
total_active_nodes, visible_nodes);
|
||||
|
||||
/*
|
||||
* Am I on the group that should keep alive? If I see less than half of
|
||||
@@ -1497,7 +1497,7 @@ do_master_failover(void)
|
||||
/* Query all available nodes to determine readiness and LSN */
|
||||
for (i = 0; i < total_active_nodes; i++)
|
||||
{
|
||||
log_debug("checking node %i...\n", nodes[i].node_id);
|
||||
log_info("checking node %i...\n", nodes[i].node_id);
|
||||
|
||||
/* if the node is not visible, skip it */
|
||||
if (!nodes[i].is_visible)
|
||||
@@ -1521,6 +1521,7 @@ do_master_failover(void)
|
||||
if (PQstatus(node_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_err(_("It seems new problems are arising, manual intervention is needed\n"));
|
||||
log_detail("%s\n", PQerrorMessage(node_conn));
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
@@ -1528,15 +1529,16 @@ do_master_failover(void)
|
||||
|
||||
if (xlog_recptr == InvalidXLogRecPtr)
|
||||
{
|
||||
log_info(_("unable to retrieve node's last standby location: %s\n"),
|
||||
log_info(_("unable to retrieve last standby location for node %i: %s\n"),
|
||||
nodes[i].node_id,
|
||||
PQerrorMessage(node_conn));
|
||||
|
||||
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
|
||||
log_detail(_("connection details: %s\n"), nodes[i].conninfo_str);
|
||||
PQfinish(node_conn);
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
log_debug(_("LSN of node %i is: %X/%X\n"), nodes[i].node_id, format_lsn(xlog_recptr));
|
||||
log_info(_("current LSN of node %i is: %X/%X\n"), nodes[i].node_id, format_lsn(xlog_recptr));
|
||||
|
||||
PQfinish(node_conn);
|
||||
|
||||
@@ -1570,6 +1572,9 @@ do_master_failover(void)
|
||||
}
|
||||
/* write last location in shared memory */
|
||||
update_shared_memory(PQgetvalue(res, 0, 0));
|
||||
|
||||
log_info("local node's LSN is %s\n", PQgetvalue(res, 0, 0));
|
||||
|
||||
PQclear(res);
|
||||
|
||||
/* Wait for each node to come up and report a valid LSN */
|
||||
@@ -1606,6 +1611,9 @@ do_master_failover(void)
|
||||
*/
|
||||
if (PQstatus(node_conn) != CONNECTION_OK)
|
||||
{
|
||||
log_err(_("connection to node %i has gone away:\n%s\n"),
|
||||
nodes[i].node_id,
|
||||
PQerrorMessage(node_conn));
|
||||
log_info(_("At this point, it could be some race conditions "
|
||||
"that are acceptable, assume the node is restarting "
|
||||
"and starting failover procedure\n"));
|
||||
@@ -1622,6 +1630,9 @@ do_master_failover(void)
|
||||
res = PQexec(node_conn, sqlquery);
|
||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
/*
|
||||
* Note: in repmgr4 we handle this kind of situation much more gracefully.
|
||||
*/
|
||||
log_err(_("PQexec failed: %s.\nReport an invalid value to not "
|
||||
"be considered as new master and exit.\n"),
|
||||
PQerrorMessage(node_conn));
|
||||
@@ -1654,8 +1665,8 @@ do_master_failover(void)
|
||||
*/
|
||||
if (strcmp(location_value, PASSIVE_NODE) == 0)
|
||||
{
|
||||
log_debug("node %i is passive mode\n", nodes[i].node_id);
|
||||
log_info(_("node %i will not be considered for promotion\n"), nodes[i].node_id);
|
||||
log_detail("node %i indicates it is a passive mode\n", nodes[i].node_id);
|
||||
nodes[i].xlog_location = InvalidXLogRecPtr;
|
||||
continue_loop = false;
|
||||
}
|
||||
@@ -1665,7 +1676,8 @@ do_master_failover(void)
|
||||
*/
|
||||
else if (strcmp(location_value, LSN_QUERY_ERROR) == 0)
|
||||
{
|
||||
log_warning(_("node %i is unable to update its shared memory and will not be considered for promotion\n"), nodes[i].node_id);
|
||||
log_warning(_("node %i is unable to update its shared memory and will not be considered for promotion\n"),
|
||||
nodes[i].node_id);
|
||||
nodes[i].xlog_location = InvalidXLogRecPtr;
|
||||
continue_loop = false;
|
||||
}
|
||||
@@ -1673,12 +1685,8 @@ do_master_failover(void)
|
||||
/* Unable to parse value returned by `repmgr_get_last_standby_location()` */
|
||||
else if (*location_value == '\0')
|
||||
{
|
||||
log_crit(
|
||||
_("unable to obtain LSN from node %i"), nodes[i].node_id
|
||||
);
|
||||
log_hint(
|
||||
_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n")
|
||||
);
|
||||
log_crit(_("unable to obtain LSN from node %i"), nodes[i].node_id);
|
||||
log_hint(_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n"));
|
||||
|
||||
PQfinish(node_conn);
|
||||
/* XXX shouldn't we just ignore this node? */
|
||||
@@ -1690,14 +1698,14 @@ do_master_failover(void)
|
||||
* strategy keep checking
|
||||
*/
|
||||
else {
|
||||
log_warning(_("unable to parse LSN \"%s\"\n"),
|
||||
log_warning(_("unable to parse shared memory LSN \"%s\"\n"),
|
||||
location_value);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log_debug(
|
||||
_("invalid LSN returned from node %i: '%s'\n"),
|
||||
_("invalid shared memory LSN returned from node %i: '%s'\n"),
|
||||
nodes[i].node_id,
|
||||
location_value);
|
||||
}
|
||||
@@ -1719,7 +1727,7 @@ do_master_failover(void)
|
||||
nodes[i].xlog_location = xlog_recptr;
|
||||
}
|
||||
|
||||
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, location_value);
|
||||
log_info(_("shared memory LSN of node %i is: %s\n"), nodes[i].node_id, location_value);
|
||||
|
||||
ready_nodes++;
|
||||
nodes[i].is_ready = true;
|
||||
@@ -1775,7 +1783,7 @@ do_master_failover(void)
|
||||
terminate(ERR_FAILOVER_FAIL);
|
||||
}
|
||||
|
||||
log_debug("best candidate node id is %i\n", best_candidate.node_id);
|
||||
log_info("best candidate node id is %i\n", best_candidate.node_id);
|
||||
|
||||
/* if local node is the best candidate, promote it */
|
||||
if (best_candidate.node_id == local_options.node)
|
||||
@@ -1791,9 +1799,9 @@ do_master_failover(void)
|
||||
sleep(5);
|
||||
|
||||
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||
|
||||
log_debug("promote command is: \"%s\"\n",
|
||||
local_options.promote_command);
|
||||
log_detail(_("LSN is %X/%X\n"), format_lsn(best_candidate.xlog_location));
|
||||
log_info("promote command is: \"%s\"\n",
|
||||
local_options.promote_command);
|
||||
|
||||
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
||||
{
|
||||
@@ -1849,6 +1857,8 @@ do_master_failover(void)
|
||||
node_info.node_id,
|
||||
failed_master.node_id);
|
||||
|
||||
log_notice("%s", event_details.data);
|
||||
|
||||
/* my_local_conn is now the master */
|
||||
create_event_record(my_local_conn,
|
||||
&local_options,
|
||||
@@ -1909,7 +1919,7 @@ do_master_failover(void)
|
||||
}
|
||||
|
||||
|
||||
log_debug(_("executing follow command: \"%s\"\n"), local_options.follow_command);
|
||||
log_notice(_("executing follow command: \"%s\"\n"), local_options.follow_command);
|
||||
|
||||
r = system(local_options.follow_command);
|
||||
if (r != 0)
|
||||
|
||||
Reference in New Issue
Block a user