mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-27 17:06:29 +00:00
repmgrd: improve logging during failover
Ensure relevant decision making information is visible at the default log level (INFO), and also that where log messages are specific to a particular node, that node's ID is noted.
This commit is contained in:
3
HISTORY
3
HISTORY
@@ -1,7 +1,8 @@
|
|||||||
3.4.0 2019-02-
|
3.4.0 2019-02-
|
||||||
default log level is now INFO
|
default log level is now INFO (Ian)
|
||||||
repmgr: fix `standby register --force` when updating existing node record (Ian)
|
repmgr: fix `standby register --force` when updating existing node record (Ian)
|
||||||
repmgrd: set LSN shared memory value at standby startup (Ian)
|
repmgrd: set LSN shared memory value at standby startup (Ian)
|
||||||
|
repmgrd: improve logging during failover (Ian)
|
||||||
|
|
||||||
3.3.2 2017-06-01
|
3.3.2 2017-06-01
|
||||||
Add support for PostgreSQL 10 (Ian)
|
Add support for PostgreSQL 10 (Ian)
|
||||||
|
|||||||
56
repmgrd.c
56
repmgrd.c
@@ -1396,7 +1396,7 @@ do_master_failover(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
total_active_nodes = PQntuples(res);
|
total_active_nodes = PQntuples(res);
|
||||||
log_debug(_("%d active nodes registered\n"), total_active_nodes);
|
log_info(_("%d active nodes registered\n"), total_active_nodes);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Build an array with the nodes and indicate which ones are visible and
|
* Build an array with the nodes and indicate which ones are visible and
|
||||||
@@ -1479,8 +1479,8 @@ do_master_failover(void)
|
|||||||
}
|
}
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
|
|
||||||
log_debug(_("total nodes counted: registered=%d, visible=%d\n"),
|
log_info(_("total nodes counted: registered=%d, visible=%d\n"),
|
||||||
total_active_nodes, visible_nodes);
|
total_active_nodes, visible_nodes);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Am I on the group that should keep alive? If I see less than half of
|
* Am I on the group that should keep alive? If I see less than half of
|
||||||
@@ -1497,7 +1497,7 @@ do_master_failover(void)
|
|||||||
/* Query all available nodes to determine readiness and LSN */
|
/* Query all available nodes to determine readiness and LSN */
|
||||||
for (i = 0; i < total_active_nodes; i++)
|
for (i = 0; i < total_active_nodes; i++)
|
||||||
{
|
{
|
||||||
log_debug("checking node %i...\n", nodes[i].node_id);
|
log_info("checking node %i...\n", nodes[i].node_id);
|
||||||
|
|
||||||
/* if the node is not visible, skip it */
|
/* if the node is not visible, skip it */
|
||||||
if (!nodes[i].is_visible)
|
if (!nodes[i].is_visible)
|
||||||
@@ -1521,6 +1521,7 @@ do_master_failover(void)
|
|||||||
if (PQstatus(node_conn) != CONNECTION_OK)
|
if (PQstatus(node_conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
log_err(_("It seems new problems are arising, manual intervention is needed\n"));
|
log_err(_("It seems new problems are arising, manual intervention is needed\n"));
|
||||||
|
log_detail("%s\n", PQerrorMessage(node_conn));
|
||||||
terminate(ERR_FAILOVER_FAIL);
|
terminate(ERR_FAILOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1528,15 +1529,16 @@ do_master_failover(void)
|
|||||||
|
|
||||||
if (xlog_recptr == InvalidXLogRecPtr)
|
if (xlog_recptr == InvalidXLogRecPtr)
|
||||||
{
|
{
|
||||||
log_info(_("unable to retrieve node's last standby location: %s\n"),
|
log_info(_("unable to retrieve last standby location for node %i: %s\n"),
|
||||||
|
nodes[i].node_id,
|
||||||
PQerrorMessage(node_conn));
|
PQerrorMessage(node_conn));
|
||||||
|
|
||||||
log_debug(_("connection details: %s\n"), nodes[i].conninfo_str);
|
log_detail(_("connection details: %s\n"), nodes[i].conninfo_str);
|
||||||
PQfinish(node_conn);
|
PQfinish(node_conn);
|
||||||
terminate(ERR_FAILOVER_FAIL);
|
terminate(ERR_FAILOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
log_debug(_("LSN of node %i is: %X/%X\n"), nodes[i].node_id, format_lsn(xlog_recptr));
|
log_info(_("current LSN of node %i is: %X/%X\n"), nodes[i].node_id, format_lsn(xlog_recptr));
|
||||||
|
|
||||||
PQfinish(node_conn);
|
PQfinish(node_conn);
|
||||||
|
|
||||||
@@ -1570,6 +1572,9 @@ do_master_failover(void)
|
|||||||
}
|
}
|
||||||
/* write last location in shared memory */
|
/* write last location in shared memory */
|
||||||
update_shared_memory(PQgetvalue(res, 0, 0));
|
update_shared_memory(PQgetvalue(res, 0, 0));
|
||||||
|
|
||||||
|
log_info("local node's LSN is %s\n", PQgetvalue(res, 0, 0));
|
||||||
|
|
||||||
PQclear(res);
|
PQclear(res);
|
||||||
|
|
||||||
/* Wait for each node to come up and report a valid LSN */
|
/* Wait for each node to come up and report a valid LSN */
|
||||||
@@ -1606,6 +1611,9 @@ do_master_failover(void)
|
|||||||
*/
|
*/
|
||||||
if (PQstatus(node_conn) != CONNECTION_OK)
|
if (PQstatus(node_conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
|
log_err(_("connection to node %i has gone away:\n%s\n"),
|
||||||
|
nodes[i].node_id,
|
||||||
|
PQerrorMessage(node_conn));
|
||||||
log_info(_("At this point, it could be some race conditions "
|
log_info(_("At this point, it could be some race conditions "
|
||||||
"that are acceptable, assume the node is restarting "
|
"that are acceptable, assume the node is restarting "
|
||||||
"and starting failover procedure\n"));
|
"and starting failover procedure\n"));
|
||||||
@@ -1622,6 +1630,9 @@ do_master_failover(void)
|
|||||||
res = PQexec(node_conn, sqlquery);
|
res = PQexec(node_conn, sqlquery);
|
||||||
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* Note: in repmgr4 we handle this kind of situation much more gracefully.
|
||||||
|
*/
|
||||||
log_err(_("PQexec failed: %s.\nReport an invalid value to not "
|
log_err(_("PQexec failed: %s.\nReport an invalid value to not "
|
||||||
"be considered as new master and exit.\n"),
|
"be considered as new master and exit.\n"),
|
||||||
PQerrorMessage(node_conn));
|
PQerrorMessage(node_conn));
|
||||||
@@ -1654,8 +1665,8 @@ do_master_failover(void)
|
|||||||
*/
|
*/
|
||||||
if (strcmp(location_value, PASSIVE_NODE) == 0)
|
if (strcmp(location_value, PASSIVE_NODE) == 0)
|
||||||
{
|
{
|
||||||
log_debug("node %i is passive mode\n", nodes[i].node_id);
|
|
||||||
log_info(_("node %i will not be considered for promotion\n"), nodes[i].node_id);
|
log_info(_("node %i will not be considered for promotion\n"), nodes[i].node_id);
|
||||||
|
log_detail("node %i indicates it is a passive mode\n", nodes[i].node_id);
|
||||||
nodes[i].xlog_location = InvalidXLogRecPtr;
|
nodes[i].xlog_location = InvalidXLogRecPtr;
|
||||||
continue_loop = false;
|
continue_loop = false;
|
||||||
}
|
}
|
||||||
@@ -1665,7 +1676,8 @@ do_master_failover(void)
|
|||||||
*/
|
*/
|
||||||
else if (strcmp(location_value, LSN_QUERY_ERROR) == 0)
|
else if (strcmp(location_value, LSN_QUERY_ERROR) == 0)
|
||||||
{
|
{
|
||||||
log_warning(_("node %i is unable to update its shared memory and will not be considered for promotion\n"), nodes[i].node_id);
|
log_warning(_("node %i is unable to update its shared memory and will not be considered for promotion\n"),
|
||||||
|
nodes[i].node_id);
|
||||||
nodes[i].xlog_location = InvalidXLogRecPtr;
|
nodes[i].xlog_location = InvalidXLogRecPtr;
|
||||||
continue_loop = false;
|
continue_loop = false;
|
||||||
}
|
}
|
||||||
@@ -1673,12 +1685,8 @@ do_master_failover(void)
|
|||||||
/* Unable to parse value returned by `repmgr_get_last_standby_location()` */
|
/* Unable to parse value returned by `repmgr_get_last_standby_location()` */
|
||||||
else if (*location_value == '\0')
|
else if (*location_value == '\0')
|
||||||
{
|
{
|
||||||
log_crit(
|
log_crit(_("unable to obtain LSN from node %i"), nodes[i].node_id);
|
||||||
_("unable to obtain LSN from node %i"), nodes[i].node_id
|
log_hint(_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n"));
|
||||||
);
|
|
||||||
log_hint(
|
|
||||||
_("please check that 'shared_preload_libraries=repmgr_funcs' is set in postgresql.conf\n")
|
|
||||||
);
|
|
||||||
|
|
||||||
PQfinish(node_conn);
|
PQfinish(node_conn);
|
||||||
/* XXX shouldn't we just ignore this node? */
|
/* XXX shouldn't we just ignore this node? */
|
||||||
@@ -1690,14 +1698,14 @@ do_master_failover(void)
|
|||||||
* strategy keep checking
|
* strategy keep checking
|
||||||
*/
|
*/
|
||||||
else {
|
else {
|
||||||
log_warning(_("unable to parse LSN \"%s\"\n"),
|
log_warning(_("unable to parse shared memory LSN \"%s\"\n"),
|
||||||
location_value);
|
location_value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
log_debug(
|
log_debug(
|
||||||
_("invalid LSN returned from node %i: '%s'\n"),
|
_("invalid shared memory LSN returned from node %i: '%s'\n"),
|
||||||
nodes[i].node_id,
|
nodes[i].node_id,
|
||||||
location_value);
|
location_value);
|
||||||
}
|
}
|
||||||
@@ -1719,7 +1727,7 @@ do_master_failover(void)
|
|||||||
nodes[i].xlog_location = xlog_recptr;
|
nodes[i].xlog_location = xlog_recptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
log_debug(_("LSN of node %i is: %s\n"), nodes[i].node_id, location_value);
|
log_info(_("shared memory LSN of node %i is: %s\n"), nodes[i].node_id, location_value);
|
||||||
|
|
||||||
ready_nodes++;
|
ready_nodes++;
|
||||||
nodes[i].is_ready = true;
|
nodes[i].is_ready = true;
|
||||||
@@ -1775,7 +1783,7 @@ do_master_failover(void)
|
|||||||
terminate(ERR_FAILOVER_FAIL);
|
terminate(ERR_FAILOVER_FAIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
log_debug("best candidate node id is %i\n", best_candidate.node_id);
|
log_info("best candidate node id is %i\n", best_candidate.node_id);
|
||||||
|
|
||||||
/* if local node is the best candidate, promote it */
|
/* if local node is the best candidate, promote it */
|
||||||
if (best_candidate.node_id == local_options.node)
|
if (best_candidate.node_id == local_options.node)
|
||||||
@@ -1791,9 +1799,9 @@ do_master_failover(void)
|
|||||||
sleep(5);
|
sleep(5);
|
||||||
|
|
||||||
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
log_notice(_("this node is the best candidate to be the new master, promoting...\n"));
|
||||||
|
log_detail(_("LSN is %X/%X\n"), format_lsn(best_candidate.xlog_location));
|
||||||
log_debug("promote command is: \"%s\"\n",
|
log_info("promote command is: \"%s\"\n",
|
||||||
local_options.promote_command);
|
local_options.promote_command);
|
||||||
|
|
||||||
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
if (log_type == REPMGR_STDERR && *local_options.logfile)
|
||||||
{
|
{
|
||||||
@@ -1849,6 +1857,8 @@ do_master_failover(void)
|
|||||||
node_info.node_id,
|
node_info.node_id,
|
||||||
failed_master.node_id);
|
failed_master.node_id);
|
||||||
|
|
||||||
|
log_notice("%s", event_details.data);
|
||||||
|
|
||||||
/* my_local_conn is now the master */
|
/* my_local_conn is now the master */
|
||||||
create_event_record(my_local_conn,
|
create_event_record(my_local_conn,
|
||||||
&local_options,
|
&local_options,
|
||||||
@@ -1909,7 +1919,7 @@ do_master_failover(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
log_debug(_("executing follow command: \"%s\"\n"), local_options.follow_command);
|
log_notice(_("executing follow command: \"%s\"\n"), local_options.follow_command);
|
||||||
|
|
||||||
r = system(local_options.follow_command);
|
r = system(local_options.follow_command);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
|
|||||||
Reference in New Issue
Block a user