From 87910a544894c44afdd7c23e262b6875fd80c749 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Mon, 29 Apr 2019 13:48:18 +0900 Subject: [PATCH] repmgrd: improve logging of sibling node's upstream info If the sibling node has already been promoted (for whatever reason, e.g. "repmgr standby promote" was executed manually) and has exited recovery, the upstream node ID will normally be reported as "-1", which is correct, but looks confusing in the logs. We now only report the upstream node ID if the sibling node is still in recovery, *or* if it has exited recovery but is still reporting an extant node ID. --- repmgrd-physical.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 29e8c0cb..dedc910e 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -4088,12 +4088,6 @@ do_election(NodeInfoList *sibling_nodes, int *new_primary_id) continue; } - log_info(_("node \"%s\" (ID: %i) reports its upstream is node %i, last seen %i second(s) ago"), - cell->node_info->node_name, - cell->node_info->node_id, - sibling_replication_info.upstream_node_id, - sibling_replication_info.upstream_last_seen); - /* * Check if node is not in recovery - it may have been promoted * outside of the failover mechanism, in which case we may be able @@ -4108,6 +4102,21 @@ do_election(NodeInfoList *sibling_nodes, int *new_primary_id) cell->node_info->node_name, cell->node_info->node_id); + /* + * Node is not in recovery, but still reporting an upstream + * node ID; possible it was promoted manually (e.g. with "pg_ctl promote"), + * or (less likely) the node's repmgrd has just switched to primary + * monitoring node but has not yet unset the upstream node ID in + * shared memory. Either way, log this. + */ + if (sibling_replication_info.upstream_node_id != UNKNOWN_NODE_ID) + { + log_warning(_("node \"%s\" (ID: %i) still reports its upstream is node %i, last seen %i second(s) ago"), + cell->node_info->node_name, + cell->node_info->node_id, + sibling_replication_info.upstream_node_id, + sibling_replication_info.upstream_last_seen); + } can_follow = check_node_can_follow(local_conn, local_node_info.last_wal_receive_lsn, cell->node_info->conn, @@ -4128,6 +4137,14 @@ do_election(NodeInfoList *sibling_nodes, int *new_primary_id) cell->node_info->node_id); continue; } + else + { + log_info(_("node \"%s\" (ID: %i) reports its upstream is node %i, last seen %i second(s) ago"), + cell->node_info->node_name, + cell->node_info->node_id, + sibling_replication_info.upstream_node_id, + sibling_replication_info.upstream_last_seen); + } /* check if WAL replay on node is paused */ if (sibling_replication_info.wal_replay_paused == true)