diff --git a/HISTORY b/HISTORY index 95169f2f..ae297e79 100644 --- a/HISTORY +++ b/HISTORY @@ -1,10 +1,11 @@ -5.0 2019-09-?? +5.0 2019-10-?? general: add PostgreSQL 12 support (Ian) general: parse configuration file using flex (Ian) repmgr: rename "repmgr daemon ..." commands to "repmgr service ..." (Ian) repmgr: improve data directory check (Ian) repmgr: improve extension check during "standby clone" (Ian) repmgr: pass provided log level when executing repmgr remotely (Ian) + repmgrd: fix handling of upstream node change check (Ian) 4.4 2019-06-27 repmgr: improve "daemon status" output (Ian) diff --git a/doc/appendix-release-notes.xml b/doc/appendix-release-notes.xml index e1dc3eab..6fa3a86a 100644 --- a/doc/appendix-release-notes.xml +++ b/doc/appendix-release-notes.xml @@ -228,8 +228,34 @@ conninfo='host=node1 user=repmgr dbname=repmgr connect_timeout=2'pg_monitor are permitted to do this as well. + + + + &repmgrd;: Fix handling of upstream node change check. + + + &repmgrd; has a check to see if the upstream node has unexpectedly + changed, e.g. if the repmgrd service is paused and the PostgreSQL + instance has been pointed to another node. + + + However this check was relying on the node record on the local node + being up-to-date, which may not be the case immediately after a + failover, when the node is still replaying records updated prior + to the node's own record being updated. In this case it will + mistakenly assume the node is following the original primary + and attempt to restart monitoring, which will fail as the original + primary is no longer available. + + + To prevent this, the node's record on the upstream node is checked + to see if the reported upstream node_id matches + the expected node_id. GitHub #587/#588. + + + diff --git a/repmgrd-physical.c b/repmgrd-physical.c index b51b0df9..ee3a8c56 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -2086,7 +2086,6 @@ loop: } } - if (got_SIGHUP) { handle_sighup(&local_conn, STANDBY); @@ -2096,13 +2095,34 @@ loop: if (local_monitoring_state == MS_NORMAL && last_known_upstream_node_id != local_node_info.upstream_node_id) { - log_notice(_("local node %i's upstream appears to have changed, restarting monitoring"), - local_node_info.node_id); - log_detail(_("currently monitoring upstream %i; new upstream is %i"), - last_known_upstream_node_id, - local_node_info.upstream_node_id); - close_connection(&upstream_conn); - return; + /* + * It's possible that after a change of upstream, the local node record will not + * yet have been updated with the new upstream node ID. Therefore we check the + * node record on the upstream, and if that matches "last_known_upstream_node_id", + * take that as the correct value. + */ + + if (monitoring_state == MS_NORMAL) + { + t_node_info node_info_on_upstream = T_NODE_INFO_INITIALIZER; + record_status = get_node_record(primary_conn, config_file_options.node_id, &node_info_on_upstream); + + if (last_known_upstream_node_id == node_info_on_upstream.upstream_node_id) + { + local_node_info.upstream_node_id = last_known_upstream_node_id; + } + } + + if (last_known_upstream_node_id != local_node_info.upstream_node_id) + { + log_notice(_("local node %i's upstream appears to have changed, restarting monitoring"), + local_node_info.node_id); + log_detail(_("currently monitoring upstream %i; new upstream is %i"), + last_known_upstream_node_id, + local_node_info.upstream_node_id); + close_connection(&upstream_conn); + return; + } } log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",