diff --git a/HISTORY b/HISTORY
index ed5168a6..f54c33bd 100644
--- a/HISTORY
+++ b/HISTORY
@@ -8,6 +8,7 @@
repmgr: report error code on follow/rejoin failure due to non-available
replication slot (Ian)
repmgr: ensure "node rejoin" checks for available replication slots (Ian)
+ repmgr: improve "standby switchover" completion checks
5.0 2019-10-15
general: add PostgreSQL 12 support (Ian)
diff --git a/doc/appendix-release-notes.xml b/doc/appendix-release-notes.xml
index 56da4a38..1fea8d54 100644
--- a/doc/appendix-release-notes.xml
+++ b/doc/appendix-release-notes.xml
@@ -45,6 +45,20 @@
+
+ General improvements
+
+
+
+
+ repmgr standby follow:
+ Improve logging and checking of potential failure situations.
+
+
+
+
+
+
Bug fixes
diff --git a/repmgr-action-standby.c b/repmgr-action-standby.c
index 0357806f..dd6f68c7 100644
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -3368,6 +3368,16 @@ do_standby_follow_internal(PGconn *primary_conn, PGconn *follow_target_conn, t_n
* Where running and not already paused, repmgrd will be paused (and
* subsequently unpaused), unless --repmgrd-no-pause provided.
*
+ * Note that this operation can only be considered to have failed completely
+ * ("ERR_SWITCHOVER_FAIL") in these situations:
+ *
+ * - the prerequisites for a switchover are not met
+ * - the demotion candidate could not be shut down cleanly
+ * - the promotion candidate could not be promoted
+ *
+ * All other failures (demotion candidate did not connect to new primary etc.)
+ * are considered partial failures ("ERR_SWITCHOVER_INCOMPLETE")
+ *
* TODO:
* - make connection test timeouts/intervals configurable (see below)
*/
@@ -4601,6 +4611,14 @@ do_standby_switchover(void)
* requested.
*/
initPQExpBuffer(&node_rejoin_options);
+
+ /*
+ * Don't wait for repmgr on the remote node to report the success
+ * of the rejoin operation - we'll check it from here.
+ */
+ appendPQExpBufferStr(&node_rejoin_options,
+ " --no-wait");
+
if (replication_info.last_wal_receive_lsn < remote_last_checkpoint_lsn)
{
KeyValueListCell *cell = NULL;
@@ -4656,7 +4674,7 @@ do_standby_switchover(void)
char *conninfo_normalized = normalize_conninfo_string(local_node_record.conninfo);
appendPQExpBuffer(&remote_command_str,
- "%s-d ",
+ "%s -d ",
node_rejoin_options.data);
appendRemoteShellString(&remote_command_str,
@@ -4698,21 +4716,56 @@ do_standby_switchover(void)
else
{
PQExpBufferData event_details;
+ standy_join_status join_success = check_standby_join(local_conn,
+ &local_node_record,
+ &remote_node_record);
initPQExpBuffer(&event_details);
- appendPQExpBuffer(&event_details,
- "node %i promoted to primary, node %i demoted to standby",
- config_file_options.node_id,
- remote_node_record.node_id);
+ switch (join_success) {
+ case JOIN_FAIL_NO_PING:
+ appendPQExpBuffer(&event_details,
+ _("node \"%s\" (ID: %i) promoted to primary, but demote node \"%s\" (ID: %i) did not beome available"),
+ config_file_options.node_name,
+ config_file_options.node_id,
+ remote_node_record.node_name,
+ remote_node_record.node_id);
+ switchover_success = false;
+
+ break;
+ case JOIN_FAIL_NO_REPLICATION:
+ appendPQExpBuffer(&event_details,
+ _("node \"%s\" (ID: %i) promoted to primary, but demote node \"%s\" (ID: %i) did not connect to the new primary"),
+ config_file_options.node_name,
+ config_file_options.node_id,
+ remote_node_record.node_name,
+ remote_node_record.node_id);
+ switchover_success = false;
+ break;
+ case JOIN_SUCCESS:
+ appendPQExpBuffer(&event_details,
+ _("node \"%s\" (ID: %i) promoted to primary, node \"%s\" (ID: %i) demoted to standby"),
+ config_file_options.node_name,
+ config_file_options.node_id,
+ remote_node_record.node_name,
+ remote_node_record.node_id);
+ }
create_event_notification_extended(local_conn,
&config_file_options,
config_file_options.node_id,
"standby_switchover",
- true,
+ switchover_success,
event_details.data,
&event_info);
+ if (switchover_success == true)
+ {
+ log_notice("%s", event_details.data);
+ }
+ else
+ {
+ log_error("%s", event_details.data);
+ }
termPQExpBuffer(&event_details);
}
@@ -4729,8 +4782,6 @@ do_standby_switchover(void)
clear_node_info_list(&sibling_nodes);
- PQfinish(local_conn);
-
/*
* Clean up remote node (primary demoted to standby). It's possible that the node is
* still starting up, so poll for a while until we get a connection.
@@ -4757,9 +4808,11 @@ do_standby_switchover(void)
/* TODO: double-check whether new standby has attached */
log_warning(_("switchover did not fully complete"));
- log_detail(_("node \"%s\" is now primary but node \"%s\" is not reachable"),
+ log_detail(_("node \"%s\" (ID: %i) is now primary but node \"%s\" (ID: %i) is not reachable"),
local_node_record.node_name,
- remote_node_record.node_name);
+ local_node_record.node_id,
+ remote_node_record.node_name,
+ remote_node_record.node_id);
if (config_file_options.use_replication_slots == true)
{
@@ -4768,22 +4821,47 @@ do_standby_switchover(void)
}
else
{
+ NodeAttached node_attached;
+
+ /*
+ * We were able to connect to the former primary - attempt to drop
+ * this node's former replication slot, if it exists.
+ */
if (config_file_options.use_replication_slots == true)
{
drop_replication_slot_if_exists(remote_conn,
remote_node_record.node_id,
local_node_record.slot_name);
}
- /* TODO warn about any inactive replication slots */
- log_notice(_("switchover was successful"));
- log_detail(_("node \"%s\" is now primary and node \"%s\" is attached as standby"),
- local_node_record.node_name,
- remote_node_record.node_name);
+
+ /*
+ * Do a final check that the standby has connected - it's possible
+ * the standby became reachable but has not connected (or became disconnected).
+ */
+
+ node_attached = is_downstream_node_attached(local_conn,
+ remote_node_record.node_name);
+ if (node_attached == NODE_ATTACHED)
+ {
+ log_notice(_("switchover was successful"));
+ log_detail(_("node \"%s\" is now primary and node \"%s\" is attached as standby"),
+ local_node_record.node_name,
+ remote_node_record.node_name);
+ }
+ else
+ {
+ log_notice(_("switchover is incomplete"));
+ log_detail(_("node \"%s\" is now primary but node \"%s\" is not attached as standby"),
+ local_node_record.node_name,
+ remote_node_record.node_name);
+ switchover_success = false;
+ }
}
PQfinish(remote_conn);
+ PQfinish(local_conn);
/*
* Attempt to unpause all paused repmgrd instances, unless user explicitly
@@ -4878,7 +4956,6 @@ do_standby_switchover(void)
exit(ERR_SWITCHOVER_INCOMPLETE);
}
-
return;
}