diff --git a/HISTORY b/HISTORY
index 30ea7bb1..241d3d0e 100644
--- a/HISTORY
+++ b/HISTORY
@@ -1,5 +1,6 @@
-4.0.4 2018-??-??
+4.0.4 2018-03-??
repmgr: ensure "node rejoin" honours "--dry-run" option; GitHub #383 (Ian)
+ repmgrd: improve detection of status change from primary to standby (Ian)
4.0.3 2018-02-15
repmgr: improve switchover handling when "pg_ctl" used to control the
diff --git a/dbutils.c b/dbutils.c
index 22354997..e6317432 100644
--- a/dbutils.c
+++ b/dbutils.c
@@ -2344,8 +2344,7 @@ update_node_record_set_active(PGconn *conn, int this_node_id, bool active)
initPQExpBuffer(&query);
- appendPQExpBuffer(
- &query,
+ appendPQExpBuffer(&query,
"UPDATE repmgr.nodes SET active = %s "
" WHERE node_id = %i",
active == true ? "TRUE" : "FALSE",
@@ -2370,6 +2369,40 @@ update_node_record_set_active(PGconn *conn, int this_node_id, bool active)
}
+bool
+update_node_record_set_active_standby(PGconn *conn, int this_node_id)
+{
+ PQExpBufferData query;
+ PGresult *res = NULL;
+
+ initPQExpBuffer(&query);
+
+ appendPQExpBuffer(&query,
+ "UPDATE repmgr.nodes "
+ " SET type = 'standby', "
+ " active = TRUE "
+ " WHERE node_id = %i",
+ this_node_id);
+
+ log_verbose(LOG_DEBUG, "update_node_record_set_active_standby():\n %s", query.data);
+
+ res = PQexec(conn, query.data);
+ termPQExpBuffer(&query);
+
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ {
+ log_error(_("unable to update node record:\n %s"),
+ PQerrorMessage(conn));
+ PQclear(res);
+ return false;
+ }
+
+ PQclear(res);
+
+ return true;
+}
+
+
bool
update_node_record_set_primary(PGconn *conn, int this_node_id)
{
diff --git a/dbutils.h b/dbutils.h
index 6a9b5a47..ce7eff36 100644
--- a/dbutils.h
+++ b/dbutils.h
@@ -428,6 +428,7 @@ bool truncate_node_records(PGconn *conn);
bool update_node_record_set_active(PGconn *conn, int this_node_id, bool active);
bool update_node_record_set_primary(PGconn *conn, int this_node_id);
+bool update_node_record_set_active_standby(PGconn *conn, int this_node_id);
bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id);
bool update_node_record_status(PGconn *conn, int this_node_id, char *type, int upstream_node_id, bool active);
bool update_node_record_conn_priority(PGconn *conn, t_configuration_options *options);
diff --git a/doc/repmgrd-degraded-monitoring.sgml b/doc/repmgrd-degraded-monitoring.sgml
index b2f3ac71..923331ad 100644
--- a/doc/repmgrd-degraded-monitoring.sgml
+++ b/doc/repmgrd-degraded-monitoring.sgml
@@ -40,7 +40,7 @@
- repmgrd is monitoring the primary node, but it is not available
+ repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary)
@@ -69,7 +69,15 @@
By default, repmgrd will continue in degraded monitoring mode indefinitely.
However a timeout (in seconds) can be set with degraded_monitoring_timeout,
after which repmgrd will terminate.
-
+
+
+ If repmgrd is monitoring a primary mode which has been stopped
+ and manually restarted as a standby attached to a new primary, it will automatically detect
+ the status change and update the node record to reflect the node's new status
+ as an active standby. It will then resume monitoring the node as a standby.
+
+
+
diff --git a/repmgrd-physical.c b/repmgrd-physical.c
index 8fbb213a..94547621 100644
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
@@ -325,7 +325,6 @@ monitor_streaming_primary(void)
else
{
local_node_info.node_status = NODE_STATUS_UP;
- monitoring_state = MS_NORMAL;
initPQExpBuffer(&event_details);
@@ -353,54 +352,83 @@ monitor_streaming_primary(void)
else
{
RecordStatus record_status;
- int i = 0;
log_debug("primary node id is now %i", primary_node_id);
- /*
- * poll for a while until record type is returned as "STANDBY" - it's possible
- * that there's a gap between the server being restarted and the record
- * being updated
- */
- for (i = 0; i < 30; i++)
- {
- /*
- * try and refresh the local node record from the primary, as the updated
- * local node record may not have been replicated yet
- */
-
- record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
-
- if (record_status == RECORD_FOUND)
- {
- log_debug("type = %s", get_node_type_string(local_node_info.type));
-
- if (local_node_info.type == STANDBY)
- {
- PQfinish(new_primary_conn);
-
- /* XXX add event notification */
- return;
- }
- }
- sleep(1);
- }
-
- PQfinish(new_primary_conn);
+ record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
if (record_status == RECORD_FOUND)
{
- log_warning(_("repmgr node record is still %s"), get_node_type_string(local_node_info.type));
+ bool resume_monitoring = true;
+
+ log_debug("node %i is registered with type = %s",
+ config_file_options.node_id,
+ get_node_type_string(local_node_info.type));
+
+ /*
+ * node has recovered but metadata not updated - we can do that ourselves,
+ */
+ if (local_node_info.type == PRIMARY)
+ {
+ log_notice(_("node \"%s\" (ID: %i) still registered as primary, setting to standby"),
+ config_file_options.node_name,
+ config_file_options.node_id);
+
+ if (update_node_record_set_active_standby(new_primary_conn, config_file_options.node_id) == false)
+ {
+ resume_monitoring = false;
+ }
+ else
+ {
+ record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
+
+ if (record_status != RECORD_FOUND)
+ {
+ resume_monitoring = false;
+ }
+ }
+ }
+
+ if (resume_monitoring == true)
+ {
+ monitoring_state = MS_NORMAL;
+
+ appendPQExpBuffer(&event_details,
+ _("former primary has been restored as standby after %i seconds, updating node record and resuming monitoring"),
+ degraded_monitoring_elapsed);
+
+ create_event_notification(new_primary_conn,
+ &config_file_options,
+ config_file_options.node_id,
+ "repmgrd_standby_reconnect",
+ true,
+ event_details.data);
+ log_notice("%s", event_details.data);
+ termPQExpBuffer(&event_details);
+
+ PQfinish(new_primary_conn);
+
+ /* restart monitoring as standby */
+ return;
+ }
}
- else
+ else if (record_status == RECORD_NOT_FOUND)
{
- log_error(_("no metadata record found for this node"));
+ log_error(_("no metadata record found for this node on current primary %i"), primary_node_id);
log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
+
+ PQfinish(new_primary_conn);
+
+ /* add event notification */
+ terminate(ERR_BAD_CONFIG);
}
+
}
}
else
{
+ monitoring_state = MS_NORMAL;
+
appendPQExpBuffer(&event_details,
_("reconnected to primary node after %i seconds, resuming monitoring"),
degraded_monitoring_elapsed);