diff --git a/HISTORY b/HISTORY index 30ea7bb1..241d3d0e 100644 --- a/HISTORY +++ b/HISTORY @@ -1,5 +1,6 @@ -4.0.4 2018-??-?? +4.0.4 2018-03-?? repmgr: ensure "node rejoin" honours "--dry-run" option; GitHub #383 (Ian) + repmgrd: improve detection of status change from primary to standby (Ian) 4.0.3 2018-02-15 repmgr: improve switchover handling when "pg_ctl" used to control the diff --git a/dbutils.c b/dbutils.c index 22354997..e6317432 100644 --- a/dbutils.c +++ b/dbutils.c @@ -2344,8 +2344,7 @@ update_node_record_set_active(PGconn *conn, int this_node_id, bool active) initPQExpBuffer(&query); - appendPQExpBuffer( - &query, + appendPQExpBuffer(&query, "UPDATE repmgr.nodes SET active = %s " " WHERE node_id = %i", active == true ? "TRUE" : "FALSE", @@ -2370,6 +2369,40 @@ update_node_record_set_active(PGconn *conn, int this_node_id, bool active) } +bool +update_node_record_set_active_standby(PGconn *conn, int this_node_id) +{ + PQExpBufferData query; + PGresult *res = NULL; + + initPQExpBuffer(&query); + + appendPQExpBuffer(&query, + "UPDATE repmgr.nodes " + " SET type = 'standby', " + " active = TRUE " + " WHERE node_id = %i", + this_node_id); + + log_verbose(LOG_DEBUG, "update_node_record_set_active_standby():\n %s", query.data); + + res = PQexec(conn, query.data); + termPQExpBuffer(&query); + + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + log_error(_("unable to update node record:\n %s"), + PQerrorMessage(conn)); + PQclear(res); + return false; + } + + PQclear(res); + + return true; +} + + bool update_node_record_set_primary(PGconn *conn, int this_node_id) { diff --git a/dbutils.h b/dbutils.h index 6a9b5a47..ce7eff36 100644 --- a/dbutils.h +++ b/dbutils.h @@ -428,6 +428,7 @@ bool truncate_node_records(PGconn *conn); bool update_node_record_set_active(PGconn *conn, int this_node_id, bool active); bool update_node_record_set_primary(PGconn *conn, int this_node_id); +bool update_node_record_set_active_standby(PGconn *conn, int this_node_id); bool update_node_record_set_upstream(PGconn *conn, int this_node_id, int new_upstream_node_id); bool update_node_record_status(PGconn *conn, int this_node_id, char *type, int upstream_node_id, bool active); bool update_node_record_conn_priority(PGconn *conn, t_configuration_options *options); diff --git a/doc/repmgrd-degraded-monitoring.sgml b/doc/repmgrd-degraded-monitoring.sgml index b2f3ac71..923331ad 100644 --- a/doc/repmgrd-degraded-monitoring.sgml +++ b/doc/repmgrd-degraded-monitoring.sgml @@ -40,7 +40,7 @@ - repmgrd is monitoring the primary node, but it is not available + repmgrd is monitoring the primary node, but it is not available (and no other node has been promoted as primary) @@ -69,7 +69,15 @@ By default, repmgrd will continue in degraded monitoring mode indefinitely. However a timeout (in seconds) can be set with degraded_monitoring_timeout, after which repmgrd will terminate. - + + + If repmgrd is monitoring a primary mode which has been stopped + and manually restarted as a standby attached to a new primary, it will automatically detect + the status change and update the node record to reflect the node's new status + as an active standby. It will then resume monitoring the node as a standby. + + + diff --git a/repmgrd-physical.c b/repmgrd-physical.c index 8fbb213a..94547621 100644 --- a/repmgrd-physical.c +++ b/repmgrd-physical.c @@ -325,7 +325,6 @@ monitor_streaming_primary(void) else { local_node_info.node_status = NODE_STATUS_UP; - monitoring_state = MS_NORMAL; initPQExpBuffer(&event_details); @@ -353,54 +352,83 @@ monitor_streaming_primary(void) else { RecordStatus record_status; - int i = 0; log_debug("primary node id is now %i", primary_node_id); - /* - * poll for a while until record type is returned as "STANDBY" - it's possible - * that there's a gap between the server being restarted and the record - * being updated - */ - for (i = 0; i < 30; i++) - { - /* - * try and refresh the local node record from the primary, as the updated - * local node record may not have been replicated yet - */ - - record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info); - - if (record_status == RECORD_FOUND) - { - log_debug("type = %s", get_node_type_string(local_node_info.type)); - - if (local_node_info.type == STANDBY) - { - PQfinish(new_primary_conn); - - /* XXX add event notification */ - return; - } - } - sleep(1); - } - - PQfinish(new_primary_conn); + record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info); if (record_status == RECORD_FOUND) { - log_warning(_("repmgr node record is still %s"), get_node_type_string(local_node_info.type)); + bool resume_monitoring = true; + + log_debug("node %i is registered with type = %s", + config_file_options.node_id, + get_node_type_string(local_node_info.type)); + + /* + * node has recovered but metadata not updated - we can do that ourselves, + */ + if (local_node_info.type == PRIMARY) + { + log_notice(_("node \"%s\" (ID: %i) still registered as primary, setting to standby"), + config_file_options.node_name, + config_file_options.node_id); + + if (update_node_record_set_active_standby(new_primary_conn, config_file_options.node_id) == false) + { + resume_monitoring = false; + } + else + { + record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info); + + if (record_status != RECORD_FOUND) + { + resume_monitoring = false; + } + } + } + + if (resume_monitoring == true) + { + monitoring_state = MS_NORMAL; + + appendPQExpBuffer(&event_details, + _("former primary has been restored as standby after %i seconds, updating node record and resuming monitoring"), + degraded_monitoring_elapsed); + + create_event_notification(new_primary_conn, + &config_file_options, + config_file_options.node_id, + "repmgrd_standby_reconnect", + true, + event_details.data); + log_notice("%s", event_details.data); + termPQExpBuffer(&event_details); + + PQfinish(new_primary_conn); + + /* restart monitoring as standby */ + return; + } } - else + else if (record_status == RECORD_NOT_FOUND) { - log_error(_("no metadata record found for this node")); + log_error(_("no metadata record found for this node on current primary %i"), primary_node_id); log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node")); + + PQfinish(new_primary_conn); + + /* add event notification */ + terminate(ERR_BAD_CONFIG); } + } } else { + monitoring_state = MS_NORMAL; + appendPQExpBuffer(&event_details, _("reconnected to primary node after %i seconds, resuming monitoring"), degraded_monitoring_elapsed);