Bump version number

4.3
Fix default return value in alter_system_int()
2026-03-23 23:26:30 +00:00 · 2019-04-01 15:25:48 +09:00 · 2019-04-01 14:52:37 +09:00 · 2019-04-01 12:24:57 +09:00 · 2019-04-01 11:29:16 +09:00 · 2019-04-01 11:03:47 +09:00
33 changed files with 1328 additions and 698 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,9 @@ lib*.pc
 # other
 /.lineno
 *.dSYM
+*.orig
+*.rej
+
 # generated binaries
 repmgr
 repmgrd
--- a/4
+++ b/4
@@ -15,6 +15,8 @@
        repmgr: add sanity check for correct extension version (Ian)
        repmgr: ensure "witness register --dry-run" does not attempt to read node
          tables if repmgr extension not installed; GitHub #513 (Ian)
+        repmgr: ensure "standby register" fails when --upstream-node-id is the
+          same as the local node ID (Ian)
        repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
        repmgrd: on a cascaded standby, don't fail over if "failover=manual";
          GitHub #531 (Ian)
@@ -22,6 +24,8 @@
          candidates (Ian)
        repmgrd: add option "connection_check_type" (Ian)
        repmgrd: improve witness monitoring when primary node not available (Ian)
+		repmgrd: handle situation where a primary has unexpectedly appeared
+		  during failover; GitHub #420 (Ian)

 4.2     2018-10-24
        repmgr: add parameter "shutdown_check_timeout" for use by "standby switchover";
--- a/configfile.c
+++ b/configfile.c
@@ -484,7 +484,14 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 			node_id_found = true;
 		}
 		else if (strcmp(name, "node_name") == 0)
-			strncpy(options->node_name, value, MAXLEN);
+		{
+			if (strlen(value) < sizeof(options->node_name))
+				strncpy(options->node_name, value, sizeof(options->node_name));
+			else
+				item_list_append_format(error_list,
+										_("value for \"node_name\" must contain fewer than %lu characters"),
+										sizeof(options->node_name));
+		}
 		else if (strcmp(name, "conninfo") == 0)
 			strncpy(options->conninfo, value, MAXLEN);
 		else if (strcmp(name, "data_directory") == 0)
@@ -494,11 +501,12 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *

 		else if (strcmp(name, "replication_user") == 0)
 		{
-			if (strlen(value) < NAMEDATALEN)
-				strncpy(options->replication_user, value, NAMEDATALEN);
+			if (strlen(value) < sizeof(options->replication_user))
+				strncpy(options->replication_user, value, sizeof(options->replication_user));
 			else
-				item_list_append(error_list,
-								 _("value for \"replication_user\" must contain fewer than " STR(NAMEDATALEN) " characters"));
+				item_list_append_format(error_list,
+										_("value for \"replication_user\" must contain fewer than %lu characters"),
+										sizeof(options->replication_user));
 		}
 		else if (strcmp(name, "pg_bindir") == 0)
 			strncpy(options->pg_bindir, value, MAXPGPATH);
@@ -645,7 +653,7 @@ _parse_config(t_configuration_options *options, ItemList *error_list, ItemList *
 			else
 			{
 				item_list_append(error_list,
-								 _("value for \"connection_check_type\" must be \"ping\" or \"connection\"\n"));
+								 _("value for \"connection_check_type\" must be \"ping\", \"connection\" or \"query\"\n"));
 			}
 		}
 		else if (strcmp(name, "primary_visibility_consensus") == 0)
@@ -1196,7 +1204,7 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 		return false;
 	}

-	if (strncmp(new_options.node_name, orig_options->node_name, MAXLEN) != 0)
+	if (strncmp(new_options.node_name, orig_options->node_name, sizeof(orig_options->node_name)) != 0)
 	{
 		log_warning(_("\"node_name\" cannot be changed, keeping current configuration"));
 		return false;
@@ -1398,7 +1406,7 @@ reload_config(t_configuration_options *orig_options, t_server_type server_type)
 	{
 		orig_options->connection_check_type = new_options.connection_check_type;
 		log_info(_("\"connection_check_type\" is now \"%s\""),
-				 new_options.connection_check_type == CHECK_PING ? "ping" : "connection");
+				 print_connection_check_type(new_options.connection_check_type));
 		config_changed = true;
 	}

@@ -2017,3 +2025,21 @@ parse_pg_basebackup_options(const char *pg_basebackup_options, t_basebackup_opti

 	return backup_options_ok;
 }
+
+
+const char *
+print_connection_check_type(ConnectionCheckType type)
+{
+	switch (type)
+	{
+		case CHECK_PING:
+			return "ping";
+		case CHECK_QUERY:
+			return "query";
+		case CHECK_CONNECTION:
+			return "connection";
+	}
+
+	/* should never reach here */
+	return "UNKNOWN";
+}
--- a/configfile.h
+++ b/configfile.h
@@ -76,7 +76,7 @@ typedef struct
 {
 	/* node information */
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	char		conninfo[MAXLEN];
 	char		replication_user[NAMEDATALEN];
 	char		data_directory[MAXPGPATH];
@@ -329,5 +329,6 @@ void free_parsed_argv(char ***argv_array);
 /* called by repmgr-client and repmgrd */
 void		exit_with_cli_errors(ItemList *error_list, const char *repmgr_command);
 void		print_item_list(ItemList *item_list);
+const char *print_connection_check_type(ConnectionCheckType type);

 #endif							/* _REPMGR_CONFIGFILE_H_ */
--- a/controldata.c
+++ b/controldata.c
@@ -301,6 +301,8 @@ get_controlfile(const char *DataDir)
 					ControlFilePath);
 		log_detail("%s", strerror(errno));

+		close(fd);
+
 		return control_file_info;
 	}

--- a/dbutils.c
+++ b/dbutils.c
@@ -43,6 +43,8 @@ int			bdr_version_num = UNKNOWN_BDR_VERSION_NUM;
 static void log_db_error(PGconn *conn, const char *query_text, const char *fmt,...)
 __attribute__((format(PG_PRINTF_ATTRIBUTE, 3, 4)));

+static bool _is_server_available(const char *conninfo, bool quiet);
+
 static PGconn *_establish_db_connection(const char *conninfo,
 						 const bool exit_on_error,
 						 const bool log_notice,
@@ -67,16 +69,19 @@ void
 log_db_error(PGconn *conn, const char *query_text, const char *fmt,...)
 {
 	va_list		ap;
+	char		buf[MAXLEN];
+	int			retval;

 	va_start(ap, fmt);
-
-	log_error(fmt, ap);
-
+	retval = vsnprintf(buf, MAXLEN, fmt, ap);
 	va_end(ap);

-	if (conn != NULL && PQstatus(conn) == CONNECTION_OK)
+	if (retval < MAXLEN)
+		log_error("%s", buf);
+
+	if (conn != NULL)
 	{
-		log_detail("%s", PQerrorMessage(conn));
+		log_detail("\n%s", PQerrorMessage(conn));
 	}

 	if (query_text != NULL)
@@ -190,13 +195,13 @@ _establish_db_connection(const char *conninfo, const bool exit_on_error, const b
 		{
 			if (log_notice)
 			{
-				log_notice(_("connection to database failed:\n  %s"),
-						   PQerrorMessage(conn));
+				log_notice(_("connection to database failed"));
+				log_detail("\n%s", PQerrorMessage(conn));
 			}
 			else
 			{
-				log_error(_("connection to database failed:\n  %s"),
-						  PQerrorMessage(conn));
+				log_error(_("connection to database failed"));
+				log_detail("\n%s", PQerrorMessage(conn));
 			}
 			log_detail(_("attempted to connect using:\n  %s"),
 					   connection_string);
@@ -287,8 +292,9 @@ establish_db_connection_by_params(t_conninfo_param_list *param_list,
 	/* Check to see that the backend connection was successfully made */
 	if ((PQstatus(conn) != CONNECTION_OK))
 	{
-		log_error(_("connection to database failed:\n	%s"),
-				  PQerrorMessage(conn));
+		log_error(_("connection to database failed"));
+		log_detail("\n%s", PQerrorMessage(conn));
+
 		if (exit_on_error)
 		{
 			PQfinish(conn);
@@ -338,7 +344,9 @@ is_superuser_connection(PGconn *conn, t_connection_user *userinfo)

 	if (userinfo != NULL)
 	{
-		strncpy(userinfo->username, current_user, MAXLEN);
+		snprintf(userinfo->username,
+				 sizeof(userinfo->username),
+				 "%s", current_user);
 		userinfo->is_superuser = is_superuser;
 	}

@@ -1073,7 +1081,7 @@ get_pg_setting(PGconn *conn, const char *setting, char *output)
 	{
 		if (strcmp(PQgetvalue(res, i, 0), setting) == 0)
 		{
-			strncpy(output, PQgetvalue(res, i, 1), MAXLEN);
+			snprintf(output, MAXLEN, "%s", PQgetvalue(res, i, 1));
 			success = true;
 			break;
 		}
@@ -1101,7 +1109,7 @@ alter_system_int(PGconn *conn, const char *name, int value)
 {
 	PQExpBufferData query;
 	PGresult   *res = NULL;
-	bool		success = false;
+	bool		success = true;

 	initPQExpBuffer(&query);
 	appendPQExpBuffer(&query,
@@ -1117,7 +1125,6 @@ alter_system_int(PGconn *conn, const char *name, int value)
 		success = false;
 	}

-
 	termPQExpBuffer(&query);
 	PQclear(res);

@@ -1174,7 +1181,7 @@ get_cluster_size(PGconn *conn, char *size)
 	}
 	else
 	{
-		strncpy(size, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(size, MAXLEN, "%s", PQgetvalue(res, 0, 0));
 	}

 	termPQExpBuffer(&query);
@@ -1222,7 +1229,7 @@ get_server_version(PGconn *conn, char *server_version_buf)
 		 * first space.
 		 */

-		strncpy(_server_version_buf, PQgetvalue(res, 0, 1), MAXVERSIONSTR);
+		snprintf(_server_version_buf, MAXVERSIONSTR, "%s", PQgetvalue(res, 0, 1));

 		for (i = 0; i < MAXVERSIONSTR; i++)
 		{
@@ -1349,7 +1356,8 @@ _get_primary_connection(PGconn *conn,

 		/* initialize with the values of the current node being processed */
 		node_id = atoi(PQgetvalue(res, i, 0));
-		strncpy(remote_conninfo, PQgetvalue(res, i, 1), MAXCONNINFO);
+		snprintf(remote_conninfo, MAXCONNINFO, "%s", PQgetvalue(res, i, 1));
+
 		log_verbose(LOG_INFO,
 					_("checking if node %i is primary"),
 					node_id);
@@ -1994,9 +2002,13 @@ get_repmgr_extension_status(PGconn *conn, t_extension_versions *extversions)
 		/* caller wants to know which versions are installed/available */
 		if (extversions != NULL)
 		{
-			strncpy(extversions->default_version, PQgetvalue(res, 0, 2), 7);
+			snprintf(extversions->default_version,
+					 sizeof(extversions->default_version),
+					 "%s", PQgetvalue(res, 0, 2));
 			extversions->default_version_num = available_version;
-			strncpy(extversions->installed_version, PQgetvalue(res, 0, 4), 7);
+			snprintf(extversions->installed_version,
+					 sizeof(extversions->installed_version),
+					 "%s", PQgetvalue(res, 0, 4));
 			extversions->installed_version_num = installed_version;
 		}

@@ -2197,17 +2209,17 @@ _populate_node_record(PGresult *res, t_node_info *node_info, int row, bool init_
 		node_info->upstream_node_id = atoi(PQgetvalue(res, row, 2));
 	}

-	strncpy(node_info->node_name, PQgetvalue(res, row, 3), MAXLEN);
-	strncpy(node_info->conninfo, PQgetvalue(res, row, 4), MAXLEN);
-	strncpy(node_info->repluser, PQgetvalue(res, row, 5), NAMEDATALEN);
-	strncpy(node_info->slot_name, PQgetvalue(res, row, 6), MAXLEN);
-	strncpy(node_info->location, PQgetvalue(res, row, 7), MAXLEN);
+	snprintf(node_info->node_name, sizeof(node_info->node_name), "%s", PQgetvalue(res, row, 3));
+	snprintf(node_info->conninfo, sizeof(node_info->conninfo), "%s", PQgetvalue(res, row, 4));
+	snprintf(node_info->repluser, sizeof(node_info->repluser), "%s", PQgetvalue(res, row, 5));
+	snprintf(node_info->slot_name, sizeof(node_info->slot_name), "%s", PQgetvalue(res, row, 6));
+	snprintf(node_info->location, sizeof(node_info->location), "%s", PQgetvalue(res, row, 7));
 	node_info->priority = atoi(PQgetvalue(res, row, 8));
 	node_info->active = atobool(PQgetvalue(res, row, 9));
-	strncpy(node_info->config_file, PQgetvalue(res, row, 10), MAXPGPATH);
+	snprintf(node_info->config_file, sizeof(node_info->config_file), "%s", PQgetvalue(res, row, 10));

 	/* This won't normally be set */
-	strncpy(node_info->upstream_node_name, PQgetvalue(res, row, 11), MAXLEN);
+	snprintf(node_info->upstream_node_name, sizeof(node_info->upstream_node_name), "%s", PQgetvalue(res, row, 11));

 	/* Set remaining struct fields with default values */

@@ -3461,11 +3473,15 @@ config_file_list_add(t_configfile_list *list, const char *file, const char *file
 	}


-	strncpy(list->files[list->entries]->filepath, file, MAXPGPATH);
+	snprintf(list->files[list->entries]->filepath,
+			 sizeof(list->files[list->entries]->filepath),
+			 "%s", file);
 	canonicalize_path(list->files[list->entries]->filepath);

+	snprintf(list->files[list->entries]->filename,
+			 sizeof(list->files[list->entries]->filename),
+			 "%s", filename);

-	strncpy(list->files[list->entries]->filename, filename, MAXPGPATH);
 	list->files[list->entries]->in_data_directory = in_data_dir;

 	list->entries++;
@@ -3545,9 +3561,10 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
 	log_verbose(LOG_DEBUG, "_create_event(): event is \"%s\" for node %i", event, node_id);

 	/*
-	 * Only attempt to write a record if a connection handle was provided.
+	 * Only attempt to write a record if a connection handle was provided,
+	 * and the connection handle points to a node which is not in recovery.
 	 */
-	if (conn != NULL && PQstatus(conn) == CONNECTION_OK)
+	if (conn != NULL && PQstatus(conn) == CONNECTION_OK && get_recovery_type(conn) == RECTYPE_PRIMARY)
 	{
 		int			n_node_id = htonl(node_id);
 		char	   *t_successful = successful ? "TRUE" : "FALSE";
@@ -3601,7 +3618,7 @@ _create_event(PGconn *conn, t_configuration_options *options, int node_id, char
 		else
 		{
 			/* Store timestamp to send to the notification command */
-			strncpy(event_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
+			snprintf(event_timestamp, MAXLEN, "%s", PQgetvalue(res, 0, 0));
 		}

 		termPQExpBuffer(&query);
@@ -4036,8 +4053,12 @@ get_slot_record(PGconn *conn, char *slot_name, t_replication_slot *record)
 	}
 	else
 	{
-		strncpy(record->slot_name, PQgetvalue(res, 0, 0), MAXLEN);
-		strncpy(record->slot_type, PQgetvalue(res, 0, 1), MAXLEN);
+		snprintf(record->slot_name,
+				 sizeof(record->slot_name),
+				 "%s", PQgetvalue(res, 0, 0));
+		snprintf(record->slot_type,
+				 sizeof(record->slot_type),
+				 "%s", PQgetvalue(res, 0, 1));
 		record->active = atobool(PQgetvalue(res, 0, 2));
 	}

@@ -4168,7 +4189,8 @@ get_tablespace_name_by_location(PGconn *conn, const char *location, char *name)
 	}
 	else
 	{
-		strncpy(name, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(name, MAXLEN,
+				 "%s", PQgetvalue(res, 0, 0));
 	}

 	termPQExpBuffer(&query);
@@ -4202,7 +4224,7 @@ cancel_query(PGconn *conn, int timeout)
 	if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
 	{
 		log_warning(_("unable to cancel current query"));
-		log_detail("%s", errbuf);
+		log_detail("\n%s", errbuf);
 		PQfreeCancel(pgcancel);
 		return false;
 	}
@@ -4232,7 +4254,7 @@ wait_connection_availability(PGconn *conn, int timeout)
 	long long	timeout_ms;

 	/* calculate timeout in microseconds */
-	timeout_ms = timeout * 1000000;
+	timeout_ms = (long long) timeout * 1000000;

 	while (timeout_ms > 0)
 	{
@@ -4291,13 +4313,33 @@ wait_connection_availability(PGconn *conn, int timeout)

 bool
 is_server_available(const char *conninfo)
+{
+	return _is_server_available(conninfo, false);
+}
+
+
+bool
+is_server_available_quiet(const char *conninfo)
+{
+	return _is_server_available(conninfo, true);
+}
+
+
+static bool
+_is_server_available(const char *conninfo, bool quiet)
 {
 	PGPing		status = PQping(conninfo);

-	log_verbose(LOG_DEBUG, "is_server_available(): ping status for %s is %i", conninfo, (int)status);
+	log_verbose(LOG_DEBUG, "is_server_available(): ping status for \"%s\" is %s", conninfo, print_pqping_status(status));
 	if (status == PQPING_OK)
 		return true;

+	if (quiet == false)
+	{
+		log_warning(_("unable to ping \"%s\""), conninfo);
+		log_detail(_("PQping() returned \"%s\""), print_pqping_status(status));
+	}
+
 	return false;
 }

@@ -4310,10 +4352,17 @@ is_server_available_params(t_conninfo_param_list *param_list)
 									  false);

 	/* deparsing the param_list adds overhead, so only do it if needed  */
-	if (log_level == LOG_DEBUG)
+	if (log_level == LOG_DEBUG || status != PQPING_OK)
 	{
 		char *conninfo_str = param_list_to_string(param_list);
-		log_verbose(LOG_DEBUG, "is_server_available_params(): ping status for %s is %i", conninfo_str, (int)status);
+		log_verbose(LOG_DEBUG, "is_server_available_params(): ping status for \"%s\" is %s", conninfo_str, print_pqping_status(status));
+
+		if (status != PQPING_OK)
+		{
+			log_warning(_("unable to ping \"%s\""), conninfo_str);
+			log_detail(_("PQping() returned \"%s\""), print_pqping_status(status));
+		}
+
 		pfree(conninfo_str);
 	}

@@ -4351,7 +4400,7 @@ connection_ping_reconnect(PGconn *conn)
 	if (PQstatus(conn) != CONNECTION_OK)
 	{
 		log_warning(_("connection error, attempting to reset"));
-		log_detail("%s", PQerrorMessage(conn));
+		log_detail("\n%s", PQerrorMessage(conn));
 		PQreset(conn);
 		ping_result = connection_ping(conn);
 	}
@@ -4883,6 +4932,7 @@ void
 init_replication_info(ReplInfo *replication_info)
 {
 	memset(replication_info->current_timestamp, 0, sizeof(replication_info->current_timestamp));
+	replication_info->in_recovery = false;
 	replication_info->last_wal_receive_lsn = InvalidXLogRecPtr;
 	replication_info->last_wal_replay_lsn = InvalidXLogRecPtr;
 	memset(replication_info->last_xact_replay_timestamp, 0, sizeof(replication_info->last_xact_replay_timestamp));
@@ -4903,6 +4953,7 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
 	initPQExpBuffer(&query);
 	appendPQExpBufferStr(&query,
 						 " SELECT ts, "
+						 "        in_recovery, "
 						 "        last_wal_receive_lsn, "
 						 "        last_wal_replay_lsn, "
 						 "        last_xact_replay_timestamp, "
@@ -4920,6 +4971,7 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
 						 "        upstream_last_seen "
 						 "   FROM ( "
 						 " SELECT CURRENT_TIMESTAMP AS ts, "
+						 "        pg_catalog.pg_is_in_recovery() AS in_recovery, "
 						 "        pg_catalog.pg_last_xact_replay_timestamp() AS last_xact_replay_timestamp, ");


@@ -4985,14 +5037,19 @@ get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replicatio
 	}
 	else
 	{
-		strncpy(replication_info->current_timestamp, PQgetvalue(res, 0, 0), MAXLEN);
-		replication_info->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 1));
-		replication_info->last_wal_replay_lsn = parse_lsn(PQgetvalue(res, 0, 2));
-		strncpy(replication_info->last_xact_replay_timestamp, PQgetvalue(res, 0, 3), MAXLEN);
-		replication_info->replication_lag_time = atoi(PQgetvalue(res, 0, 4));
-		replication_info->receiving_streamed_wal = atobool(PQgetvalue(res, 0, 5));
-		replication_info->wal_replay_paused = atobool(PQgetvalue(res, 0, 6));
-		replication_info->upstream_last_seen = atoi(PQgetvalue(res, 0, 7));
+		snprintf(replication_info->current_timestamp,
+				 sizeof(replication_info->current_timestamp),
+				 "%s", PQgetvalue(res, 0, 0));
+		replication_info->in_recovery = atobool(PQgetvalue(res, 0, 1));
+		replication_info->last_wal_receive_lsn = parse_lsn(PQgetvalue(res, 0, 2));
+		replication_info->last_wal_replay_lsn = parse_lsn(PQgetvalue(res, 0, 3));
+		snprintf(replication_info->last_xact_replay_timestamp,
+				 sizeof(replication_info->last_xact_replay_timestamp),
+				 "%s", PQgetvalue(res, 0, 4));
+		replication_info->replication_lag_time = atoi(PQgetvalue(res, 0, 5));
+		replication_info->receiving_streamed_wal = atobool(PQgetvalue(res, 0, 6));
+		replication_info->wal_replay_paused = atobool(PQgetvalue(res, 0, 7));
+		replication_info->upstream_last_seen = atoi(PQgetvalue(res, 0, 8));
 	}

 	termPQExpBuffer(&query);
@@ -5038,13 +5095,12 @@ get_replication_lag_seconds(PGconn *conn)
 		log_warning("%s", PQerrorMessage(conn));
 		PQclear(res);

-		/* XXX magic number */
-		return -1;
+		return UNKNOWN_REPLICATION_LAG;
 	}

 	if (!PQntuples(res))
 	{
-		return -1;
+		return UNKNOWN_REPLICATION_LAG;
 	}

 	lag_seconds = atoi(PQgetvalue(res, 0, 0));
@@ -5502,7 +5558,9 @@ get_default_bdr_replication_set(PGconn *conn)
 		/* For BDR2, we use a custom replication set */
 		namelen = strlen(BDR2_REPLICATION_SET_NAME);
 		default_replication_set = pg_malloc0(namelen + 1);
-		strncpy(default_replication_set, BDR2_REPLICATION_SET_NAME, namelen);
+		snprintf(default_replication_set,
+				 namelen + 1,
+				 "%s", BDR2_REPLICATION_SET_NAME);

 		return default_replication_set;
 	}
@@ -5532,7 +5590,9 @@ get_default_bdr_replication_set(PGconn *conn)
 	namelen = strlen(PQgetvalue(res, 0, 0));
 	default_replication_set = pg_malloc0(namelen + 1);

-	strncpy(default_replication_set, PQgetvalue(res, 0, 0), namelen);
+	snprintf(default_replication_set,
+			 namelen,
+			 "%s", PQgetvalue(res, 0, 0));

 	PQclear(res);

@@ -5753,7 +5813,9 @@ get_bdr_other_node_name(PGconn *conn, int node_id, char *node_name)

 	if (PQresultStatus(res) == PGRES_TUPLES_OK)
 	{
-		strncpy(node_name, PQgetvalue(res, 0, 0), MAXLEN);
+		snprintf(node_name,
+				 NAMEDATALEN,
+				 "%s", PQgetvalue(res, 0, 0));
 	}
 	else
 	{
@@ -5936,12 +5998,12 @@ _populate_bdr_node_records(PGresult *res, BdrNodeInfoList *node_list)
 static void
 _populate_bdr_node_record(PGresult *res, t_bdr_node_info *node_info, int row)
 {
-	strncpy(node_info->node_sysid, PQgetvalue(res, row, 0), MAXLEN);
+	snprintf(node_info->node_sysid, sizeof(node_info->node_sysid), "%s", PQgetvalue(res, row, 0));
 	node_info->node_timeline = atoi(PQgetvalue(res, row, 1));
 	node_info->node_dboid = atoi(PQgetvalue(res, row, 2));
-	strncpy(node_info->node_name, PQgetvalue(res, row, 3), MAXLEN);
-	strncpy(node_info->node_local_dsn, PQgetvalue(res, row, 4), MAXLEN);
-	strncpy(node_info->peer_state_name, PQgetvalue(res, row, 5), MAXLEN);
+	snprintf(node_info->node_name, sizeof(node_info->node_name), "%s", PQgetvalue(res, row, 3));
+	snprintf(node_info->node_local_dsn, sizeof(node_info->node_local_dsn), "%s", PQgetvalue(res, row, 4));
+	snprintf(node_info->peer_state_name, sizeof(node_info->peer_state_name), "%s", PQgetvalue(res, row, 5));
 }


--- a/dbutils.h
+++ b/dbutils.h
@@ -134,8 +134,8 @@ typedef struct s_node_info
 	int			node_id;
 	int			upstream_node_id;
 	t_server_type type;
-	char		node_name[MAXLEN];
-	char		upstream_node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
+	char		upstream_node_name[NAMEDATALEN];
 	char		conninfo[MAXLEN];
 	char		repluser[NAMEDATALEN];
 	char		location[MAXLEN];
@@ -302,6 +302,7 @@ typedef struct BdrNodeInfoList
 typedef struct
 {
 	char		current_timestamp[MAXLEN];
+	bool		in_recovery;
 	XLogRecPtr	last_wal_receive_lsn;
 	XLogRecPtr	last_wal_replay_lsn;
 	char		last_xact_replay_timestamp[MAXLEN];
@@ -517,6 +518,7 @@ int			wait_connection_availability(PGconn *conn, int timeout);

 /* node availability functions */
 bool		is_server_available(const char *conninfo);
+bool		is_server_available_quiet(const char *conninfo);
 bool		is_server_available_params(t_conninfo_param_list *param_list);
 ExecStatusType	connection_ping(PGconn *conn);
 ExecStatusType	connection_ping_reconnect(PGconn *conn);
--- a/dirutil.c
+++ b/dirutil.c
@@ -276,6 +276,8 @@ is_pg_running(const char *path)
 			log_warning(_("invalid data in PostgreSQL PID file \"%s\""), path);
 		}

+		fclose(pidf);
+
 		return PG_DIR_NOT_RUNNING;
 	}

--- a/doc/appendix-release-notes.sgml
+++ b/doc/appendix-release-notes.sgml
@@ -152,13 +152,13 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
          <listitem>
            <para>
              <application>repmgrd</application> will no longer consider nodes where <application>repmgrd</application>
-	      is not running as promotion candidates.
+			  is not running as promotion candidates.
+            </para>
+            <para>
+              Previously, if <application>repmgrd</application> was not running on a node, but
+              that node qualified as the promotion candidate, it would never be promoted due to
+              the absence of a running <application>repmgrd</application>.
            </para>
-	    <para>
-	      Previously, if <application>repmgrd</application> was not running on a node, but
-	      that node qualified as the promotion candidate, it would never be promoted due to
-	      the absence of a running <application>repmgrd</application>.
-	    </para>
          </listitem>

          <listitem>
@@ -188,6 +188,14 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
            </para>
          </listitem>

+         <listitem>
+            <para>
+			  In a failover situation, <application>repmgrd</application> will not attempt to promote a
+			  node if another standby has already appeared (e.g. by being promoted manually).
+			  GitHub #420.
+			</para>
+          </listitem>
+
 		</itemizedlist>
 	  </para>
 	</sect2>
@@ -208,7 +216,7 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
          <listitem>
            <para>
              &repmgr;: when executing <command><link linkend="repmgr-witness-register">repmgr witness register</link></command>,
-              chech the node to connected is actually the primary (i.e. not the witness server).  GitHub #528.
+              check the node to connected is actually the primary (i.e. not the witness server).  GitHub #528.
            </para>
          </listitem>

@@ -246,25 +254,32 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
          <listitem>
            <para>
              <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command>:
-	      fix display of node IDs with multiple digits.
+	          fix display of node IDs with multiple digits.
            </para>
-	  </listitem>
+	      </listitem>

          <listitem>
            <para>
              ensure <command><link linkend="repmgr-primary-unregister">repmgr primary unregister</link></command>
-	      behaves correctly when executed on a witness server. GitHub #548.
+	          behaves correctly when executed on a witness server. GitHub #548.
            </para>
-	  </listitem>
+	      </listitem>
+
+          <listitem>
+            <para>
+              ensure <command><link linkend="repmgr-standby-register">repmgr standby register</link></command>
+	          fails when <option>--upstream-node-id</option> is the same as the local node ID.
+            </para>
+	      </listitem>

          <listitem>
            <para>
              <command><link linkend="repmgr-node-check">repmgr node check</link></command>
-	      will only consider physical replication slots, as the purpose
-	      of slot checks is to warn about potential issues with
-	      streaming replication standbys which are no longer attached.
-	    </para>
-	  </listitem>
+	          will only consider physical replication slots, as the purpose
+	          of slot checks is to warn about potential issues with
+	          streaming replication standbys which are no longer attached.
+	        </para>
+	      </listitem>

        </itemizedlist>
      </para>
--- a/doc/configuration-file-required-settings.sgml
+++ b/doc/configuration-file-required-settings.sgml
@@ -39,6 +39,10 @@
       called <varname>standby1</varname> (for example), things will be confusing
       to say the least.
     </para>
+     <para>
+       The string's maximum length is 63 characters and it should
+       contain only printable ASCII characters.
+     </para>
    </listitem>
   </varlistentry>

--- a/doc/configuring-witness-server.sgml
+++ b/doc/configuring-witness-server.sgml
@@ -1,93 +0,0 @@
-<chapter id="using-witness-server">
- <indexterm>
-  <primary>witness server</primary>
- </indexterm>
-
-
- <title>Using a witness server</title>
- <para>
-   A <xref linkend="witness-server"> is a normal PostgreSQL instance which
-   is not part of the streaming replication cluster; its purpose is, if a
-   failover situation occurs, to provide proof that it is the primary server
-   itself which is unavailable, rather than e.g. a network split between
-   different physical locations.
- </para>
-
- <para>
-   A typical use case for a witness server is a two-node streaming replication
-   setup, where the primary and standby are in different locations (data centres).
-   By creating a witness server in the same location (data centre) as the primary,
-   if the primary becomes unavailable it's possible for the standby to decide whether
-   it can promote itself without risking a "split brain" scenario: if it can't see either the
-   witness or the primary server, it's likely there's a network-level interruption
-   and it should not promote itself. If it can see the witness but not the primary,
-   this proves there is no network interruption and the primary itself is unavailable,
-   and it can therefore promote itself (and ideally take action to fence the
-   former primary).
- </para>
- <note>
-   <para>
-     <emphasis>Never</emphasis> install a witness server on the same physical host
-     as another node in the replication cluster managed by &repmgr; - it's essential
-     the witness is not affected in any way by failure of another node.
-   </para>
- </note>
- <para>
-   For more complex replication scenarios,e.g. with multiple datacentres, it may
-   be preferable to use location-based failover, which ensures that only nodes
-   in the same location as the primary will ever be promotion candidates;
-   see <xref linkend="repmgrd-network-split"> for more details.
- </para>
-
- <note>
-   <simpara>
-     A witness server will only be useful if <application>repmgrd</application>
-     is in use.
-   </simpara>
- </note>
-
- <sect1 id="creating-witness-server">
-   <title>Creating a witness server</title>
- <para>
-   To create a witness server, set up a normal PostgreSQL instance on a server
-   in the same physical location as the cluster's primary server.
- </para>
- <para>
-   This instance should <emphasis>not</emphasis> be on the same physical host as the primary server,
-   as otherwise if the primary server fails due to hardware issues, the witness
-   server will be lost too.
- </para>
- <note>
-   <simpara>
-     &repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
-     command, which would automatically create a PostgreSQL instance. However
-     this often resulted in an unsatisfactory, hard-to-customise instance.
-   </simpara>
- </note>
- <para>
-   The witness server should be configured in the same way as a normal
-   &repmgr; node; see section <xref linkend="configuration">.
- </para>
- <para>
-   Register the witness server with <xref linkend="repmgr-witness-register">.
-   This will create the &repmgr; extension on the witness server, and make
-   a copy of the &repmgr; metadata.
- </para>
- <note>
-   <simpara>
-    As the witness server is not part of the replication cluster, further
-    changes to the &repmgr; metadata will be synchronised by
-    <application>repmgrd</application>.
-   </simpara>
- </note>
- <para>
-   Once the witness server has been configured, <application>repmgrd</application>
-   should be started; for more details see <xref linkend="repmgrd-witness-server">.
- </para>
-
- <para>
-  To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
- </para>
-
- </sect1>
-</chapter>
--- a/doc/filelist.sgml
+++ b/doc/filelist.sgml
@@ -45,7 +45,6 @@
 <!ENTITY promoting-standby  SYSTEM "promoting-standby.sgml">
 <!ENTITY follow-new-primary  SYSTEM "follow-new-primary.sgml">
 <!ENTITY switchover  SYSTEM "switchover.sgml">
-<!ENTITY configuring-witness-server SYSTEM "configuring-witness-server.sgml">

 <!ENTITY event-notifications  SYSTEM "event-notifications.sgml">
 <!ENTITY upgrading-repmgr  SYSTEM "upgrading-repmgr.sgml">
--- a/doc/install-source.sgml
+++ b/doc/install-source.sgml
@@ -136,6 +136,16 @@ deb-src http://apt.postgresql.org/pub/repos/apt/ stretch-pgdg main</programlisti
         </itemizedlist>
       </para>
     </note>
+
+     <tip>
+       <para>
+         If building against PostgreSQL 11 or later configured with the <option>--with-llvm</option> option
+         (this is the case with the PGDG-provided packages) you'll also need to install the
+         <literal>llvm-toolset-7-clang</literal> package. This is available via the
+         <ulink url="https://wiki.centos.org/AdditionalResources/Repositories/SCL">Software Collections (SCL) Repository</ulink>.
+       </para>
+     </tip>
+
    </listitem>
   </itemizedlist>
  </para>
--- a/doc/repmgr.sgml
+++ b/doc/repmgr.sgml
@@ -73,7 +73,6 @@
  &promoting-standby;
  &follow-new-primary;
  &switchover;
-  &configuring-witness-server;
  &event-notifications;
  &upgrading-repmgr;
 </part>
--- a/doc/repmgrd-automatic-failover.sgml
+++ b/doc/repmgrd-automatic-failover.sgml
@@ -23,37 +23,92 @@
   <primary>witness server</primary>
   <secondary>repmgrd</secondary>
 </indexterm>
+ <title>Using a witness server</title>
+ <para>
+   A <xref linkend="witness-server"> is a normal PostgreSQL instance which
+   is not part of the streaming replication cluster; its purpose is, if a
+   failover situation occurs, to provide proof that it is the primary server
+   itself which is unavailable, rather than e.g. a network split between
+   different physical locations.
+ </para>

- <title>Using a witness server with repmgrd</title>
 <para>
-   In a situation caused e.g. by a network interruption between two
-   data centres, it's important to avoid a &quot;split-brain&quot; situation where
-   both sides of the network assume they are the active segment and the
-   side without an active primary unilaterally promotes one of its standbys.
- </para>
- <para>
-   To prevent this situation happening, it's essential to ensure that one
-   network segment has a &quot;voting majority&quot;, so other segments will know
-   they're in the minority and not attempt to promote a new primary. Where
-   an odd number of servers exists, this is not an issue. However, if each
-   network has an even number of nodes, it's necessary to provide some way
-   of ensuring a majority, which is where the witness server becomes useful.
- </para>
- <para>
-   This is not a fully-fledged standby node and is not integrated into
-   replication, but it effectively represents the &quot;casting vote&quot; when
-   deciding which network segment has a majority. A witness server can
-   be set up using <link linkend="repmgr-witness-register"><command>repmgr witness register</command></link>;
-   see also section <link linkend="using-witness-server">Using a witness server</link>.
+   A typical use case for a witness server is a two-node streaming replication
+   setup, where the primary and standby are in different locations (data centres).
+   By creating a witness server in the same location (data centre) as the primary,
+   if the primary becomes unavailable it's possible for the standby to decide whether
+   it can promote itself without risking a "split brain" scenario: if it can't see either the
+   witness or the primary server, it's likely there's a network-level interruption
+   and it should not promote itself. If it can see the witness but not the primary,
+   this proves there is no network interruption and the primary itself is unavailable,
+   and it can therefore promote itself (and ideally take action to fence the
+   former primary).
 </para>
 <note>
   <para>
-     It only
-     makes sense to create a witness server in conjunction with running
-     <application>repmgrd</application>; the witness server will require its own
-     <application>repmgrd</application> instance.
+     <emphasis>Never</emphasis> install a witness server on the same physical host
+     as another node in the replication cluster managed by &repmgr; - it's essential
+     the witness is not affected in any way by failure of another node.
   </para>
 </note>
+ <para>
+   For more complex replication scenarios,e.g. with multiple datacentres, it may
+   be preferable to use location-based failover, which ensures that only nodes
+   in the same location as the primary will ever be promotion candidates;
+   see <xref linkend="repmgrd-network-split"> for more details.
+ </para>
+
+ <note>
+   <simpara>
+     A witness server will only be useful if <application>repmgrd</application>
+     is in use.
+   </simpara>
+ </note>
+
+ <sect2 id="creating-witness-server">
+   <title>Creating a witness server</title>
+ <para>
+   To create a witness server, set up a normal PostgreSQL instance on a server
+   in the same physical location as the cluster's primary server.
+ </para>
+ <para>
+   This instance should <emphasis>not</emphasis> be on the same physical host as the primary server,
+   as otherwise if the primary server fails due to hardware issues, the witness
+   server will be lost too.
+ </para>
+ <note>
+   <simpara>
+     &repmgr; 3.3 and earlier provided a <command>repmgr create witness</command>
+     command, which would automatically create a PostgreSQL instance. However
+     this often resulted in an unsatisfactory, hard-to-customise instance.
+   </simpara>
+ </note>
+ <para>
+   The witness server should be configured in the same way as a normal
+   &repmgr; node; see section <xref linkend="configuration">.
+ </para>
+ <para>
+   Register the witness server with <xref linkend="repmgr-witness-register">.
+   This will create the &repmgr; extension on the witness server, and make
+   a copy of the &repmgr; metadata.
+ </para>
+ <note>
+   <simpara>
+    As the witness server is not part of the replication cluster, further
+    changes to the &repmgr; metadata will be synchronised by
+    <application>repmgrd</application>.
+   </simpara>
+ </note>
+ <para>
+   Once the witness server has been configured, <application>repmgrd</application>
+   should be started.
+ </para>
+
+ <para>
+  To unregister a witness server, use <xref linkend="repmgr-witness-unregister">.
+ </para>
+
+ </sect2>

 </sect1>

--- a/doc/repmgrd-configuration.sgml
+++ b/doc/repmgrd-configuration.sgml
@@ -53,7 +53,7 @@

        </varlistentry>

-        <varlistentry>
+        <varlistentry id="connection-check-type">

          <indexterm>
            <primary>connection_check_type</primary>
--- a/doc/repmgrd-overview.sgml
+++ b/doc/repmgrd-overview.sgml
@@ -13,6 +13,66 @@
    failover and updating standbys to follow the new primary, as well as
    providing monitoring information about the state of each standby.
  </para>
+  <para>
+    <application>repmgrd</application> is designed to be straightforward to set up
+    and does not require additional external infrastructure.
+  </para>
+  <para>
+    Functionality provided by <application>repmgrd</application> includes:
+    <itemizedlist spacing="compact" mark="bullet">
+
+       <listitem>
+         <simpara>
+           wide range of <link linkend="repmgrd-basic-configuration">configuration options</link>
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           option to execute custom scripts (&quot;<link linkend="event-notifications">event notifications</link>
+           at different points in the failover sequence
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           ability to <link linkend="repmgrd-pausing">pause repmgrd</link>
+           operation on all nodes with a
+           <link linkend="repmgr-daemon-pause"><command>single command</command></link>
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           optional <link linkend="repmgrd-witness-server">witness server</link>
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           &quot;location&quot; configuration option to restrict
+           potential promotion candidates to a single location
+           (e.g. when nodes are spread over multiple data centres)
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           <link linkend="connection-check-type">choice of method</link> to determine node availability
+           (PostgreSQL ping, query execution or new connection)
+         </simpara>
+       </listitem>
+
+       <listitem>
+         <simpara>
+           retention of monitoring statistics (optional)
+         </simpara>
+       </listitem>
+
+
+    </itemizedlist>
+
+  </para>

  <sect1 id="repmgrd-demonstration">

@@ -22,12 +82,12 @@
  and two standbys streaming directly from the primary) so that the cluster looks
  something like this:
  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster show
-     ID | Name  | Role    | Status    | Upstream | Location | Connection string
-    ----+-------+---------+-----------+----------+----------+--------------------------------------
-     1  | node1 | primary | * running |          | default  | host=node1 dbname=repmgr user=repmgr
-     2  | node2 | standby |   running | node1    | default  | host=node2 dbname=repmgr user=repmgr
-     3  | node3 | standby |   running | node1    | default  | host=node3 dbname=repmgr user=repmgr</programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster show --compact
+     ID | Name  | Role    | Status    | Upstream | Location | Prio.
+    ----+-------+---------+-----------+----------+----------+-------
+     1  | node1 | primary | * running |          | default  | 100
+     2  | node2 | standby |   running | node1    | default  | 100
+     3  | node3 | standby |   running | node1    | default  | 100</programlisting>
 </para>

 <tip>
@@ -40,10 +100,11 @@
  Start <application>repmgrd</application> on each standby and verify that it's running by examining the
  log output, which at log level <literal>INFO</literal> will look like this:
  <programlisting>
-    [2017-08-24 17:31:00] [NOTICE] using configuration file "/etc/repmgr.conf"
-    [2017-08-24 17:31:00] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr"
-    [2017-08-24 17:31:00] [NOTICE] starting monitoring of node <literal>node2</literal> (ID: 2)
-    [2017-08-24 17:31:00] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
+    [2019-03-15 06:32:05] [NOTICE] repmgrd (repmgrd 4.3) starting up
+    [2019-03-15 06:32:05] [INFO] connecting to database "host=node2 dbname=repmgr user=repmgr connect_timeout=2"
+    INFO:  set_repmgrd_pid(): provided pidfile is /var/run/repmgr/repmgrd-11.pid
+    [2019-03-15 06:32:05] [NOTICE] starting monitoring of node "node2" (ID: 2)
+    [2019-03-15 06:32:05] [INFO] monitoring connection to upstream node "node1" (node ID: 1)</programlisting>
 </para>
 <para>
  Each <application>repmgrd</application> should also have recorded its successful startup as an event:
@@ -51,9 +112,9 @@
    $ repmgr -f /etc/repmgr.conf cluster event --event=repmgrd_start
     Node ID | Name  | Event         | OK | Timestamp           | Details
    ---------+-------+---------------+----+---------------------+-------------------------------------------------------------
-     3       | node3 | repmgrd_start | t  | 2017-08-24 17:35:54 | monitoring connection to upstream node "node1" (node ID: 1)
-     2       | node2 | repmgrd_start | t  | 2017-08-24 17:35:50 | monitoring connection to upstream node "node1" (node ID: 1)
-     1       | node1 | repmgrd_start | t  | 2017-08-24 17:35:46 | monitoring cluster primary "node1" (node ID: 1)  </programlisting>
+     3       | node3 | repmgrd_start | t  | 2019-03-14 04:17:30 | monitoring connection to upstream node "node1" (node ID: 1)
+     2       | node2 | repmgrd_start | t  | 2019-03-14 04:11:47 | monitoring connection to upstream node "node1" (node ID: 1)
+     1       | node1 | repmgrd_start | t  | 2019-03-14 04:04:31 | monitoring cluster primary "node1" (node ID: 1)</programlisting>
 </para>
 <para>
  Now stop the current primary server with e.g.:
@@ -67,55 +128,59 @@
  decision is made. This is an extract from the log of a standby server (<literal>node2</literal>)
  which has promoted to new primary after failure of the original primary (<literal>node1</literal>).
  <programlisting>
-    [2017-08-24 23:32:01] [INFO] node "node2" (node ID: 2) monitoring upstream node "node1" (node ID: 1) in normal state
-    [2017-08-24 23:32:08] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
-    [2017-08-24 23:32:08] [INFO] checking state of node 1, 1 of 5 attempts
-    [2017-08-24 23:32:08] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:09] [INFO] checking state of node 1, 2 of 5 attempts
-    [2017-08-24 23:32:09] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:10] [INFO] checking state of node 1, 3 of 5 attempts
-    [2017-08-24 23:32:10] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:11] [INFO] checking state of node 1, 4 of 5 attempts
-    [2017-08-24 23:32:11] [INFO] sleeping 1 seconds until next reconnection attempt
-    [2017-08-24 23:32:12] [INFO] checking state of node 1, 5 of 5 attempts
-    [2017-08-24 23:32:12] [WARNING] unable to reconnect to node 1 after 5 attempts
-    INFO:  setting voting term to 1
-    INFO:  node 2 is candidate
-    INFO:  node 3 has received request from node 2 for electoral term 1 (our term: 0)
-    [2017-08-24 23:32:12] [NOTICE] this node is the winner, will now promote self and inform other nodes
-    INFO: connecting to standby database
-    NOTICE: promoting standby
-    DETAIL: promoting server using 'pg_ctl -l /var/log/postgres/startup.log -w -D '/var/lib/pgsql/data' promote'
-    INFO: reconnecting to promoted server
+    [2019-03-15 06:37:50] [WARNING] unable to connect to upstream node "node1" (node ID: 1)
+    [2019-03-15 06:37:50] [INFO] checking state of node 1, 1 of 3 attempts
+    [2019-03-15 06:37:50] [INFO] sleeping 5 seconds until next reconnection attempt
+    [2019-03-15 06:37:55] [INFO] checking state of node 1, 2 of 3 attempts
+    [2019-03-15 06:37:55] [INFO] sleeping 5 seconds until next reconnection attempt
+    [2019-03-15 06:38:00] [INFO] checking state of node 1, 3 of 3 attempts
+    [2019-03-15 06:38:00] [WARNING] unable to reconnect to node 1 after 3 attempts
+    [2019-03-15 06:38:00] [INFO] primary and this node have the same location ("default")
+    [2019-03-15 06:38:00] [INFO] local node's last receive lsn: 0/900CBF8
+    [2019-03-15 06:38:00] [INFO] node 3 last saw primary node 12 second(s) ago
+    [2019-03-15 06:38:00] [INFO] last receive LSN for sibling node "node3" (ID: 3) is: 0/900CBF8
+    [2019-03-15 06:38:00] [INFO] node "node3" (ID: 3) has same LSN as current candidate "node2" (ID: 2)
+    [2019-03-15 06:38:00] [INFO] visible nodes: 2; total nodes: 2; no nodes have seen the primary within the last 4 seconds
+    [2019-03-15 06:38:00] [NOTICE] promotion candidate is "node2" (ID: 2)
+    [2019-03-15 06:38:00] [NOTICE] this node is the winner, will now promote itself and inform other nodes
+    [2019-03-15 06:38:00] [INFO] promote_command is:
+      "/usr/pgsql-11/bin/repmgr -f /etc/repmgr/11/repmgr.conf standby promote"
+    NOTICE: promoting standby to primary
+    DETAIL: promoting server "node2" (ID: 2) using "/usr/pgsql-11/bin/pg_ctl  -w -D '/var/lib/pgsql/11/data' promote"
+    NOTICE: waiting up to 60 seconds (parameter "promote_check_timeout") for promotion to complete
    NOTICE: STANDBY PROMOTE successful
-    DETAIL: node 2 was successfully promoted to primary
+    DETAIL: server "node2" (ID: 2) was successfully promoted to primary
+    [2019-03-15 06:38:01] [INFO] 3 followers to notify
+    [2019-03-15 06:38:01] [NOTICE] notifying node "node3" (node ID: 3) to follow node 2
    INFO:  node 3 received notification to follow node 2
-    [2017-08-24 23:32:13] [INFO] switching to primary monitoring mode</programlisting>
+    [2019-03-15 06:38:01] [INFO] switching to primary monitoring mode
+    [2019-03-15 06:38:01] [NOTICE] monitoring cluster primary "node2" (node ID: 2)</programlisting>
 </para>
 <para>
  The cluster status will now look like this, with the original primary (<literal>node1</literal>)
  marked as inactive, and standby <literal>node3</literal> now following the new primary
  (<literal>node2</literal>):
  <programlisting>
-    $ repmgr -f /etc/repmgr.conf cluster show
-     ID | Name  | Role    | Status    | Upstream | Location | Connection string
-    ----+-------+---------+-----------+----------+----------+----------------------------------------------------
-     1  | node1 | primary | - failed  |          | default  | host=node1 dbname=repmgr user=repmgr
-     2  | node2 | primary | * running |          | default  | host=node2 dbname=repmgr user=repmgr
-     3  | node3 | standby |   running | node2    | default  | host=node3 dbname=repmgr user=repmgr</programlisting>
+    $ repmgr -f /etc/repmgr.conf cluster show --compact
+     ID | Name  | Role    | Status    | Upstream | Location | Prio.
+    ----+-------+---------+-----------+----------+----------+-------
+     1  | node1 | primary | - failed  |          | default  | 100
+     2  | node2 | primary | * running |          | default  | 100
+     3  | node3 | standby |   running | node2    | default  | 100</programlisting>

 </para>
 <para>
-  <command>repmgr cluster event</command> will display a summary of what happened to each server
-  during the failover:
+   <link linkend="repmgr-cluster-event"><command>repmgr cluster event</command></link> will display a summary of
+   what happened to each server during the failover:
  <programlisting>
    $ repmgr -f /etc/repmgr.conf cluster event
-     Node ID | Name  | Event                    | OK | Timestamp           | Details
-    ---------+-------+--------------------------+----+---------------------+-----------------------------------------------------------------------------------
-     3       | node3 | repmgrd_failover_follow  | t  | 2017-08-24 23:32:16 | node 3 now following new upstream node 2
-     3       | node3 | standby_follow           | t  | 2017-08-24 23:32:16 | node 3 is now attached to node 2
-     2       | node2 | repmgrd_failover_promote | t  | 2017-08-24 23:32:13 | node 2 promoted to primary; old primary 1 marked as failed
-     2       | node2 | standby_promote          | t  | 2017-08-24 23:32:13 | node 2 was successfully promoted to primary</programlisting>
+     Node ID | Name  | Event                      | OK | Timestamp           | Details
+    ---------+-------+----------------------------+----+---------------------+-------------------------------------------------------------
+     3       | node3 | repmgrd_failover_follow    | t  | 2019-03-15 06:38:03 | node 3 now following new upstream node 2
+     3       | node3 | standby_follow             | t  | 2019-03-15 06:38:02 | standby attached to upstream node "node2" (node ID: 2)
+     2       | node2 | repmgrd_reload             | t  | 2019-03-15 06:38:01 | monitoring cluster primary "node2" (node ID: 2)
+     2       | node2 | repmgrd_failover_promote   | t  | 2019-03-15 06:38:01 | node 2 promoted to primary; old primary 1 marked as failed
+     2       | node2 | standby_promote            | t  | 2019-03-15 06:38:01 | server "node2" (ID: 2) was successfully promoted to primary</programlisting>
 </para>

  </sect1>
--- a/log.c
+++ b/log.c
@@ -85,7 +85,7 @@ _stderr_log_with_level(const char *level_name, int level, const char *fmt, va_li

 			time(&t);
 			tm = localtime(&t);
-			strftime(buf, 100, "[%Y-%m-%d %H:%M:%S]", tm);
+			strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M:%S]", tm);
 			fprintf(stderr, "%s [%s] ", buf, level_name);
 		}
 		else
--- a/repmgr-action-bdr.c
+++ b/repmgr-action-bdr.c
@@ -216,7 +216,7 @@ do_bdr_register(void)
 				ExtensionStatus other_node_extension_status = REPMGR_UNKNOWN;

 				/* skip the local node */
-				if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, MAXLEN) == 0)
+				if (strncmp(node_info.node_name, bdr_cell->node_info->node_name, sizeof(node_info.node_name)) == 0)
 				{
 					continue;
 				}
@@ -304,9 +304,9 @@ do_bdr_register(void)
 	node_info.active = true;
 	node_info.priority = config_file_options.priority;

-	strncpy(node_info.node_name, config_file_options.node_name, MAXLEN);
-	strncpy(node_info.location, config_file_options.location, MAXLEN);
-	strncpy(node_info.conninfo, config_file_options.conninfo, MAXLEN);
+	strncpy(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name));
+	strncpy(node_info.location, config_file_options.location, sizeof(node_info.location));
+	strncpy(node_info.conninfo, config_file_options.conninfo, sizeof(node_info.conninfo));

 	if (record_status == RECORD_FOUND)
 	{
@@ -330,7 +330,7 @@ do_bdr_register(void)
 		 * name set when the node was registered.
 		 */

-		if (strncmp(node_info.node_name, config_file_options.node_name, MAXLEN) != 0)
+		if (strncmp(node_info.node_name, config_file_options.node_name, sizeof(node_info.node_name)) != 0)
 		{
 			log_error(_("a record for node %i is already registered with node_name \"%s\""),
 					  config_file_options.node_id, node_info.node_name);
--- a/repmgr-action-cluster.c
+++ b/repmgr-action-cluster.c
@@ -156,7 +156,7 @@ do_cluster_show(void)
 		else
 		{
 			/* check if node is reachable, but just not letting us in */
-			if (is_server_available(cell->node_info->conninfo))
+			if (is_server_available_quiet(cell->node_info->conninfo))
 				cell->node_info->node_status = NODE_STATUS_REJECTED;
 			else
 				cell->node_info->node_status = NODE_STATUS_DOWN;
@@ -1063,7 +1063,9 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, Ite
 		matrix_rec_list[i] = (t_node_matrix_rec *) pg_malloc0(sizeof(t_node_matrix_rec));

 		matrix_rec_list[i]->node_id = cell->node_info->node_id;
-		strncpy(matrix_rec_list[i]->node_name, cell->node_info->node_name, MAXLEN);
+		strncpy(matrix_rec_list[i]->node_name,
+				cell->node_info->node_name,
+				sizeof(cell->node_info->node_name));

 		/*
 		 * Find the maximum length of a node name
@@ -1278,7 +1280,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item

 		cube[h] = (t_node_status_cube *) pg_malloc(sizeof(t_node_status_cube));
 		cube[h]->node_id = cell->node_info->node_id;
-		strncpy(cube[h]->node_name, cell->node_info->node_name, MAXLEN);
+		strncpy(cube[h]->node_name, cell->node_info->node_name, sizeof(cell->node_info->node_name));

 		/*
 		 * Find the maximum length of a node name
@@ -1300,7 +1302,7 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, Item
 			/* we don't need the name here */
 			cube[h]->matrix_list_rec[i]->node_name[0] = '\0';

-			cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec) * nodes.node_count);
+			cube[h]->matrix_list_rec[i]->node_status_list = (t_node_status_rec **) pg_malloc0(sizeof(t_node_status_rec *) * nodes.node_count);

 			j = 0;

--- a/repmgr-action-cluster.h
+++ b/repmgr-action-cluster.h
@@ -30,14 +30,14 @@ typedef struct
 typedef struct
 {
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	t_node_status_rec **node_status_list;
 } t_node_matrix_rec;

 typedef struct
 {
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	t_node_matrix_rec **matrix_list_rec;
 } t_node_status_cube;

--- a/repmgr-action-node.c
+++ b/repmgr-action-node.c
@@ -1408,7 +1408,7 @@ do_node_check_replication_lag(PGconn *conn, OutputMode mode, t_node_info *node_i
 					break;
 			}
 		}
-		else if (lag_seconds < 0)
+		else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
 		{
 			status = CHECK_STATUS_UNKNOWN;

@@ -2476,6 +2476,8 @@ do_node_rejoin(void)

 						termPQExpBuffer(&slotdir_ent_path);
 					}
+
+					closedir(slotdir);
 				}
 				termPQExpBuffer(&slotdir_path);
 			}
@@ -2784,6 +2786,7 @@ _do_node_archive_config(void)

 	arcdir = opendir(archive_dir.data);

+	/* always attempt to open the directory */
 	if (arcdir == NULL)
 	{
 		log_error(_("unable to open archive directory \"%s\""),
@@ -2829,10 +2832,11 @@ _do_node_archive_config(void)

 			termPQExpBuffer(&arcdir_ent_path);
 		}
-
-		closedir(arcdir);
 	}

+	closedir(arcdir);
+
+
 	/*
 	 * extract list of config files from --config-files
 	 */
@@ -3104,11 +3108,12 @@ copy_file(const char *src_file, const char *dest_file)
 	int			a = 0;

 	ptr_old = fopen(src_file, "r");
-	ptr_new = fopen(dest_file, "w");

 	if (ptr_old == NULL)
 		return false;

+	ptr_new = fopen(dest_file, "w");
+
 	if (ptr_new == NULL)
 	{
 		fclose(ptr_old);
--- a/repmgr-action-standby.c
+++ b/repmgr-action-standby.c
@@ -213,7 +213,7 @@ do_standby_clone(void)
 		param_set(&recovery_conninfo, "application_name", config_file_options.node_name);

 		get_conninfo_value(config_file_options.conninfo, "application_name", application_name);
-		if (strlen(application_name) && strncmp(application_name, config_file_options.node_name, MAXLEN) != 0)
+		if (strlen(application_name) && strncmp(application_name, config_file_options.node_name, sizeof(config_file_options.node_name)) != 0)
 		{
 			log_notice(_("\"application_name\" is set in repmgr.conf but will be replaced by the node name"));
 		}
@@ -1305,8 +1305,7 @@ do_standby_register(void)
 			log_error(_("unable to connect to local node \"%s\" (ID: %i)"),
 					  config_file_options.node_name,
 					  config_file_options.node_id);
-			log_detail("%s",
-					   PQerrorMessage(conn));
+			log_detail("\n%s", PQerrorMessage(conn));
 			log_hint(_("to register a standby which is not running, provide primary connection parameters and use option -F/--force"));

 			exit(ERR_BAD_CONFIG);
@@ -1436,6 +1435,17 @@ do_standby_register(void)
 		RecordStatus upstream_record_status = RECORD_NOT_FOUND;
 		t_node_info upstream_node_record = T_NODE_INFO_INITIALIZER;

+		if (runtime_options.upstream_node_id == config_file_options.node_id)
+		{
+			log_error(_("provided node ID for --upstream-node-id (%i) is the same as the configured local node ID (%i)"),
+					  runtime_options.upstream_node_id,
+					  config_file_options.node_id);
+			PQfinish(primary_conn);
+			if (PQstatus(conn) == CONNECTION_OK)
+				PQfinish(conn);
+			exit(ERR_BAD_CONFIG);
+		}
+
 		upstream_record_status = get_node_record(primary_conn,
 												 runtime_options.upstream_node_id,
 												 &upstream_node_record);
@@ -1887,7 +1897,7 @@ do_standby_unregister(void)
 	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
 		log_error(_("unable to connect to primary server"));
-		log_detail("%s", PQerrorMessage(conn));
+		log_detail("\n%s", PQerrorMessage(conn));
 		exit(ERR_BAD_CONFIG);
 	}

@@ -2269,6 +2279,7 @@ void
 do_standby_follow(void)
 {
 	PGconn	   *local_conn = NULL;
+	t_node_info local_node_record = T_NODE_INFO_INITIALIZER;

 	PGconn	   *primary_conn = NULL;
 	int			primary_node_id = UNKNOWN_NODE_ID;
@@ -2307,6 +2318,19 @@ do_standby_follow(void)
 	if (PQserverVersion(local_conn) < 90400)
 		check_93_config();

+	/* attempt to retrieve local node record */
+	record_status = get_node_record(local_conn,
+									config_file_options.node_id,
+									&local_node_record);
+
+	if (record_status != RECORD_FOUND)
+	{
+		log_error(_("unable to retrieve record for local node %i"),
+				  config_file_options.node_id);
+		PQfinish(local_conn);
+		exit(ERR_BAD_CONFIG);
+	}
+
 	/*
 	 * --upstream-node-id provided - attempt to follow that node
 	 */
@@ -2551,6 +2575,9 @@ do_standby_follow(void)

 		conn_to_param_list(local_conn, &local_repl_conninfo);

+		/* Set the replication user from the node record */
+		param_set(&local_repl_conninfo, "user", local_node_record.repluser);
+
 		param_set(&local_repl_conninfo, "replication", "1");

 		local_repl_conn = establish_db_connection_by_params(&local_repl_conninfo, false);
@@ -3884,7 +3911,7 @@ do_standby_switchover(void)
 			log_detail(_("lag is %i seconds (warning threshold: %i)"),
 					   lag_seconds, config_file_options.replication_lag_warning);
 		}
-		else if (lag_seconds < 0)
+		else if (lag_seconds == UNKNOWN_REPLICATION_LAG)
 		{
 			if (runtime_options.force == false)
 			{
@@ -3986,13 +4013,14 @@ do_standby_switchover(void)

 		for (cell = all_nodes.head; cell; cell = cell->next)
 		{
-			cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
-
 			repmgrd_info[i] = pg_malloc0(sizeof(RepmgrdInfo));
 			repmgrd_info[i]->node_id = cell->node_info->node_id;
 			repmgrd_info[i]->pid = UNKNOWN_PID;
 			repmgrd_info[i]->paused = false;
 			repmgrd_info[i]->running = false;
+			repmgrd_info[i]->pg_running = true;
+
+			cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);

 			if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
 			{
@@ -4003,11 +4031,16 @@ do_standby_switchover(void)
 				repmgrd_info[i]->pg_running = false;

 				item_list_append_format(&repmgrd_connection_errors,
-										_("unable to connect to node \"%s\" (ID %i)"),
+										_("unable to connect to node \"%s\" (ID %i):\n%s"),
 										cell->node_info->node_name,
-										cell->node_info->node_id);
+										cell->node_info->node_id,
+										PQerrorMessage(cell->node_info->conn));
+
+				PQfinish(cell->node_info->conn);
+				cell->node_info->conn = NULL;

 				unreachable_node_count++;
+				i++;
 				continue;
 			}

@@ -4069,11 +4102,37 @@ do_standby_switchover(void)

 		}

+		/* pause repmgrd on all reachable nodes */
 		if (repmgrd_running_count > 0)
 		{
 			i = 0;
 			for (cell = all_nodes.head; cell; cell = cell->next)
 			{
+
+				/*
+				 * Skip if node was unreachable
+				 */
+				if (repmgrd_info[i]->pg_running == false)
+				{
+					log_warning(_("node %s (ID %i) unreachable, unable to pause repmgrd"),
+								cell->node_info->node_name,
+								cell->node_info->node_id);
+					i++;
+					continue;
+				}
+
+
+				/*
+				 * Skip if repmgrd not running on node
+				 */
+				if (repmgrd_info[i]->running == false)
+				{
+					log_warning(_("repmgrd not running on node %s (ID %i)"),
+								cell->node_info->node_name,
+								cell->node_info->node_id);
+					i++;
+					continue;
+				}
 				/*
 				 * Skip if node is already paused. Note we won't unpause these, to
 				 * leave the repmgrd instances in the cluster in the same state they
@@ -4113,8 +4172,11 @@ do_standby_switchover(void)
 			/* close all connections - we'll reestablish later */
 			for (cell = all_nodes.head; cell; cell = cell->next)
 			{
-				PQfinish(cell->node_info->conn);
-				cell->node_info->conn = NULL;
+				if (cell->node_info->conn != NULL)
+				{
+					PQfinish(cell->node_info->conn);
+					cell->node_info->conn = NULL;
+				}
 			}
 		}
 	}
@@ -4188,6 +4250,7 @@ do_standby_switchover(void)
 	 */
 	if (runtime_options.dry_run == true)
 	{
+		/* we use a buffer here as it will be modified by string_remove_trailing_newlines() */
 		char		shutdown_command[MAXLEN] = "";

 		strncpy(shutdown_command, command_output.data, MAXLEN);
@@ -4304,6 +4367,9 @@ do_standby_switchover(void)
 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
 		log_warning(_("connection to local node lost, reconnecting..."));
+		log_detail("\n%s", PQerrorMessage(local_conn));
+		PQfinish(local_conn);
+
 		local_conn = establish_db_connection(config_file_options.conninfo, false);

 		if (PQstatus(local_conn) != CONNECTION_OK)
@@ -4719,9 +4785,10 @@ do_standby_switchover(void)
 				else
 				{
 					item_list_append_format(&repmgrd_unpause_errors,
-											_("unable to connect to node \"%s\" (ID %i)"),
+											_("unable to connect to node \"%s\" (ID %i):\n%s"),
 											cell->node_info->node_name,
-											cell->node_info->node_id);
+											cell->node_info->node_id,
+											PQerrorMessage(cell->node_info->conn));
 					error_node_count++;
 				}

@@ -4733,6 +4800,8 @@ do_standby_switchover(void)
 				PQExpBufferData detail;
 				ItemListCell *cell;

+				initPQExpBuffer(&detail);
+
 				for (cell = repmgrd_unpause_errors.head; cell; cell = cell->next)
 				{
 					appendPQExpBuffer(&detail,
@@ -6015,10 +6084,11 @@ run_file_backup(t_node_info *node_record)
 				 * Remove prefix
 				 */
 				p = string_skip_prefix(prefix, output);
+
 				if (p == NULL)
 				{
-					log_error("unexpected output from \"barman list-files\": %s",
-							  output);
+					log_error("unexpected output from \"barman list-files\"");
+					log_detail("%s", output);
 					exit(ERR_BARMAN);
 				}

@@ -6036,6 +6106,14 @@ run_file_backup(t_node_info *node_record)
 					strncat(prefix, backup_id, MAXLEN - 1);
 					strncat(prefix, "/", MAXLEN - 1);
 					p = string_skip_prefix(backup_id, p);
+
+					if (p == NULL)
+					{
+						log_error("unexpected output from \"barman list-files\"");
+						log_detail("%s", output);
+						exit(ERR_BARMAN);
+					}
+
 					p = string_skip_prefix("/", p);

 					/*
@@ -6047,8 +6125,8 @@ run_file_backup(t_node_info *node_record)
 									basebackups_directory,
 									backup_id,
 									local_repmgr_tmp_directory);
-					(void) local_command(
-										 command,
+
+					(void) local_command(command,
 										 NULL);

 					/*
@@ -6372,6 +6450,8 @@ run_file_backup(t_node_info *node_record)

 		if (fputs(tablespace_map.data, tablespace_map_file) == EOF)
 		{
+			fclose(tablespace_map_file);
+
 			log_error(_("unable to write to tablespace_map file \"%s\""), tablespace_map_filename.data);

 			r = ERR_BAD_BASEBACKUP;
--- a/repmgr-action-witness.c
+++ b/repmgr-action-witness.c
@@ -56,8 +56,7 @@ do_witness_register(void)
 		log_error(_("unable to connect to witness node \"%s\" (ID: %i)"),
 				  config_file_options.node_name,
 				  config_file_options.node_id);
-		log_detail("%s",
-				   PQerrorMessage(witness_conn));
+		log_detail("\n%s", PQerrorMessage(witness_conn));
 		log_hint(_("the witness node must be running before it can be registered"));
 		exit(ERR_BAD_CONFIG);
 	}
@@ -411,7 +410,7 @@ do_witness_unregister(void)
 			log_error(_("unable to connect to node \"%s\" (ID: %i)"),
 					  config_file_options.node_name,
 					  config_file_options.node_id);
-			log_detail("%s", PQerrorMessage(local_conn));
+			log_detail("\n%s", PQerrorMessage(local_conn));
 			exit(ERR_BAD_CONFIG);
 		}

@@ -437,7 +436,7 @@ do_witness_unregister(void)
 	if (PQstatus(primary_conn) != CONNECTION_OK)
 	{
 		log_error(_("unable to connect to primary"));
-		log_detail("%s", PQerrorMessage(primary_conn));
+		log_detail("\n%s", PQerrorMessage(primary_conn));

 		if (local_node_available == true)
 		{
--- a/repmgr-client-global.h
+++ b/repmgr-client-global.h
@@ -70,7 +70,7 @@ typedef struct

 	/* general node options */
 	int			node_id;
-	char		node_name[MAXLEN];
+	char		node_name[NAMEDATALEN];
 	char		data_dir[MAXPGPATH];
 	int			remote_node_id;

--- a/repmgr-client.c
+++ b/repmgr-client.c
@@ -356,9 +356,15 @@ main(int argc, char **argv)

 				/* --node-name */
 			case OPT_NODE_NAME:
-				strncpy(runtime_options.node_name, optarg, MAXLEN);
+			{
+				if (strlen(optarg) < sizeof(runtime_options.node_name))
+					strncpy(runtime_options.node_name, optarg, sizeof(runtime_options.node_name));
+				else
+					item_list_append_format(&cli_errors,
+											_("value for \"--node-name\" must contain fewer than %lu characters"),
+											sizeof(runtime_options.node_name));
 				break;
-
+			}
 				/* --remote-node-id */
 			case OPT_REMOTE_NODE_ID:
 				runtime_options.remote_node_id = repmgr_atoi(optarg, "--remote-node-id", &cli_errors, MIN_NODE_ID);
@@ -1674,6 +1680,8 @@ check_cli_parameters(const int action)
 				item_list_append_format(&cli_warnings,
 										_("--replication-user ignored when executing %s"),
 										action_name(action));
+				break;
+
 			default:
 				item_list_append_format(&cli_warnings,
 										_("--replication-user not required when executing %s"),
@@ -2457,6 +2465,7 @@ get_superuser_connection(PGconn **conn, PGconn **superuser_conn, PGconn **privil
 	if (PQstatus(*conn) != CONNECTION_OK)
 	{
 		log_error(_("no database connection available"));
+		log_detail("\n%s", PQerrorMessage(*conn));
 		exit(ERR_INTERNAL);
 	}

@@ -3000,7 +3009,7 @@ init_node_record(t_node_info *node_record)
 		strncpy(node_record->location, "default", MAXLEN);


-	strncpy(node_record->node_name, config_file_options.node_name, MAXLEN);
+	strncpy(node_record->node_name, config_file_options.node_name, sizeof(node_record->node_name));
 	strncpy(node_record->conninfo, config_file_options.conninfo, MAXLEN);
 	strncpy(node_record->config_file, config_file_path, MAXPGPATH);

@@ -3054,9 +3063,6 @@ can_use_pg_rewind(PGconn *conn, const char *data_directory, PQExpBufferData *rea
 	/* "full_page_writes" must be on in any case */
 	if (guc_set(conn, "full_page_writes", "=", "off"))
 	{
-		if (can_use == false)
-			appendPQExpBuffer(reason, "; ");
-
 		appendPQExpBuffer(reason,
 						  _("\"full_page_writes\" must be set to \"on\""));

@@ -3143,6 +3149,8 @@ drop_replication_slot_if_exists(PGconn *conn, int node_id, char *slot_name)
 /*
 * Here we'll perform some timeline sanity checks to ensure the follow target
 * can actually be followed.
+ *
+ * See also comment for check_node_can_follow() in repmgrd-physical.c .
 */
 bool
 check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *follow_target_conn, t_node_info *follow_target_node_record, bool is_rejoin)
@@ -3233,6 +3241,7 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
 		return false;
 	}

+	/* timelines are the same - check relative positions */
 	if (follow_target_identification.timeline == local_tli)
 	{
 		XLogRecPtr follow_target_xlogpos = get_node_current_lsn(follow_target_conn);
@@ -3244,7 +3253,6 @@ check_node_can_attach(TimeLineID local_tli, XLogRecPtr local_xlogpos, PGconn *fo
 			return false;
 		}

-		/* timeline is the same - check relative positions */
 		if (local_xlogpos <= follow_target_xlogpos)
 		{
 			log_info(_("timelines are same, this server is not ahead"));
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -7,7 +7,8 @@
 # parameter will be treated as empty or false.
 #
 # IMPORTANT: string values can be provided as-is, or enclosed in single quotes
-# (but not double-quotes, which will be interpreted as part of the string), e.g.:
+# (but not double-quotes, which will be interpreted as part of the string),
+# e.g.:
 #
 #  node_name=foo
 #  node_name = 'foo'
@@ -24,9 +25,11 @@
 				 # using the server's hostname or another identifier
 				 # unambiguously associated with the server to avoid
 				 # confusion. Avoid choosing names which reflect the
-				 # node's current role, e.g. "primary" or "standby1",
+				 # node's current role, e.g. 'primary' or 'standby1',
 				 # as roles can change and it will be confusing if
-				 # the current primary is called "standby1".
+				 # the current primary is called 'standby1'.
+                                 # The string's maximum length is 63 characters and it should
+                                 # contain only printable ASCII characters.

 #conninfo=''			 # Database connection information as a conninfo string.
 				 # All servers in the cluster must be able to connect to
--- a/repmgr.h
+++ b/repmgr.h
@@ -55,6 +55,7 @@
 #define UNKNOWN_TIMELINE_ID -1
 #define UNKNOWN_SYSTEM_IDENTIFIER 0
 #define UNKNOWN_PID			-1
+#define UNKNOWN_REPLICATION_LAG	-1

 #define NODE_NOT_FOUND		-1
 #define NO_UPSTREAM_NODE	-1
--- a/repmgr_version.h.in
+++ b/repmgr_version.h.in
@@ -1,3 +1,3 @@
 #define REPMGR_VERSION_DATE ""
-#define REPMGR_VERSION "4.3rc1"
+#define REPMGR_VERSION "4.3"
 #define REPMGR_VERSION_NUM 40300
--- a/repmgrd-bdr.c
+++ b/repmgrd-bdr.c
@@ -68,7 +68,6 @@ monitor_bdr(void)
 	t_bdr_node_info bdr_node_info = T_BDR_NODE_INFO_INITIALIZER;
 	RecordStatus record_status;
 	NodeInfoListCell *cell;
-	PQExpBufferData event_details;
 	instr_time	log_status_interval_start;

 	/* sanity check local database */
@@ -229,6 +228,7 @@ monitor_bdr(void)
 								if (cell->node_info->node_status == NODE_STATUS_UP)
 								{
 									int			node_unreachable_elapsed = calculate_elapsed(node_unreachable_start);
+									PQExpBufferData event_details;

 									initPQExpBuffer(&event_details);

@@ -366,7 +366,6 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *next_node_conn = NULL;
 	NodeInfoListCell *cell;
-	PQExpBufferData event_details;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
 	t_node_info target_node = T_NODE_INFO_INITIALIZER;
 	t_node_info failed_node = T_NODE_INFO_INITIALIZER;
@@ -460,45 +459,49 @@ do_bdr_failover(NodeInfoList *nodes, t_node_info *monitored_node)

 	log_debug("this node is the failover handler");

-	initPQExpBuffer(&event_details);
+	{
+		PQExpBufferData event_details;

-	event_info.conninfo_str = target_node.conninfo;
-	event_info.node_name = target_node.node_name;
+		initPQExpBuffer(&event_details);

-	/* update node record on the active node */
-	update_node_record_set_active(next_node_conn, monitored_node->node_id, false);
+		event_info.conninfo_str = target_node.conninfo;
+		event_info.node_name = target_node.node_name;

-	log_notice(_("setting node record for node %i to inactive"), monitored_node->node_id);
+		/* update node record on the active node */
+		update_node_record_set_active(next_node_conn, monitored_node->node_id, false);

-	appendPQExpBuffer(&event_details,
-					  _("node \"%s\" (ID: %i) detected as failed; next available node is \"%s\" (ID: %i)"),
-					  monitored_node->node_name,
-					  monitored_node->node_id,
-					  target_node.node_name,
-					  target_node.node_id);
+		log_notice(_("setting node record for node %i to inactive"), monitored_node->node_id);

-	/*
-	 * Create an event record
-	 *
-	 * If we were able to connect to another node, we'll update the event log
-	 * there.
-	 *
-	 * In any case the event notification command will be triggered with the
-	 * event "bdr_failover"
-	 */
+		appendPQExpBuffer(&event_details,
+						  _("node \"%s\" (ID: %i) detected as failed; next available node is \"%s\" (ID: %i)"),
+						  monitored_node->node_name,
+						  monitored_node->node_id,
+						  target_node.node_name,
+						  target_node.node_id);
+
+		/*
+		 * Create an event record
+		 *
+		 * If we were able to connect to another node, we'll update the event log
+		 * there.
+		 *
+		 * In any case the event notification command will be triggered with the
+		 * event "bdr_failover"
+		 */


-	create_event_notification_extended(next_node_conn,
-									   &config_file_options,
-									   monitored_node->node_id,
-									   "bdr_failover",
-									   true,
-									   event_details.data,
-									   &event_info);
+		create_event_notification_extended(next_node_conn,
+										   &config_file_options,
+										   monitored_node->node_id,
+										   "bdr_failover",
+										   true,
+										   event_details.data,
+										   &event_info);

-	log_info("%s", event_details.data);
+		log_info("%s", event_details.data);

-	termPQExpBuffer(&event_details);
+		termPQExpBuffer(&event_details);
+	}

 	unset_bdr_failover_handler(next_node_conn);

@@ -513,7 +516,6 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 {
 	PGconn	   *recovered_node_conn;

-	PQExpBufferData event_details;
 	t_event_info event_info = T_EVENT_INFO_INITIALIZER;
 	int			i;
 	bool		slot_reactivated = false;
@@ -543,6 +545,8 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 	 */
 	if (PQstatus(local_conn) != CONNECTION_OK)
 	{
+		PQExpBufferData event_details;
+
 		local_conn = NULL;
 		log_warning(_("unable to reconnect to local node"));

@@ -613,49 +617,50 @@ do_bdr_recovery(NodeInfoList *nodes, t_node_info *monitored_node)
 	node_recovery_elapsed = calculate_elapsed(degraded_monitoring_start);
 	monitored_node->monitoring_state = MS_NORMAL;

-
-	initPQExpBuffer(&event_details);
-
-	appendPQExpBuffer(&event_details,
-					  _("node \"%s\" (ID: %i) has recovered after %i seconds"),
-					  monitored_node->node_name,
-					  monitored_node->node_id,
-					  node_recovery_elapsed);
-
-	log_notice("%s", event_details.data);
-
-
-	/* other node will generate the event */
-	if (monitored_node->node_id == local_node_info.node_id)
 	{
+		PQExpBufferData event_details;
+
+		initPQExpBuffer(&event_details);
+
+		appendPQExpBuffer(&event_details,
+						  _("node \"%s\" (ID: %i) has recovered after %i seconds"),
+						  monitored_node->node_name,
+						  monitored_node->node_id,
+						  node_recovery_elapsed);
+
+		log_notice("%s", event_details.data);
+
+
+		/* other node will generate the event */
+		if (monitored_node->node_id == local_node_info.node_id)
+		{
+			termPQExpBuffer(&event_details);
+			PQfinish(recovered_node_conn);
+
+			return;
+		}
+
+
+		/* generate the event on the currently active node only */
+		if (monitored_node->node_id != local_node_info.node_id)
+		{
+			event_info.conninfo_str = monitored_node->conninfo;
+			event_info.node_name = monitored_node->node_name;
+
+			create_event_notification_extended(local_conn,
+											   &config_file_options,
+											   config_file_options.node_id,
+											   "bdr_recovery",
+											   true,
+											   event_details.data,
+											   &event_info);
+		}
+
 		termPQExpBuffer(&event_details);
-		PQfinish(recovered_node_conn);
-
-		return;
 	}

-
-	/* generate the event on the currently active node only */
-	if (monitored_node->node_id != local_node_info.node_id)
-	{
-		event_info.conninfo_str = monitored_node->conninfo;
-		event_info.node_name = monitored_node->node_name;
-
-		create_event_notification_extended(
-										   local_conn,
-										   &config_file_options,
-										   config_file_options.node_id,
-										   "bdr_recovery",
-										   true,
-										   event_details.data,
-										   &event_info);
-	}
-
-
 	update_node_record_set_active(local_conn, monitored_node->node_id, true);

-	termPQExpBuffer(&event_details);
-
 	PQfinish(recovered_node_conn);

 	return;
--- a/repmgrd-physical.c
+++ b/repmgrd-physical.c
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -433,7 +433,7 @@ main(int argc, char **argv)
 		if (extension_status == REPMGR_UNKNOWN)
 		{
 			log_error(_("unable to determine status of \"repmgr\" extension"));
-			log_detail("%s", PQerrorMessage(local_conn));
+			log_detail("\n%s", PQerrorMessage(local_conn));
 			close_connection(&local_conn);
 			exit(ERR_DB_QUERY);
 		}
@@ -561,6 +561,8 @@ start_monitoring(void)
 			   local_node_info.node_name,
 			   local_node_info.node_id);

+	log_info(_("\"connection_check_type\" set to \"%s\""), print_connection_check_type(config_file_options.connection_check_type));
+
 	while (true)
 	{
 		switch (local_node_info.type)
@@ -846,6 +848,7 @@ check_upstream_connection(PGconn **conn, const char *conninfo)
 		if (PQstatus(test_conn) != CONNECTION_OK)
 		{
 			log_warning(_("unable to connect to \"%s\""), conninfo);
+			log_detail("\n%s", PQerrorMessage(test_conn));
 			success = false;
 		}
 		PQfinish(test_conn);
--- a/sysutils.c
+++ b/sysutils.c
@@ -310,18 +310,26 @@ enable_wal_receiver(PGconn *conn, bool wait_startup)
 	if (wal_retrieve_retry_interval > WALRECEIVER_DISABLE_TIMEOUT_VALUE)
 	{
 		int new_wal_retrieve_retry_interval = wal_retrieve_retry_interval - WALRECEIVER_DISABLE_TIMEOUT_VALUE;
+		bool success;
+
 		log_notice(_("setting \"wal_retrieve_retry_interval\" to %i ms"),
 				   new_wal_retrieve_retry_interval);

-		// XXX handle error
-		alter_system_int(conn,
-						 "wal_retrieve_retry_interval",
-						 new_wal_retrieve_retry_interval);
+		success = alter_system_int(conn,
+								   "wal_retrieve_retry_interval",
+								   new_wal_retrieve_retry_interval);
+
+		if (success == false)
+		{
+			log_warning(_("unable to change \"wal_retrieve_retry_interval\""));
+			return UNKNOWN_PID;
+		}
+
 		pg_reload_conf(conn);
 	}
 	else
 	{
-		// XXX add threshold sanity check
+		/* TODO: add threshold sanity check */
 		log_info(_("\"wal_retrieve_retry_interval\" is %i, not changing"),
 				 wal_retrieve_retry_interval);
 	}
Author	SHA1	Message	Date
Ian Barwick	b4ca6851ab	Bump version number 4.3	2019-04-01 15:25:48 +09:00
Ian Barwick	347948b79f	Fix default return value in alter_system_int()	2019-04-01 14:52:37 +09:00
Ian Barwick	83e492d4ef	Add is_server_available_quiet() For use in cases where the caller collates node availability information and doesn't want to prematurely emit log output.	2019-04-01 12:24:57 +09:00
Ian Barwick	1906ea89bd	Improve copying of strings from database results Where feasible, specify the maximum string length via sizeof(), and use snprintf() in place of strncpy().	2019-04-01 11:29:16 +09:00
Ian Barwick	eab4fd2795	Handle unhandled error situation in enable_wal_receiver()	2019-04-01 11:03:47 +09:00
Ian Barwick	3f1fe9b6c2	Updae BDR repmgrd to handle node_name as a max 63 char string Follow-up from commit `1953ec7`.	2019-03-28 14:29:03 +09:00
Ian Barwick	e672f7e3ee	Handle potential NULL return from string_skip_prefix()	2019-03-28 12:46:03 +09:00
Ian Barwick	fd86160dff	Add missing break	2019-03-28 12:45:12 +09:00
Ian Barwick	f19cf62f09	Update code comment	2019-03-28 12:45:09 +09:00
Ian Barwick	8018ba97d6	Remove logically dead code	2019-03-28 12:36:05 +09:00
Ian Barwick	73554c6e16	Prevent potential file descriptor resource leak	2019-03-28 12:29:42 +09:00
Ian Barwick	f23a93e12d	Put closedir call in correct location	2019-03-28 12:10:16 +09:00
Ian Barwick	d9947a46e8	Add various missing close() calls	2019-03-28 12:10:12 +09:00
Ian Barwick	e3a632e29d	Use correct argument for sizeof()	2019-03-28 11:04:57 +09:00
Ian Barwick	939cbd0721	Cast "int" to "long long"	2019-03-28 11:04:53 +09:00
Ian Barwick	c45c5abfb8	doc: note valid characters for "node_name" "node_name" will be used as "application_name", so should only contain characters valid for that; see: https://www.postgresql.org/docs/current/runtime-config-logging.html#GUC-APPLICATION-NAME Not yet enforced.	2019-03-28 10:58:23 +09:00
Ian Barwick	1953ec7459	Restrict "node_name" to maximum 63 characters In "recovery.conf", the configuration parameter "node_name" is used as the "application_name" value, which will be truncated by PostgreSQL to 63 characters (NAMEDATALEN - 1). repmgr sometimes needs to be able to extract the application name from pg_stat_replication to determine if a node is connected (e.g. when executing "repmgr standby register"), so the comparison will fail if "node_name" exceeds 63 characters.	2019-03-28 10:58:18 +09:00
Ian Barwick	a6eacca6e4	standby register: fail if --upstream-node-id is the local node ID	2019-03-27 14:27:59 +09:00
Ian Barwick	948e076ad9	log_db_error(): fix formatted message handling	2019-03-27 14:27:55 +09:00
Ian Barwick	a3bd9d33ff	Use sizeof(buf) rather than hard-coding value	2019-03-27 14:27:50 +09:00
Ian Barwick	9dc928a7d5	repmgrd: clean up PQExpBuffer handling Unless the PQExpBuffer is required for the duration of the function, ensure it's always a variable local to the relevant code block. This mitigates the risk of accidentally accessing a generically named PQExpBuffer which hasn't been initialised or was previously terminated.	2019-03-26 13:39:00 +09:00
Ian Barwick	9acf7bdfea	repmgrd: don't terminate uninitialized PQExpBuffer	2019-03-26 13:38:55 +09:00
Ian Barwick	29acd10f37	doc: update release notes	2019-03-22 15:42:12 +09:00
Ian Barwick	9df511eee3	doc: fix syntax	2019-03-22 15:41:44 +09:00
Ian Barwick	6441db23ff	repmgrd: during failover, check if a node was already promoted Previously, repmgrd assumed that during a failover, there would not already be another primary node. However it's possible a node was promoted manually. While this is not a desirable situation, it's conceivable this could happen in the wild, so we should check for it and react accordingly. Also sanity-check that the follow target can actually be followed. Addresses issue raised in GitHub #420.	2019-03-22 15:15:49 +09:00
Ian Barwick	7792de3543	standby follow: set replication user when connecting to local node	2019-03-22 10:12:35 +09:00
Ian Barwick	94fe3e395e	standby switchover: don't attempt to pause repmgrd on unreachable nodes	2019-03-22 10:12:28 +09:00
Ian Barwick	ff26173b1e	doc: add note about compiling against Pg11 and later with the --with-llvm option	2019-03-22 10:12:23 +09:00
Ian Barwick	4c11a57334	use a constant to denote unknown replication lag	2019-03-22 10:12:19 +09:00
Ian Barwick	1d2d6e3587	doc: consolidate witness server documentation	2019-03-20 16:30:09 +09:00
Ian Barwick	c03913d32a	doc: various improvements to repmgrd documentation	2019-03-20 16:10:38 +09:00
Ian Barwick	37a41a66f9	Check node recovery type before attempting to write an event record In some corner cases (e.g. immediately after a switchover) where the current primary has not yet been determined, the provided connection might not be writeable. This prevents error messages such as "cannot execute INSERT in a read-only transaction" generating unnecessary noise in the logs.	2019-03-20 12:14:53 +09:00
Ian Barwick	4c2c8ecbab	Fix logging related to "connection_check_type" Also log the selected type at repmgrd startup.	2019-03-20 12:13:51 +09:00
Ian Barwick	b84b6180ee	repmgrd: improve witness node monitoring Mainly fix a couple of places where "standby" was hard-coded into a log message which can apply either to a witness or a standby.	2019-03-20 12:13:47 +09:00
Ian Barwick	58f55222d9	Explictly log PQping() failures	2019-03-20 12:13:44 +09:00
Ian Barwick	5cbaff8d0a	Improve database connection failure logging Log the output of PQerrorStatus() in a couple of places where it was missing. Additionally, always log the output of PQerrorStatus() starting with a blank line, otherwise the first line looks like it was emitted by repmgr, and it's harder to scan the error message. Before: [2019-03-20 11:24:15] [DETAIL] could not connect to server: Connection refused Is the server running on host "localhost" (::1) and accepting TCP/IP connections on port 5501? could not connect to server: Connection refused Is the server running on host "localhost" (127.0.0.1) and accepting TCP/IP connections on port 5501? After: [2019-03-20 11:27:21] [DETAIL] could not connect to server: Connection refused Is the server running on host "localhost" (::1) and accepting TCP/IP connections on port 5501? could not connect to server: Connection refused Is the server running on host "localhost" (127.0.0.1) and accepting TCP/IP connections on port 5501?	2019-03-20 12:13:40 +09:00
Ian Barwick	a38e229e61	check_primary_status(): handle case where recovery type unknown	2019-03-20 12:13:34 +09:00
Ian Barwick	272abdd483	Refactor check_primary_status() Reduce nested if/else branching, and improve documentation.	2019-03-20 12:13:08 +09:00
Ian Barwick	b4f6043abc	Update .gitignore Ignore artefacts from failed patch application.	2019-03-20 12:11:57 +09:00
Ian Barwick	a7f3f899ff	doc: update repmgrd example output	2019-03-20 12:10:31 +09:00