Fix HISTORY to show from newest to oldest

Fix tabs in HISTORY
Fix typos in RELEASE NOTES
2026-03-23 15:16:29 +00:00 · 2012-07-27 11:26:18 -05:00 · 2012-07-27 11:20:56 -05:00 · 2012-07-27 11:15:50 -05:00 · 2012-07-21 17:49:38 -05:00 · 2012-07-21 12:06:33 -05:00
29 changed files with 2585 additions and 540 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,9 @@
 *~
 *.o
+*.so
 repmgr
 repmgrd
 README.htm*
 README.pdf
+sql/repmgr_funcs.so
+sql/repmgr_funcs.sql
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-Copyright (c) 2010-2011, 2ndQuadrant Limited
+Copyright (c) 2010-2012, 2ndQuadrant Limited
 All rights reserved.

 This program is free software: you can redistribute it and/or modify
--- a/34
+++ b/34
@@ -1,5 +1,30 @@
-1.0.0   2010-12-05
-        First public release
+2.0beta 2012-07-27
+        Make CLONE command try to make an exact copy including $PGDATA location (Cedric) 
+        Add detection of master failure (Jaime)
+        Add the notion of a witness server (Jaime)
+        Add autofailover capabilities (Jaime)
+        Add a configuration parameter to indicate the script to execute on failover or follow (Jaime)
+        Make the monitoring optional and turned off by default, it can be turned on with --monitoring-history switch (Jaime)
+        Add tunables to specify number of retries to reconnect to master and the time between them (Jaime)
+
+1.2.0	2012-07-27
+		Test ssh connection before trying to rsync (Cédric)
+        Add CLUSTER SHOW command (Carlo)
+        Add CLUSTER CLEANUP command (Jaime)
+        Add function write_primary_conninfo (Marco)
+        Teach repmgr how to get tablespace's location in different pg version (Jaime)
+		Improve version message (Carlo)
+
+1.1.1	2012-04-18
+        Add --ignore-rsync-warning (Cédric)
+        Add strnlen for compatibility with OS X (Greg)
+        Improve performance of the repl_status view (Jaime)
+        Remove last argument from log_err (Jaime, Reported by Jeroen Dekkers)
+        Complete documentation about possible error conditions (Jaime)
+        Document how to clean history (Jaime)
+
+1.1.0   2011-03-09
+        Make options -U, -R and -p not mandatory (Jaime)

 1.1.0b1 2011-02-24 
        Fix missing "--force" option in help (Greg Smith)
@@ -28,6 +53,5 @@
        Map old verbose flag into a useful setting for the new logger (Greg)
        Document repmgrd startup restrictions and log info about them (Greg)

-1.1.0   2011-03-09
-        Make options -U, -R and -p not mandatory (Jaime)
-
+1.0.0   2010-12-05
+        First public release
--- a/12
+++ b/12
@@ -1,6 +1,6 @@
 #
 # Makefile
-# Copyright (c) 2ndQuadrant, 2010-2011
+# Copyright (c) 2ndQuadrant, 2010-2012

 repmgrd_OBJS = dbutils.o config.o repmgrd.o log.o strutil.o
 repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o
@@ -11,9 +11,11 @@ PG_CPPFLAGS = -I$(libpq_srcdir)
 PG_LIBS = $(libpq_pgport)

 all:  repmgrd repmgr
+	$(MAKE) -C sql

 repmgrd: $(repmgrd_OBJS)
 	$(CC) $(CFLAGS) $(repmgrd_OBJS) $(PG_LIBS) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o repmgrd
+	$(MAKE) -C sql

 repmgr: $(repmgr_OBJS)
 	$(CC) $(CFLAGS) $(repmgr_OBJS) $(PG_LIBS) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o repmgr
@@ -33,6 +35,7 @@ endif
 install:
 	$(INSTALL_PROGRAM) repmgrd$(X) '$(DESTDIR)$(bindir)'
 	$(INSTALL_PROGRAM) repmgr$(X) '$(DESTDIR)$(bindir)'
+	$(MAKE) -C sql install

 ifneq (,$(DATA)$(DATA_built))
 	@for file in $(addprefix $(srcdir)/, $(DATA)) $(DATA_built); do \
@@ -45,10 +48,17 @@ clean:
 	rm -f *.o
 	rm -f repmgrd
 	rm -f repmgr
+	$(MAKE) -C sql clean

 deb: repmgrd repmgr
 	mkdir -p ./debian/usr/bin
 	cp repmgrd repmgr ./debian/usr/bin/
+	mkdir -p ./debian/usr/share/postgresql/9.0/contrib/
+	cp sql/repmgr_funcs.sql ./debian/usr/share/postgresql/9.0/contrib/
+	cp sql/uninstall_repmgr_funcs.sql ./debian/usr/share/postgresql/9.0/contrib/
+	mkdir -p ./debian/usr/lib/postgresql/9.0/lib/
+	cp sql/repmgr_funcs.so ./debian/usr/lib/postgresql/9.0/lib/
 	dpkg-deb --build debian
 	mv debian.deb ../postgresql-repmgr-9.0_1.0.0.deb
+	rm -rf ./debian/usr

--- a/README.rst
+++ b/README.rst
@@ -5,7 +5,7 @@ repmgr: Replication Manager for PostgreSQL clusters
 Introduction
 ============

-PostgreSQL 9.0 allow us to have replicated Hot Standby servers 
+PostgreSQL 9+ allow us to have replicated Hot Standby servers
 which we can query and/or use for high availability.

 While the main components of the feature are included with
@@ -20,6 +20,17 @@ databases as a single cluster.  repmgr includes two components:
 * repmgrd: management and monitoring daemon that watches the cluster
  and can automate remote actions.

+Supported Releases
+------------------
+
+repmgr works with PostgreSQL versions 9.0 and superior.
+
+There are currently no incompatibilities when upgrading repmgr from 9.0 to 9.1,
+so your 9.0 configuration will work with 9.1
+
+Additional parameters must be added to postgresql.conf to take advantage of
+the new 9.1 features such as synchronous replication or hot standby feedback.
+
 Requirements
 ------------

@@ -309,7 +320,7 @@ keys and a maching authorization file to a privledged user on the other system::
  [postgres@node1]$ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
  [postgres@node1]$ chmod go-rwx ~/.ssh/*
  [postgres@node1]$ cd ~/.ssh
-  [postgres@node1]$ scp id_rsa.pub id_rsa authorized_keys postgres@node2:
+  [postgres@node1]$ scp id_rsa.pub id_rsa authorized_keys user@node2:

 Login as a user on the other system, and install the files into the postgres 
 user's account::
@@ -378,7 +389,7 @@ walkthrough assumes the following setup:

 * Another standby server called "node3" with a similar configuration to "node2".

-* The Postgress installation in each of the above is defined as $PGDATA, 
+* The Postgres installation in each of the above is defined as $PGDATA, 
  which is represented here as ``/var/lib/pgsql/9.0/data``
  
 Creating some sample data
@@ -503,12 +514,14 @@ following the standard directory structure of a RHEL system.  It should contain:

  cluster=test
  node=1
+  node_name=earth
  conninfo='host=node1 user=repmgr dbname=pgbench'

 On "node2" create the file ``/var/lib/pgsql/repmgr/repmgr.conf`` with::

  cluster=test
  node=2
+  node_name=mars
  conninfo='host=node2 user=repmgr dbname=pgbench'

 The STANDBY CLONE process should have created a recovery.conf file on
@@ -701,12 +714,14 @@ and it should contain::

  cluster=test
  node=1
+  node_name=earth
  conninfo='host=127.0.0.1 dbname=testdb'

 On "standby" create the file ``/home/standby/repmgr/repmgr.conf`` with::

  cluster=test
  node=2
+  node_name=mars
  conninfo='host=127.0.0.1 dbname=testdb'

 Next, with "prime" server running, we want to use the ``clone standby`` command
@@ -814,6 +829,22 @@ and on "prime."

 The servers are now again acting as primary on "prime" and standby on "standby".

+Maintainance of monitor history
+-------------------------------
+
+Once you have changed roles (with a failover or to restore original roles)
+you would end up with records saying that node1 is primary and other records
+saying that node2 is the primary. Which could be confusing.
+Also, if you don't do anything about it the monitor history will keep growing.
+For both of those reasons you sometime want to make some maintainance of the 
+``repl_monitor`` table.
+
+If you want to clean the history after a few days you can execute the  
+CLUSTER CLEANUP command in a cron. For example to keep just one day of history
+you can put this in your crontab::
+
+0 1 * * *   repmgr cluster cleanup -k 1 -f ~/repmgr.conf
+
 Configuration and command reference
 ===================================

@@ -862,6 +893,7 @@ The output from this program looks like this::
    -f, --config_file=PATH     path to the configuration file
    -R, --remote-user=USERNAME database server username for rsync
    -w, --wal-keep-segments=VALUE  minimum value for the GUC wal_keep_segments (default: 5000)
+    -I, --ignore-rsync-warning ignore rsync partial transfer warning
    -F, --force                force potentially dangerous operations to happen

  repmgr performs some tasks like clone a node, promote it or making follow another node and then exits.
@@ -941,6 +973,26 @@ its port if is different from the default one.

        ./repmgr standby follow

+* cluster show 
+
+    * Shows the role (standby/master) and connection string for all nodes configured 
+      in the cluster or "FAILED" if the node doesn't respond. This allow us to know 
+      which nodes are alive and which one needs attention and to have a notion of the
+      structure of clusters we just have access to.  Example::
+
+        ./repmgr cluster show
+
+* cluster cleanup 
+
+    * Cleans the monitor's history from repmgr tables. This avoids the repl_monitor table
+      to grow excesivelly which in turns affects repl_status view performance, also 
+      keeps controlled the space in disk used by repmgr. This command can be used manually
+      or in a cron to make it periodically.  
+      There is also a --keep-history (-k) option to indicate how many days of history we
+      want to keep, so the command will clean up history older than "keep-history" days. Example::
+
+        ./repmgr cluster cleanup -k 2
+
 repmgrd Daemon
 --------------

@@ -961,6 +1013,7 @@ The output from this program looks like this::
    --help                    show this help, then exit
    --version                 output version information, then exit
    --verbose                 output verbose activity information
+    --monitoring-history      track advance or lag of the replication in every standby in repl_monitor
    -f, --config_file=PATH    database to connect to
  
  repmgrd monitors a cluster of servers.
@@ -991,6 +1044,10 @@ Lag monitoring
 repmgrd helps monitor a set of master and standby servers.  You can
 see which node is the current master, as well as how far behind each
 is from current.
+To activate the monitor capabilities of repmgr you must include the
+option --monitoring-history when running it::
+
+  repmgrd --monitoring-history --config-file=/path/to/repmgr.conf &

 To look at the current lag between primary and each node listed
 in ``repl_node``, consult the ``repl_status`` view::
@@ -1028,10 +1085,16 @@ License and Contributions
 =========================

 repmgr is licensed under the GPL v3.  All of its code and documentation is
-Copyright 2010-2011, 2ndQuadrant Limited.  See the files COPYRIGHT and LICENSE for
+Copyright 2010-2012, 2ndQuadrant Limited.  See the files COPYRIGHT and LICENSE for
 details.

-Contributions to repmgr are welcome, and listed in the file CREDITS.
+Main sponsorship of repmgr has been from 2ndQuadrant customers.
+
+Additional work has been sponsored by the 4CaaST project for cloud computing,
+which has received funding from the European Union's Seventh Framework Programme
+(FP7/2007-2013) under grant agreement 258862.
+
+Contributions to repmgr are welcome, and will be listed in the file CREDITS.
 2ndQuadrant Limited requires that any contributions provide a copyright
 assignment and a disclaimer of any work-for-hire ownership claims from the
 employer of the developer.  This lets us make sure that all of the repmgr
@@ -1047,3 +1110,35 @@ Code in repmgr is formatted to a consistent style using the following command::

 Contributors should reformat their code similarly before submitting code to
 the project, in order to minimize merge conflicts with other work.
+
+Support and Assistance
+======================
+
+2ndQuadrant provides 24x7 production support for repmgr, as well as help you
+configure it correctly, verify an installation and train you in running a
+robust replication cluster.
+
+There is a mailing list/forum to discuss contributions or issues
+http://groups.google.com/group/repmgr
+
+#repmgr is registered in freenode IRC
+
+Further information is available at http://www.repmgr.org/
+
+We'd love to hear from you about how you use repmgr. Case studies and
+news are always welcome. Send us an email at info@2ndQuadrant.com, or
+send a postcard to
+
+repmgr
+c/o 2ndQuadrant
+7200 The Quorum
+Oxford Business Park North
+Oxford
+OX4 2JZ
+
+Thanks from the repmgr core team
+
+Jaime Casanova
+Simon Riggs
+Greg Smith
+Cedric Villemain
--- a/autofailover_quick_setup.rst
+++ b/autofailover_quick_setup.rst
@@ -0,0 +1,213 @@
+=====================================================
+ PostgreSQL Automatic Fail-Over - User Documentation
+=====================================================
+
+Automatic Failover
+==================
+
+repmgr allows setups for automatic failover when it detects the failure of the master node.
+Following is a quick setup for this.
+
+Installation
+============
+
+For convenience, we define:
+
+  * node1 is the hostname fully qualified of the Master server, IP 192.168.1.10
+  * node2 is the hostname fully qualified of the Standby server, IP 192.168.1.11
+  * witness is the hostname fully qualified of the server used for witness, IP 192.168.1.12
+
+:Note: It is not recommanded to use name defining status of a server like «masterserver»,
+       this is a name leading to confusion once a failover take place and the Master is
+       now on the «standbyserver».
+
+Summary
+-------
+
+2 PostgreSQL servers are involved in the replication.  Automatic fail-over need
+to vote to decide what server it should promote, thus an odd number is required
+and a witness-repmgrd is installed in a third server where it uses a PostgreSQL
+cluster to communicate with other repmgrd daemons.
+
+1. Install PostgreSQL in all the servers involved (including the server used for
+witness)
+2. Install repmgr in all the servers involved (including the server used for witness)
+3. Configure the Master PostreSQL
+4. Clone the Master to the Standby using "repmgr standby clone" command
+5. Configure repmgr in all the servers involved (including the server used for witness)
+6. Register Master and Standby nodes
+7. Initiate witness server
+8. Start the repmgrd daemons in all nodes
+
+:Note: A complete Hight-Availability design need at least 3 servers to still have
+       a backup node after a first failure.
+
+Install PostgreSQL
+------------------
+
+You can install PostgreSQL using any of the recommended methods. You should ensure
+it's 9.0 or superior.
+
+Install repmgr
+--------------
+
+Install repmgr following the steps in the README.
+
+Configure PostreSQL
+-------------------
+
+Log in node1.
+
+Edit the file postgresql.conf and modify the parameters::
+
+  listen_addresses='*'
+  wal_level = 'hot_standby'
+  archive_mode = on
+  archive_command = 'cd .'	 # we can also use exit 0, anything that 
+                             # just does nothing
+  max_wal_senders = 10
+  wal_keep_segments = 5000   # 80 GB required on pg_xlog
+  hot_standby = on
+  shared_preload_libraries = 'repmgr_funcs'
+
+Edit the file pg_hba.conf and add lines for the replication::
+
+  host     repmgr           repmgr      127.0.0.1/32            trust
+  host     repmgr           repmgr      192.168.1.10/30         trust
+  host     replication      all         192.168.1.10/30         trust
+
+:Note: It is also possible to use a password authentication (md5), .pgpass file
+       should be edited to allow connection between each node.
+
+Create the user and database to manage replication::
+
+  su - postgres
+  createuser -s repmgr
+  createdb -O repmgr repmgr
+  psql -f /usr/share/postgresql/9.0/contrib/repmgr_funcs.sql repmgr
+
+Restart the PostgreSQL server::
+
+  pg_ctl -D $PGDATA restart
+
+And check everything is fine in the server log.
+
+Create the ssh-key for the postgres user and copy it to other servers::
+
+  su - postgres
+  ssh-keygen             # /!\ do not use a passphrase /!\
+  cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys
+  chmod 600 ~/.ssh/authorized_keys
+  exit
+  rsync -avz ~postgres/.ssh/authorized_keys node2:~postgres/.ssh/
+  rsync -avz ~postgres/.ssh/authorized_keys witness:~postgres/.ssh/
+  rsync -avz ~postgres/.ssh/id_rsa* node2:~postgres/.ssh/
+  rsync -avz ~postgres/.ssh/id_rsa* witness:~postgres/.ssh/
+
+Clone Master
+------------
+
+Log in node2.
+
+Clone the node1 (the current Master)::
+
+  su - postgres
+  repmgr -d repmgr -U repmgr standby clone node1
+
+Start the PostgreSQL server::
+
+  pg_ctl -D $PGDATA start
+
+And check everything is fine in the server log.
+
+Configure repmgr
+----------------
+
+Log in each server and configure repmgr by editing the file
+/etc/repmgr/repmgr.conf::
+
+  cluster=my_cluster
+  node=1
+  node_name=earth
+  conninfo='host=192.168.1.10 dbname=repmgr user=repmgr'
+  master_response_timeout=60
+  reconnect_attempts=6
+  reconnect_interval=10
+  failover=automatic
+  promote_command='promote_command.sh'
+  follow_command='repmgr standby follow -f /etc/repmgr/repmgr.conf'
+
+* *cluster* is the name of the current replication.
+* *node* is the number of the current node (1, 2 or 3 in the current example).
+* *node_name* is an identifier for every node.
+* *conninfo* is used to connect to the local PostgreSQL server (where the configuration file is) from any node. In the witness server configuration it is needed to add a 'port=5499' to the conninfo.
+* *master_response_timeout* is the maximum amount of time we are going to wait before deciding the master has died and start failover procedure.
+* *reconnect_attempts* is the number of times we will try to reconnect to master after a failure has been detected and before start failover procedure.
+* *reconnect_interval* is the amount of time between retries to reconnect to master after a failure has been detected and before start failover procedure.
+* *failover* configure behavior : *manual* or *automatic*.
+* *promote_command* the command executed to do the failover (including the PostgreSQL failover itself). The command must return 0 on success.
+* *follow_command* the command executed to address the current standby to another Master. The command must return 0 on success.
+
+Register Master and Standby
+---------------------------
+
+Log in node1.
+
+Register the node as Master::
+
+  su - postgres
+  repmgr -f /etc/repmgr/repmgr.conf master register
+
+Log in node2.
+
+Register the node as Standby::
+
+  su - postgres
+  repmgr -f /etc/repmgr/repmgr.conf standby register
+
+Initialize witness server
+-------------------------
+
+Log in witness.
+
+Initialize the witness server::
+
+  su - postgres
+  repmgr -d repmgr -U repmgr -h 192.168.1.10 -D $WITNESS_PGDATA -f /etc/repmgr/repmgr.conf witness create node1
+
+It needs information to connect to the master to copy the configuration of the cluster, also it needs to know where it should initialize it's own $PGDATA.
+As part of the procees it also ask for the superuser password so it can connect when needed.
+
+Start the repmgrd daemons
+-------------------------
+
+Log in node2 and witness.
+
+  su - postgres
+  repmgrd -f /etc/repmgr/repmgr.conf > /var/log/postgresql/repmgr.log 2>&1
+
+:Note: The Master does not need a repmgrd daemon.
+
+
+Suspend Automatic behavior
+==========================
+
+Edit the repmgr.conf of the node to remove from automatic processing and change::
+
+	failover=manual
+
+Then, signal repmgrd daemon::
+
+	su - postgres
+	kill -HUP `pidoff repmgrd`
+
+TODO : -HUP configuration update is not implemented and it should check its
+	   configuration file  against its configuration in DB, updating
+	   accordingly the SQL conf (especialy the failover manual or auto)
+	   this allow witness-standby and standby-not-promotable features
+	   and simpler usage of the tool ;)
+
+Usage
+=====
+
+The repmgr documentation is in the README file (how to build, options, etc.)
--- a/check_dir.c
+++ b/check_dir.c
@@ -1,6 +1,6 @@
 /*
 * check_dir.c - Directories management functions
- * Copyright (C) 2ndQuadrant, 2010-2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -31,8 +31,6 @@
 #include "strutil.h"
 #include "log.h"

-static int mkdir_p(char *path, mode_t omode);
-
 /*
 * make sure the directory either doesn't exist or is empty
 * we use this function to check the new data directory and
@@ -124,15 +122,15 @@ set_directory_permissions(char *dir)
 * note that on failure, the path arg has been modified to show the particular
 * directory level we had problems with.
 */
-static int
+int
 mkdir_p(char *path, mode_t omode)
 {
 	struct stat sb;
 	mode_t		numask,
-	         oumask;
+	oumask;
 	int			first,
-	        last,
-	        retval;
+	last,
+	retval;
 	char	   *p;

 	p = path;
@@ -225,8 +223,85 @@ is_pg_dir(char *dir)
 	const size_t buf_sz = 8192;
 	char		 path[buf_sz];
 	struct stat	 sb;
+	int		r;

+	// test pgdata
 	xsnprintf(path, buf_sz, "%s/PG_VERSION", dir);
+	if (stat(path, &sb) == 0)
+		return true;

-	return (stat(path, &sb) == 0) ? true : false;
+	// test tablespace dir
+	sprintf(path, "ls %s/PG_*/ -I*", dir);
+	r = system(path);
+	if (r == 0)
+		return true;
+
+	return false;
+}
+
+
+bool
+create_pgdir(char *dir, bool force)
+{
+	bool	pg_dir = false;
+
+	/* Check this directory could be used as a PGDATA dir */
+	switch (check_dir(dir))
+	{
+	case 0:
+		/* dir not there, must create it */
+		log_info(_("creating directory \"%s\"...\n"), dir);
+
+		if (!create_directory(dir))
+		{
+			log_err(_("couldn't create directory \"%s\"...\n"),
+			        dir);
+			exit(ERR_BAD_CONFIG);
+		}
+		break;
+	case 1:
+		/* Present but empty, fix permissions and use it */
+		log_info(_("checking and correcting permissions on existing directory %s ...\n"),
+		         dir);
+
+		if (!set_directory_permissions(dir))
+		{
+			log_err(_("could not change permissions of directory \"%s\": %s\n"),
+			        dir, strerror(errno));
+			exit(ERR_BAD_CONFIG);
+		}
+		break;
+	case 2:
+		/* Present and not empty */
+		log_warning(_("directory \"%s\" exists but is not empty\n"),
+		            dir);
+
+		pg_dir = is_pg_dir(dir);
+
+		/*
+		 * we use force to reduce the time needed to restore a node which
+		 * turn async after a failover or anything else
+		 */
+		if (pg_dir && force)
+		{
+			/* Let it continue */
+			break;
+		}
+		else if (pg_dir && !force)
+		{
+			log_warning(_("\nThis looks like a PostgreSQL directory.\n"
+			              "If you are sure you want to clone here, "
+			              "please check there is no PostgreSQL server "
+			              "running and use the --force option\n"));
+			exit(ERR_BAD_CONFIG);
+		}
+
+		return false;
+	default:
+		/* Trouble accessing directory */
+		log_err(_("could not access directory \"%s\": %s\n"),
+		        dir, strerror(errno));
+		exit(ERR_BAD_CONFIG);
+	}
+	return true;
 }
--- a/check_dir.h
+++ b/check_dir.h
@@ -1,6 +1,6 @@
 /*
 * check_dir.h
- * Copyright (c) 2ndQuadrant, 2010-2011
+ * Copyright (c) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -20,9 +20,11 @@
 #ifndef _REPMGR_CHECK_DIR_H_
 #define _REPMGR_CHECK_DIR_H_

+int mkdir_p(char *path, mode_t omode);
 int  check_dir(char *dir);
 bool create_directory(char *dir);
 bool set_directory_permissions(char *dir);
 bool is_pg_dir(char *dir);
+bool create_pgdir(char *dir, bool force);

 #endif
--- a/config.c
+++ b/config.c
@@ -1,6 +1,6 @@
 /*
 * config.c - Functions to parse the config file
- * Copyright (C) 2ndQuadrant, 2010-2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -18,11 +18,12 @@
 */

 #include "config.h"
-#include "repmgr.h"
+#include "log.h"
 #include "strutil.h"
+#include "repmgr.h"

 void
-parse_config(const char* config_file, t_configuration_options* options)
+parse_config(const char *config_file, t_configuration_options *options)
 {
 	char *s, buff[MAXLINELENGTH];
 	char name[MAXLEN];
@@ -34,15 +35,27 @@ parse_config(const char* config_file, t_configuration_options* options)
 	memset(options->cluster_name, 0, sizeof(options->cluster_name));
 	options->node = -1;
 	memset(options->conninfo, 0, sizeof(options->conninfo));
+	options->failover = MANUAL_FAILOVER;
+	options->priority = 0;
+	memset(options->node_name, 0, sizeof(options->node_name));
+	memset(options->promote_command, 0, sizeof(options->promote_command));
+	memset(options->follow_command, 0, sizeof(options->follow_command));
 	memset(options->rsync_options, 0, sizeof(options->rsync_options));

+	/* if nothing has been provided defaults to 60 */
+	options->master_response_timeout = 60;
+
+	/* it defaults to 6 retries with a time between retries of 10s */
+	options->reconnect_attempts = 6;
+	options->reconnect_intvl = 10;
+
 	/*
 	 * Since some commands don't require a config file at all, not
 	 * having one isn't necessarily a problem.
 	 */
 	if (fp == NULL)
 	{
-		fprintf(stderr, _("Did not find the configuration file '%s', continuing\n"), config_file);
+		log_err(_("Did not find the configuration file '%s', continuing\n"), config_file);
 		return;
 	}

@@ -69,8 +82,37 @@ parse_config(const char* config_file, t_configuration_options* options)
 			strncpy (options->loglevel, value, MAXLEN);
 		else if (strcmp(name, "logfacility") == 0)
 			strncpy (options->logfacility, value, MAXLEN);
+		else if (strcmp(name, "failover") == 0)
+		{
+			char failoverstr[MAXLEN];
+			strncpy(failoverstr, value, MAXLEN);
+
+			if (strcmp(failoverstr, "manual") == 0)
+				options->failover = MANUAL_FAILOVER;
+			else if (strcmp(failoverstr, "automatic") == 0)
+				options->failover = AUTOMATIC_FAILOVER;
+			else
+			{
+				log_warning(_("value for failover option is incorrect, it should be automatic or manual. Defaulting to manual.\n"));
+				options->failover = MANUAL_FAILOVER;
+			}
+		}
+		else if (strcmp(name, "priority") == 0)
+			options->priority = atoi(value);
+		else if (strcmp(name, "node_name") == 0)
+			strncpy(options->node_name, value, MAXLEN);
+		else if (strcmp(name, "promote_command") == 0)
+			strncpy(options->promote_command, value, MAXLEN);
+		else if (strcmp(name, "follow_command") == 0)
+			strncpy(options->follow_command, value, MAXLEN);
+		else if (strcmp(name, "master_response_timeout") == 0)
+			options->master_response_timeout = atoi(value);
+		else if (strcmp(name, "reconnect_attempts") == 0)
+			options->reconnect_attempts = atoi(value);
+		else if (strcmp(name, "reconnect_interval") == 0)
+			options->reconnect_intvl = atoi(value);
 		else
-			printf ("WARNING: %s/%s: Unknown name/value pair!\n", name, value);
+			log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
 	}

 	/* Close file */
@@ -79,19 +121,36 @@ parse_config(const char* config_file, t_configuration_options* options)
 	/* Check config settings */
 	if (strnlen(options->cluster_name, MAXLEN)==0)
 	{
-		fprintf(stderr, "Cluster name is missing. "
-		        "Check the configuration file.\n");
+		log_err(_("Cluster name is missing. Check the configuration file.\n"));
 		exit(ERR_BAD_CONFIG);
 	}

 	if (options->node == -1)
 	{
-		fprintf(stderr, "Node information is missing. "
-		        "Check the configuration file.\n");
+		log_err(_("Node information is missing. Check the configuration file.\n"));
+		exit(ERR_BAD_CONFIG);
+	}
+
+	if (options->master_response_timeout <= 0)
+	{
+		log_err(_("Master response timeout must be greater than zero. Check the configuration file.\n"));
+		exit(ERR_BAD_CONFIG);
+	}
+
+	if (options->reconnect_attempts < 0)
+	{
+		log_err(_("Reconnect attempts must be zero or greater. Check the configuration file.\n"));
+		exit(ERR_BAD_CONFIG);
+	}
+
+	if (options->reconnect_intvl <= 0)
+	{
+		log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
 		exit(ERR_BAD_CONFIG);
 	}
 }

+
 char *
 trim (char *s)
 {
@@ -144,3 +203,99 @@ parse_line(char *buff, char *name, char *value)
 	value[j] = '\0';
 	trim(value);
 }
+
+bool
+reload_configuration(char *config_file, t_configuration_options *orig_options)
+{
+	PGconn	*conn;
+
+	t_configuration_options new_options;
+
+	/*
+	 * Re-read the configuration file: repmgr.conf
+	 */
+	log_info(_("Reloading configuration file and updating repmgr tables\n"));
+	parse_config(config_file, &new_options);
+	if (new_options.node == -1)
+	{
+		log_warning(_("\nCannot load new configuration, will keep current one.\n"));
+		return false;
+	}
+
+	if (strcmp(new_options.cluster_name, orig_options->cluster_name) != 0)
+	{
+		log_warning(_("\nCannot change cluster name, will keep current configuration.\n"));
+		return false;
+	}
+
+	if (new_options.node != orig_options->node)
+	{
+		log_warning(_("\nCannot change node number, will keep current configuration.\n"));
+		return false;
+	}
+
+	if (new_options.node_name != orig_options->node_name)
+	{
+		log_warning(_("\nCannot change standby name, will keep current configuration.\n"));
+		return false;
+	}
+
+	if (new_options.failover != MANUAL_FAILOVER && new_options.failover != AUTOMATIC_FAILOVER)
+	{
+		log_warning(_("\nNew value for failover is not valid. Should be MANUAL or AUTOMATIC.\n"));
+		return false;
+	}
+
+	if (new_options.master_response_timeout <= 0)
+	{
+		log_warning(_("\nNew value for master_response_timeout is not valid. Should be greater than zero.\n"));
+		return false;
+	}
+
+	if (new_options.reconnect_attempts < 0)
+	{
+		log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
+		return false;
+	}
+
+	if (new_options.reconnect_intvl < 0)
+	{
+		log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
+		return false;
+	}
+
+	/* Test conninfo string */
+	conn = establishDBConnection(new_options.conninfo, false);
+	if (!conn || (PQstatus(conn) != CONNECTION_OK))
+	{
+		log_warning(_("\nconninfo string is not valid, will keep current configuration.\n"));
+		return false;
+	}
+	PQfinish(conn);
+
+	/* Configuration seems ok, will load new values */
+	strcpy(orig_options->cluster_name, new_options.cluster_name);
+	orig_options->node = new_options.node;
+	strcpy(orig_options->conninfo, new_options.conninfo);
+	orig_options->failover = new_options.failover;
+	orig_options->priority = new_options.priority;
+	strcpy(orig_options->node_name, new_options.node_name);
+	strcpy(orig_options->promote_command, new_options.promote_command);
+	strcpy(orig_options->follow_command, new_options.follow_command);
+	strcpy(orig_options->rsync_options, new_options.rsync_options);
+	orig_options->master_response_timeout = new_options.master_response_timeout;
+	orig_options->reconnect_attempts = new_options.reconnect_attempts;
+	orig_options->reconnect_intvl = new_options.reconnect_intvl;
+	/*
+	 * XXX These ones can change with a simple SIGHUP?
+
+		strcpy (orig_options->loglevel, new_options.loglevel);
+		strcpy (orig_options->logfacility, new_options.logfacility);
+
+		logger_shutdown();
+		XXX do we have progname here ?
+		logger_init(progname, orig_options.loglevel, orig_options.logfacility);
+	*/
+
+	return true;
+}
--- a/config.h
+++ b/config.h
@@ -1,6 +1,6 @@
 /*
 * config.h
- * Copyright (c) 2ndQuadrant, 2010-2011
+ * Copyright (c) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -28,13 +28,22 @@ typedef struct
 	char cluster_name[MAXLEN];
 	int node;
 	char conninfo[MAXLEN];
+	int failover;
+	int priority;
+	char node_name[MAXLEN];
+	char promote_command[MAXLEN];
+	char follow_command[MAXLEN];
 	char loglevel[MAXLEN];
 	char logfacility[MAXLEN];
 	char rsync_options[QUERY_STR_LEN];
+	int  master_response_timeout;
+	int  reconnect_attempts;
+	int  reconnect_intvl;
 } t_configuration_options;

-void parse_config(const char* config_file, t_configuration_options* options);
+void parse_config(const char *config_file, t_configuration_options *options);
 void parse_line(char *buff, char *name, char *value);
 char *trim(char *s);
+bool reload_configuration(char *config_file, t_configuration_options *orig_options);

 #endif
--- a/dbutils.c
+++ b/dbutils.c
@@ -1,6 +1,6 @@
 /*
 * dbutils.c - Database connection/management functions
- * Copyright (C) 2ndQuadrant, 2010-2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -17,6 +17,8 @@
 *
 */

+#include <unistd.h>
+
 #include "repmgr.h"
 #include "strutil.h"
 #include "log.h"
@@ -25,7 +27,12 @@ PGconn *
 establishDBConnection(const char *conninfo, const bool exit_on_error)
 {
 	/* Make a connection to the database */
-	PGconn *conn = PQconnectdb(conninfo);
+	PGconn *conn = NULL;
+	char    connection_string[MAXLEN];
+
+	strcpy(connection_string, conninfo);
+	strcat(connection_string, " fallback_application_name='repmgr'");
+	conn = PQconnectdb(connection_string);

 	/* Check to see that the backend connection was successfully made */
 	if ((PQstatus(conn) != CONNECTION_OK))
@@ -68,7 +75,7 @@ bool
 is_standby(PGconn *conn)
 {
 	PGresult   *res;
-	bool		result;
+	bool		result = false;

 	res = PQexec(conn, "SELECT pg_is_in_recovery()");

@@ -81,9 +88,7 @@ is_standby(PGconn *conn)
 		exit(ERR_DB_QUERY);
 	}

-	if (strcmp(PQgetvalue(res, 0, 0), "f") == 0)
-		result = false;
-	else
+	if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0)
 		result = true;

 	PQclear(res);
@@ -91,6 +96,84 @@ is_standby(PGconn *conn)
 }


+
+bool
+is_witness(PGconn *conn, char *schema, char *cluster, int node_id)
+{
+	PGresult   *res;
+	bool		result = false;
+	char		sqlquery[QUERY_STR_LEN];
+
+	sqlquery_snprintf(sqlquery, "SELECT witness from %s.repl_nodes where cluster = '%s' and id = %d",
+	                  schema, cluster, node_id);
+	res = PQexec(conn, sqlquery);
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_err(_("Can't query server mode: %s"), PQerrorMessage(conn));
+		PQclear(res);
+		PQfinish(conn);
+		exit(ERR_DB_QUERY);
+	}
+
+	if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0)
+		result = true;
+
+	PQclear(res);
+	return result;
+}
+
+
+/* check the PQStatus and try to 'select 1' to confirm good connection */
+bool
+is_pgup(PGconn *conn, int timeout)
+{
+	char		sqlquery[QUERY_STR_LEN];
+	/* Check the connection status twice in case it changes after reset */
+	bool		twice = false;
+
+	/* Check the connection status twice in case it changes after reset */
+	for (;;)
+	{
+		if (PQstatus(conn) != CONNECTION_OK)
+		{
+			if (twice)
+				return false;
+			PQreset(conn);  // reconnect
+			twice = true;
+		}
+		else
+		{
+			/*
+			* Send a SELECT 1 just to check if the connection is OK
+			*/
+			CancelQuery(conn, timeout);
+			if (wait_connection_availability(conn, timeout) != 1)
+				goto failed;
+
+			sqlquery_snprintf(sqlquery, "SELECT 1");
+			if (PQsendQuery(conn, sqlquery) == 0)
+			{
+				log_warning(_("PQsendQuery: Query could not be sent to primary. %s\n"), 
+								PQerrorMessage(conn));
+				goto failed;
+			}
+			if (wait_connection_availability(conn, timeout) != 1)
+				goto failed;
+
+			break;
+
+failed:
+			// we need to retry, because we might just have loose the connection once
+			if (twice)
+				return false;
+			PQreset(conn);  // reconnect
+			twice = true;
+		}
+	}
+	return true;
+}
+
+
 /*
 * If postgreSQL version is 9 or superior returns the major version
 * if 8 or inferior returns an empty string
@@ -202,7 +285,7 @@ get_cluster_size(PGconn *conn)
 * connection string is placed there.
 */
 PGconn *
-getMasterConnection(PGconn *standby_conn, int id, char *cluster,
+getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
                    int *master_id, char *master_conninfo_out)
 {
 	PGconn		*master_conn	 = NULL;
@@ -211,7 +294,6 @@ getMasterConnection(PGconn *standby_conn, int id, char *cluster,
 	char		 sqlquery[QUERY_STR_LEN];
 	char		 master_conninfo_stack[MAXCONNINFO];
 	char		*master_conninfo = &*master_conninfo_stack;
-	char		 schema_str[MAXLEN];
 	char		 schema_quoted[MAXLEN];

 	int		 i;
@@ -228,10 +310,9 @@ getMasterConnection(PGconn *standby_conn, int id, char *cluster,
 	 *
 	 * Assemble the unquoted schema name
 	 */
-	maxlen_snprintf(schema_str, "repmgr_%s", cluster);
 	{
-		char *identifier = PQescapeIdentifier(standby_conn, schema_str,
-		                                      strlen(schema_str));
+		char *identifier = PQescapeIdentifier(standby_conn, schema,
+		                                      strlen(schema));

 		maxlen_snprintf(schema_quoted, "%s", identifier);
 		PQfreemem(identifier);
@@ -241,9 +322,9 @@ getMasterConnection(PGconn *standby_conn, int id, char *cluster,
 	log_info(_("finding node list for cluster '%s'\n"),
 	         cluster);

-	sqlquery_snprintf(sqlquery, "SELECT * FROM %s.repl_nodes "
-	                  " WHERE cluster = '%s' and id <> %d",
-	                  schema_quoted, cluster, id);
+	sqlquery_snprintf(sqlquery, "SELECT id, conninfo FROM %s.repl_nodes "
+	                  " WHERE cluster = '%s' and not witness",
+	                  schema_quoted, cluster);

 	res1 = PQexec(standby_conn, sqlquery);
 	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
@@ -259,7 +340,7 @@ getMasterConnection(PGconn *standby_conn, int id, char *cluster,
 	{
 		/* initialize with the values of the current node being processed */
 		*master_id = atoi(PQgetvalue(res1, i, 0));
-		strncpy(master_conninfo, PQgetvalue(res1, i, 2), MAXCONNINFO);
+		strncpy(master_conninfo, PQgetvalue(res1, i, 1), MAXCONNINFO);
 		log_info(_("checking role of cluster node '%s'\n"),
 		         master_conninfo);
 		master_conn = establishDBConnection(master_conninfo, false);
@@ -311,3 +392,55 @@ getMasterConnection(PGconn *standby_conn, int id, char *cluster,
 	PQclear(res1);
 	return NULL;
 }
+
+
+/*
+ * wait until current query finishes ignoring any results, this could be an async command
+ * or a cancelation of a query 
+ * return 1 if Ok; 0 if any error ocurred; -1 if timeout reached
+ */
+int
+wait_connection_availability(PGconn *conn, int timeout)
+{
+	PGresult   *res;
+
+	while(timeout-- >= 0)
+	{
+		if (PQconsumeInput(conn) == 0)
+		{
+			log_warning(_("PQconsumeInput: Query could not be sent to primary. %s\n"), 
+							PQerrorMessage(conn));
+			return 0;
+		}
+	
+		if (PQisBusy(conn) == 0)
+		{
+			res = PQgetResult(conn);
+			if (res == NULL)
+				break;
+			PQclear(res);
+		}
+		sleep(1);
+	}
+	if (timeout >= 0)
+		return 1;
+	else
+		return -1;
+}
+
+
+void
+CancelQuery(PGconn *conn, int timeout)
+{
+	char errbuf[ERRBUFF_SIZE];
+	PGcancel *pgcancel;
+
+	wait_connection_availability(conn, timeout);
+
+	pgcancel = PQgetCancel(conn);
+
+	if (!pgcancel || PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
+		log_warning(_("Can't stop current query: %s\n"), errbuf);
+
+	PQfreeCancel(pgcancel);
+}
--- a/dbutils.h
+++ b/dbutils.h
@@ -1,6 +1,6 @@
 /*
 * dbutils.h
- * Copyright (c) 2ndQuadrant, 2010-2011
+ * Copyright (c) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -20,16 +20,22 @@
 #ifndef _REPMGR_DBUTILS_H_
 #define _REPMGR_DBUTILS_H_

+#include "strutil.h"
+
 PGconn *establishDBConnection(const char *conninfo, const bool exit_on_error);
 PGconn *establishDBConnectionByParams(const char *keywords[],
                                      const char *values[],
                                      const bool exit_on_error);
-bool	is_standby(PGconn *conn);
+bool    is_standby(PGconn *conn);
+bool    is_witness(PGconn *conn, char *schema, char *cluster, int node_id);
+bool	is_pgup(PGconn *conn, int timeout);
 char   *pg_version(PGconn *conn, char* major_version);
 bool	guc_setted(PGconn *conn, const char *parameter, const char *op,
-                   const char *value);
+                const char *value);
 const char	 *get_cluster_size(PGconn *conn);
-PGconn *getMasterConnection(PGconn *standby_conn, int id, char *cluster,
+PGconn *getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
                            int *master_id, char *master_conninfo_out);

+int wait_connection_availability(PGconn *conn, int timeout);
+void CancelQuery(PGconn *conn, int timeout);
 #endif
--- a/debian/DEBIAN/control
+++ b/debian/DEBIAN/control
@@ -1,4 +1,4 @@
-Package: repmgr
+Package: repmgr-auto
 Version: 1.0-1
 Section: database
 Priority: optional
--- a/errcode.h
+++ b/errcode.h
@@ -1,6 +1,6 @@
 /*
 * errcode.h
- * Copyright (C) 2ndQuadrant, 2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -33,5 +33,7 @@
 #define ERR_PROMOTED 8
 #define ERR_BAD_PASSWORD 9
 #define ERR_STR_OVERFLOW 10
+#define ERR_FAILOVER_FAIL 11
+#define ERR_BAD_SSH	12

 #endif	/* _ERRCODE_H_ */
--- a/log.c
+++ b/log.c
@@ -1,6 +1,6 @@
 /*
 * log.c - Logging methods
- * Copyright (C) 2ndQuadrant, 2010-2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 * This module is a set of methods for logging (currently only syslog)
 *
--- a/log.h
+++ b/log.h
@@ -1,6 +1,6 @@
 /*
 * log.h
- * Copyright (c) 2ndQuadrant, 2010-2011
+ * Copyright (c) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
--- a/repmgr.c
+++ b/repmgr.c
--- a/repmgr.conf.sample
+++ b/repmgr.conf.sample
@@ -7,11 +7,25 @@ cluster=test

 # Node ID
 node=2
+node_name=standby2

 # Connection information
 conninfo='host=192.168.204.104'
 rsync_options=--archive --checksum --compress --progress --rsh=ssh

+# How many seconds we wait for master response before declaring master failure
+master_response_timeout=60
+
+# How many time we try to reconnect to master before starting failover procedure
+reconnect_attempts=6
+reconnect_interval=10
+
+# Autofailover options
+failover=automatic
+priority=-1
+promote_command='repmgr promote'
+follow_command='repmgr follow'
+
 # Log level: possible values are DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG
 # Default: NOTICE
 loglevel=NOTICE
--- a/repmgr.h
+++ b/repmgr.h
@@ -1,6 +1,6 @@
 /*
 * repmgr.h
- * Copyright (c) 2ndQuadrant, 2010-2011
+ * Copyright (c) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -30,6 +30,7 @@

 #define PRIMARY_MODE		0
 #define STANDBY_MODE		1
+#define WITNESS_MODE		2

 #include "config.h"
 #define MAXFILENAME		1024
@@ -42,6 +43,9 @@
 #define DEFAULT_DBNAME			"postgres"
 #define DEFAULT_REPMGR_SCHEMA_PREFIX	"repmgr_"

+#define MANUAL_FAILOVER		0
+#define AUTOMATIC_FAILOVER	1
+
 /* Run time options type */
 typedef struct
 {
@@ -55,9 +59,15 @@ typedef struct
 	char wal_keep_segments[MAXLEN];
 	bool verbose;
 	bool force;
+	bool ignore_rsync_warn;

 	char masterport[MAXLEN];
+	char localport[MAXLEN];

+	/* parameter used by CLUSTER CLEANUP */
+	int keep_history;
 } t_runtime_options;

+#define SLEEP_MONITOR		2
+
 #endif
--- a/repmgr.sql
+++ b/repmgr.sql
@@ -1,7 +1,7 @@
 /*
 * repmgr.sql
 *
- * Copyright (C) 2ndQuadrant, 2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 */

@@ -14,8 +14,11 @@ CREATE SCHEMA repmgr;
 */
 CREATE TABLE repl_nodes (
  id            integer primary key,
-  cluster   text        not null,       -- Name to identify the cluster
-  conninfo      text    not null
+  cluster   	text    not null,       -- Name to identify the cluster
+  name			text	not null,
+  conninfo      text    not null,
+  priority  	integer not null,
+  witness   	boolean not null default false
 );
 ALTER TABLE repl_nodes OWNER TO repmgr;

@@ -28,13 +31,12 @@ CREATE TABLE repl_monitor (
  standby_node                   INTEGER NOT NULL,
  last_monitor_time                      TIMESTAMP WITH TIME ZONE NOT NULL,
  last_wal_primary_location      TEXT NOT NULL,
-  last_wal_standby_location      TEXT NOT NULL,
+  last_wal_standby_location      TEXT,		-- In case of a witness server this will be NULL
  replication_lag                BIGINT NOT NULL,
  apply_lag                      BIGINT NOT NULL
 );
 ALTER TABLE repl_monitor OWNER TO repmgr;

-
 /*
 * This view shows the latest monitor info about every node.
 * Interesting thing to see:
@@ -46,14 +48,14 @@ ALTER TABLE repl_monitor OWNER TO repmgr;
 * time_lag: how many seconds are we from being up-to-date with master
 */
 CREATE VIEW repl_status AS
-WITH monitor_info AS (SELECT *, ROW_NUMBER() OVER (PARTITION BY primary_node, standby_node
-                                                       ORDER BY last_monitor_time desc)
-                        FROM repl_monitor)
-SELECT primary_node, standby_node, last_monitor_time, last_wal_primary_location,
+SELECT primary_node, standby_node, name AS standby_name, last_monitor_time, last_wal_primary_location,
       last_wal_standby_location, pg_size_pretty(replication_lag) replication_lag,
       pg_size_pretty(apply_lag) apply_lag,
       age(now(), last_monitor_time) AS time_lag
-  FROM monitor_info a
- WHERE row_number = 1;
+ FROM repl_monitor JOIN repl_nodes ON standby_node = id
+WHERE (standby_node, last_monitor_time) IN (SELECT standby_node, MAX(last_monitor_time)
+                                              FROM repl_monitor GROUP BY 1);

 ALTER VIEW repl_status OWNER TO repmgr;
+
+CREATE INDEX idx_repl_status_sort ON repl_monitor(last_monitor_time, standby_node);
--- a/repmgrd.c
+++ b/repmgrd.c
@@ -1,6 +1,6 @@
 /*
 * repmgrd.c - Replication manager daemon
- * Copyright (C) 2ndQuadrant, 2010-2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 * This module connects to the nodes of a replication cluster and monitors
 * how far are they from master
@@ -30,9 +30,24 @@
 #include "config.h"
 #include "log.h"
 #include "strutil.h"
+#include "version.h"

+#include "access/xlogdefs.h"
 #include "libpq/pqsignal.h"

+/*
+ * Struct to keep info about the nodes, used in the voting process in
+ * do_failover()
+ */
+typedef struct nodeInfo
+{
+	int nodeId;
+	XLogRecPtr xlog_location;
+	bool is_ready;
+} nodeInfo;
+
+
+char    myClusterName[MAXLEN];

 /* Local info */
 t_configuration_options local_options;
@@ -50,6 +65,7 @@ const char *progname;

 char	*config_file = DEFAULT_CONFIG_FILE;
 bool	verbose = false;
+bool	monitoring_history = false;
 char	repmgr_schema[MAXLEN];

 /*
@@ -60,35 +76,36 @@ t_configuration_options config = {};

 static void help(const char* progname);
 static void usage(void);
-static void checkClusterConfiguration(PGconn *conn,PGconn *primary);
+static void checkClusterConfiguration(PGconn *conn, PGconn *primary);
 static void checkNodeConfiguration(char *conninfo);
-static void CancelQuery(void);

-static void MonitorExecute(void);
+static void StandbyMonitor(void);
+static void WitnessMonitor(void);
+static bool CheckPrimaryConnection(void);
+static void update_shared_memory(char *last_wal_standby_applied);
+static void update_registration(void);
+static void do_failover(void);

 static unsigned long long int walLocationToBytes(char *wal_location);

+/*
+ * Flag to mark SIGHUP. Whenever the main loop comes around it
+ * will reread the configuration file.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+
+static void handle_sighup(SIGNAL_ARGS);
 static void handle_sigint(SIGNAL_ARGS);
-static void setup_cancel_handler(void);
+static void setup_event_handlers(void);

 #define CloseConnections()	\
 	if (PQisBusy(primaryConn) == 1) \
-		CancelQuery(); \
+		CancelQuery(primaryConn, local_options.master_response_timeout); \
 	if (myLocalConn != NULL) \
 		PQfinish(myLocalConn);	\
 	if (primaryConn != NULL && primaryConn != myLocalConn) \
 		PQfinish(primaryConn);

-/*
- * Every 3 seconds, insert monitor info
- */
-#define MonitorCheck()						  \
-	for (;;)								  \
-	{										  \
-		MonitorExecute();					  \
-		sleep(3);							  \
-	}
-

 int
 main(int argc, char **argv)
@@ -97,6 +114,7 @@ main(int argc, char **argv)
 	{
 		{"config", required_argument, NULL, 'f'},
 		{"verbose", no_argument, NULL, 'v'},
+		{"monitoring-history", no_argument, NULL, 'm'},
 		{NULL, 0, NULL, 0}
 	};

@@ -116,12 +134,12 @@ main(int argc, char **argv)
 		}
 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
 		{
-			printf("%s (PostgreSQL) " PG_VERSION "\n", progname);
+			printf("%s %s (PostgreSQL %s)\n", progname, REPMGR_VERSION, PG_VERSION);
 			exit(SUCCESS);
 		}
 	}

-	while ((c = getopt_long(argc, argv, "f:v", long_options, &optindex)) != -1)
+	while ((c = getopt_long(argc, argv, "f:v:m", long_options, &optindex)) != -1)
 	{
 		switch (c)
 		{
@@ -131,13 +149,16 @@ main(int argc, char **argv)
 		case 'v':
 			verbose = true;
 			break;
+		case 'm':
+			monitoring_history = true;
+			break;
 		default:
 			usage();
 			exit(ERR_BAD_CONFIG);
 		}
 	}

-	setup_cancel_handler();
+	setup_event_handlers();

 	/*
 	 * Read the configuration file: repmgr.conf
@@ -145,8 +166,8 @@ main(int argc, char **argv)
 	parse_config(config_file, &local_options);
 	if (local_options.node == -1)
 	{
-		log_err("Node information is missing. "
-		        "Check the configuration file, or provide one if you have not done so.\n");
+		log_err(_("Node information is missing. "
+		          "Check the configuration file, or provide one if you have not done so.\n"));
 		exit(ERR_BAD_CONFIG);
 	}

@@ -164,8 +185,8 @@ main(int argc, char **argv)
 	pg_version(myLocalConn, standby_version);
 	if (strcmp(standby_version, "") == 0)
 	{
-		PQfinish(myLocalConn);
 		log_err(_("%s needs standby to be PostgreSQL 9.0 or better\n"), progname);
+		PQfinish(myLocalConn);
 		exit(ERR_BAD_CONFIG);
 	}

@@ -173,38 +194,129 @@ main(int argc, char **argv)
 	 * Set my server mode, establish a connection to primary
 	 * and start monitor
 	 */
-	myLocalMode = is_standby(myLocalConn) ? STANDBY_MODE : PRIMARY_MODE;
-	if (myLocalMode == PRIMARY_MODE)
+	if (is_witness(myLocalConn, repmgr_schema, local_options.cluster_name, local_options.node))
+		myLocalMode = WITNESS_MODE;
+	else if (is_standby(myLocalConn))
+		myLocalMode = STANDBY_MODE;
+	else /* is the master */
+		myLocalMode = PRIMARY_MODE;
+
+	switch (myLocalMode)
 	{
+	case PRIMARY_MODE:
 		primary_options.node = local_options.node;
 		strncpy(primary_options.conninfo, local_options.conninfo, MAXLEN);
 		primaryConn = myLocalConn;
-	}
-	else
-	{
+
+		checkClusterConfiguration(myLocalConn, primaryConn);
+		checkNodeConfiguration(local_options.conninfo);
+
+		if (reload_configuration(config_file, &local_options))
+		{
+			PQfinish(myLocalConn);
+			myLocalConn = establishDBConnection(local_options.conninfo, true);
+			primaryConn = myLocalConn;
+			update_registration();
+		}
+
+		log_info(_("%s Starting continuous primary connection check\n"), progname);
+		/* Check that primary is still alive, and standbies are sending info */
+		/*
+		 * Every SLEEP_MONITOR seconds, do master checks
+		 * XXX
+		 * Check that standbies are sending info
+		*/
+		for (;;)
+		{
+			if (CheckPrimaryConnection())
+			{
+				/*
+									CheckActiveStandbiesConnections();
+									CheckInactiveStandbies();
+				*/
+				sleep(SLEEP_MONITOR);
+			}
+			else
+			{
+				/* XXX
+				 * May we do something more verbose ?
+				 */
+				exit (1);
+			}
+
+			if (got_SIGHUP)
+			{
+				/* if we can reload, then could need to change myLocalConn */
+				if (reload_configuration(config_file, &local_options))
+				{
+					PQfinish(myLocalConn);
+					myLocalConn = establishDBConnection(local_options.conninfo, true);
+					primaryConn = myLocalConn;
+					update_registration();
+				}
+				got_SIGHUP = false;
+			}
+		}
+		break;
+	case WITNESS_MODE:
+	case STANDBY_MODE:
 		/* I need the id of the primary as well as a connection to it */
 		log_info(_("%s Connecting to primary for cluster '%s'\n"),
 		         progname, local_options.cluster_name);
-		primaryConn = getMasterConnection(myLocalConn, local_options.node,
+		primaryConn = getMasterConnection(myLocalConn, repmgr_schema, 
 		                                  local_options.cluster_name,
-		                                  &primary_options.node,NULL);
+		                                  &primary_options.node, NULL);
 		if (primaryConn == NULL)
 		{
 			CloseConnections();
 			exit(ERR_BAD_CONFIG);
 		}
-	}

-	checkClusterConfiguration(myLocalConn,primaryConn);
-	checkNodeConfiguration(local_options.conninfo);
-	if (myLocalMode == STANDBY_MODE)
-	{
-		log_info(_("%s Starting continuous standby node monitoring\n"), progname);
-		MonitorCheck();
-	}
-	else
-	{
-		log_info(_("%s This is a primary node, program not needed here; exiting'\n"), progname);
+		checkClusterConfiguration(myLocalConn, primaryConn);
+		checkNodeConfiguration(local_options.conninfo);
+
+		if (reload_configuration(config_file, &local_options))
+		{
+			PQfinish(myLocalConn);
+			myLocalConn = establishDBConnection(local_options.conninfo, true);
+			update_registration();
+		}
+
+		/*
+		 * Every SLEEP_MONITOR seconds, do checks
+		 */
+		if (myLocalMode == WITNESS_MODE)
+		{
+			log_info(_("%s Starting continuous witness node monitoring\n"), progname);
+		}
+		else if (myLocalMode == STANDBY_MODE)
+		{
+			log_info(_("%s Starting continuous standby node monitoring\n"), progname);
+		}
+
+		for (;;)
+		{
+			if (myLocalMode == WITNESS_MODE)
+				WitnessMonitor();
+			else if (myLocalMode == STANDBY_MODE)
+				StandbyMonitor();
+			sleep(SLEEP_MONITOR);
+
+			if (got_SIGHUP)
+			{
+				/* if we can reload, then could need to change myLocalConn */
+				if (reload_configuration(config_file, &local_options))
+				{
+					PQfinish(myLocalConn);
+					myLocalConn = establishDBConnection(local_options.conninfo, true);
+					update_registration();
+				}
+				got_SIGHUP = false;
+			}
+		}
+		break;
+	default:
+		log_err(_("%s: Unrecognized mode for node %d\n"), progname, local_options.node);
 	}

 	/* Prevent a double-free */
@@ -220,6 +332,78 @@ main(int argc, char **argv)
 	return 0;
 }

+/*
+ *
+ */
+static void
+WitnessMonitor(void)
+{
+	char monitor_witness_timestamp[MAXLEN];
+	PGresult	*res;
+
+	/*
+	 * Check if the master is still available, if after 5 minutes of retries
+	 * we cannot reconnect, return false.
+	 */
+	CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds
+
+	if (PQstatus(primaryConn) != CONNECTION_OK)
+	{
+		/*
+		 * If we can't reconnect, just exit...
+		 * XXX we need to make witness connect to the new master
+		 */
+		PQfinish(myLocalConn);
+		exit(0);
+	}
+
+	/* Fast path for the case where no history is requested */
+	if (!monitoring_history)
+		return;
+
+	/*
+	 * Cancel any query that is still being executed,
+	 * so i can insert the current record
+	 */
+	CancelQuery(primaryConn, local_options.master_response_timeout);
+	if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1)
+		return;
+
+	/* Get local xlog info */
+	sqlquery_snprintf(sqlquery, "SELECT CURRENT_TIMESTAMP ");
+
+	res = PQexec(myLocalConn, sqlquery);
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_err(_("PQexec failed: %s\n"), PQerrorMessage(myLocalConn));
+		PQclear(res);
+		/* if there is any error just let it be and retry in next loop */
+		return;
+	}
+
+	strcpy(monitor_witness_timestamp, PQgetvalue(res, 0, 0));
+	PQclear(res);
+
+	/*
+	 * Build the SQL to execute on primary
+	 */
+	sqlquery_snprintf(sqlquery,
+	                  "INSERT INTO %s.repl_monitor "
+	                  "VALUES(%d, %d, '%s'::timestamp with time zone, "
+	                  " pg_current_xlog_location(), null,  "
+	                  " 0, 0)",
+	                  repmgr_schema, primary_options.node, local_options.node, monitor_witness_timestamp);
+
+	/*
+	 * Execute the query asynchronously, but don't check for a result. We
+	 * will check the result next time we pause for a monitor step.
+	 */
+	log_debug("WitnessMonitor: %s\n", sqlquery);
+	if (PQsendQuery(primaryConn, sqlquery) == 0)
+		log_warning(_("Query could not be sent to primary. %s\n"),
+		            PQerrorMessage(primaryConn));
+}
+

 /*
 * Insert monitor info, this is basically the time and xlog replayed,
@@ -227,7 +411,7 @@ main(int argc, char **argv)
 * Also do the math to see how far are we in bytes for being uptodate
 */
 static void
-MonitorExecute(void)
+StandbyMonitor(void)
 {
 	PGresult *res;
 	char monitor_standby_timestamp[MAXLEN];
@@ -245,50 +429,45 @@ MonitorExecute(void)
 	 * Check if the master is still available, if after 5 minutes of retries
 	 * we cannot reconnect, try to get a new master.
 	 */
-	for (connection_retries = 0; connection_retries < 15; connection_retries++)
-	{
-		if (PQstatus(primaryConn) != CONNECTION_OK)
-		{
-			log_warning(_("Connection to master has been lost, trying to recover...\n"));
-			/* wait 20 seconds between retries */
-			sleep(20);
+	CheckPrimaryConnection(); // this take up to local_options.reconnect_attempts * local_options.reconnect_intvl seconds

-			PQreset(primaryConn);
-		}
-		else
-		{
-			if (connection_retries > 0)
-			{
-				log_notice(_("Connection to master has been restored, continue monitoring.\n"));
-			}
-			break;
-		}
-	}
 	if (PQstatus(primaryConn) != CONNECTION_OK)
 	{
-		log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
-		for (connection_retries = 0; connection_retries < 6; connection_retries++)
+		if (local_options.failover == MANUAL_FAILOVER)
 		{
-			primaryConn = getMasterConnection(myLocalConn, local_options.node,
-			                                  local_options.cluster_name, &primary_options.node,NULL);
-			if (PQstatus(primaryConn) == CONNECTION_OK)
+			log_err(_("We couldn't reconnect to master. Now checking if another node has been promoted.\n"));
+			for (connection_retries = 0; connection_retries < 6; connection_retries++)
 			{
-				/* Connected, we can continue the process so break the loop */
-				log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node);
-				break;
+				primaryConn = getMasterConnection(myLocalConn, repmgr_schema, 
+				                                  local_options.cluster_name, &primary_options.node, NULL);
+				if (PQstatus(primaryConn) == CONNECTION_OK)
+				{
+					/* Connected, we can continue the process so break the loop */
+					log_err(_("Connected to node %d, continue monitoring.\n"), primary_options.node);
+					break;
+				}
+				else
+				{
+					log_err(_("We haven't found a new master, waiting before retry...\n"));
+					/* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */
+					sleep(300);
+				}
 			}
-			else
+
+			if (PQstatus(primaryConn) != CONNECTION_OK)
 			{
-				log_err(_("We haven't found a new master, waiting before retry...\n"));
-				/* wait 5 minutes before retries, after 6 failures (30 minutes) we stop trying */
-				sleep(300);
+				log_err(_("We couldn't reconnect for long enough, exiting...\n"));
+				exit(ERR_DB_CON);
 			}
 		}
-	}
-	if (PQstatus(primaryConn) != CONNECTION_OK)
-	{
-		log_err(_("We couldn't reconnect for long enough, exiting...\n"));
-		exit(ERR_DB_CON);
+		else if (local_options.failover == AUTOMATIC_FAILOVER)
+		{
+			/*
+			 * When we returns from this function we will have a new primary and
+			 * a new primaryConn
+			 */
+			do_failover();
+		}
 	}

 	/* Check if we still are a standby, we could have been promoted */
@@ -299,13 +478,17 @@ MonitorExecute(void)
 		exit(ERR_PROMOTED);
 	}

+	/* Fast path for the case where no history is requested */
+	if (!monitoring_history)
+		return;
+
 	/*
-	 * first check if there is a command being executed,
-	 * and if that is the case, cancel the query so i can
-	 * insert the current record
+	 * Cancel any query that is still being executed,
+	 * so i can insert the current record
 	 */
-	if (PQisBusy(primaryConn) == 1)
-		CancelQuery();
+	CancelQuery(primaryConn, local_options.master_response_timeout);
+	if (wait_connection_availability(primaryConn, local_options.master_response_timeout) != 1)
+		return;

 	/* Get local xlog info */
 	sqlquery_snprintf(
@@ -316,7 +499,7 @@ MonitorExecute(void)
 	res = PQexec(myLocalConn, sqlquery);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		log_err("PQexec failed: %s\n", PQerrorMessage(myLocalConn));
+		log_err(_("PQexec failed: %s\n"), PQerrorMessage(myLocalConn));
 		PQclear(res);
 		/* if there is any error just let it be and retry in next loop */
 		return;
@@ -333,7 +516,7 @@ MonitorExecute(void)
 	res = PQexec(primaryConn, sqlquery);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		log_err("PQexec failed: %s\n", PQerrorMessage(primaryConn));
+		log_err(_("PQexec failed: %s\n"), PQerrorMessage(primaryConn));
 		PQclear(res);
 		return;
 	}
@@ -364,12 +547,253 @@ MonitorExecute(void)
 	 * Execute the query asynchronously, but don't check for a result. We
 	 * will check the result next time we pause for a monitor step.
 	 */
+	log_debug("StandbyMonitor: %s\n", sqlquery);
 	if (PQsendQuery(primaryConn, sqlquery) == 0)
-		log_warning("Query could not be sent to primary. %s\n",
+		log_warning(_("Query could not be sent to primary. %s\n"),
 		            PQerrorMessage(primaryConn));
 }


+static void
+do_failover(void)
+{
+	PGresult *res1;
+	PGresult *res2;
+	char 	sqlquery[8192];
+
+	int		total_nodes = 0;
+	int		visible_nodes = 0;
+	bool	find_best = false;
+
+	int		i;
+	int		r;
+
+	int 	node;
+	char	nodeConninfo[MAXLEN];
+
+	unsigned int uxlogid;
+	unsigned int uxrecoff;
+	char last_wal_standby_applied[MAXLEN];
+
+	PGconn	*nodeConn = NULL;
+
+	/*
+	 * will get info about until 50 nodes,
+	 * which seems to be large enough for most scenarios
+	 */
+	nodeInfo nodes[50];
+	nodeInfo best_candidate;
+
+	/* first we get info about this node, and update shared memory */
+	sprintf(sqlquery, "SELECT pg_last_xlog_replay_location()");
+	res1 = PQexec(myLocalConn, sqlquery);
+	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
+	{
+		log_err(_("PQexec failed: %s.\nReport an invalid value to not be considered as new primary and exit.\n"), PQerrorMessage(myLocalConn));
+		PQclear(res1);
+		sprintf(last_wal_standby_applied, "'%X/%X'", 0, 0);
+		update_shared_memory(last_wal_standby_applied);
+		exit(ERR_DB_QUERY);
+	}
+
+	/* write last location in shared memory */
+	update_shared_memory(PQgetvalue(res1, 0, 0));
+
+	/*
+	 * we sleep the monitor time + one second
+	 * we bet it should be enough for other repmgrd to update their own data
+	 */
+	sleep(SLEEP_MONITOR + 1);
+
+	/* get a list of standby nodes, including myself */
+	sprintf(sqlquery, "SELECT id, conninfo "
+	        "  FROM %s.repl_nodes "
+	        " WHERE id <> %d "
+	        "   AND cluster = '%s' "
+	        " ORDER BY priority ",
+	        repmgr_schema, primary_options.node, local_options.cluster_name);
+
+	res1 = PQexec(myLocalConn, sqlquery);
+	if (PQresultStatus(res1) != PGRES_TUPLES_OK)
+	{
+		log_err(_("Can't get nodes info: %s\n"), PQerrorMessage(myLocalConn));
+		PQclear(res1);
+		PQfinish(myLocalConn);
+		exit(ERR_DB_QUERY);
+	}
+
+	/* ask for the locations */
+	for (i = 0; i < PQntuples(res1); i++)
+	{
+		node = atoi(PQgetvalue(res1, i, 0));
+		/* Initialize on false so if we can't reach this node we know that later */
+		nodes[i].is_ready = false;
+		strncpy(nodeConninfo, PQgetvalue(res1, i, 1), MAXLEN);
+		nodeConn = establishDBConnection(nodeConninfo, false);
+		/* if we can't see the node just skip it */
+		if (PQstatus(nodeConn) != CONNECTION_OK)
+			continue;
+
+		sqlquery_snprintf(sqlquery, "SELECT %s.repmgr_get_last_standby_location()", repmgr_schema);
+		res2 = PQexec(nodeConn, sqlquery);
+		if (PQresultStatus(res2) != PGRES_TUPLES_OK)
+		{
+			log_info(_("Can't get node's last standby location: %s\n"), PQerrorMessage(nodeConn));
+			log_info(_("Connection details: %s\n"), nodeConninfo);
+			PQclear(res2);
+			PQfinish(nodeConn);
+			continue;
+		}
+
+		visible_nodes++;
+
+		if (sscanf(PQgetvalue(res2, 0, 0), "%X/%X", &uxlogid, &uxrecoff) != 2)
+			log_info(_("could not parse transaction log location \"%s\"\n"), PQgetvalue(res2, 0, 0));
+
+		nodes[i].nodeId = node;
+		nodes[i].xlog_location.xlogid = uxlogid;
+		nodes[i].xlog_location.xrecoff = uxrecoff;
+		nodes[i].is_ready = true;
+
+		PQclear(res2);
+		PQfinish(nodeConn);
+	}
+	PQclear(res1);
+	/* Close the connection to this server */
+	PQfinish(myLocalConn);
+
+	/*
+	 * total nodes that are registered, include master which is a node but was
+	 * not counted because it's not a standby
+	 */
+	total_nodes = i + 1;
+
+	/*
+	 * am i on the group that should keep alive?
+	 * if i see less than half of total_nodes then i should do nothing
+	 */
+	if (visible_nodes < (total_nodes / 2.0))
+	{
+		log_err(_("Can't reach most of the nodes.\n"
+		          "Let the other standby servers decide which one will be the primary.\n"
+		          "Manual action will be needed to readd this node to the cluster.\n"));
+		exit(ERR_FAILOVER_FAIL);
+	}
+
+	/*
+	 * determine which one is the best candidate to promote to primary
+	 */
+	for (i = 0; i < total_nodes - 1; i++)
+	{
+		if (!nodes[i].is_ready)
+			continue;
+		else if (!find_best)
+		{
+			/* start with the first ready node, and then move on to the next one */
+			best_candidate.nodeId                = nodes[i].nodeId;
+			best_candidate.xlog_location.xlogid  = nodes[i].xlog_location.xlogid;
+			best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
+			best_candidate.is_ready              = nodes[i].is_ready;
+			find_best = true;
+		}
+
+		/* we use the macros provided by xlogdefs.h to compare XLogPtr */
+		/*
+		 * Nodes are retrieved ordered by priority, so if the current
+		 * best candidate is lower or equal to the next node's wal location
+		 * then assign next node as the new best candidate.
+		 */
+		if (XLByteLE(best_candidate.xlog_location, nodes[i].xlog_location))
+		{
+			best_candidate.nodeId                = nodes[i].nodeId;
+			best_candidate.xlog_location.xlogid  = nodes[i].xlog_location.xlogid;
+			best_candidate.xlog_location.xrecoff = nodes[i].xlog_location.xrecoff;
+			best_candidate.is_ready              = nodes[i].is_ready;
+		}
+	}
+
+	/* once we know who is the best candidate, promote it */
+	if (find_best && (best_candidate.nodeId == local_options.node))
+	{
+		if (verbose)
+			log_info(_("%s: This node is the best candidate to be the new primary, promoting...\n"),
+			         progname);
+		log_debug(_("promote command is: \"%s\"\n"), local_options.promote_command);
+		r = system(local_options.promote_command);
+		if (r != 0)
+		{
+			log_err(_("%s: promote command failed. You could check and try it manually.\n"), progname);
+			exit(ERR_BAD_CONFIG);
+		}
+	}
+	else if (find_best)
+	{
+		if (verbose)
+			log_info(_("%s: Node %d is the best candidate to be the new primary, we should follow it...\n"),
+			         progname, best_candidate.nodeId);
+		log_debug(_("follow command is: \"%s\"\n"), local_options.follow_command);
+		/*
+		 * New Primary need some time to be promoted.
+		 * The follow command should take care of that.
+		 */
+		r = system(local_options.follow_command);
+		if (r != 0)
+		{
+			log_err(_("%s: follow command failed. You could check and try it manually.\n"), progname);
+			exit(ERR_BAD_CONFIG);
+		}
+	}
+	else
+	{
+		log_err(_("%s: Did not find candidates. You should check and try manually.\n"), progname);
+		exit(ERR_FAILOVER_FAIL);
+	}
+
+	/* and reconnect to the local database */
+	myLocalConn = establishDBConnection(local_options.conninfo, true);
+}
+
+
+static bool
+CheckPrimaryConnection(void)
+{
+	int	connection_retries;
+
+	/*
+	 * Check if the master is still available
+	 * if after local_options.reconnect_attempts * local_options.reconnect_intvl seconds of retries
+	 * we cannot reconnect
+	 * return false
+	 */
+	for (connection_retries = 0; connection_retries < local_options.reconnect_attempts; connection_retries++)
+	{
+		if (!is_pgup(primaryConn, local_options.master_response_timeout))
+		{
+			log_warning(_("%s: Connection to master has been lost, trying to recover... %i seconds before failover decision\n"), 
+								progname, 
+								(local_options.reconnect_intvl * (local_options.reconnect_attempts - connection_retries)));
+			/* wait local_options.reconnect_intvl seconds between retries */
+			sleep(local_options.reconnect_intvl);
+		}
+		else
+		{
+			if ( connection_retries > 0)
+			{
+				log_info(_("%s: Connection to master has been restored.\n"), progname);
+			}
+			break;
+		}
+	}
+	if (!is_pgup(primaryConn, local_options.master_response_timeout))
+	{
+		log_err(_("%s: We couldn't reconnect for long enough, exiting...\n"), progname);
+		/* XXX Anything else to do here? */
+		return false;
+	}
+	return true;
+}
+
+
 static void
 checkClusterConfiguration(PGconn *conn, PGconn *primary)
 {
@@ -383,7 +807,7 @@ checkClusterConfiguration(PGconn *conn, PGconn *primary)
 	res = PQexec(conn, sqlquery);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		log_err("PQexec failed: %s\n", PQerrorMessage(conn));
+		log_err(_("PQexec failed: %s\n"), PQerrorMessage(conn));
 		PQclear(res);
 		CloseConnections();
 		exit(ERR_DB_QUERY);
@@ -398,7 +822,7 @@ checkClusterConfiguration(PGconn *conn, PGconn *primary)
 	 */
 	if (PQntuples(res) == 0)
 	{
-		log_err("The replication cluster is not configured\n");
+		log_err(_("The replication cluster is not configured\n"));
 		PQclear(res);
 		CloseConnections();
 		exit(ERR_BAD_CONFIG);
@@ -425,7 +849,7 @@ checkNodeConfiguration(char *conninfo)
 	res = PQexec(myLocalConn, sqlquery);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
-		log_err("PQexec failed: %s\n", PQerrorMessage(myLocalConn));
+		log_err(_("PQexec failed: %s\n"), PQerrorMessage(myLocalConn));
 		PQclear(res);
 		CloseConnections();
 		exit(ERR_BAD_CONFIG);
@@ -433,30 +857,42 @@ checkNodeConfiguration(char *conninfo)

 	/*
 	 * If there isn't any results then we have not configured this node yet
-	 * in repmgr, if that is the case we will insert the node to the cluster
+	 * in repmgr, if that is the case we will insert the node to the cluster,
+	 * except if it is a witness
 	 */
 	if (PQntuples(res) == 0)
 	{
 		PQclear(res);

+		if (myLocalMode == WITNESS_MODE)
+		{
+			log_err(_("The witness is not configured\n"));
+			CloseConnections();
+			exit(ERR_BAD_CONFIG);
+		}
+
 		/* Adding the node */
 		log_info(_("%s Adding node %d to cluster '%s'\n"),
 		         progname, local_options.node, local_options.cluster_name);
 		sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_nodes "
-		                  "VALUES (%d, '%s', '%s')",
+		                  "VALUES (%d, '%s', '%s', '%s', 0, 'f')",
 		                  repmgr_schema, local_options.node,
 		                  local_options.cluster_name,
+						  local_options.node_name,
 		                  local_options.conninfo);

 		if (!PQexec(primaryConn, sqlquery))
 		{
-			log_err("Cannot insert node details, %s\n",
+			log_err(_("Cannot insert node details, %s\n"),
 			        PQerrorMessage(primaryConn));
 			CloseConnections();
 			exit(ERR_BAD_CONFIG);
 		}
 	}
-	PQclear(res);
+	else 
+	{
+		PQclear(res);
+	}
 }


@@ -468,7 +904,7 @@ walLocationToBytes(char *wal_location)

 	if (sscanf(wal_location, "%X/%X", &xlogid, &xrecoff) != 2)
 	{
-		log_err("wrong log location format: %s\n", wal_location);
+		log_err(_("wrong log location format: %s\n"), wal_location);
 		return 0;
 	}
 	return (( (long long) xlogid * 16 * 1024 * 1024 * 255) + xrecoff);
@@ -490,6 +926,7 @@ void help(const char *progname)
 	printf(_("  --help                    show this help, then exit\n"));
 	printf(_("  --version                 output version information, then exit\n"));
 	printf(_("  --verbose                 output verbose activity information\n"));
+	printf(_("  --monitoring-history      track advance or lag of the replication in every standby in repl_monitor\n"));
 	printf(_("  -f, --config_file=PATH    configuration file\n"));
 	printf(_("\n%s monitors a cluster of servers.\n"), progname);
 }
@@ -500,27 +937,60 @@ static void
 handle_sigint(SIGNAL_ARGS)
 {
 	CloseConnections();
+	exit(1);
 }

+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+handle_sighup(SIGNAL_ARGS)
+{
+	got_SIGHUP = true;
+}

 static void
-setup_cancel_handler(void)
+setup_event_handlers(void)
 {
+	pqsignal(SIGHUP, handle_sighup);
 	pqsignal(SIGINT, handle_sigint);
 }
 #endif


 static void
-CancelQuery(void)
+update_shared_memory(char *last_wal_standby_applied)
 {
-	char errbuf[ERRBUFF_SIZE];
-	PGcancel *pgcancel;
+	PGresult *res;

-	pgcancel = PQgetCancel(primaryConn);
+	sprintf(sqlquery, "SELECT %s.repmgr_update_standby_location('%s')",
+	        		  repmgr_schema, last_wal_standby_applied);

-	if (!pgcancel || PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
-		log_warning("Can't stop current query: %s\n", errbuf);
-
-	PQfreeCancel(pgcancel);
+	/* If an error happens, just inform about that and continue */
+	res = PQexec(myLocalConn, sqlquery);
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		log_warning(_("Cannot update this standby's shared memory: %s\n"), PQerrorMessage(myLocalConn));
+		/* XXX is this enough reason to terminate this repmgrd? */
+	}
+	PQclear(res);
+}
+
+static void
+update_registration(void)
+{
+	PGresult *res;
+
+	sqlquery_snprintf(sqlquery, "UPDATE %s.repl_nodes "
+	                  "   SET conninfo = '%s', "
+	                  "       priority = %d "
+	                  " WHERE id = %d",
+	                  repmgr_schema, local_options.conninfo, local_options.priority, local_options.node);
+
+	res = PQexec(primaryConn, sqlquery);
+	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+	{
+		log_err(_("Cannot update registration: %s\n"), PQerrorMessage(primaryConn));
+		CloseConnections();
+		exit(ERR_DB_CON);
+	}
+	PQclear(res);
 }
--- a/sql/Makefile
+++ b/sql/Makefile
@@ -0,0 +1,19 @@
+#
+# Makefile
+# Copyright (c) 2ndQuadrant, 2010
+#
+
+MODULE_big = repmgr_funcs
+DATA_built=repmgr_funcs.sql
+DATA=uninstall_repmgr_funcs.sql
+OBJS=repmgr_funcs.o
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/repmgr/sql
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
--- a/sql/repmgr_funcs.c
+++ b/sql/repmgr_funcs.c
@@ -0,0 +1,189 @@
+/*
+ * repmgr_funcs.c
+ * Copyright (c) 2ndQuadrant, 2010
+ *
+ * Shared memory state management and some backend functions in SQL
+ */
+
+#include "postgres.h"
+#include "fmgr.h"
+#include "access/xlog.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/procarray.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+
+/* same definition as the one in xlog_internal.h */
+#define MAXFNAMELEN 	64
+
+PG_MODULE_MAGIC;
+
+/*
+ * Global shared state
+ */
+typedef struct repmgrSharedState
+{
+    LWLockId    lock;           		/* protects search/modification */
+    char		location[MAXFNAMELEN];	/* last known xlog location */
+} repmgrSharedState;
+
+/* Links to shared memory state */
+static repmgrSharedState *shared_state = NULL;
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+void        _PG_init(void);
+void        _PG_fini(void);
+
+static void repmgr_shmem_startup(void);
+static Size repmgr_memsize(void);
+
+static bool repmgr_set_standby_location(char *locationstr);
+
+Datum repmgr_update_standby_location(PG_FUNCTION_ARGS);
+Datum repmgr_get_last_standby_location(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(repmgr_update_standby_location);
+PG_FUNCTION_INFO_V1(repmgr_get_last_standby_location);
+
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+	/*
+	 * In order to create our shared memory area, we have to be loaded via
+	 * shared_preload_libraries.  If not, fall out without hooking into any of
+	 * the main system.  (We don't throw error here because it seems useful to
+	 * allow the repmgr functions to be created even when the
+	 * module isn't active.  The functions must protect themselves against
+	 * being called then, however.)
+	 */
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+
+	/*
+	 * Request additional shared resources.  (These are no-ops if we're not in
+	 * the postmaster process.)  We'll allocate or attach to the shared
+	 * resources in repmgr_shmem_startup().
+	 */
+	RequestAddinShmemSpace(repmgr_memsize());
+	RequestAddinLWLocks(1);
+
+	/*
+	 * Install hooks.
+	 */
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = repmgr_shmem_startup;
+}
+
+/*
+ * Module unload callback
+ */
+void
+_PG_fini(void)
+{
+	/* Uninstall hooks. */
+	shmem_startup_hook = prev_shmem_startup_hook;
+}
+
+/*
+ * shmem_startup hook: allocate or attach to shared memory,
+ */
+static void
+repmgr_shmem_startup(void)
+{
+	bool		found;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	/* reset in case this is a restart within the postmaster */
+	shared_state = NULL;
+
+	/*
+	 * Create or attach to the shared memory state, including hash table
+	 */
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	shared_state = ShmemInitStruct("repmgr shared state",
+						   sizeof(repmgrSharedState),
+						   &found);
+
+	if (!found)
+	{
+		/* First time through ... */
+		shared_state->lock = LWLockAssign();
+		snprintf(shared_state->location,
+				sizeof(shared_state->location), "%X/%X", 0, 0);
+	}
+
+	LWLockRelease(AddinShmemInitLock);
+}
+
+
+/*
+ * Estimate shared memory space needed.
+ */
+static Size
+repmgr_memsize(void)
+{
+    return MAXALIGN(sizeof(repmgrSharedState));
+}
+
+
+static bool
+repmgr_set_standby_location(char *locationstr)
+{
+    /* Safety check... */
+    if (!shared_state)
+        return false;
+
+    LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
+	strncpy(shared_state->location, locationstr, MAXFNAMELEN);
+    LWLockRelease(shared_state->lock);
+
+	return true;
+}
+
+
+/* SQL Functions */
+
+/* Read last xlog location reported by this standby from shared memory */
+Datum
+repmgr_get_last_standby_location(PG_FUNCTION_ARGS)
+{
+	char location[MAXFNAMELEN];
+
+	/* Safety check... */
+	if (!shared_state)
+		PG_RETURN_NULL();
+
+	LWLockAcquire(shared_state->lock, LW_SHARED);
+	strncpy(location, shared_state->location, MAXFNAMELEN);
+	LWLockRelease(shared_state->lock);
+
+	PG_RETURN_TEXT_P(cstring_to_text(location));
+}
+
+
+/* Set update last xlog location reported by this standby to shared memory */
+Datum
+repmgr_update_standby_location(PG_FUNCTION_ARGS)
+{
+	text       *location = PG_GETARG_TEXT_P(0);
+	char       *locationstr;
+
+    /* Safety check... */
+    if (!shared_state)
+        PG_RETURN_BOOL(false);
+
+    locationstr = text_to_cstring(location);
+
+	PG_RETURN_BOOL(repmgr_set_standby_location(locationstr));
+}
--- a/sql/repmgr_funcs.sql.in
+++ b/sql/repmgr_funcs.sql.in
@@ -0,0 +1,15 @@
+/*
+ * repmgr_function.sql
+ * Copyright (c) 2ndQuadrant, 2010
+ *
+ */
+
+-- SET SEARCH_PATH TO 'repmgr';
+
+CREATE FUNCTION repmgr_update_standby_location(text) RETURNS boolean
+AS 'MODULE_PATHNAME', 'repmgr_update_standby_location'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION repmgr_get_last_standby_location() RETURNS text
+AS 'MODULE_PATHNAME', 'repmgr_get_last_standby_location'
+LANGUAGE C STRICT;
--- a/sql/uninstall_repmgr_funcs.sql
+++ b/sql/uninstall_repmgr_funcs.sql
@@ -0,0 +1,2 @@
+DROP FUNCTION repmgr_update_standby_location(text);
+DROP FUNCTION repmgr_get_last_standby_location();
--- a/strutil.c
+++ b/strutil.c
@@ -1,7 +1,7 @@
 /*
 * strutil.c
 *
- * Copyright (C) 2ndQuadrant, 2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@@ -27,6 +27,15 @@

 static int xvsnprintf(char *str, size_t size, const char *format, va_list ap);

+/* Add strnlen on platforms that don't have it, like OS X */
+#ifndef strnlen
+size_t
+strnlen(const char *s, size_t n)
+{
+	const char *end = (const char *) memchr(s, '\0', n);
+	return(end ? end - s : n);
+}
+#endif

 static int
 xvsnprintf(char *str, size_t size, const char *format, va_list ap)
--- a/strutil.h
+++ b/strutil.h
@@ -1,6 +1,6 @@
 /*
 * strutil.h
- * Copyright (C) 2ndQuadrant, 2010-2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 *
 * This program is free software: you can redistribute it and/or modify
@@ -35,4 +35,9 @@ extern int xsnprintf(char *str, size_t size, const char *format, ...);
 extern int sqlquery_snprintf(char *str, const char *format, ...);
 extern int maxlen_snprintf(char *str, const char *format, ...);

+/* Add strnlen on platforms that don't have it, like OS X */
+#ifndef strnlen
+extern size_t strnlen(const char *s, size_t n);
+#endif
+
 #endif	/* _STRUTIL_H_ */
--- a/uninstall_repmgr.sql
+++ b/uninstall_repmgr.sql
@@ -1,7 +1,7 @@
 /*
 * uninstall_repmgr.sql
 *
- * Copyright (C) 2ndQuadrant, 2010-2011
+ * Copyright (C) 2ndQuadrant, 2010-2012
 *
 */

--- a/version.h
+++ b/version.h
@@ -0,0 +1,4 @@
+#ifndef _VERSION_H_
+#define _VERSION_H_
+#define REPMGR_VERSION "2.0beta1"
+#endif