Compare commits

...

142 Commits

Author SHA1 Message Date
Christian Kruse
069f9ff2ed version push 2014-03-17 14:26:56 +01:00
Christian Kruse
b8ade8e908 fixing some documentation errors 2014-03-10 15:51:55 +01:00
Christian Kruse
c0abb3be31 Merge branch 'master' into REL2_0_STABLE 2014-03-06 15:23:52 +01:00
Christian Kruse
fed5c77653 various improvements and bugfixes in the init script 2014-03-06 15:23:22 +01:00
Christian Kruse
8429b43edf Merge pull request #14 from wamonite/fix_follow_user
fix: store the master connection user name on standby follow
2014-03-06 15:20:02 +01:00
Warren Moore
7e55ce737d fix: store the master connection user name on standby follow 2014-03-05 16:49:56 +00:00
Christian Kruse
98c7635fb5 fixing more compiler warnings 2014-03-04 17:58:36 +01:00
Christian Kruse
90ecb2b107 fix: check return values of freopen()
Some compiles complain about not checking the return value of freopen(),
so we check it
2014-03-04 15:32:48 +01:00
Christian Kruse
50b9022a41 fix: don't use Windows newlines 2014-03-04 12:59:23 +01:00
Christian Kruse
150ccc0662 add option to avoid repmgrd started upon installation
Now repmgr.repmgrd.default has another option: REPMGRD_ENABLED. Valid
values are either yes or no.
2014-03-04 12:46:05 +01:00
Christian Kruse
0a71123920 Merge branch 'master' into REL2_0_STABLE 2014-03-03 09:25:08 +01:00
Christian Kruse
0ff14a2aa1 avoid compiler warnings 2014-02-21 13:47:29 +01:00
Christian Kruse
5215265694 fix: now CloseConnections() is much more safe 2014-02-18 17:06:36 +01:00
Christian Kruse
e45ac25348 fix: progname is const, do not free it
The leak is irrelevant
2014-02-18 16:45:35 +01:00
Christian Kruse
a1ce01f033 fix: fixed some leaks 2014-02-18 16:35:29 +01:00
Christian Kruse
516cde621a fix: strcpy() on overlapping memory regions is invalid 2014-02-18 15:42:20 +01:00
Christian Kruse
f0807923a3 fix: gettimeofday() expects two arguments 2014-02-18 15:33:56 +01:00
Christian Kruse
10ca8037f8 added some more log messages
Now we should be able to distinguish different events more easily
2014-02-18 14:10:12 +01:00
Christian Kruse
0dc46f0dc8 fix: set connection to NULL when finishing it
This will avoid CloseConnections() to try to close an already closed connection.
2014-02-18 13:42:49 +01:00
Christian Kruse
c3b58658ad fixing repmgr repl_status columns
repmgr repl_status had the column time_lag which was documented to be
the time a standby is behind master. In fact it only works like this
when viewed on the standby and not on the master: there it only was the
time of the last status update. We dropped that column and replaced it
by a new column „communication_time_lag“ which is the content of the
repl_status column on the master. On the standby we contain the time of
the last update in shared mem though refer always to the correct time
nonetheless where repl_status is queried. We also added a new column,
„replication_time_lag“, which refers to the apply delay.
2014-02-15 01:35:27 +01:00
Christian Kruse
18f1fed77f fixing wait_connection_availability()
wait_connection_availability() did take at least 2 seconds per call in
the old incarnation. Now we may finish a call without any sleep at all
when the result is already ready at the time called
2014-02-15 01:31:12 +01:00
Christian Kruse
d58fd080ca flush stderr after a log message appears
We had the problem that the log file appeared empty for a long time due
to file buffers. Thus we call fflush() after every log message so the
log file gets written out to disk quickly
2014-02-15 01:29:12 +01:00
Christian Kruse
c4ac2d3343 fixing PQexec() calls
fixing several calls where we did not check the result status but only
the return value; the query may fail nonetheless
2014-02-15 01:27:53 +01:00
Christian Kruse
a72c2296e9 Merge branch 'master' into REL2_0_STABLE 2014-02-11 09:28:40 +01:00
Christian Kruse
5ff1beeea7 do not enable autofailover by default
Autofailover is an experimental feature which should not be enabled by
default. The user has to be aware of what he is doing when enabling it.
2014-02-11 09:27:31 +01:00
Christian Kruse
9c3d79147b now version.h contains the right version 2014-02-07 21:47:39 +01:00
Christian Kruse
ca470647cb cleanup of usage text
Now it properly aligns and breaks at 78 characters.
2014-01-30 14:26:17 +01:00
Christian Kruse
62ee287e3f updated TODO 2014-01-30 14:10:14 +01:00
Christian Kruse
729a1b848a release notes for 2.0 stable 2014-01-30 13:59:17 +01:00
Christian Kruse
701cf043fd fix: seems as if I missread -hackers 2014-01-23 16:46:49 +01:00
Christian Kruse
bbb67c55f6 simple past of set is set 2014-01-23 10:50:37 +01:00
Christian Kruse
c2c48a9fe6 removed already finished TODO tasks 2014-01-23 10:48:04 +01:00
Christian Kruse
9d6ac2ebf9 fixed documentation and line endings 2014-01-23 10:39:21 +01:00
Christian Kruse
680f23fb1d copyright push 2014-01-23 10:37:49 +01:00
Christian Kruse
1159113c58 ignore the dynamic shared memory directory, too 2014-01-23 10:02:32 +01:00
Christian Kruse
f25a709454 added an explicit type cast to avoid compiler warnings 2014-01-22 15:17:47 +01:00
Christian Kruse
897daddcc7 removed not needed arguments to avoid compiler warnings 2014-01-22 15:17:28 +01:00
Christian Kruse
0fdcce0477 use if instead of switch and avoid a warning 2014-01-22 15:12:29 +01:00
Christian Kruse
de58eff7c1 added a chdir() for proper daemonizing 2014-01-22 14:30:38 +01:00
Christian Kruse
f2a0b31a20 more log format fixes 2014-01-22 14:30:24 +01:00
Christian Kruse
e007a55967 fix: do not use fsync()
We do not need fsync(), the fflush() is enough to avoid concurrent
logs.
2014-01-22 11:47:50 +01:00
Christian Kruse
d235c696af fix: do not newline at the start of a log line
This breaks the log file format since it will have a line break directly
after the timestamp
2014-01-22 11:47:02 +01:00
Christian Kruse
4ef6fbb5fe do not close stderr but reopen it to /dev/null
We want stderr to be always a valid file descriptor
2014-01-21 16:25:57 +01:00
Christian Kruse
2e61d7b156 refactoring: daemonizing is now a function 2014-01-21 16:19:49 +01:00
Christian Kruse
4496a0761e we now use a function and are more sophisticated
Refactoring part: we now use a function to generate the PID
file. Sophistication: we now check if the PID contained in the file is a
valid PID. We ignore the file if it doesn't.
2014-01-21 16:18:15 +01:00
Christian Kruse
3978ead184 use a second fork to avoid a terminal
after the setsid() we are the process leader. And as a process leader we
are able to open a new terminal, even if we currently don't own one. So
we do another fork and do not call setsid() and not become a process
leader to avoid that.
2014-01-21 15:51:33 +01:00
Christian Kruse
b36dbf61fe reopening stdin and stdout to /dev/null now
stdin, stdout and stderr should always be valid file handles. Thus we
don't close them but reopen them to /dev/null
2014-01-21 15:31:38 +01:00
Christian Kruse
84466ecca5 log_crit() is more appropriate 2014-01-21 15:23:20 +01:00
Christian Kruse
649086e5e4 use unlink() instead of remove()
`remove()` will do a rmdir if necessary - we don't want that. So we use `unlink()`
2014-01-21 15:22:31 +01:00
Christian Kruse
7cf2eb440d renamed config options to a much more descriptive name 2014-01-21 15:19:50 +01:00
Christian Kruse
388bbfb773 split install target into install_prog and install_ext
Patch by Marco Nenciarini <mnencia@debian.org>
2014-01-21 14:23:33 +01:00
Christian Kruse
a89aa02c68 fix: make pg_config be settable from outside the makefile
Patch by Marco Nenciarini <mnencia@debian.org>
2014-01-21 14:22:59 +01:00
Christian Kruse
c81793b63f fix: added forgotten options.priority value
Patch by Marco Nenciarini <mnencia@debian.org>
2014-01-21 14:18:12 +01:00
Christian Kruse
b4e83cf188 Add format attribute checking for printf() like functions
Patch by Marco Nenciarini <mnencia@debian.org>
2014-01-21 14:14:36 +01:00
Christian Kruse
1db61ce277 fix: fail when repmgr_funcs is not pre-loaded
when repmgr_funcs is not pre-loaded `repmgr_update_standby_location()`
will return false and `repmgr_get_last_standby_location()` will return
an empty string. Thus we may end in an endless loop. To avoid that we fail.
2014-01-21 13:54:10 +01:00
Christian Kruse
41abf9a7ef fix: flushing and fsync()ing the log file
When not flushing and fsync()ing it the output may be garbled due to
concurrent writes to the file (system() spawns a child process with
stdin/stdout/stderr inherited from it's parent)
2014-01-21 13:52:27 +01:00
Christian Kruse
abebc53ddc fix: sscanf() does not set variables to 0 on error 2014-01-21 13:48:41 +01:00
Christian Kruse
5fc4a0382f added config options sleep_delay and sleep_monitor
sleep_monitor replaces the old SLEEP_MONITOR define and makes it
configurable; this is the interval in which we monitor

sleep_delay replaces the old sleep(300) when waiting for the master to
recover.
2014-01-17 14:35:50 +01:00
Christian Kruse
a7d3c9b93a fix: also close stderr when using syslog logging 2014-01-17 12:14:26 +01:00
Christian Kruse
ee9dc9e247 do not use exit()
We avoid using exit() to be able to clean up when we have to
terminate. This includes removal of the PID file as well as closing
database connections.
2014-01-17 11:28:55 +01:00
Christian Kruse
94cb5b94e7 fix: reopen log file on SIGHUP 2014-01-16 17:16:45 +01:00
Christian Kruse
a08aa50f92 fix: close stdin and stdout only in repmgrd
closing stdin and stdout might cause problems when using system(), so we
avoid it.
2014-01-16 16:01:58 +01:00
Christian Kruse
9563877fbb new config option, stdout/stdin closed
Now stdin and stdout get closed. Additionally stderr gets closed and
reopened to the new config option „logfile“ if specified
2014-01-16 15:22:34 +01:00
Christian Kruse
4f3bd6612c do not exit in getMasterConnection() 2014-01-16 15:07:15 +01:00
Christian Kruse
192ee3cdb0 do not exit in get_cluster_size 2014-01-16 15:07:06 +01:00
Christian Kruse
6f149ead8f do not exit in guc_setted and guc_setted_typed 2014-01-16 14:48:46 +01:00
Christian Kruse
77aa6aa326 do not exit in pg_version 2014-01-16 14:48:42 +01:00
Christian Kruse
18206b3a64 do not exit() in is_witness 2014-01-16 14:28:56 +01:00
Christian Kruse
91446bcf93 fix: do not try to reconnect infinitely 2014-01-10 17:26:02 +01:00
Christian Kruse
dcdf8788ae fix: handle connection loss to standby
We do basically the same as we do for the master since connections drop
from time to time
2014-01-10 17:12:03 +01:00
Christian Kruse
4fabfbbbd0 fix: do not exit in is_standby()
Instead we now return an int with 0 meaning „not a standby,“ 1 meaning
„is a standby“ and -1 meaning „connection dropped“
2014-01-10 17:11:16 +01:00
Christian Kruse
c41030b40e Merge branch 'REL2_0_STABLE'
Conflicts:
	HISTORY
	dbutils.h
	repmgr.c
	repmgrd.c
	version.h
2014-01-10 16:07:33 +01:00
Christian Kruse
a0fdadd5d2 this way it is much cleaner 2014-01-09 15:35:44 +01:00
Christian Kruse
4c3d7f80ed now code compiles with -ansi -pedantic and has less warnings 2014-01-09 14:45:07 +01:00
Christian Kruse
6e3fe059d8 added config options pg_bindir and pg_ctl_options 2014-01-09 14:44:34 +01:00
Christian Kruse
9f26254ac3 fix: added some missing initializers to avoid compiler warning 2014-01-09 13:33:22 +01:00
Christian Kruse
0e8ff1730e added handling of a PID file 2014-01-09 13:04:40 +01:00
Christian Kruse
634fdff303 fix: do not call setup_event_handlers() on WIN32
If we put setup_event_handlers() in #ifdef WIN32, we have to do it for
the call and the declaration, too
2014-01-09 12:57:16 +01:00
Christian Kruse
cbce29f009 fixed typos 2014-01-08 11:55:03 +01:00
Christian Kruse
920f925e4b added a new cli option --daemonize
This option forks the process and generates a new session. This
effectively detaches it from the shell. Don't forget to redirect stderr
or use syslog for logging!
2014-01-08 11:53:15 +01:00
Christian Kruse
9fe2d6886e white space cleanup 2014-01-07 16:42:06 +01:00
Christian Kruse
0068dd573a fix: do not compare pointers but the strings 2014-01-07 15:52:29 +01:00
Christian Kruse
d0f3cb59c7 fix: create data directory after sanity check 2014-01-07 14:42:55 +01:00
Christian Kruse
7428e92e10 fix: correctly check the return value of PQexec()
not only check if return value is not NULL but also check that the
returned result is a PGRES_COMMAND_OK (e.g. the INSERT was successful)
2014-01-07 14:27:31 +01:00
Christian Kruse
a97065113d fix: remove own node earlier if force is set
We have to remove our own node before we check for a new master if force
is set; else master register would fail on the second time since there
already is a master (ourselves), even if we specify -F
2014-01-07 14:16:58 +01:00
Christian Kruse
9e2f276fcf fix: do not exit after pg_start_backup() w/o pg_stop_backup() 2014-01-07 14:02:29 +01:00
Christian Kruse
b0cd2b5e43 fix: do not exit() in create_pgdir()
This could leave the database in a locked state (pg_start_backup()).
And since all calls to create_pgdir() handle the return value correctly
we simply replace the exit() by a return false
2014-01-07 14:01:46 +01:00
Jaime Casanova
9209248420 Fix oversight in the header of guc_setted_typed() 2013-12-19 11:09:08 -05:00
Jaime Casanova
6693b99288 Files to create the debian package
Patch by: Christian Kruse
2013-12-19 01:43:12 -05:00
Jaime Casanova
8e7b487838 Update debian control file 2013-12-19 01:41:24 -05:00
Jaime Casanova
7f796e2d15 Update history and credit files 2013-12-19 01:40:00 -05:00
Jaime Casanova
5e04ab6eae Add a ssh_options parameter to allow ssh checking
to consider non-default values (ie: a different port)

Patch by Jay Taylor
2013-12-19 01:22:55 -05:00
Jaime Casanova
a1f4285e2b Add guc_setted_typed() function to allow
wal_keep_segmeents to be checked as an integer instead
of text

Patch by Jay Taylor
2013-12-19 01:22:42 -05:00
Jaime Casanova
493133986d Add timestamps to log line in stderr
Patch by Christian Kruse
2013-12-19 01:15:28 -05:00
Jaime Casanova
8b370dc581 Fix some typos
Patch by Krzysztof Gajdemski
2013-12-07 13:25:46 -05:00
Jaime Casanova
43af00aa12 Ignore pg_log when cloning, just like we ignore pg_xlog 2013-12-04 01:23:48 -05:00
Jaime Casanova
3c8df59eb9 Make repmgr compile in 9.3.
Patch provided by Shawn Ellis with some fixes by me.
2013-11-14 00:43:35 -05:00
Jaime Casanova
b410772627 Rework algorithm to coordinate voting
Make this by waiting for all nodes to finish a step, before starting
a new one. So everyone starts promoting or following in a coordinated
fashion.
Also make a few fixes.
2013-09-26 13:24:31 -05:00
Jaime Casanova
d99024ba11 Make repmgrd survive to the failover
To do this it needs to reconnect to the new master
2013-09-26 11:58:59 -05:00
Jaime Casanova
1afaa3a26f Rearrange the logic in do_failover() for further improvements.
Specially, make this a more coordinated process by making all
nodes waiting for the others before going to the next step.

This is one step further in following Andres Freund advices
but there is still a lot to do in order to complete that,
specially it could be needed to add more fields to repl_nodes
and to the shm area.
2013-09-23 18:28:58 -05:00
Jaime Casanova
079a7c9f16 In a failover situation get the nodes in a well defined order.
When deciding which node will be the new master, we should get the
nodes in a well defined order otherwise two standbys could process
nodes with the same priority in different order and end up with
a two master situation.
2013-07-26 00:59:50 -05:00
Jaime Casanova
3b66a31ac9 In a failover situation get the nodes in a well defined order.
When deciding which node will be the new master, we should get the
nodes in a well defined order otherwise two standbys could process
nodes with the same priority in different order and end up with
a two master situation.
2013-07-26 00:52:31 -05:00
Jaime Casanova
bdf957ca52 Add a missing ')'. This is a typo introduced in commit
2bc8044fda

Per complaint from Carlos Chapi when compiling for a customer.
2013-07-13 12:39:13 -05:00
Jaime Casanova
ad3630e7a9 Add a missing ')'. This is a typo introduced in commit
2bc8044fda

Per complaint from Carlos Chapi when compiling for a customer.
2013-07-13 12:37:15 -05:00
Jaime Casanova
67b451aa45 If PQgetCancel() returns NULL we should also return false.
Noted by Andres Freund.
2013-07-12 08:03:36 -05:00
Jaime Casanova
0a70d907ae Improve messages in wait_connection_availability, so we know what
error makes the failover procedure to start

By gripe from Andres Freund
2013-07-12 08:03:25 -05:00
Jaime Casanova
2e7acf03c4 If PQgetCancel() returns NULL we should also return false.
Noted by Andres Freund.
2013-07-12 08:01:01 -05:00
Jaime Casanova
2bc8044fda Improve messages in wait_connection_availability, so we know what
error makes the failover procedure to start

By gripe from Andres Freund
2013-07-10 19:25:58 -05:00
Jaime Casanova
ab1d380843 If PQcancel() fails, consider it as if the master is failing.
Because PQcancel() establish a new synchronous connection to the
database, if it fails it means something wrong has happenned with
master. So instead of just ignore the failure, CancelQuery() now
reports a failure condition so we can detect master's death in
that situation.

This is very important specially when only postmaster crashes but
other children/backend connections are still there. Because the
children connection won't fail and CancelQuery() failure is our
only indication of something wrong happenning.
Currently we just ignore the PQcancel() failure which leads us to
a situation in which we just loop forever
trying to cancel the async query.

Reported by: Martin Euser <martin.euser@nl.abnamro.com>
Problem analyzed and bug spotted by: Andres Freund <andres@2ndquadrant.com>
Patch by: Jaime Casanova <jaime@2ndquadrant.com>
2013-07-10 10:21:51 -05:00
Jaime Casanova
b0b44a157f If PQcancel() fails, consider it as if the master is failing.
Because PQcancel() establish a new synchronous connection to the
database, if it fails it means something wrong has happenned with
master. So instead of just ignore the failure, CancelQuery() now
reports a failure condition so we can detect master's death in
that situation.

This is very important specially when only postmaster crashes but
other children/backend connections are still there. Because the
children connection won't fail and CancelQuery() failure is our
only indication of something wrong happenning.
Currently we just ignore the PQcancel() failure which leads us to
a situation in which we just loop forever
trying to cancel the async query.

Reported by: Martin Euser <martin.euser@nl.abnamro.com>
Problem analyzed and bug spotted by: Andres Freund <andres@2ndquadrant.com>
Patch by: Jaime Casanova <jaime@2ndquadrant.com>
2013-07-10 09:53:45 -05:00
Jaime Casanova
49a2531930 Options -F -W -I -v doesn't accept arguments, which means that on
getopt_long shouldn't be marked with the colon (:) character.

This has been wrong since day one, so backpatching all the way until
1.1
2013-01-13 16:37:39 -05:00
Jaime Casanova
672b237c4e Options -F -W -I -v doesn't accept arguments, which means that on
getopt_long shouldn't be marked with the colon (:) character.

This has been wrong since day one, so backpatching all the way until
1.1
2013-01-13 16:32:56 -05:00
Jaime Casanova
7d94151494 If the node is a witness don't bother asking its position, it always
will be 0/0. We just need to check that we can connect to it to determine
if we are in the majority.
2013-01-11 03:44:50 -05:00
Jaime Casanova
4191b77e70 If the node is a witness don't bother asking its position, it always
will be 0/0. We just need to check that we can connect to it to determine
if we are in the majority.
2013-01-11 03:42:08 -05:00
Jaime Casanova
2a5d431481 Fix a problem that caused a standby to promote itself without going to
voting procedure.

This is because of a race condition inside CheckPrimaryConnection().

This has independently reported by Alex Railean and Dumitru, and Frank Jördens.
Analyzed and fixed by Cédric Villemain.

The fix have been verified to work by Frank
2012-12-19 12:01:27 -05:00
Jaime Casanova
81b8a944de Fix a problem that caused a standby to promote itself without going to
voting procedure.

This is because of a race condition inside CheckPrimaryConnection().

This has independently reported by Alex Railean and Dumitru, and Frank Jördens.
Analyzed and fixed by Cédric Villemain.

The fix have been verified to work by Frank
2012-12-19 11:45:58 -05:00
Jaime Casanova
93a999adc7 Formatting code using astyle 2012-12-11 11:49:07 -05:00
Jaime Casanova
1b69282df9 Formatting code using astyle 2012-12-11 11:47:59 -05:00
Jaime Casanova
06dd252f69 To select new master it needs to know which standby has received more
xlog records from master, so it standby should use pg_last_xlog_receive_location()
to report their positions. This solves a possible situation in which
a standby that is considered as new master when promoted is no longer
the best option.
2012-12-03 09:27:12 -05:00
Jaime Casanova
088ca29fe3 To select new master it needs to know which standby has received more
xlog records from master, so it standby should use pg_last_xlog_receive_location()
to report their positions. This solves a possible situation in which
a standby that is considered as new master when promoted is no longer
the best option.
2012-12-03 09:18:08 -05:00
Jaime Casanova
30e9d06172 Add an option for STANDBY FOLLOW to wait for a master to appear.
This is important for autofailover to do the right thing when
standbys detected master death at different times.

While this is a new option, seems important for the autofailover
to work properly so i will consider the lack of it a bug and
will backpatch to 2.0 where autofailover was introduced.

For gripe from Alex Railean, about a standby not finding the new
master because the new master hasn't finish promoting.
2012-11-14 15:09:26 -05:00
Jaime Casanova
d6bd5aa381 Add an option for STANDBY FOLLOW to wait for a master to appear.
This is important for autofailover to do the right thing when
standbys detected master death at different times.

While this is a new option, seems important for the autofailover
to work properly so i will consider the lack of it a bug and
will backpatch to 2.0 where autofailover was introduced.

For gripe from Alex Railean, about a standby not finding the new
master because the new master hasn't finish promoting.
2012-11-14 15:07:59 -05:00
Gabriele Bartolini
bbdcffa813 Fixed typos notified by lintian 2012-11-09 18:09:43 +01:00
Jaime Casanova
cd1a84252e Fix node decision logic when priorities are involved. Currently if
two nodes with different prorities are equally good to be promoted
the second one (with a lower priority, considering them
in descending order) will win.

Per report from Brailean Dumitru
2012-09-16 02:47:02 -05:00
Jaime Casanova
5f33d9d715 Fix node decision logic when priorities are involved. Currently if
two nodes with different prorities are equally good to be promoted
the second one (with a lower priority, considering them
in descending order) will win.

Per report from Brailean Dumitru
2012-09-16 02:38:28 -05:00
Jaime Casanova
2e19b3688b Add a comment 2012-09-16 02:26:18 -05:00
Jaime Casanova
877f4cf82e Add a comment 2012-09-16 02:23:16 -05:00
Jaime Casanova
de883a4c84 Keep compiler quiet. Noted when compiling in FreeBSD in which i
get a warning for an uninitialized variable.

Also, define InvalidXLogRecPtr. We don't really need it but using
it make the initialization future proof (considering that in 9.3
XLogRecPtr will change its structure).
2012-09-16 02:21:18 -05:00
Jaime Casanova
949f5ee498 Keep compiler quiet. Noted when compiling in FreeBSD in which i
get a warning for an uninitialized variable.

Also, define InvalidXLogRecPtr. We don't really need it but using
it make the initialization future proof (considering that in 9.3
XLogRecPtr will change its structure).
2012-09-16 02:10:02 -05:00
Jaime Casanova
eb2f7efb4a When we have more command-line arguments than we should have we
need to show that last value and we should use only optind for that
instead of optind+1
2012-09-15 17:39:10 -05:00
Jaime Casanova
85ff3ec286 Fix documentation to always use -h sintax to refer to the node we
want to clone or connect to, instead of relying on the fact that
for some time putting that argument at last worked.
2012-09-15 17:38:42 -05:00
Jaime Casanova
499a501afd Make repmgr compatible with FreeBSD.
We need to add an #include and make it use a different path for the
"true" binary.

Maybe we need to make this changes for all BSD systems but having no
evidence of that i prefer to make this only for systems with __FreeBSD__
2012-09-15 17:37:59 -05:00
Jaime Casanova
0a9107d76d Improve sample of commands for promote and follow 2012-09-15 17:37:43 -05:00
Jaime Casanova
2803bb92a8 Make repmgr compatible with FreeBSD.
We need to add an #include and make it use a different path for the
"true" binary.

Maybe we need to make this changes for all BSD systems but having no
evidence of that i prefer to make this only for systems with __FreeBSD__
2012-09-15 17:32:38 -05:00
Jaime Casanova
16fe41eecf Improve sample of commands for promote and follow 2012-09-11 15:53:57 -05:00
Jaime Casanova
95ec0450da When we have more command-line arguments than we should have we
need to show that last value and we should use only optind for that
instead of optind+1
2012-08-30 02:11:48 -05:00
Jaime Casanova
57aa95f674 Fix documentation to always use -h sintax to refer to the node we
want to clone or connect to, instead of relying on the fact that
for some time putting that argument at last worked.
2012-08-30 02:10:10 -05:00
Jaime Casanova
d365a309fc Fix HISTORY to show from newest to oldest 2012-07-27 11:29:07 -05:00
Jaime Casanova
d5a41bb587 Fix tabs in HISTORY 2012-07-27 11:22:04 -05:00
Jaime Casanova
474d3217b4 Fix typos in RELEASE NOTES 2012-07-27 11:21:49 -05:00
Jaime Casanova
7a00d5a9a4 Now that we can have no monitoring we need to check all nodes at failover
not only those in repl_monitor
2012-07-21 17:53:15 -05:00
Jaime Casanova
5683b905dd New development branch is 2.1dev 2012-07-21 12:22:04 -05:00
32 changed files with 1849 additions and 818 deletions

View File

@@ -1,4 +1,4 @@
Copyright (c) 2010-2012, 2ndQuadrant Limited Copyright (c) 2010-2014, 2ndQuadrant Limited
All rights reserved. All rights reserved.
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify

View File

@@ -10,3 +10,7 @@ Hannu Krosing <hannu@2ndQuadrant.com>
Cédric Villemain <cedric@2ndquadrant.com> Cédric Villemain <cedric@2ndquadrant.com>
Charles Duffy <charles@dyfis.net> Charles Duffy <charles@dyfis.net>
Daniel Farina <daniel@heroku.com> Daniel Farina <daniel@heroku.com>
Shawn Ellis <shawn.ellis17@gmail.com>
Jay Taylor <jay@jaytaylor.com>
Christian Kruse <christian@2ndQuadrant.com>
Krzysztof Gajdemski <songo@debian.org.pl>

20
HISTORY
View File

@@ -1,4 +1,22 @@
2.0beta 2012-07-27 2.0stable 2014-01-30
Documentation fixes (Christian)
General refactoring, code quality improvements and stabilization work (Christian)
Added proper daemonizing (-d/--daemonize) (Christian)
Added PID file handling (-p/--pid-file) (Christian)
New config option: monitor_interval_secs (Christian)
New config option: retry_promote_interval (Christian)
New config option: logfile (Christian)
New config option: pg_bindir (Christian)
New config option: pgctl_options (Christian)
2.0beta2 2013-12-19
Improve autofailover logic and algorithms (Jaime, Andres)
Ignore pg_log when cloning (Jaime)
Add timestamps to log line in stderr (Christian)
Correctly check wal_keep_segments (Jay Taylor)
Add a ssh_options parameter (Jay Taylor)
2.0beta1 2012-07-27
Make CLONE command try to make an exact copy including $PGDATA location (Cedric) Make CLONE command try to make an exact copy including $PGDATA location (Cedric)
Add detection of master failure (Jaime) Add detection of master failure (Jaime)
Add the notion of a witness server (Jaime) Add the notion of a witness server (Jaime)

View File

@@ -1,6 +1,6 @@
# #
# Makefile # Makefile
# Copyright (c) 2ndQuadrant, 2010-2012 # Copyright (c) 2ndQuadrant, 2010-2014
repmgrd_OBJS = dbutils.o config.o repmgrd.o log.o strutil.o repmgrd_OBJS = dbutils.o config.o repmgrd.o log.o strutil.o
repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o repmgr_OBJS = dbutils.o check_dir.o config.o repmgr.o log.o strutil.o
@@ -21,7 +21,8 @@ repmgr: $(repmgr_OBJS)
$(CC) $(CFLAGS) $(repmgr_OBJS) $(PG_LIBS) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o repmgr $(CC) $(CFLAGS) $(repmgr_OBJS) $(PG_LIBS) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o repmgr
ifdef USE_PGXS ifdef USE_PGXS
PGXS := $(shell pg_config --pgxs) PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS) include $(PGXS)
else else
subdir = contrib/repmgr subdir = contrib/repmgr
@@ -32,9 +33,13 @@ endif
# XXX: Try to use PROGRAM construct (see pgxs.mk) someday. Right now # XXX: Try to use PROGRAM construct (see pgxs.mk) someday. Right now
# is overriding pgxs install. # is overriding pgxs install.
install: install: install_prog install_ext
install_prog:
$(INSTALL_PROGRAM) repmgrd$(X) '$(DESTDIR)$(bindir)' $(INSTALL_PROGRAM) repmgrd$(X) '$(DESTDIR)$(bindir)'
$(INSTALL_PROGRAM) repmgr$(X) '$(DESTDIR)$(bindir)' $(INSTALL_PROGRAM) repmgr$(X) '$(DESTDIR)$(bindir)'
install_ext:
$(MAKE) -C sql install $(MAKE) -C sql install
ifneq (,$(DATA)$(DATA_built)) ifneq (,$(DATA)$(DATA_built))

View File

@@ -625,18 +625,18 @@ Now restore to the original configuration by stopping
primary server, then bringing up "node2" as a standby with a valid primary server, then bringing up "node2" as a standby with a valid
``recovery.conf`` file. ``recovery.conf`` file.
Stop the "node2" server:: Stop the "node2" server and type the following on "node1" server::
repmgr -f /var/lib/pgsql/repmgr/repmgr.conf standby promote repmgr -f /var/lib/pgsql/repmgr/repmgr.conf standby promote
Now the original primary, "node1" is acting again as primary. Now the original primary, "node1", is acting again as primary.
Start the "node2" server and type this on "node1":: Start the "node2" server and type this on "node2"::
repmgr standby clone --force -h node2 -p 5432 -U postgres -R postgres --verbose repmgr standby clone --force -h node2 -p 5432 -U postgres -R postgres --verbose
Verify the roles have reversed by attempting to insert a record on "node" Verify the roles have reversed by attempting to insert a record on "node1"
and on "node1". and on "node2".
The servers are now again acting as primary on "node1" and standby on "node2". The servers are now again acting as primary on "node1" and standby on "node2".
@@ -1085,7 +1085,7 @@ License and Contributions
========================= =========================
repmgr is licensed under the GPL v3. All of its code and documentation is repmgr is licensed under the GPL v3. All of its code and documentation is
Copyright 2010-2012, 2ndQuadrant Limited. See the files COPYRIGHT and LICENSE for Copyright 2010-2014, 2ndQuadrant Limited. See the files COPYRIGHT and LICENSE for
details. details.
Main sponsorship of repmgr has been from 2ndQuadrant customers. Main sponsorship of repmgr has been from 2ndQuadrant customers.

19
TODO
View File

@@ -1,21 +1,18 @@
Known issues in repmgr Known issues in repmgr
====================== ======================
* The check for whether ``wal_keep_segments`` is considered large enough
does a string comparison rather than an integer one. It can give both
false positive (setting is large enough but flagged as too small) and
false negative (setting is too small but not noted as such) errors.
* When running repmgr against a remote machine, operations that start * When running repmgr against a remote machine, operations that start
the database server using the ``pg_ctl`` command may accidentally the database server using the ``pg_ctl`` command may accidentally
terminate after their associated ssh session ends. terminate after their associated ssh session ends.
* After running repmgrd as a regular foreground application, hitting
control-C causes the program to crash.
Planned feature improvements Planned feature improvements
============================ ============================
* Before running ``pg_start_backup()``, a sanity check that there is a * Timeline increases when promoting a standby
a working ssh connection to the destination would help find
configuration errors before disturbing the database. * A better check which standby did receive most of the data
* Make the fact that a standby may be delayed a factor in the voting
algorithm
* include support for delayed standbys

View File

@@ -1,213 +1,225 @@
===================================================== =====================================================
PostgreSQL Automatic Fail-Over - User Documentation PostgreSQL Automatic Fail-Over - User Documentation
===================================================== =====================================================
Automatic Failover Automatic Failover
================== ==================
repmgr allows setups for automatic failover when it detects the failure of the master node. repmgr allows setups for automatic failover when it detects the failure of the master node.
Following is a quick setup for this. Following is a quick setup for this.
Installation Installation
============ ============
For convenience, we define: For convenience, we define:
* node1 is the hostname fully qualified of the Master server, IP 192.168.1.10 **node1**
* node2 is the hostname fully qualified of the Standby server, IP 192.168.1.11 is the hostname fully qualified of the Master server, IP 192.168.1.10
* witness is the hostname fully qualified of the server used for witness, IP 192.168.1.12 **node2**
is the hostname fully qualified of the Standby server, IP 192.168.1.11
:Note: It is not recommanded to use name defining status of a server like «masterserver», **witness**
this is a name leading to confusion once a failover take place and the Master is is the hostname fully qualified of the server used for witness, IP 192.168.1.12
now on the «standbyserver».
**Note:** It is not recommanded to use name defining status of a server like «masterserver»,
Summary this is a name leading to confusion once a failover take place and the Master is
------- now on the «standbyserver».
2 PostgreSQL servers are involved in the replication. Automatic fail-over need Summary
to vote to decide what server it should promote, thus an odd number is required -------
and a witness-repmgrd is installed in a third server where it uses a PostgreSQL
cluster to communicate with other repmgrd daemons. 2 PostgreSQL servers are involved in the replication. Automatic fail-over need
to vote to decide what server it should promote, thus an odd number is required
1. Install PostgreSQL in all the servers involved (including the server used for and a witness-repmgrd is installed in a third server where it uses a PostgreSQL
witness) cluster to communicate with other repmgrd daemons.
2. Install repmgr in all the servers involved (including the server used for witness)
3. Configure the Master PostreSQL 1. Install PostgreSQL in all the servers involved (including the server used for
4. Clone the Master to the Standby using "repmgr standby clone" command witness)
5. Configure repmgr in all the servers involved (including the server used for witness)
6. Register Master and Standby nodes 2. Install repmgr in all the servers involved (including the server used for witness)
7. Initiate witness server
8. Start the repmgrd daemons in all nodes 3. Configure the Master PostreSQL
:Note: A complete Hight-Availability design need at least 3 servers to still have 4. Clone the Master to the Standby using "repmgr standby clone" command
a backup node after a first failure.
5. Configure repmgr in all the servers involved (including the server used for witness)
Install PostgreSQL
------------------ 6. Register Master and Standby nodes
You can install PostgreSQL using any of the recommended methods. You should ensure 7. Initiate witness server
it's 9.0 or superior.
8. Start the repmgrd daemons in all nodes
Install repmgr
-------------- **Note** A complete Hight-Availability design need at least 3 servers to still have
a backup node after a first failure.
Install repmgr following the steps in the README.
Install PostgreSQL
Configure PostreSQL ------------------
-------------------
You can install PostgreSQL using any of the recommended methods. You should ensure
Log in node1. it's 9.0 or superior.
Edit the file postgresql.conf and modify the parameters:: Install repmgr
--------------
listen_addresses='*'
wal_level = 'hot_standby' Install repmgr following the steps in the README.
archive_mode = on
archive_command = 'cd .' # we can also use exit 0, anything that Configure PostreSQL
# just does nothing -------------------
max_wal_senders = 10
wal_keep_segments = 5000 # 80 GB required on pg_xlog Log in node1.
hot_standby = on
shared_preload_libraries = 'repmgr_funcs' Edit the file postgresql.conf and modify the parameters::
Edit the file pg_hba.conf and add lines for the replication:: listen_addresses='*'
wal_level = 'hot_standby'
host repmgr repmgr 127.0.0.1/32 trust archive_mode = on
host repmgr repmgr 192.168.1.10/30 trust archive_command = 'cd .' # we can also use exit 0, anything that
host replication all 192.168.1.10/30 trust # just does nothing
max_wal_senders = 10
:Note: It is also possible to use a password authentication (md5), .pgpass file wal_keep_segments = 5000 # 80 GB required on pg_xlog
should be edited to allow connection between each node. hot_standby = on
shared_preload_libraries = 'repmgr_funcs'
Create the user and database to manage replication::
Edit the file pg_hba.conf and add lines for the replication::
su - postgres
createuser -s repmgr host repmgr repmgr 127.0.0.1/32 trust
createdb -O repmgr repmgr host repmgr repmgr 192.168.1.10/30 trust
psql -f /usr/share/postgresql/9.0/contrib/repmgr_funcs.sql repmgr host replication all 192.168.1.10/30 trust
Restart the PostgreSQL server:: **Note:** It is also possible to use a password authentication (md5), .pgpass file
should be edited to allow connection between each node.
pg_ctl -D $PGDATA restart
Create the user and database to manage replication::
And check everything is fine in the server log.
su - postgres
Create the ssh-key for the postgres user and copy it to other servers:: createuser -s repmgr
createdb -O repmgr repmgr
su - postgres psql -f /usr/share/postgresql/9.0/contrib/repmgr_funcs.sql repmgr
ssh-keygen # /!\ do not use a passphrase /!\
cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys Restart the PostgreSQL server::
chmod 600 ~/.ssh/authorized_keys
exit pg_ctl -D $PGDATA restart
rsync -avz ~postgres/.ssh/authorized_keys node2:~postgres/.ssh/
rsync -avz ~postgres/.ssh/authorized_keys witness:~postgres/.ssh/ And check everything is fine in the server log.
rsync -avz ~postgres/.ssh/id_rsa* node2:~postgres/.ssh/
rsync -avz ~postgres/.ssh/id_rsa* witness:~postgres/.ssh/ Create the ssh-key for the postgres user and copy it to other servers::
Clone Master su - postgres
------------ ssh-keygen # /!\ do not use a passphrase /!\
cat ~/.ssh/id_rsa.pub > ~/.ssh/authorized_keys
Log in node2. chmod 600 ~/.ssh/authorized_keys
exit
Clone the node1 (the current Master):: rsync -avz ~postgres/.ssh/authorized_keys node2:~postgres/.ssh/
rsync -avz ~postgres/.ssh/authorized_keys witness:~postgres/.ssh/
su - postgres rsync -avz ~postgres/.ssh/id_rsa* node2:~postgres/.ssh/
repmgr -d repmgr -U repmgr standby clone node1 rsync -avz ~postgres/.ssh/id_rsa* witness:~postgres/.ssh/
Start the PostgreSQL server:: Clone Master
------------
pg_ctl -D $PGDATA start
Log in node2.
And check everything is fine in the server log.
Clone the node1 (the current Master)::
Configure repmgr
---------------- su - postgres
repmgr -d repmgr -U repmgr -h node1 standby clone
Log in each server and configure repmgr by editing the file
/etc/repmgr/repmgr.conf:: Start the PostgreSQL server::
cluster=my_cluster pg_ctl -D $PGDATA start
node=1
node_name=earth And check everything is fine in the server log.
conninfo='host=192.168.1.10 dbname=repmgr user=repmgr'
master_response_timeout=60 Configure repmgr
reconnect_attempts=6 ----------------
reconnect_interval=10
failover=automatic Log in each server and configure repmgr by editing the file
promote_command='promote_command.sh' /etc/repmgr/repmgr.conf::
follow_command='repmgr standby follow -f /etc/repmgr/repmgr.conf'
cluster=my_cluster
* *cluster* is the name of the current replication. node=1
* *node* is the number of the current node (1, 2 or 3 in the current example). node_name=earth
* *node_name* is an identifier for every node. conninfo='host=192.168.1.10 dbname=repmgr user=repmgr'
* *conninfo* is used to connect to the local PostgreSQL server (where the configuration file is) from any node. In the witness server configuration it is needed to add a 'port=5499' to the conninfo. master_response_timeout=60
* *master_response_timeout* is the maximum amount of time we are going to wait before deciding the master has died and start failover procedure. reconnect_attempts=6
* *reconnect_attempts* is the number of times we will try to reconnect to master after a failure has been detected and before start failover procedure. reconnect_interval=10
* *reconnect_interval* is the amount of time between retries to reconnect to master after a failure has been detected and before start failover procedure. failover=automatic
* *failover* configure behavior : *manual* or *automatic*. promote_command='promote_command.sh'
* *promote_command* the command executed to do the failover (including the PostgreSQL failover itself). The command must return 0 on success. follow_command='repmgr standby follow -f /etc/repmgr/repmgr.conf'
* *follow_command* the command executed to address the current standby to another Master. The command must return 0 on success.
**cluster**
Register Master and Standby is the name of the current replication.
--------------------------- **node**
is the number of the current node (1, 2 or 3 in the current example).
Log in node1. **node_name**
is an identifier for every node.
Register the node as Master:: **conninfo**
is used to connect to the local PostgreSQL server (where the configuration file is) from any node. In the witness server configuration it is needed to add a 'port=5499' to the conninfo.
su - postgres **master_response_timeout**
repmgr -f /etc/repmgr/repmgr.conf master register is the maximum amount of time we are going to wait before deciding the master has died and start failover procedure.
**reconnect_attempts**
Log in node2. is the number of times we will try to reconnect to master after a failure has been detected and before start failover procedure.
**reconnect_interval**
Register the node as Standby:: is the amount of time between retries to reconnect to master after a failure has been detected and before start failover procedure.
**failover**
su - postgres configure behavior: *manual* or *automatic*.
repmgr -f /etc/repmgr/repmgr.conf standby register **promote_command**
the command executed to do the failover (including the PostgreSQL failover itself). The command must return 0 on success.
Initialize witness server **follow_command**
------------------------- the command executed to address the current standby to another Master. The command must return 0 on success.
Log in witness. Register Master and Standby
---------------------------
Initialize the witness server::
Log in node1.
su - postgres
repmgr -d repmgr -U repmgr -h 192.168.1.10 -D $WITNESS_PGDATA -f /etc/repmgr/repmgr.conf witness create node1 Register the node as Master::
It needs information to connect to the master to copy the configuration of the cluster, also it needs to know where it should initialize it's own $PGDATA. su - postgres
As part of the procees it also ask for the superuser password so it can connect when needed. repmgr -f /etc/repmgr/repmgr.conf master register
Start the repmgrd daemons Log in node2. Register it as a standby::
-------------------------
su - postgres
Log in node2 and witness. repmgr -f /etc/repmgr/repmgr.conf standby register
su - postgres Initialize witness server
repmgrd -f /etc/repmgr/repmgr.conf > /var/log/postgresql/repmgr.log 2>&1 -------------------------
:Note: The Master does not need a repmgrd daemon. Log in witness.
Initialize the witness server::
Suspend Automatic behavior
========================== su - postgres
repmgr -d repmgr -U repmgr -h 192.168.1.10 -D $WITNESS_PGDATA -f /etc/repmgr/repmgr.conf witness create
Edit the repmgr.conf of the node to remove from automatic processing and change::
It needs information to connect to the master to copy the configuration of the cluster, also it needs to know where it should initialize it's own $PGDATA.
failover=manual As part of the procees it also ask for the superuser password so it can connect when needed.
Then, signal repmgrd daemon:: Start the repmgrd daemons
-------------------------
su - postgres
kill -HUP `pidoff repmgrd` Log in node2 and witness.
TODO : -HUP configuration update is not implemented and it should check its su - postgres
configuration file against its configuration in DB, updating repmgrd -f /etc/repmgr/repmgr.conf > /var/log/postgresql/repmgr.log 2>&1
accordingly the SQL conf (especialy the failover manual or auto)
this allow witness-standby and standby-not-promotable features **Note:** The Master does not need a repmgrd daemon.
and simpler usage of the tool ;)
Usage Suspend Automatic behavior
===== ==========================
The repmgr documentation is in the README file (how to build, options, etc.) Edit the repmgr.conf of the node to remove from automatic processing and change::
failover=manual
Then, signal repmgrd daemon::
su - postgres
kill -HUP `pidoff repmgrd`
Usage
=====
The repmgr documentation is in the README file (how to build, options, etc.)

View File

@@ -1,6 +1,6 @@
/* /*
* check_dir.c - Directories management functions * check_dir.c - Directories management functions
* Copyright (C) 2ndQuadrant, 2010-2012 * Copyright (C) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@@ -127,10 +127,10 @@ mkdir_p(char *path, mode_t omode)
{ {
struct stat sb; struct stat sb;
mode_t numask, mode_t numask,
oumask; oumask;
int first, int first,
last, last,
retval; retval;
char *p; char *p;
p = path; p = path;
@@ -225,12 +225,12 @@ is_pg_dir(char *dir)
struct stat sb; struct stat sb;
int r; int r;
// test pgdata /* test pgdata */
xsnprintf(path, buf_sz, "%s/PG_VERSION", dir); xsnprintf(path, buf_sz, "%s/PG_VERSION", dir);
if (stat(path, &sb) == 0) if (stat(path, &sb) == 0)
return true; return true;
// test tablespace dir /* test tablespace dir */
sprintf(path, "ls %s/PG_*/ -I*", dir); sprintf(path, "ls %s/PG_*/ -I*", dir);
r = system(path); r = system(path);
if (r == 0) if (r == 0)
@@ -256,7 +256,7 @@ create_pgdir(char *dir, bool force)
{ {
log_err(_("couldn't create directory \"%s\"...\n"), log_err(_("couldn't create directory \"%s\"...\n"),
dir); dir);
exit(ERR_BAD_CONFIG); return false;
} }
break; break;
case 1: case 1:
@@ -268,7 +268,7 @@ create_pgdir(char *dir, bool force)
{ {
log_err(_("could not change permissions of directory \"%s\": %s\n"), log_err(_("could not change permissions of directory \"%s\": %s\n"),
dir, strerror(errno)); dir, strerror(errno));
exit(ERR_BAD_CONFIG); return false;
} }
break; break;
case 2: case 2:
@@ -293,7 +293,7 @@ create_pgdir(char *dir, bool force)
"If you are sure you want to clone here, " "If you are sure you want to clone here, "
"please check there is no PostgreSQL server " "please check there is no PostgreSQL server "
"running and use the --force option\n")); "running and use the --force option\n"));
exit(ERR_BAD_CONFIG); return false;
} }
return false; return false;
@@ -301,7 +301,7 @@ create_pgdir(char *dir, bool force)
/* Trouble accessing directory */ /* Trouble accessing directory */
log_err(_("could not access directory \"%s\": %s\n"), log_err(_("could not access directory \"%s\": %s\n"),
dir, strerror(errno)); dir, strerror(errno));
exit(ERR_BAD_CONFIG); return false;
} }
return true; return true;
} }

View File

@@ -1,6 +1,6 @@
/* /*
* check_dir.h * check_dir.h
* Copyright (c) 2ndQuadrant, 2010-2012 * Copyright (c) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by

View File

@@ -1,6 +1,6 @@
/* /*
* config.c - Functions to parse the config file * config.c - Functions to parse the config file
* Copyright (C) 2ndQuadrant, 2010-2012 * Copyright (C) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@@ -41,6 +41,9 @@ parse_config(const char *config_file, t_configuration_options *options)
memset(options->promote_command, 0, sizeof(options->promote_command)); memset(options->promote_command, 0, sizeof(options->promote_command));
memset(options->follow_command, 0, sizeof(options->follow_command)); memset(options->follow_command, 0, sizeof(options->follow_command));
memset(options->rsync_options, 0, sizeof(options->rsync_options)); memset(options->rsync_options, 0, sizeof(options->rsync_options));
memset(options->ssh_options, 0, sizeof(options->ssh_options));
memset(options->pg_bindir, 0, sizeof(options->pg_bindir));
memset(options->pgctl_options, 0, sizeof(options->pgctl_options));
/* if nothing has been provided defaults to 60 */ /* if nothing has been provided defaults to 60 */
options->master_response_timeout = 60; options->master_response_timeout = 60;
@@ -49,6 +52,9 @@ parse_config(const char *config_file, t_configuration_options *options)
options->reconnect_attempts = 6; options->reconnect_attempts = 6;
options->reconnect_intvl = 10; options->reconnect_intvl = 10;
options->monitor_interval_secs = 2;
options->retry_promote_interval_secs = 300;
/* /*
* Since some commands don't require a config file at all, not * Since some commands don't require a config file at all, not
* having one isn't necessarily a problem. * having one isn't necessarily a problem.
@@ -78,6 +84,8 @@ parse_config(const char *config_file, t_configuration_options *options)
strncpy (options->conninfo, value, MAXLEN); strncpy (options->conninfo, value, MAXLEN);
else if (strcmp(name, "rsync_options") == 0) else if (strcmp(name, "rsync_options") == 0)
strncpy (options->rsync_options, value, QUERY_STR_LEN); strncpy (options->rsync_options, value, QUERY_STR_LEN);
else if (strcmp(name, "ssh_options") == 0)
strncpy (options->ssh_options, value, QUERY_STR_LEN);
else if (strcmp(name, "loglevel") == 0) else if (strcmp(name, "loglevel") == 0)
strncpy (options->loglevel, value, MAXLEN); strncpy (options->loglevel, value, MAXLEN);
else if (strcmp(name, "logfacility") == 0) else if (strcmp(name, "logfacility") == 0)
@@ -111,6 +119,16 @@ parse_config(const char *config_file, t_configuration_options *options)
options->reconnect_attempts = atoi(value); options->reconnect_attempts = atoi(value);
else if (strcmp(name, "reconnect_interval") == 0) else if (strcmp(name, "reconnect_interval") == 0)
options->reconnect_intvl = atoi(value); options->reconnect_intvl = atoi(value);
else if (strcmp(name, "pg_bindir") == 0)
strncpy (options->pg_bindir, value, MAXLEN);
else if (strcmp(name, "pg_ctl_options") == 0)
strncpy (options->pgctl_options, value, MAXLEN);
else if (strcmp(name, "logfile") == 0)
strncpy(options->logfile, value, MAXLEN);
else if (strcmp(name, "monitor_interval_secs") == 0)
options->monitor_interval_secs = atoi(value);
else if (strcmp(name, "retry_promote_interval_secs") == 0)
options->retry_promote_interval_secs = atoi(value);
else else
log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value); log_warning(_("%s/%s: Unknown name/value pair!\n"), name, value);
} }
@@ -148,6 +166,12 @@ parse_config(const char *config_file, t_configuration_options *options)
log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n")); log_err(_("Reconnect intervals must be zero or greater. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG); exit(ERR_BAD_CONFIG);
} }
if (*options->pg_bindir == '\0')
{
log_err(_("pg_bindir config value not found. Check the configuration file.\n"));
exit(ERR_BAD_CONFIG);
}
} }
@@ -167,7 +191,9 @@ trim (char *s)
++s1; ++s1;
/* Copy finished string */ /* Copy finished string */
strcpy (s, s1); memmove (s, s1, s2 - s1);
s[s2 - s1 + 1] = '\0';
return s; return s;
} }
@@ -218,49 +244,49 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
parse_config(config_file, &new_options); parse_config(config_file, &new_options);
if (new_options.node == -1) if (new_options.node == -1)
{ {
log_warning(_("\nCannot load new configuration, will keep current one.\n")); log_warning(_("Cannot load new configuration, will keep current one.\n"));
return false; return false;
} }
if (strcmp(new_options.cluster_name, orig_options->cluster_name) != 0) if (strcmp(new_options.cluster_name, orig_options->cluster_name) != 0)
{ {
log_warning(_("\nCannot change cluster name, will keep current configuration.\n")); log_warning(_("Cannot change cluster name, will keep current configuration.\n"));
return false; return false;
} }
if (new_options.node != orig_options->node) if (new_options.node != orig_options->node)
{ {
log_warning(_("\nCannot change node number, will keep current configuration.\n")); log_warning(_("Cannot change node number, will keep current configuration.\n"));
return false; return false;
} }
if (new_options.node_name != orig_options->node_name) if (strcmp(new_options.node_name, orig_options->node_name) != 0)
{ {
log_warning(_("\nCannot change standby name, will keep current configuration.\n")); log_warning(_("Cannot change standby name, will keep current configuration.\n"));
return false; return false;
} }
if (new_options.failover != MANUAL_FAILOVER && new_options.failover != AUTOMATIC_FAILOVER) if (new_options.failover != MANUAL_FAILOVER && new_options.failover != AUTOMATIC_FAILOVER)
{ {
log_warning(_("\nNew value for failover is not valid. Should be MANUAL or AUTOMATIC.\n")); log_warning(_("New value for failover is not valid. Should be MANUAL or AUTOMATIC.\n"));
return false; return false;
} }
if (new_options.master_response_timeout <= 0) if (new_options.master_response_timeout <= 0)
{ {
log_warning(_("\nNew value for master_response_timeout is not valid. Should be greater than zero.\n")); log_warning(_("New value for master_response_timeout is not valid. Should be greater than zero.\n"));
return false; return false;
} }
if (new_options.reconnect_attempts < 0) if (new_options.reconnect_attempts < 0)
{ {
log_warning(_("\nNew value for reconnect_attempts is not valid. Should be greater or equal than zero.\n")); log_warning(_("New value for reconnect_attempts is not valid. Should be greater or equal than zero.\n"));
return false; return false;
} }
if (new_options.reconnect_intvl < 0) if (new_options.reconnect_intvl < 0)
{ {
log_warning(_("\nNew value for reconnect_interval is not valid. Should be greater or equal than zero.\n")); log_warning(_("New value for reconnect_interval is not valid. Should be greater or equal than zero.\n"));
return false; return false;
} }
@@ -268,7 +294,7 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
conn = establishDBConnection(new_options.conninfo, false); conn = establishDBConnection(new_options.conninfo, false);
if (!conn || (PQstatus(conn) != CONNECTION_OK)) if (!conn || (PQstatus(conn) != CONNECTION_OK))
{ {
log_warning(_("\nconninfo string is not valid, will keep current configuration.\n")); log_warning(_("conninfo string is not valid, will keep current configuration.\n"));
return false; return false;
} }
PQfinish(conn); PQfinish(conn);
@@ -283,6 +309,7 @@ reload_configuration(char *config_file, t_configuration_options *orig_options)
strcpy(orig_options->promote_command, new_options.promote_command); strcpy(orig_options->promote_command, new_options.promote_command);
strcpy(orig_options->follow_command, new_options.follow_command); strcpy(orig_options->follow_command, new_options.follow_command);
strcpy(orig_options->rsync_options, new_options.rsync_options); strcpy(orig_options->rsync_options, new_options.rsync_options);
strcpy(orig_options->ssh_options, new_options.ssh_options);
orig_options->master_response_timeout = new_options.master_response_timeout; orig_options->master_response_timeout = new_options.master_response_timeout;
orig_options->reconnect_attempts = new_options.reconnect_attempts; orig_options->reconnect_attempts = new_options.reconnect_attempts;
orig_options->reconnect_intvl = new_options.reconnect_intvl; orig_options->reconnect_intvl = new_options.reconnect_intvl;

View File

@@ -1,6 +1,6 @@
/* /*
* config.h * config.h
* Copyright (c) 2ndQuadrant, 2010-2012 * Copyright (c) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@@ -36,11 +36,19 @@ typedef struct
char loglevel[MAXLEN]; char loglevel[MAXLEN];
char logfacility[MAXLEN]; char logfacility[MAXLEN];
char rsync_options[QUERY_STR_LEN]; char rsync_options[QUERY_STR_LEN];
char ssh_options[QUERY_STR_LEN];
int master_response_timeout; int master_response_timeout;
int reconnect_attempts; int reconnect_attempts;
int reconnect_intvl; int reconnect_intvl;
char pg_bindir[MAXLEN];
char pgctl_options[MAXLEN];
char logfile[MAXLEN];
int monitor_interval_secs;
int retry_promote_interval_secs;
} t_configuration_options; } t_configuration_options;
#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", -1, -1, -1, "", "", "", 0, 0 }
void parse_config(const char *config_file, t_configuration_options *options); void parse_config(const char *config_file, t_configuration_options *options);
void parse_line(char *buff, char *name, char *value); void parse_line(char *buff, char *name, char *value);
char *trim(char *s); char *trim(char *s);

192
dbutils.c
View File

@@ -1,6 +1,6 @@
/* /*
* dbutils.c - Database connection/management functions * dbutils.c - Database connection/management functions
* Copyright (C) 2ndQuadrant, 2010-2012 * Copyright (C) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@@ -18,6 +18,8 @@
*/ */
#include <unistd.h> #include <unistd.h>
#include <time.h>
#include <sys/time.h>
#include "repmgr.h" #include "repmgr.h"
#include "strutil.h" #include "strutil.h"
@@ -71,25 +73,22 @@ establishDBConnectionByParams(const char *keywords[], const char *values[],const
return conn; return conn;
} }
bool int
is_standby(PGconn *conn) is_standby(PGconn *conn)
{ {
PGresult *res; PGresult *res;
bool result = false; int result = 0;
res = PQexec(conn, "SELECT pg_is_in_recovery()"); res = PQexec(conn, "SELECT pg_is_in_recovery()");
if (PQresultStatus(res) != PGRES_TUPLES_OK) if (res == NULL || PQresultStatus(res) != PGRES_TUPLES_OK)
{ {
log_err(_("Can't query server mode: %s"), log_err(_("Can't query server mode: %s"),
PQerrorMessage(conn)); PQerrorMessage(conn));
PQclear(res); result = -1;
PQfinish(conn);
exit(ERR_DB_QUERY);
} }
else if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0)
if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0) result = 1;
result = true;
PQclear(res); PQclear(res);
return result; return result;
@@ -97,11 +96,11 @@ is_standby(PGconn *conn)
bool int
is_witness(PGconn *conn, char *schema, char *cluster, int node_id) is_witness(PGconn *conn, char *schema, char *cluster, int node_id)
{ {
PGresult *res; PGresult *res;
bool result = false; int result = 0;
char sqlquery[QUERY_STR_LEN]; char sqlquery[QUERY_STR_LEN];
sqlquery_snprintf(sqlquery, "SELECT witness from %s.repl_nodes where cluster = '%s' and id = %d", sqlquery_snprintf(sqlquery, "SELECT witness from %s.repl_nodes where cluster = '%s' and id = %d",
@@ -110,13 +109,10 @@ is_witness(PGconn *conn, char *schema, char *cluster, int node_id)
if (PQresultStatus(res) != PGRES_TUPLES_OK) if (PQresultStatus(res) != PGRES_TUPLES_OK)
{ {
log_err(_("Can't query server mode: %s"), PQerrorMessage(conn)); log_err(_("Can't query server mode: %s"), PQerrorMessage(conn));
PQclear(res); result = -1;
PQfinish(conn);
exit(ERR_DB_QUERY);
} }
else if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0)
if (PQntuples(res) == 1 && strcmp(PQgetvalue(res, 0, 0), "t") == 0) result = 1;
result = true;
PQclear(res); PQclear(res);
return result; return result;
@@ -138,7 +134,7 @@ is_pgup(PGconn *conn, int timeout)
{ {
if (twice) if (twice)
return false; return false;
PQreset(conn); // reconnect PQreset(conn); /* reconnect */
twice = true; twice = true;
} }
else else
@@ -146,15 +142,16 @@ is_pgup(PGconn *conn, int timeout)
/* /*
* Send a SELECT 1 just to check if the connection is OK * Send a SELECT 1 just to check if the connection is OK
*/ */
CancelQuery(conn, timeout); if (!CancelQuery(conn, timeout))
goto failed;
if (wait_connection_availability(conn, timeout) != 1) if (wait_connection_availability(conn, timeout) != 1)
goto failed; goto failed;
sqlquery_snprintf(sqlquery, "SELECT 1"); sqlquery_snprintf(sqlquery, "SELECT 1");
if (PQsendQuery(conn, sqlquery) == 0) if (PQsendQuery(conn, sqlquery) == 0)
{ {
log_warning(_("PQsendQuery: Query could not be sent to primary. %s\n"), log_warning(_("PQsendQuery: Query could not be sent to primary. %s\n"),
PQerrorMessage(conn)); PQerrorMessage(conn));
goto failed; goto failed;
} }
if (wait_connection_availability(conn, timeout) != 1) if (wait_connection_availability(conn, timeout) != 1)
@@ -163,10 +160,10 @@ is_pgup(PGconn *conn, int timeout)
break; break;
failed: failed:
// we need to retry, because we might just have loose the connection once /* we need to retry, because we might just have loose the connection once */
if (twice) if (twice)
return false; return false;
PQreset(conn); // reconnect PQreset(conn); /* reconnect */
twice = true; twice = true;
} }
} }
@@ -197,8 +194,7 @@ pg_version(PGconn *conn, char* major_version)
log_err(_("Version check PQexec failed: %s"), log_err(_("Version check PQexec failed: %s"),
PQerrorMessage(conn)); PQerrorMessage(conn));
PQclear(res); PQclear(res);
PQfinish(conn); return NULL;
exit(ERR_DB_QUERY);
} }
major_version1 = atoi(PQgetvalue(res, 0, 0)); major_version1 = atoi(PQgetvalue(res, 0, 0));
@@ -219,12 +215,13 @@ pg_version(PGconn *conn, char* major_version)
} }
bool int
guc_setted(PGconn *conn, const char *parameter, const char *op, guc_set(PGconn *conn, const char *parameter, const char *op,
const char *value) const char *value)
{ {
PGresult *res; PGresult *res;
char sqlquery[QUERY_STR_LEN]; char sqlquery[QUERY_STR_LEN];
int retval = 1;
sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings " sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings "
" WHERE name = '%s' AND setting %s '%s'", " WHERE name = '%s' AND setting %s '%s'",
@@ -235,18 +232,49 @@ guc_setted(PGconn *conn, const char *parameter, const char *op,
{ {
log_err(_("GUC setting check PQexec failed: %s"), log_err(_("GUC setting check PQexec failed: %s"),
PQerrorMessage(conn)); PQerrorMessage(conn));
PQclear(res); retval = -1;
PQfinish(conn);
exit(ERR_DB_QUERY);
} }
if (PQntuples(res) == 0) else if (PQntuples(res) == 0)
{ {
PQclear(res); retval = 0;
return false;
} }
PQclear(res); PQclear(res);
return true; return retval;
}
/**
* Just like guc_set except with an extra parameter containing the name of
* the pg datatype so that the comparison can be done properly.
*/
int
guc_set_typed(PGconn *conn, const char *parameter, const char *op,
const char *value, const char *datatype)
{
PGresult *res;
char sqlquery[QUERY_STR_LEN];
int retval = 1;
sqlquery_snprintf(sqlquery, "SELECT true FROM pg_settings "
" WHERE name = '%s' AND setting::%s %s '%s'::%s",
parameter, datatype, op, value, datatype);
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_err(_("GUC setting check PQexec failed: %s"),
PQerrorMessage(conn));
retval = -1;
}
else if (PQntuples(res) == 0)
{
retval = 0;
}
PQclear(res);
return retval;
} }
@@ -254,7 +282,7 @@ const char *
get_cluster_size(PGconn *conn) get_cluster_size(PGconn *conn)
{ {
PGresult *res; PGresult *res;
const char *size; const char *size = NULL;
char sqlquery[QUERY_STR_LEN]; char sqlquery[QUERY_STR_LEN];
sqlquery_snprintf( sqlquery_snprintf(
@@ -267,11 +295,12 @@ get_cluster_size(PGconn *conn)
{ {
log_err(_("Get cluster size PQexec failed: %s"), log_err(_("Get cluster size PQexec failed: %s"),
PQerrorMessage(conn)); PQerrorMessage(conn));
PQclear(res);
PQfinish(conn);
exit(ERR_DB_QUERY);
} }
size = PQgetvalue(res, 0, 0); else
{
size = PQgetvalue(res, 0, 0);
}
PQclear(res); PQclear(res);
return size; return size;
} }
@@ -332,8 +361,7 @@ getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
log_err(_("Can't get nodes info: %s\n"), log_err(_("Can't get nodes info: %s\n"),
PQerrorMessage(standby_conn)); PQerrorMessage(standby_conn));
PQclear(res1); PQclear(res1);
PQfinish(standby_conn); return NULL;
exit(ERR_DB_QUERY);
} }
for (i = 0; i < PQntuples(res1); i++) for (i = 0; i < PQntuples(res1); i++)
@@ -396,51 +424,99 @@ getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
/* /*
* wait until current query finishes ignoring any results, this could be an async command * wait until current query finishes ignoring any results, this could be an async command
* or a cancelation of a query * or a cancelation of a query
* return 1 if Ok; 0 if any error ocurred; -1 if timeout reached * return 1 if Ok; 0 if any error ocurred; -1 if timeout reached
*/ */
int int
wait_connection_availability(PGconn *conn, int timeout) wait_connection_availability(PGconn *conn, long long timeout)
{ {
PGresult *res; PGresult *res;
fd_set read_set;
int sock = PQsocket(conn);
struct timeval tmout, before, after;
struct timezone tz;
while(timeout-- >= 0) /* recalc to microseconds */
timeout *= 1000000;
while (timeout > 0)
{ {
if (PQconsumeInput(conn) == 0) if (PQconsumeInput(conn) == 0)
{ {
log_warning(_("PQconsumeInput: Query could not be sent to primary. %s\n"), log_warning(_("wait_connection_availability: could not receive data from connection. %s\n"),
PQerrorMessage(conn)); PQerrorMessage(conn));
return 0; return 0;
} }
if (PQisBusy(conn) == 0) if (PQisBusy(conn) == 0)
{ {
res = PQgetResult(conn); do {
if (res == NULL) res = PQgetResult(conn);
break; PQclear(res);
PQclear(res); } while(res != NULL);
break;
} }
sleep(1);
tmout.tv_sec = 0;
tmout.tv_usec = 250000;
FD_ZERO(&read_set);
FD_SET(sock, &read_set);
gettimeofday(&before, &tz);
if (select(sock, &read_set, NULL, NULL, &tmout) == -1)
{
log_warning(
_("wait_connection_availability: select() returned with error: %s"),
strerror(errno));
return -1;
}
gettimeofday(&after, &tz);
timeout -= (after.tv_sec * 1000000 + after.tv_usec) -
(before.tv_sec * 1000000 + before.tv_usec);
} }
if (timeout >= 0) if (timeout >= 0)
{
return 1; return 1;
else }
return -1;
log_warning(_("wait_connection_availability: timeout reached"));
return -1;
} }
void bool
CancelQuery(PGconn *conn, int timeout) CancelQuery(PGconn *conn, int timeout)
{ {
char errbuf[ERRBUFF_SIZE]; char errbuf[ERRBUFF_SIZE];
PGcancel *pgcancel; PGcancel *pgcancel;
wait_connection_availability(conn, timeout); if (wait_connection_availability(conn, timeout) != 1)
return false;
pgcancel = PQgetCancel(conn); pgcancel = PQgetCancel(conn);
if (!pgcancel || PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0) if (pgcancel == NULL)
return false;
/*
* PQcancel can only return 0 if socket()/connect()/send()
* fails, in any of those cases we can assume something
* bad happened to the connection
*/
if (PQcancel(pgcancel, errbuf, ERRBUFF_SIZE) == 0)
{
log_warning(_("Can't stop current query: %s\n"), errbuf); log_warning(_("Can't stop current query: %s\n"), errbuf);
PQfreeCancel(pgcancel);
return false;
}
PQfreeCancel(pgcancel); PQfreeCancel(pgcancel);
return true;
} }

View File

@@ -1,6 +1,6 @@
/* /*
* dbutils.h * dbutils.h
* Copyright (c) 2ndQuadrant, 2010-2012 * Copyright (c) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@@ -26,16 +26,19 @@ PGconn *establishDBConnection(const char *conninfo, const bool exit_on_error);
PGconn *establishDBConnectionByParams(const char *keywords[], PGconn *establishDBConnectionByParams(const char *keywords[],
const char *values[], const char *values[],
const bool exit_on_error); const bool exit_on_error);
bool is_standby(PGconn *conn); int is_standby(PGconn *conn);
bool is_witness(PGconn *conn, char *schema, char *cluster, int node_id); int is_witness(PGconn *conn, char *schema, char *cluster, int node_id);
bool is_pgup(PGconn *conn, int timeout); bool is_pgup(PGconn *conn, int timeout);
char *pg_version(PGconn *conn, char* major_version); char *pg_version(PGconn *conn, char* major_version);
bool guc_setted(PGconn *conn, const char *parameter, const char *op, int guc_set(PGconn *conn, const char *parameter, const char *op,
const char *value); const char *value);
int guc_set_typed(PGconn *conn, const char *parameter, const char *op,
const char *value, const char *datatype);
const char *get_cluster_size(PGconn *conn); const char *get_cluster_size(PGconn *conn);
PGconn *getMasterConnection(PGconn *standby_conn, char *schema, char *cluster, PGconn *getMasterConnection(PGconn *standby_conn, char *schema, char *cluster,
int *master_id, char *master_conninfo_out); int *master_id, char *master_conninfo_out);
int wait_connection_availability(PGconn *conn, int timeout); int wait_connection_availability(PGconn *conn, long long timeout);
void CancelQuery(PGconn *conn, int timeout); bool CancelQuery(PGconn *conn, int timeout);
#endif #endif

View File

@@ -1,9 +1,9 @@
Package: repmgr-auto Package: repmgr-auto
Version: 1.0-1 Version: 2.0beta2
Section: database Section: database
Priority: optional Priority: optional
Architecture: all Architecture: all
Depends: rsync, postgresql-9.0 Depends: rsync, postgresql-9.0 | postgresql-9.1 | postgresql-9.2 | postgresql-9.3
Maintainer: Greg Smith <greg@2ndQuadrant.com> Maintainer: Jaime Casanova <jaime@2ndQuadrant.com>
Description: PostgreSQL replication setup, magament and monitoring Description: PostgreSQL replication setup, magament and monitoring
has two main executables has two main executables

18
debian/repmgr.repmgrd.default vendored Normal file
View File

@@ -0,0 +1,18 @@
# default settings for repmgrd. This file is source by /bin/sh from
# /etc/init.d/repmgrd
# disable repmgrd by default so it won't get started upon installation
# valid values: yes/no
REPMGRD_ENABLED=no
# Options for repmgrd (required)
#REPMGRD_OPTS="--config_file /path/to/repmgr.conf"
# User to run repmgrd as
#REPMGRD_USER=postgres
# repmgrd binary
#REPMGR_BIN=/usr/bin/repmgr
# pid file
#REPMGR_PIDFILE=/var/run/repmgrd.pid

101
debian/repmgr.repmgrd.init vendored Normal file
View File

@@ -0,0 +1,101 @@
#!/bin/sh
### BEGIN INIT INFO
# Provides: repmgrd
# Required-Start: $local_fs $remote_fs $network $syslog postgresql
# Required-Stop: $local_fs $remote_fs $network $syslog postgresql
# Should-Start: $syslog postgresql
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: Start/stop repmgrd
# Description: Enable repmgrd replication management and monitoring daemon for PostgreSQL
### END INIT INFO
set -e
DESC="PostgreSQL replication management and monitoring daemon"
NAME=repmgrd
REPMGRD_ENABLED=no
REPMGRD_OPTS=
REPMGRD_USER=postgres
REPMGRD_BIN=/usr/bin/repmgrd
REPMGRD_PIDFILE=/var/run/repmgrd.pid
# Read configuration variable file if it is present
[ -r /etc/default/$NAME ] && . /etc/default/$NAME
test -x $REPMGRD_BIN || exit 0
case "$REPMGRD_ENABLED" in
[Yy]*)
break
;;
*)
exit 0
;;
esac
# Define LSB log_* functions.
. /lib/lsb/init-functions
if [ -z "$REPMGRD_OPTS" ]
then
log_warning_msg "Not starting $NAME, REPMGRD_OPTS not set in /etc/default/$NAME"
exit 0
fi
do_start()
{
# Return
# 0 if daemon has been started
# 1 if daemon was already running
# other if daemon could not be started or a failure occured
start-stop-daemon --start --quiet --chuid $REPMGRD_USER --make-pidfile --pidfile $REPMGRD_PIDFILE --exec $REPMGRD_BIN -- $REPMGRD_OPTS
}
do_stop()
{
# Return
# 0 if daemon has been stopped
# 1 if daemon was already stopped
# other if daemon could not be stopped or a failure occurred
start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 --pidfile $REPMGRD_PIDFILE --exec $REPMGRD_BIN
}
case "$1" in
start)
log_daemon_msg "Starting $DESC" "$NAME"
do_start
case "$?" in
0) log_end_msg 0 ;;
1) log_progress_msg "already started"
log_end_msg 0 ;;
*) log_end_msg 1 ;;
esac
;;
stop)
log_daemon_msg "Stopping $DESC" "$NAME"
do_stop
case "$?" in
0) log_end_msg 0 ;;
1) log_progress_msg "already stopped"
log_end_msg 0 ;;
*) log_end_msg 1 ;;
esac
;;
restart|force-reload)
$0 stop
$0 start
;;
status)
status_of_proc -p $PIDFILE $DAEMON $NAME && exit 0 || exit $?
;;
*)
echo "Usage: $SCRIPTNAME {start|stop|restart|force-reload|status}" >&2
exit 3
;;
esac
exit 0

View File

@@ -1,6 +1,6 @@
/* /*
* errcode.h * errcode.h
* Copyright (C) 2ndQuadrant, 2010-2012 * Copyright (C) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@@ -35,5 +35,6 @@
#define ERR_STR_OVERFLOW 10 #define ERR_STR_OVERFLOW 10
#define ERR_FAILOVER_FAIL 11 #define ERR_FAILOVER_FAIL 11
#define ERR_BAD_SSH 12 #define ERR_BAD_SSH 12
#define ERR_SYS_FAILURE 13
#endif /* _ERRCODE_H_ */ #endif /* _ERRCODE_H_ */

45
log.c
View File

@@ -1,6 +1,6 @@
/* /*
* log.c - Logging methods * log.c - Logging methods
* Copyright (C) 2ndQuadrant, 2010-2012 * Copyright (C) 2ndQuadrant, 2010-2014
* *
* This module is a set of methods for logging (currently only syslog) * This module is a set of methods for logging (currently only syslog)
* *
@@ -25,9 +25,11 @@
#ifdef HAVE_SYSLOG #ifdef HAVE_SYSLOG
#include <syslog.h> #include <syslog.h>
#include <stdarg.h>
#endif #endif
#include <stdarg.h>
#include <time.h>
#include "log.h" #include "log.h"
#define DEFAULT_IDENT "repmgr" #define DEFAULT_IDENT "repmgr"
@@ -37,13 +39,38 @@
/* #define REPMGR_DEBUG */ /* #define REPMGR_DEBUG */
void stderr_log_with_level(const char *level_name, int level, const char *fmt, ...) {
size_t len = strlen(fmt);
char fmt1[len + 150];
time_t t;
struct tm *tm;
char buff[100];
va_list ap;
if(log_level >= level) {
time(&t);
tm = localtime(&t);
va_start(ap, fmt);
strftime(buff, 100, "[%Y-%m-%d %H:%M:%S]", tm);
snprintf(fmt1, len + 150, "%s [%s] %s", buff, level_name, fmt);
vfprintf(stderr, fmt1, ap);
va_end(ap);
fflush(stderr);
}
}
static int detect_log_level(const char* level); static int detect_log_level(const char* level);
static int detect_log_facility(const char* facility); static int detect_log_facility(const char* facility);
int log_type = REPMGR_STDERR; int log_type = REPMGR_STDERR;
int log_level = LOG_NOTICE; int log_level = LOG_NOTICE;
bool logger_init(const char* ident, const char* level, const char* facility) bool logger_init(t_configuration_options *opts, const char* ident, const char* level, const char* facility)
{ {
int l; int l;
@@ -115,6 +142,18 @@ bool logger_init(const char* ident, const char* level, const char* facility)
#endif #endif
if (*opts->logfile)
{
FILE *fd;
fd = freopen(opts->logfile, "a", stderr);
if (fd == NULL)
{
fprintf(stderr, "error reopening stderr to '%s': %s",
opts->logfile, strerror(errno));
}
}
return true; return true;
} }

22
log.h
View File

@@ -1,6 +1,6 @@
/* /*
* log.h * log.h
* Copyright (c) 2ndQuadrant, 2010-2012 * Copyright (c) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@@ -25,15 +25,17 @@
#define REPMGR_SYSLOG 1 #define REPMGR_SYSLOG 1
#define REPMGR_STDERR 2 #define REPMGR_STDERR 2
void stderr_log_with_level(const char *level_name, int level, const char *fmt, ...) __attribute__ ((format (PG_PRINTF_ATTRIBUTE, 3, 4)));
/* Standard error logging */ /* Standard error logging */
#define stderr_log_debug(...) if (log_level >= LOG_DEBUG) fprintf(stderr, __VA_ARGS__) #define stderr_log_debug(...) stderr_log_with_level("DEBUG", LOG_DEBUG, __VA_ARGS__)
#define stderr_log_info(...) if (log_level >= LOG_INFO) fprintf(stderr, __VA_ARGS__) #define stderr_log_info(...) stderr_log_with_level("INFO", LOG_INFO, __VA_ARGS__)
#define stderr_log_notice(...) if (log_level >= LOG_NOTICE) fprintf(stderr, __VA_ARGS__) #define stderr_log_notice(...) stderr_log_with_level("NOTICE", LOG_NOTICE, __VA_ARGS__)
#define stderr_log_warning(...) if (log_level >= LOG_WARNING) fprintf(stderr, __VA_ARGS__) #define stderr_log_warning(...) stderr_log_with_level("WARNING", LOG_WARNING, __VA_ARGS__)
#define stderr_log_err(...) if (log_level >= LOG_ERR) fprintf(stderr, __VA_ARGS__) #define stderr_log_err(...) stderr_log_with_level("ERROR", LOG_ERR, __VA_ARGS__)
#define stderr_log_crit(...) if (log_level >= LOG_CRIT) fprintf(stderr, __VA_ARGS__) #define stderr_log_crit(...) stderr_log_with_level("CRITICAL", LOG_CRIT, __VA_ARGS__)
#define stderr_log_alert(...) if (log_level >= LOG_ALERT) fprintf(stderr, __VA_ARGS__) #define stderr_log_alert(...) stderr_log_with_level("ALERT", LOG_ALERT, __VA_ARGS__)
#define stderr_log_emerg(...) if (log_level >= LOG_EMERG) fprintf(stderr, __VA_ARGS__) #define stderr_log_emerg(...) stderr_log_with_level("EMERGENCY", LOG_EMERG, __VA_ARGS__)
#ifdef HAVE_SYSLOG #ifdef HAVE_SYSLOG
@@ -112,7 +114,7 @@
/* Logger initialisation and shutdown */ /* Logger initialisation and shutdown */
bool logger_shutdown(void); bool logger_shutdown(void);
bool logger_init(const char* ident, const char* level, const char* facility); bool logger_init(t_configuration_options *opts, const char* ident, const char* level, const char* facility);
void logger_min_verbose(int minimum); void logger_min_verbose(int minimum);
extern int log_type; extern int log_type;

550
repmgr.c

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,8 @@ node_name=standby2
# Connection information # Connection information
conninfo='host=192.168.204.104' conninfo='host=192.168.204.104'
rsync_options=--archive --checksum --compress --progress --rsh=ssh rsync_options=--archive --checksum --compress --progress --rsh="ssh -o \"StrictHostKeyChecking no\""
ssh_options=-o "StrictHostKeyChecking no"
# How many seconds we wait for master response before declaring master failure # How many seconds we wait for master response before declaring master failure
master_response_timeout=60 master_response_timeout=60
@@ -21,10 +22,10 @@ reconnect_attempts=6
reconnect_interval=10 reconnect_interval=10
# Autofailover options # Autofailover options
failover=automatic failover=manual
priority=-1 priority=-1
promote_command='repmgr promote' promote_command='repmgr standby promote -f /path/to/repmgr.conf'
follow_command='repmgr follow' follow_command='repmgr standby follow -f /path/to/repmgr.conf -W'
# Log level: possible values are DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG # Log level: possible values are DEBUG, INFO, NOTICE, WARNING, ERR, ALERT, CRIT or EMERG
# Default: NOTICE # Default: NOTICE
@@ -33,3 +34,29 @@ loglevel=NOTICE
# Logging facility: possible values are STDERR or - for Syslog integration - one of LOCAL0, LOCAL1, ..., LOCAL7, USER # Logging facility: possible values are STDERR or - for Syslog integration - one of LOCAL0, LOCAL1, ..., LOCAL7, USER
# Default: STDERR # Default: STDERR
logfacility=STDERR logfacility=STDERR
# path to pg_ctl executable
pg_bindir=/usr/bin/
#
# you may add command line arguments for pg_ctl
#
# pg_ctl_options='-s'
#
# redirect stderr to a logfile
#
# logfile='/var/log/repmgr.log'
#
# change monitoring interval; default is 2s
#
# monitor_interval_secs=2
#
# change wait time for master; before we bail out and exit when the
# master disappears, we wait 6 * retry_promote_interval_secs seconds;
# by default this would be half an hour (since sleep_delay default
# value is 300)
#
# retry_promote_interval_secs=300

View File

@@ -1,6 +1,6 @@
/* /*
* repmgr.h * repmgr.h
* Copyright (c) 2ndQuadrant, 2010-2012 * Copyright (c) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@@ -59,6 +59,7 @@ typedef struct
char wal_keep_segments[MAXLEN]; char wal_keep_segments[MAXLEN];
bool verbose; bool verbose;
bool force; bool force;
bool wait_for_master;
bool ignore_rsync_warn; bool ignore_rsync_warn;
char masterport[MAXLEN]; char masterport[MAXLEN];
@@ -68,6 +69,6 @@ typedef struct
int keep_history; int keep_history;
} t_runtime_options; } t_runtime_options;
#define SLEEP_MONITOR 2 #define T_RUNTIME_OPTIONS_INITIALIZER { "", "", "", "", "", "", DEFAULT_WAL_KEEP_SEGMENTS, false, false, false, false, "", "", 0 }
#endif #endif

View File

@@ -1,7 +1,7 @@
/* /*
* repmgr.sql * repmgr.sql
* *
* Copyright (C) 2ndQuadrant, 2010-2012 * Copyright (C) 2ndQuadrant, 2010-2014
* *
*/ */

998
repmgrd.c

File diff suppressed because it is too large Load Diff

View File

@@ -9,7 +9,8 @@ DATA=uninstall_repmgr_funcs.sql
OBJS=repmgr_funcs.o OBJS=repmgr_funcs.o
ifdef USE_PGXS ifdef USE_PGXS
PGXS := $(shell pg_config --pgxs) PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS) include $(PGXS)
else else
subdir = contrib/repmgr/sql subdir = contrib/repmgr/sql

View File

@@ -15,6 +15,7 @@
#include "storage/shmem.h" #include "storage/shmem.h"
#include "storage/spin.h" #include "storage/spin.h"
#include "utils/builtins.h" #include "utils/builtins.h"
#include "utils/timestamp.h"
/* same definition as the one in xlog_internal.h */ /* same definition as the one in xlog_internal.h */
#define MAXFNAMELEN 64 #define MAXFNAMELEN 64
@@ -28,6 +29,7 @@ typedef struct repmgrSharedState
{ {
LWLockId lock; /* protects search/modification */ LWLockId lock; /* protects search/modification */
char location[MAXFNAMELEN]; /* last known xlog location */ char location[MAXFNAMELEN]; /* last known xlog location */
TimestampTz last_updated;
} repmgrSharedState; } repmgrSharedState;
/* Links to shared memory state */ /* Links to shared memory state */
@@ -49,6 +51,12 @@ Datum repmgr_get_last_standby_location(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(repmgr_update_standby_location); PG_FUNCTION_INFO_V1(repmgr_update_standby_location);
PG_FUNCTION_INFO_V1(repmgr_get_last_standby_location); PG_FUNCTION_INFO_V1(repmgr_get_last_standby_location);
Datum repmgr_update_last_updated(PG_FUNCTION_ARGS);
Datum repmgr_get_last_updated(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(repmgr_update_last_updated);
PG_FUNCTION_INFO_V1(repmgr_get_last_updated);
/* /*
* Module load callback * Module load callback
@@ -187,3 +195,38 @@ repmgr_update_standby_location(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(repmgr_set_standby_location(locationstr)); PG_RETURN_BOOL(repmgr_set_standby_location(locationstr));
} }
/* update and return last updated with current timestamp */
Datum
repmgr_update_last_updated(PG_FUNCTION_ARGS)
{
TimestampTz last_updated = GetCurrentTimestamp();
/* Safety check... */
if (!shared_state)
PG_RETURN_NULL();
LWLockAcquire(shared_state->lock, LW_SHARED);
shared_state->last_updated = last_updated;
LWLockRelease(shared_state->lock);
PG_RETURN_TIMESTAMPTZ(last_updated);
}
/* get last updated timestamp */
Datum
repmgr_get_last_updated(PG_FUNCTION_ARGS)
{
TimestampTz last_updated;
/* Safety check... */
if (!shared_state)
PG_RETURN_NULL();
LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
last_updated = shared_state->last_updated;
LWLockRelease(shared_state->lock);
PG_RETURN_TIMESTAMPTZ(last_updated);
}

View File

@@ -1,6 +1,6 @@
/* /*
* repmgr_function.sql * repmgr_function.sql
* Copyright (c) 2ndQuadrant, 2010 * Copyright (c) 2ndQuadrant, 2010-2014
* *
*/ */
@@ -13,3 +13,11 @@ LANGUAGE C STRICT;
CREATE FUNCTION repmgr_get_last_standby_location() RETURNS text CREATE FUNCTION repmgr_get_last_standby_location() RETURNS text
AS 'MODULE_PATHNAME', 'repmgr_get_last_standby_location' AS 'MODULE_PATHNAME', 'repmgr_get_last_standby_location'
LANGUAGE C STRICT; LANGUAGE C STRICT;
CREATE FUNCTION repmgr_update_last_updated() RETURNS TIMESTAMP WITH TIME ZONE
AS 'MODULE_PATHNAME', 'repmgr_update_last_updated'
LANGUAGE C STRICT;
CREATE FUNCTION repmgr_get_last_updated() RETURNS TIMESTAMP WITH TIME ZONE
AS 'MODULE_PATHNAME', 'repmgr_get_last_updated'
LANGUAGE C STRICT;

View File

@@ -1,2 +1,11 @@
/*
* uninstall_repmgr_funcs.sql
* Copyright (c) 2ndQuadrant, 2010-2014
*
*/
DROP FUNCTION repmgr_update_standby_location(text); DROP FUNCTION repmgr_update_standby_location(text);
DROP FUNCTION repmgr_get_last_standby_location(); DROP FUNCTION repmgr_get_last_standby_location();
DROP FUNCTION repmgr_update_last_updated();
DROP FUNCTION repmgr_get_last_updated();

View File

@@ -1,7 +1,7 @@
/* /*
* strutil.c * strutil.c
* *
* Copyright (C) 2ndQuadrant, 2010-2012 * Copyright (C) 2ndQuadrant, 2010-2014
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
@@ -25,7 +25,7 @@
#include "log.h" #include "log.h"
#include "strutil.h" #include "strutil.h"
static int xvsnprintf(char *str, size_t size, const char *format, va_list ap); static int xvsnprintf(char *str, size_t size, const char *format, va_list ap) __attribute__ ((format (PG_PRINTF_ATTRIBUTE, 3, 0)));
/* Add strnlen on platforms that don't have it, like OS X */ /* Add strnlen on platforms that don't have it, like OS X */
#ifndef strnlen #ifndef strnlen
@@ -44,7 +44,7 @@ xvsnprintf(char *str, size_t size, const char *format, va_list ap)
retval = vsnprintf(str, size, format, ap); retval = vsnprintf(str, size, format, ap);
if (retval >= size) if (retval >= (int)size)
{ {
log_err(_("Buffer of size not large enough to format entire string '%s'\n"), log_err(_("Buffer of size not large enough to format entire string '%s'\n"),
str); str);

View File

@@ -1,6 +1,6 @@
/* /*
* strutil.h * strutil.h
* Copyright (C) 2ndQuadrant, 2010-2012 * Copyright (C) 2ndQuadrant, 2010-2014
* *
* *
* This program is free software: you can redistribute it and/or modify * This program is free software: you can redistribute it and/or modify
@@ -31,9 +31,9 @@
#define MAXCONNINFO 1024 #define MAXCONNINFO 1024
extern int xsnprintf(char *str, size_t size, const char *format, ...); extern int xsnprintf(char *str, size_t size, const char *format, ...) __attribute__ ((format (PG_PRINTF_ATTRIBUTE, 3, 4)));
extern int sqlquery_snprintf(char *str, const char *format, ...); extern int sqlquery_snprintf(char *str, const char *format, ...) __attribute__ ((format (PG_PRINTF_ATTRIBUTE, 2, 3)));
extern int maxlen_snprintf(char *str, const char *format, ...); extern int maxlen_snprintf(char *str, const char *format, ...) __attribute__ ((format (PG_PRINTF_ATTRIBUTE, 2, 3)));
/* Add strnlen on platforms that don't have it, like OS X */ /* Add strnlen on platforms that don't have it, like OS X */
#ifndef strnlen #ifndef strnlen

View File

@@ -1,7 +1,7 @@
/* /*
* uninstall_repmgr.sql * uninstall_repmgr.sql
* *
* Copyright (C) 2ndQuadrant, 2010-2012 * Copyright (C) 2ndQuadrant, 2010-2014
* *
*/ */

View File

@@ -1,4 +1,5 @@
#ifndef _VERSION_H_ #ifndef _VERSION_H_
#define _VERSION_H_ #define _VERSION_H_
#define REPMGR_VERSION "2.0beta1"
#define REPMGR_VERSION "2.0"
#endif #endif