mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-23 15:16:29 +00:00
Compare commits
16 Commits
dev/gianni
...
v4.2.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0cafeb3828 | ||
|
|
79e79bd5f2 | ||
|
|
a4e21fd8fe | ||
|
|
e826f72312 | ||
|
|
1e8b3313ee | ||
|
|
b5772d88dd | ||
|
|
22614573b9 | ||
|
|
77c9092794 | ||
|
|
15bbe04a6f | ||
|
|
0842560a88 | ||
|
|
8bec4946bc | ||
|
|
3ab22f9442 | ||
|
|
3a9c36a36c | ||
|
|
2ded8987ac | ||
|
|
6311f3f30a | ||
|
|
12ec6c7abc |
3
HISTORY
3
HISTORY
@@ -5,8 +5,9 @@
|
|||||||
repmgr: report unreachable nodes when running "repmgr cluster (matrix|crosscheck);
|
repmgr: report unreachable nodes when running "repmgr cluster (matrix|crosscheck);
|
||||||
GitHub #246 (Ian)
|
GitHub #246 (Ian)
|
||||||
repmgr: add configuration file parameter "repmgr_bindir"; GitHub #246 (Ian)
|
repmgr: add configuration file parameter "repmgr_bindir"; GitHub #246 (Ian)
|
||||||
repmgr: fix "Missing replication slots" label in "node check"; GitHub #507 (Ian)
|
repmgr: fix "Missing replication slots" label in "node check"; GitHub #507 (Ian)
|
||||||
repmgrd: fix parsing of -d/--daemonize option (Ian)
|
repmgrd: fix parsing of -d/--daemonize option (Ian)
|
||||||
|
repmgrd: support "pausing" of repmgrd (Ian)
|
||||||
|
|
||||||
4.1.1 2018-09-05
|
4.1.1 2018-09-05
|
||||||
logging: explicitly log the text of failed queries as ERRORs to
|
logging: explicitly log the text of failed queries as ERRORs to
|
||||||
|
|||||||
@@ -30,13 +30,18 @@ all: \
|
|||||||
PG_CPPFLAGS = -std=gnu89 -I$(includedir_internal) -I$(libpq_srcdir) -Wall -Wmissing-prototypes -Wmissing-declarations $(EXTRA_CFLAGS)
|
PG_CPPFLAGS = -std=gnu89 -I$(includedir_internal) -I$(libpq_srcdir) -Wall -Wmissing-prototypes -Wmissing-declarations $(EXTRA_CFLAGS)
|
||||||
SHLIB_LINK = $(libpq)
|
SHLIB_LINK = $(libpq)
|
||||||
|
|
||||||
HEADERS = $(wildcard *.h)
|
|
||||||
|
|
||||||
OBJS = \
|
OBJS = \
|
||||||
repmgr.o
|
repmgr.o
|
||||||
|
|
||||||
include Makefile.global
|
include Makefile.global
|
||||||
|
|
||||||
|
ifeq ($(vpath_build),yes)
|
||||||
|
HEADERS = $(wildcard *.h)
|
||||||
|
else
|
||||||
|
HEADERS_built = $(wildcard *.h)
|
||||||
|
endif
|
||||||
|
|
||||||
$(info Building against PostgreSQL $(MAJORVERSION))
|
$(info Building against PostgreSQL $(MAJORVERSION))
|
||||||
|
|
||||||
|
|||||||
@@ -1531,6 +1531,9 @@ parse_bool(const char *s, const char *config_item, ItemList *error_list)
|
|||||||
{
|
{
|
||||||
PQExpBufferData errors;
|
PQExpBufferData errors;
|
||||||
|
|
||||||
|
if (s == NULL)
|
||||||
|
return true;
|
||||||
|
|
||||||
if (strcasecmp(s, "0") == 0)
|
if (strcasecmp(s, "0") == 0)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
|||||||
@@ -96,10 +96,44 @@
|
|||||||
</para>
|
</para>
|
||||||
</sect2>
|
</sect2>
|
||||||
|
|
||||||
|
|
||||||
|
<sect2>
|
||||||
|
<title>repmgrd enhancements</title>
|
||||||
|
<para>
|
||||||
|
<itemizedlist>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
<application>repmgrd</application> can now be "paused", i.e. instructed
|
||||||
|
not to take any action such as a failover, even if the prerequisites for such an
|
||||||
|
action are detected.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
This removes the need to stop <application>repmgrd</application> on all nodes when
|
||||||
|
performing a planned operation such as a switchover.
|
||||||
|
</para>
|
||||||
|
<para>
|
||||||
|
For further details, see <link linkend="repmgrd-pausing">Pausing repmgrd</link>.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
</itemizedlist>
|
||||||
|
</para>
|
||||||
|
</sect2>
|
||||||
|
|
||||||
<sect2>
|
<sect2>
|
||||||
<title>Bug fixes</title>
|
<title>Bug fixes</title>
|
||||||
<para>
|
<para>
|
||||||
<itemizedlist>
|
<itemizedlist>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
&repmgr;: fix "Missing replication slots" label in
|
||||||
|
<command><link linkend="repmgr-node-check">repmgr node check</link></command>. (GitHub #507)
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
|
||||||
<listitem>
|
<listitem>
|
||||||
<para>
|
<para>
|
||||||
<application>repmgrd</application>: fix parsing of <option>-d/--daemonize</option> option.
|
<application>repmgrd</application>: fix parsing of <option>-d/--daemonize</option> option.
|
||||||
|
|||||||
@@ -17,15 +17,15 @@
|
|||||||
<link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>.
|
<link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>.
|
||||||
</para>
|
</para>
|
||||||
<para>
|
<para>
|
||||||
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> to control the PostgreSQL
|
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> utility to control the PostgreSQL
|
||||||
server. However this can lead to various problems, particularly when PostgreSQL has been
|
server. However this can lead to various problems, particularly when PostgreSQL has been
|
||||||
installed from packages, and expecially so if <application>systemd</application> is in use.
|
installed from packages, and especially so if <application>systemd</application> is in use.
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
|
|
||||||
<note>
|
<note>
|
||||||
<para>
|
<para>
|
||||||
If using <application>systemd</application>, ensure you have <varname>RemoteIPC</varname> set to <literal>off</literal>.
|
If using <application>systemd</application>, ensure you have <varname>RemoveIPC</varname> set to <literal>off</literal>.
|
||||||
See the <ulink url="https://wiki.postgresql.org/wiki/Systemd">systemd</ulink>
|
See the <ulink url="https://wiki.postgresql.org/wiki/Systemd">systemd</ulink>
|
||||||
entry in the <ulink url="https://wiki.postgresql.org/wiki/Main_Page">PostgreSQL wiki</ulink> for details.
|
entry in the <ulink url="https://wiki.postgresql.org/wiki/Main_Page">PostgreSQL wiki</ulink> for details.
|
||||||
</para>
|
</para>
|
||||||
|
|||||||
@@ -7,9 +7,9 @@
|
|||||||
<title>Upgrading repmgr</title>
|
<title>Upgrading repmgr</title>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
&repmgr; is updated regularly with point releases (e.g. 4.0.1 to 4.0.2)
|
&repmgr; is updated regularly with minor releases (e.g. 4.0.1 to 4.0.2)
|
||||||
containing bugfixes and other minor improvements. Any substantial new
|
containing bugfixes and other minor improvements. Any substantial new
|
||||||
functionality will be included in a feature release (e.g. 4.0.x to 4.1.x).
|
functionality will be included in a major release (e.g. 4.0 to 4.1).
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<sect1 id="upgrading-repmgr-extension" xreflabel="Upgrading repmgr 4.x and later">
|
<sect1 id="upgrading-repmgr-extension" xreflabel="Upgrading repmgr 4.x and later">
|
||||||
@@ -19,83 +19,110 @@
|
|||||||
</indexterm>
|
</indexterm>
|
||||||
<title>Upgrading repmgr 4.x and later</title>
|
<title>Upgrading repmgr 4.x and later</title>
|
||||||
<para>
|
<para>
|
||||||
&repmgr; 4.x is implemented as a PostgreSQL extension; normally the upgrade consists
|
From version 4, &repmgr; consists of three elements:
|
||||||
of the following steps:
|
<itemizedlist spacing="compact" mark="bullet">
|
||||||
<orderedlist>
|
|
||||||
|
|
||||||
<listitem>
|
<listitem>
|
||||||
<simpara>
|
<simpara>
|
||||||
Stop <application>repmgrd</application> (if in use) on all nodes where it is running
|
the <application>repmgr</application> and <application>repmgrd</application> executables
|
||||||
</simpara>
|
</simpara>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
|
||||||
<listitem>
|
<listitem>
|
||||||
<simpara>
|
<simpara>
|
||||||
Install the updated package (or compile the updated source)
|
the objects for the &repmgr; PostgreSQL extension (SQL files for creating/updating
|
||||||
</simpara>
|
repmgr metadata, and the extension control file)
|
||||||
</listitem>
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
<listitem>
|
<listitem>
|
||||||
<simpara>
|
<simpara>
|
||||||
For major releases, e.g. from <literal>4.0.x</literal> to <literal>4.1</literal>,
|
the shared library module used by <application>repmgrd</application> which
|
||||||
where the <literal>repmgr</literal> shared object library has been updated,
|
is resident in the PostgreSQL backend
|
||||||
restart PostgreSQL.
|
</simpara>
|
||||||
</simpara>
|
</listitem>
|
||||||
</listitem>
|
</itemizedlist>
|
||||||
|
</para>
|
||||||
<listitem>
|
<para>
|
||||||
<simpara>
|
With <emphasis>minor releases</emphasis>, usually changes are only made to the <application>repmgr</application>
|
||||||
For major releases, e.g. from <literal>4.0.x</literal> to <literal>4.1</literal>,
|
and <application>repmgrd</application> executables. In this case, the upgrade is quite straightforward,
|
||||||
execute <command>ALTER EXTENSION repmgr UPDATE</command>
|
and is simply a case of installing the new version, and restarting <application>repmgrd</application>
|
||||||
on the primary node in the database where the &repmgr; extension is installed.
|
(if running).
|
||||||
</simpara>
|
|
||||||
<simpara>
|
|
||||||
This will update the extension metadata and, if necessary, apply
|
|
||||||
changes to the &repmgr; extension objects.
|
|
||||||
</simpara>
|
|
||||||
</listitem>
|
|
||||||
|
|
||||||
<listitem>
|
|
||||||
<simpara>
|
|
||||||
Start <application>repmgrd</application> (if in use).
|
|
||||||
</simpara>
|
|
||||||
</listitem>
|
|
||||||
|
|
||||||
</orderedlist>
|
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
Always check the <link linkend="appendix-release-notes">release notes</link> for every
|
For <emphasis>major releases</emphasis>, the &repmgr; PostgreSQL extension will need to be updated
|
||||||
release as they may contain upgrade instructions particular to individual versions.
|
to the latest version. Additionally, if the shared library module has been updated (this is sometimes,
|
||||||
|
but not always the case), PostgreSQL itself will need to be restarted on each node.
|
||||||
</para>
|
</para>
|
||||||
|
<important>
|
||||||
|
<para>
|
||||||
|
Always check the <link linkend="appendix-release-notes">release notes</link> for every
|
||||||
|
release as they may contain upgrade instructions particular to individual versions.
|
||||||
|
</para>
|
||||||
|
</important>
|
||||||
|
|
||||||
<para>
|
<sect2 id="upgrading-minor-version" xreflabel="Upgrading a minor version release">
|
||||||
Note that it may be necessary to restart the PostgreSQL server if the upgrade contains
|
|
||||||
changes to the shared object file used by <application>repmgrd</application>; check the
|
|
||||||
<link linkend="appendix-release-notes">release notes</link> for details.
|
|
||||||
</para>
|
|
||||||
|
|
||||||
<sect2 id="upgrading-replication-cluster" xreflabel="Upgrading a replication cluster">
|
|
||||||
<indexterm>
|
<indexterm>
|
||||||
<primary>upgrading</primary>
|
<primary>upgrading</primary>
|
||||||
<secondary>repmgr 4.x and later</secondary>
|
<secondary>minor release</secondary>
|
||||||
</indexterm>
|
</indexterm>
|
||||||
<title>Upgrading a replication cluster</title>
|
<title>Upgrading a minor version release</title>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
The process for installing minor version upgrades is quite straightforward:
|
||||||
|
|
||||||
|
<itemizedlist spacing="compact" mark="bullet">
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<simpara>
|
||||||
|
install the new &repmgr; version
|
||||||
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<simpara>
|
||||||
|
restart <application>repmgrd</application> on all nodes where it is running
|
||||||
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
</itemizedlist>
|
||||||
|
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<note>
|
||||||
|
<para>
|
||||||
|
Some packaging systems (e.g. <link linkend="packages-debian-ubuntu">Debian/Ubuntu</link>
|
||||||
|
may restart <application>repmgrd</application> as part of the package upgrade process.
|
||||||
|
</para>
|
||||||
|
</note>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
The same &repmgr; "major version" (e.g. <literal>4.2</literal>) must be
|
Minor version upgrades can be performed in any order on the nodes in the replication
|
||||||
installed on all nodes in the replication cluster. While it's possible to have differing
|
cluster.
|
||||||
&repmgr; "minor versions" (e.g. <literal>4.2.1</literal>) on different nodes,
|
|
||||||
we strongly recommend updating all nodes to the latest minor version.
|
|
||||||
</para>
|
</para>
|
||||||
<note>
|
|
||||||
|
<para>
|
||||||
|
A PostgreSQL restart is <emphasis>not</emphasis> required for minor version upgrades.
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<note>
|
||||||
<para>
|
<para>
|
||||||
Minor version upgrades can be performed in any order on the nodes in the replicaiton
|
The same &repmgr; "major version" (e.g. <literal>4.2</literal>) must be
|
||||||
cluster. In general it makes sense to start on the primary.
|
installed on all nodes in the replication cluster. While it's possible to have differing
|
||||||
|
&repmgr; "minor versions" (e.g. <literal>4.2.1</literal>) on different nodes,
|
||||||
|
we strongly recommend updating all nodes to the latest minor version.
|
||||||
</para>
|
</para>
|
||||||
<para>
|
</note>
|
||||||
A PostgreSQL restart is <emphasis>not</emphasis> required for minor version upgrades.
|
|
||||||
</para>
|
</sect2>
|
||||||
</note>
|
|
||||||
|
<sect2 id="upgrading-major-version" xreflabel="Upgrading a major version release">
|
||||||
|
<indexterm>
|
||||||
|
<primary>upgrading</primary>
|
||||||
|
<secondary>major release</secondary>
|
||||||
|
</indexterm>
|
||||||
|
<title>Upgrading a major version release</title>
|
||||||
<para>
|
<para>
|
||||||
"major version" upgrades need to be planned more carefully, as they may include
|
"major version" upgrades need to be planned more carefully, as they may include
|
||||||
changes to the &repmgr; metadata (which need to be propagated from the primary to all
|
changes to the &repmgr; metadata (which need to be propagated from the primary to all
|
||||||
@@ -111,7 +138,14 @@
|
|||||||
|
|
||||||
<listitem>
|
<listitem>
|
||||||
<simpara>
|
<simpara>
|
||||||
Stop <application>repmgrd</application> (if in use) on all nodes where it is running
|
Stop <application>repmgrd</application> (if in use) on all nodes where it is running.
|
||||||
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<simpara>
|
||||||
|
Disable the <application>repmgrd</application> service on all nodes where it is in use;
|
||||||
|
this is to prevent packages from prematurely restarting <application>repmgrd</application>.
|
||||||
</simpara>
|
</simpara>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
|
||||||
@@ -121,12 +155,21 @@
|
|||||||
</simpara>
|
</simpara>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
If running a <literal>systemd</literal>-based Linux distribution, execute (as <literal>root</literal>,
|
||||||
|
or with appropriate <literal>sudo</literal> permissions):
|
||||||
|
<programlisting>
|
||||||
|
systemctl daemon-reload</programlisting>
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
<listitem>
|
<listitem>
|
||||||
<simpara>
|
<simpara>
|
||||||
If necessary, restart PostgreSQL, then <application>repmgrd</application> (if in use)
|
If the &repmgr; shared library module has been updated (check the <link linkend="appendix-release-notes">release notes</link>!),
|
||||||
on each node. The order in which this is applied to individual nodes is not critical,
|
restart PostgreSQL, then <application>repmgrd</application> (if in use) on each node,
|
||||||
and it's also fine to restart on all nodes first before starting <application>repmgrd</application>.
|
The order in which this is applied to individual nodes is not critical,
|
||||||
|
and it's also fine to restart PostgreSQL on all nodes first before starting <application>repmgrd</application>.
|
||||||
</simpara>
|
</simpara>
|
||||||
<simpara>
|
<simpara>
|
||||||
Note that if the upgrade requires a PostgreSQL restart, <application>repmgrd</application>
|
Note that if the upgrade requires a PostgreSQL restart, <application>repmgrd</application>
|
||||||
@@ -138,11 +181,17 @@
|
|||||||
<para>
|
<para>
|
||||||
On the primary node, execute
|
On the primary node, execute
|
||||||
<programlisting>
|
<programlisting>
|
||||||
ALTER EXTENSION repmgr UPDATE</programlisting>
|
ALTER EXTENSION repmgr UPDATE</programlisting>
|
||||||
in the database where &repmgr; is installed.
|
in the database where &repmgr; is installed.
|
||||||
</para>
|
</para>
|
||||||
</listitem>
|
</listitem>
|
||||||
|
|
||||||
|
<listitem>
|
||||||
|
<simpara>
|
||||||
|
Reenable the <application>repmgrd</application> service on all nodes where it is in use.
|
||||||
|
</simpara>
|
||||||
|
</listitem>
|
||||||
|
|
||||||
</orderedlist>
|
</orderedlist>
|
||||||
</para>
|
</para>
|
||||||
<tip>
|
<tip>
|
||||||
@@ -154,6 +203,17 @@
|
|||||||
</tip>
|
</tip>
|
||||||
</sect2>
|
</sect2>
|
||||||
|
|
||||||
|
<sect2 id="upgrading-check-repmgrd" xreflabel="Checking repmgrd status after an upgrade">
|
||||||
|
<indexterm>
|
||||||
|
<primary>upgrading</primary>
|
||||||
|
<secondary>checking repmgrd status</secondary>
|
||||||
|
</indexterm>
|
||||||
|
<title>Checking repmgrd status after an upgrade</title>
|
||||||
|
<para>
|
||||||
|
From &repmgr; 4.2, once the upgrade is complete, execute the <command><link linkend="repmgr-daemon-status">repmgr daemon status</link></command>
|
||||||
|
command (on any node) to show an overview of the status of <application>repmgrd</application> on all nodes.
|
||||||
|
</para>
|
||||||
|
</sect2>
|
||||||
</sect1>
|
</sect1>
|
||||||
|
|
||||||
<sect1 id="upgrading-and-pg-upgrade" xreflabel="pg_upgrade and repmgr">
|
<sect1 id="upgrading-and-pg-upgrade" xreflabel="pg_upgrade and repmgr">
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
<!ENTITY repmgrversion "4.2dev">
|
<!ENTITY repmgrversion "4.2">
|
||||||
|
|||||||
@@ -3718,9 +3718,18 @@ do_standby_switchover(void)
|
|||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* close all connections - we'll reestablish later */
|
||||||
|
for (cell = all_nodes.head; cell; cell = cell->next)
|
||||||
|
{
|
||||||
|
PQfinish(cell->node_info->conn);
|
||||||
|
cell->node_info->conn = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Sanity checks completed - prepare for the switchover
|
* Sanity checks completed - prepare for the switchover
|
||||||
*/
|
*/
|
||||||
@@ -3801,7 +3810,7 @@ do_standby_switchover(void)
|
|||||||
shutdown_command);
|
shutdown_command);
|
||||||
|
|
||||||
clear_node_info_list(&sibling_nodes);
|
clear_node_info_list(&sibling_nodes);
|
||||||
clear_node_info_list(&all_nodes);
|
|
||||||
key_value_list_free(&remote_config_files);
|
key_value_list_free(&remote_config_files);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
@@ -4087,9 +4096,25 @@ do_standby_switchover(void)
|
|||||||
|
|
||||||
if (sibling_node_record.type == WITNESS)
|
if (sibling_node_record.type == WITNESS)
|
||||||
{
|
{
|
||||||
|
/* TODO: create "repmgr witness resync" or similar */
|
||||||
appendPQExpBuffer(&remote_command_str,
|
appendPQExpBuffer(&remote_command_str,
|
||||||
"witness register -d \\'%s\\' --force 2>/dev/null && echo \"1\" || echo \"0\"",
|
"witness register -d \\'%s\\' --force 2>/dev/null && echo \"1\" || echo \"0\"",
|
||||||
local_node_record.conninfo);
|
local_node_record.conninfo);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Notify the witness repmgrd about the new primary, as at this point it will be assuming
|
||||||
|
* a failover situation is in place. It will detect the new primary at some point, this
|
||||||
|
* just speeds up the process.
|
||||||
|
*
|
||||||
|
* In the unlikely event repmgrd is not running or not in use, this will have no effect.
|
||||||
|
*/
|
||||||
|
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
|
||||||
|
|
||||||
|
if (PQstatus(cell->node_info->conn) == CONNECTION_OK)
|
||||||
|
{
|
||||||
|
notify_follow_primary(cell->node_info->conn, local_node_record.node_id);
|
||||||
|
}
|
||||||
|
PQfinish(cell->node_info->conn);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -4144,8 +4169,6 @@ do_standby_switchover(void)
|
|||||||
|
|
||||||
clear_node_info_list(&sibling_nodes);
|
clear_node_info_list(&sibling_nodes);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PQfinish(local_conn);
|
PQfinish(local_conn);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
#define REPMGR_VERSION_DATE ""
|
#define REPMGR_VERSION_DATE ""
|
||||||
#define REPMGR_VERSION "4.2dev"
|
#define REPMGR_VERSION "4.2"
|
||||||
|
|||||||
@@ -71,6 +71,8 @@ static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);
|
|||||||
|
|
||||||
static void check_connection(t_node_info *node_info, PGconn **conn);
|
static void check_connection(t_node_info *node_info, PGconn **conn);
|
||||||
|
|
||||||
|
static bool check_primary_status(int degraded_monitoring_elapsed);
|
||||||
|
|
||||||
static bool wait_primary_notification(int *new_primary_id);
|
static bool wait_primary_notification(int *new_primary_id);
|
||||||
static FailoverState follow_new_primary(int new_primary_id);
|
static FailoverState follow_new_primary(int new_primary_id);
|
||||||
static FailoverState witness_follow_new_primary(int new_primary_id);
|
static FailoverState witness_follow_new_primary(int new_primary_id);
|
||||||
@@ -341,6 +343,13 @@ monitor_streaming_primary(void)
|
|||||||
repmgrd_set_pid(local_conn, getpid(), pid_file);
|
repmgrd_set_pid(local_conn, getpid(), pid_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* check that the local node is still primary, otherwise switch
|
||||||
|
* to standby monitoring
|
||||||
|
*/
|
||||||
|
if (check_primary_status(-1) == false)
|
||||||
|
return;
|
||||||
|
|
||||||
goto loop;
|
goto loop;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -393,143 +402,10 @@ monitor_streaming_primary(void)
|
|||||||
{
|
{
|
||||||
local_node_info.node_status = NODE_STATUS_UP;
|
local_node_info.node_status = NODE_STATUS_UP;
|
||||||
|
|
||||||
/* check to see if the node has been restored as a standby */
|
if (check_primary_status(degraded_monitoring_elapsed) == false)
|
||||||
if (get_recovery_type(local_conn) == RECTYPE_STANDBY)
|
return;
|
||||||
{
|
|
||||||
PGconn *new_primary_conn;
|
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
goto loop;
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
|
||||||
_("reconnected to node after %i seconds, node is now a standby, switching to standby monitoring"),
|
|
||||||
degraded_monitoring_elapsed);
|
|
||||||
log_notice("%s", event_details.data);
|
|
||||||
termPQExpBuffer(&event_details);
|
|
||||||
|
|
||||||
primary_node_id = UNKNOWN_NODE_ID;
|
|
||||||
|
|
||||||
new_primary_conn = get_primary_connection_quiet(local_conn, &primary_node_id, NULL);
|
|
||||||
|
|
||||||
if (PQstatus(new_primary_conn) != CONNECTION_OK)
|
|
||||||
{
|
|
||||||
close_connection(&new_primary_conn);
|
|
||||||
log_warning(_("unable to connect to new primary node %i"), primary_node_id);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
RecordStatus record_status;
|
|
||||||
|
|
||||||
log_debug("primary node id is now %i", primary_node_id);
|
|
||||||
|
|
||||||
record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
|
|
||||||
|
|
||||||
if (record_status == RECORD_FOUND)
|
|
||||||
{
|
|
||||||
bool resume_monitoring = true;
|
|
||||||
|
|
||||||
log_debug("node %i is registered with type = %s",
|
|
||||||
config_file_options.node_id,
|
|
||||||
get_node_type_string(local_node_info.type));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* node has recovered but metadata not updated - we can do that ourselves,
|
|
||||||
*/
|
|
||||||
if (local_node_info.type == PRIMARY)
|
|
||||||
{
|
|
||||||
log_notice(_("node \"%s\" (ID: %i) still registered as primary, setting to standby"),
|
|
||||||
config_file_options.node_name,
|
|
||||||
config_file_options.node_id);
|
|
||||||
|
|
||||||
if (update_node_record_set_active_standby(new_primary_conn, config_file_options.node_id) == false)
|
|
||||||
{
|
|
||||||
resume_monitoring = false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
|
|
||||||
|
|
||||||
if (record_status != RECORD_FOUND)
|
|
||||||
{
|
|
||||||
resume_monitoring = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (resume_monitoring == true)
|
|
||||||
{
|
|
||||||
monitoring_state = MS_NORMAL;
|
|
||||||
log_notice(_("former primary has been restored as standby after %i seconds, updating node record and resuming monitoring"),
|
|
||||||
degraded_monitoring_elapsed);
|
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
|
||||||
_("node restored as standby after %i seconds, monitoring connection to upstream node %i"),
|
|
||||||
degraded_monitoring_elapsed,
|
|
||||||
local_node_info.upstream_node_id);
|
|
||||||
|
|
||||||
create_event_notification(new_primary_conn,
|
|
||||||
&config_file_options,
|
|
||||||
config_file_options.node_id,
|
|
||||||
"repmgrd_standby_reconnect",
|
|
||||||
true,
|
|
||||||
event_details.data);
|
|
||||||
|
|
||||||
|
|
||||||
termPQExpBuffer(&event_details);
|
|
||||||
|
|
||||||
close_connection(&new_primary_conn);
|
|
||||||
|
|
||||||
/* restart monitoring as standby */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (record_status == RECORD_NOT_FOUND)
|
|
||||||
{
|
|
||||||
PQExpBufferData event_details;
|
|
||||||
initPQExpBuffer(&event_details);
|
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
|
||||||
_("no metadata record found for this node on current primary %i"),
|
|
||||||
primary_node_id);
|
|
||||||
|
|
||||||
log_error("%s", event_details.data);
|
|
||||||
log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
|
|
||||||
|
|
||||||
close_connection(&new_primary_conn);
|
|
||||||
|
|
||||||
create_event_notification(NULL,
|
|
||||||
&config_file_options,
|
|
||||||
config_file_options.node_id,
|
|
||||||
"repmgrd_shutdown",
|
|
||||||
false,
|
|
||||||
event_details.data);
|
|
||||||
termPQExpBuffer(&event_details);
|
|
||||||
|
|
||||||
terminate(ERR_BAD_CONFIG);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
monitoring_state = MS_NORMAL;
|
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
|
||||||
appendPQExpBuffer(&event_details,
|
|
||||||
_("reconnected to primary node after %i seconds, resuming monitoring"),
|
|
||||||
degraded_monitoring_elapsed);
|
|
||||||
|
|
||||||
create_event_notification(local_conn,
|
|
||||||
&config_file_options,
|
|
||||||
config_file_options.node_id,
|
|
||||||
"repmgrd_local_reconnect",
|
|
||||||
true,
|
|
||||||
event_details.data);
|
|
||||||
|
|
||||||
log_notice("%s", event_details.data);
|
|
||||||
termPQExpBuffer(&event_details);
|
|
||||||
goto loop;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -541,6 +417,11 @@ monitor_streaming_primary(void)
|
|||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
loop:
|
loop:
|
||||||
|
|
||||||
|
/* check node is still primary, if not restart monitoring */
|
||||||
|
if (check_primary_status(-1) == false)
|
||||||
|
return;
|
||||||
|
|
||||||
/* emit "still alive" log message at regular intervals, if requested */
|
/* emit "still alive" log message at regular intervals, if requested */
|
||||||
if (config_file_options.log_status_interval > 0)
|
if (config_file_options.log_status_interval > 0)
|
||||||
{
|
{
|
||||||
@@ -575,6 +456,181 @@ loop:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If monitoring a primary, it's possible that after an outage of the local node
|
||||||
|
* (due to e.g. a switchover), the node has come back as a standby. We therefore
|
||||||
|
* need to verify its status and if everything looks OK, restart monitoring in
|
||||||
|
* standby mode.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
check_primary_status(int degraded_monitoring_elapsed)
|
||||||
|
{
|
||||||
|
PQExpBufferData event_details;
|
||||||
|
|
||||||
|
/* check to see if the node has been restored as a standby */
|
||||||
|
if (get_recovery_type(local_conn) == RECTYPE_STANDBY)
|
||||||
|
{
|
||||||
|
PGconn *new_primary_conn;
|
||||||
|
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
if (degraded_monitoring_elapsed > 0)
|
||||||
|
{
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("reconnected to node after %i seconds, node is now a standby, switching to standby monitoring"),
|
||||||
|
degraded_monitoring_elapsed);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
appendPQExpBufferStr(&event_details,
|
||||||
|
_("node is now a standby, switching to standby monitoring"));
|
||||||
|
}
|
||||||
|
|
||||||
|
log_notice("%s", event_details.data);
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
primary_node_id = UNKNOWN_NODE_ID;
|
||||||
|
|
||||||
|
new_primary_conn = get_primary_connection_quiet(local_conn, &primary_node_id, NULL);
|
||||||
|
|
||||||
|
if (PQstatus(new_primary_conn) != CONNECTION_OK)
|
||||||
|
{
|
||||||
|
close_connection(&new_primary_conn);
|
||||||
|
log_warning(_("unable to connect to new primary node %i"), primary_node_id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
RecordStatus record_status;
|
||||||
|
|
||||||
|
log_debug("primary node id is now %i", primary_node_id);
|
||||||
|
|
||||||
|
record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
|
||||||
|
|
||||||
|
if (record_status == RECORD_FOUND)
|
||||||
|
{
|
||||||
|
bool resume_monitoring = true;
|
||||||
|
|
||||||
|
log_debug("node %i is registered with type = %s",
|
||||||
|
config_file_options.node_id,
|
||||||
|
get_node_type_string(local_node_info.type));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* node has recovered but metadata not updated - we can do that ourselves,
|
||||||
|
*/
|
||||||
|
if (local_node_info.type == PRIMARY)
|
||||||
|
{
|
||||||
|
log_notice(_("node \"%s\" (ID: %i) still registered as primary, setting to standby"),
|
||||||
|
config_file_options.node_name,
|
||||||
|
config_file_options.node_id);
|
||||||
|
|
||||||
|
if (update_node_record_set_active_standby(new_primary_conn, config_file_options.node_id) == false)
|
||||||
|
{
|
||||||
|
resume_monitoring = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
record_status = get_node_record(new_primary_conn, config_file_options.node_id, &local_node_info);
|
||||||
|
|
||||||
|
if (record_status != RECORD_FOUND)
|
||||||
|
{
|
||||||
|
resume_monitoring = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (resume_monitoring == true)
|
||||||
|
{
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
if (degraded_monitoring_elapsed > 0)
|
||||||
|
{
|
||||||
|
monitoring_state = MS_NORMAL;
|
||||||
|
|
||||||
|
log_notice(_("former primary has been restored as standby after %i seconds, updating node record and resuming monitoring"),
|
||||||
|
degraded_monitoring_elapsed);
|
||||||
|
|
||||||
|
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("node restored as standby after %i seconds, monitoring connection to upstream node %i"),
|
||||||
|
degraded_monitoring_elapsed,
|
||||||
|
local_node_info.upstream_node_id);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("node has become a standby, monitoring connection to upstream node %i"),
|
||||||
|
local_node_info.upstream_node_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
create_event_notification(new_primary_conn,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"repmgrd_standby_reconnect",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
|
||||||
|
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
close_connection(&new_primary_conn);
|
||||||
|
|
||||||
|
/* restart monitoring as standby */
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (record_status == RECORD_NOT_FOUND)
|
||||||
|
{
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("no metadata record found for this node on current primary %i"),
|
||||||
|
primary_node_id);
|
||||||
|
|
||||||
|
log_error("%s", event_details.data);
|
||||||
|
log_hint(_("check that 'repmgr (primary|standby) register' was executed for this node"));
|
||||||
|
|
||||||
|
close_connection(&new_primary_conn);
|
||||||
|
|
||||||
|
create_event_notification(NULL,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"repmgrd_shutdown",
|
||||||
|
false,
|
||||||
|
event_details.data);
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
terminate(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (degraded_monitoring_elapsed > 0)
|
||||||
|
{
|
||||||
|
monitoring_state = MS_NORMAL;
|
||||||
|
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
appendPQExpBuffer(&event_details,
|
||||||
|
_("reconnected to primary node after %i seconds, resuming monitoring"),
|
||||||
|
degraded_monitoring_elapsed);
|
||||||
|
|
||||||
|
create_event_notification(local_conn,
|
||||||
|
&config_file_options,
|
||||||
|
config_file_options.node_id,
|
||||||
|
"repmgrd_local_reconnect",
|
||||||
|
true,
|
||||||
|
event_details.data);
|
||||||
|
|
||||||
|
log_notice("%s", event_details.data);
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
monitor_streaming_standby(void)
|
monitor_streaming_standby(void)
|
||||||
{
|
{
|
||||||
@@ -595,7 +651,7 @@ monitor_streaming_standby(void)
|
|||||||
*/
|
*/
|
||||||
if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
|
if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
|
||||||
{
|
{
|
||||||
local_node_info.upstream_node_id = get_primary_node_id(local_conn);
|
upstream_conn = get_primary_connection(local_conn, &local_node_info.upstream_node_id, NULL);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Terminate if there doesn't appear to be an active cluster primary.
|
* Terminate if there doesn't appear to be an active cluster primary.
|
||||||
@@ -608,34 +664,40 @@ monitor_streaming_standby(void)
|
|||||||
log_error(_("unable to determine an active primary for this cluster, terminating"));
|
log_error(_("unable to determine an active primary for this cluster, terminating"));
|
||||||
terminate(ERR_BAD_CONFIG);
|
terminate(ERR_BAD_CONFIG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
(void) get_node_record(upstream_conn, local_node_info.upstream_node_id, &upstream_node_info);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
else
|
||||||
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &upstream_node_info);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Terminate if we can't find the record for the node we're supposed to
|
|
||||||
* monitor. This is a "fix-the-config" situation, not a lot else we can
|
|
||||||
* do.
|
|
||||||
*/
|
|
||||||
if (record_status == RECORD_NOT_FOUND)
|
|
||||||
{
|
{
|
||||||
log_error(_("no record found for upstream node (ID: %i), terminating"),
|
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &upstream_node_info);
|
||||||
local_node_info.upstream_node_id);
|
|
||||||
log_hint(_("ensure the upstream node is registered correctly"));
|
|
||||||
|
|
||||||
terminate(ERR_DB_CONN);
|
/*
|
||||||
}
|
* Terminate if we can't find the record for the node we're supposed to
|
||||||
else if (record_status == RECORD_ERROR)
|
* monitor. This is a "fix-the-config" situation, not a lot else we can
|
||||||
{
|
* do.
|
||||||
log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
|
*/
|
||||||
local_node_info.upstream_node_id);
|
if (record_status == RECORD_NOT_FOUND)
|
||||||
|
{
|
||||||
|
log_error(_("no record found for upstream node (ID: %i), terminating"),
|
||||||
|
local_node_info.upstream_node_id);
|
||||||
|
log_hint(_("ensure the upstream node is registered correctly"));
|
||||||
|
|
||||||
terminate(ERR_DB_CONN);
|
terminate(ERR_DB_CONN);
|
||||||
|
}
|
||||||
|
else if (record_status == RECORD_ERROR)
|
||||||
|
{
|
||||||
|
log_error(_("unable to retrieve record for upstream node (ID: %i), terminating"),
|
||||||
|
local_node_info.upstream_node_id);
|
||||||
|
|
||||||
|
terminate(ERR_DB_CONN);
|
||||||
|
}
|
||||||
|
|
||||||
|
log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);
|
||||||
|
|
||||||
|
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);
|
|
||||||
|
|
||||||
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Upstream node must be running at repmgrd startup.
|
* Upstream node must be running at repmgrd startup.
|
||||||
@@ -652,6 +714,15 @@ monitor_streaming_standby(void)
|
|||||||
terminate(ERR_DB_CONN);
|
terminate(ERR_DB_CONN);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
|
||||||
|
|
||||||
|
if (upstream_node_info.node_id == local_node_info.node_id)
|
||||||
|
{
|
||||||
|
PQfinish(upstream_conn);
|
||||||
|
upstream_conn = NULL;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* refresh upstream node record from upstream node, so it's as up-to-date
|
* refresh upstream node record from upstream node, so it's as up-to-date
|
||||||
* as possible
|
* as possible
|
||||||
@@ -682,6 +753,23 @@ monitor_streaming_standby(void)
|
|||||||
primary_conn = upstream_conn;
|
primary_conn = upstream_conn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* It's possible monitoring has been restarted after some outage which
|
||||||
|
* resulted in the local node being marked as inactive; if so mark it
|
||||||
|
* as active again.
|
||||||
|
*/
|
||||||
|
if (local_node_info.active == false)
|
||||||
|
{
|
||||||
|
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
|
||||||
|
{
|
||||||
|
PQExpBufferData event_details;
|
||||||
|
|
||||||
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
local_node_info.active = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
primary_node_id = get_primary_node_id(primary_conn);
|
primary_node_id = get_primary_node_id(primary_conn);
|
||||||
|
|
||||||
/* Log startup event */
|
/* Log startup event */
|
||||||
@@ -766,6 +854,7 @@ monitor_streaming_standby(void)
|
|||||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
check_connection(&local_node_info, &local_conn);
|
check_connection(&local_node_info, &local_conn);
|
||||||
|
log_debug("YYY here");
|
||||||
}
|
}
|
||||||
|
|
||||||
try_reconnect(&upstream_conn, &upstream_node_info);
|
try_reconnect(&upstream_conn, &upstream_node_info);
|
||||||
@@ -778,6 +867,43 @@ monitor_streaming_standby(void)
|
|||||||
if (upstream_node_info.type == PRIMARY)
|
if (upstream_node_info.type == PRIMARY)
|
||||||
{
|
{
|
||||||
primary_conn = upstream_conn;
|
primary_conn = upstream_conn;
|
||||||
|
|
||||||
|
if (get_recovery_type(primary_conn) == RECTYPE_STANDBY)
|
||||||
|
{
|
||||||
|
ExecStatusType ping_result;
|
||||||
|
|
||||||
|
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
|
||||||
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
|
PQfinish(upstream_conn);
|
||||||
|
upstream_conn = NULL;
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
local_node_info.upstream_node_id = UNKNOWN_NODE_ID;
|
||||||
|
|
||||||
|
/* check local connection */
|
||||||
|
ping_result = connection_ping(local_conn);
|
||||||
|
|
||||||
|
if (ping_result != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
PQfinish(local_conn);
|
||||||
|
|
||||||
|
for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
|
||||||
|
{
|
||||||
|
local_conn = establish_db_connection(local_node_info.conninfo, false);
|
||||||
|
|
||||||
|
if (PQstatus(local_conn) == CONNECTION_OK)
|
||||||
|
break;
|
||||||
|
|
||||||
|
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
|
||||||
|
i + 1,
|
||||||
|
config_file_options.repmgrd_standby_startup_timeout);
|
||||||
|
sleep(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
@@ -1140,6 +1266,7 @@ loop:
|
|||||||
|
|
||||||
check_connection(&local_node_info, &local_conn);
|
check_connection(&local_node_info, &local_conn);
|
||||||
|
|
||||||
|
|
||||||
if (PQstatus(local_conn) != CONNECTION_OK)
|
if (PQstatus(local_conn) != CONNECTION_OK)
|
||||||
{
|
{
|
||||||
if (local_node_info.active == true)
|
if (local_node_info.active == true)
|
||||||
@@ -1180,11 +1307,37 @@ loop:
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
int stored_local_node_id = repmgrd_get_local_node_id(local_conn);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the local node was restarted, we'll need to reinitialise values
|
||||||
|
* stored in shared memory.
|
||||||
|
*/
|
||||||
|
|
||||||
|
if (stored_local_node_id == UNKNOWN_NODE_ID)
|
||||||
|
{
|
||||||
|
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
|
||||||
|
repmgrd_set_pid(local_conn, getpid(), pid_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||||
|
{
|
||||||
|
if (get_recovery_type(primary_conn) == RECTYPE_STANDBY)
|
||||||
|
{
|
||||||
|
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
|
||||||
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
|
PQfinish(primary_conn);
|
||||||
|
primary_conn = NULL;
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
|
local_node_info.upstream_node_id = UNKNOWN_NODE_ID;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* we've reconnected to the local node after an outage */
|
/* we've reconnected to the local node after an outage */
|
||||||
if (local_node_info.active == false)
|
if (local_node_info.active == false)
|
||||||
{
|
{
|
||||||
int stored_local_node_id = UNKNOWN_NODE_ID;
|
|
||||||
|
|
||||||
if (PQstatus(primary_conn) == CONNECTION_OK)
|
if (PQstatus(primary_conn) == CONNECTION_OK)
|
||||||
{
|
{
|
||||||
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
|
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
|
||||||
@@ -1194,7 +1347,6 @@ loop:
|
|||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
local_node_info.active = true;
|
local_node_info.active = true;
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
_("reconnected to local node \"%s\" (ID: %i), marking active"),
|
_("reconnected to local node \"%s\" (ID: %i), marking active"),
|
||||||
local_node_info.node_name,
|
local_node_info.node_name,
|
||||||
@@ -1212,18 +1364,6 @@ loop:
|
|||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* If the local node was restarted, we'll need to reinitialise values
|
|
||||||
* stored in shared memory.
|
|
||||||
*/
|
|
||||||
|
|
||||||
stored_local_node_id = repmgrd_get_local_node_id(local_conn);
|
|
||||||
if (stored_local_node_id == UNKNOWN_NODE_ID)
|
|
||||||
{
|
|
||||||
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
|
|
||||||
repmgrd_set_pid(local_conn, getpid(), pid_file);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1299,12 +1439,13 @@ monitor_streaming_witness(void)
|
|||||||
log_warning(_("unable to retrieve node record from primary"));
|
log_warning(_("unable to retrieve node record from primary"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Log startup event */
|
|
||||||
if (startup_event_logged == false)
|
|
||||||
{
|
{
|
||||||
PQExpBufferData event_details;
|
PQExpBufferData event_details;
|
||||||
|
|
||||||
|
char *event_type = startup_event_logged == false
|
||||||
|
? "repmgrd_start"
|
||||||
|
: "repmgrd_upstream_reconnect";
|
||||||
|
|
||||||
initPQExpBuffer(&event_details);
|
initPQExpBuffer(&event_details);
|
||||||
|
|
||||||
appendPQExpBuffer(&event_details,
|
appendPQExpBuffer(&event_details,
|
||||||
@@ -1312,16 +1453,17 @@ monitor_streaming_witness(void)
|
|||||||
upstream_node_info.node_name,
|
upstream_node_info.node_name,
|
||||||
upstream_node_info.node_id);
|
upstream_node_info.node_id);
|
||||||
|
|
||||||
|
log_info("%s", event_details.data);
|
||||||
|
|
||||||
create_event_notification(primary_conn,
|
create_event_notification(primary_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
"repmgrd_start",
|
event_type,
|
||||||
true,
|
true,
|
||||||
event_details.data);
|
event_details.data);
|
||||||
|
|
||||||
startup_event_logged = true;
|
if (startup_event_logged == false)
|
||||||
|
startup_event_logged = true;
|
||||||
log_info("%s", event_details.data);
|
|
||||||
|
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
}
|
}
|
||||||
@@ -1371,6 +1513,17 @@ monitor_streaming_witness(void)
|
|||||||
upstream_node_unreachable_elapsed);
|
upstream_node_unreachable_elapsed);
|
||||||
log_notice("%s", event_details.data);
|
log_notice("%s", event_details.data);
|
||||||
|
|
||||||
|
/* check upstream is still primary */
|
||||||
|
if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
|
||||||
|
{
|
||||||
|
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
|
||||||
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
|
PQfinish(primary_conn);
|
||||||
|
primary_conn = NULL;
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
create_event_notification(primary_conn,
|
create_event_notification(primary_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
@@ -1428,14 +1581,25 @@ monitor_streaming_witness(void)
|
|||||||
upstream_node_info.node_id,
|
upstream_node_info.node_id,
|
||||||
degraded_monitoring_elapsed);
|
degraded_monitoring_elapsed);
|
||||||
|
|
||||||
|
log_notice("%s", event_details.data);
|
||||||
|
|
||||||
|
/* check upstream is still primary */
|
||||||
|
if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
|
||||||
|
{
|
||||||
|
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
|
||||||
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
|
PQfinish(primary_conn);
|
||||||
|
primary_conn = NULL;
|
||||||
|
termPQExpBuffer(&event_details);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
create_event_notification(primary_conn,
|
create_event_notification(primary_conn,
|
||||||
&config_file_options,
|
&config_file_options,
|
||||||
config_file_options.node_id,
|
config_file_options.node_id,
|
||||||
"repmgrd_upstream_reconnect",
|
"repmgrd_upstream_reconnect",
|
||||||
true,
|
true,
|
||||||
event_details.data);
|
event_details.data);
|
||||||
|
|
||||||
log_notice("%s", event_details.data);
|
|
||||||
termPQExpBuffer(&event_details);
|
termPQExpBuffer(&event_details);
|
||||||
|
|
||||||
goto loop;
|
goto loop;
|
||||||
@@ -1467,6 +1631,12 @@ monitor_streaming_witness(void)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* skip node if configured as a witness node - we can't possibly "follow" that */
|
||||||
|
if (cell->node_info->type == WITNESS)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
|
||||||
|
|
||||||
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
|
||||||
@@ -1594,14 +1764,28 @@ loop:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* refresh repmgr.nodes after "witness_sync_interval" seconds */
|
/*
|
||||||
|
* Refresh repmgr.nodes after "witness_sync_interval" seconds, and check if primary
|
||||||
|
* has changed
|
||||||
|
*/
|
||||||
|
|
||||||
{
|
{
|
||||||
int witness_sync_interval_elapsed = calculate_elapsed(witness_sync_interval_start);
|
int witness_sync_interval_elapsed = calculate_elapsed(witness_sync_interval_start);
|
||||||
|
|
||||||
if (witness_sync_interval_elapsed >= config_file_options.witness_sync_interval)
|
if (witness_sync_interval_elapsed >= config_file_options.witness_sync_interval)
|
||||||
{
|
{
|
||||||
|
if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
|
||||||
|
{
|
||||||
|
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
|
||||||
|
upstream_node_info.node_name, upstream_node_info.node_id);
|
||||||
|
PQfinish(primary_conn);
|
||||||
|
primary_conn = NULL;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
log_debug("synchronising witness node records");
|
log_debug("synchronising witness node records");
|
||||||
witness_copy_node_records(primary_conn, local_conn);
|
witness_copy_node_records(primary_conn, local_conn);
|
||||||
|
|
||||||
INSTR_TIME_SET_CURRENT(witness_sync_interval_start);
|
INSTR_TIME_SET_CURRENT(witness_sync_interval_start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3092,6 +3276,8 @@ check_connection(t_node_info *node_info, PGconn **conn)
|
|||||||
if (is_server_available(node_info->conninfo) == false)
|
if (is_server_available(node_info->conninfo) == false)
|
||||||
{
|
{
|
||||||
log_warning(_("connection to node %i lost"), node_info->node_id);
|
log_warning(_("connection to node %i lost"), node_info->node_id);
|
||||||
|
PQfinish(*conn);
|
||||||
|
*conn = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (PQstatus(*conn) != CONNECTION_OK)
|
if (PQstatus(*conn) != CONNECTION_OK)
|
||||||
|
|||||||
@@ -175,7 +175,6 @@ main(int argc, char **argv)
|
|||||||
|
|
||||||
/* daemon options */
|
/* daemon options */
|
||||||
|
|
||||||
|
|
||||||
case 'd':
|
case 'd':
|
||||||
daemonize = true;
|
daemonize = true;
|
||||||
break;
|
break;
|
||||||
@@ -184,7 +183,6 @@ main(int argc, char **argv)
|
|||||||
daemonize = parse_bool(optarg, "-d/--daemonize", &cli_errors);
|
daemonize = parse_bool(optarg, "-d/--daemonize", &cli_errors);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
||||||
case 'p':
|
case 'p':
|
||||||
strncpy(pid_file, optarg, MAXPGPATH);
|
strncpy(pid_file, optarg, MAXPGPATH);
|
||||||
break;
|
break;
|
||||||
|
|||||||
Reference in New Issue
Block a user