Compare commits

...

16 Commits

Author SHA1 Message Date
Ian Barwick
0cafeb3828 repmgrd: fix upstream role check
Only take action if it's confirmed as a standby.
2018-10-23 12:50:04 +09:00
Ian Barwick
79e79bd5f2 "standby switchover": close all connections used to check repmgrd status
The connections used to check repmgrd status on all nodes were not being
closed if repmgrd was not running. Normally this wouldn't be a huge
problem as they will go away when repmgr terminates or the PostgreSQL
server restarted. However, if shutdown mode is "smart", the open
connection on the demotion candidate will cause the shutdown operation
to fail until repmgr times out.
2018-10-23 10:59:24 +09:00
Ian Barwick
a4e21fd8fe Update doc version 2018-10-23 09:28:46 +09:00
Ian Barwick
e826f72312 Bump version number
4.2
2018-10-23 09:24:17 +09:00
Ian Barwick
1e8b3313ee doc: fix typos 2018-10-23 09:22:04 +09:00
Ian Barwick
b5772d88dd doc: fix typo
Per user report on mailing list.
2018-10-23 09:00:09 +09:00
Ian Barwick
22614573b9 Fix Makefile for VPATH builds under PostgreSQL 11 2018-10-22 20:05:09 +09:00
Ian Barwick
77c9092794 repmgrd: improve node role change detection 2018-10-19 11:33:08 +09:00
Ian Barwick
15bbe04a6f Speed up witness "failover" during a switchover 2018-10-18 18:35:23 +09:00
Ian Barwick
0842560a88 repmgrd: handle case where upstream is no longer primary
If the upstream comes back on line (e.g. after a switchover), and its
status is no longer primary, restart monitoring to ensure the correct
primary (potentially the current node) is being monitored.
2018-10-18 17:04:14 +09:00
Ian Barwick
8bec4946bc Ensure witness repmgrd detects change in upstream's role
This ensures that e.g. after a switchover, repmgrd running on a witness
node will automatically detect the new primary and monitor that.
2018-10-18 16:15:52 +09:00
Ian Barwick
3ab22f9442 repmgrd: ensure witness node doesn't try and follow another witness
Theoretically there should never be more than one witness node
visible here, but it's not impossible to rule it out, so add a
check just in case.
2018-10-18 12:20:04 +09:00
Ian Barwick
3a9c36a36c doc: improve upgrade instructions
Note requirement to execute "systemctl daemon-reload" for systemd
systems...
2018-10-17 17:08:54 +09:00
Ian Barwick
2ded8987ac doc: improve upgrade instructions 2018-10-17 14:35:36 +09:00
Ian Barwick
6311f3f30a Handle NULL strings when parsing boolean arguments 2018-10-17 11:46:29 +09:00
Ian Barwick
12ec6c7abc Doc: update HISTORY and 4.2 release notes 2018-10-17 09:50:36 +09:00
11 changed files with 576 additions and 266 deletions

View File

@@ -7,6 +7,7 @@
repmgr: add configuration file parameter "repmgr_bindir"; GitHub #246 (Ian)
repmgr: fix "Missing replication slots" label in "node check"; GitHub #507 (Ian)
repmgrd: fix parsing of -d/--daemonize option (Ian)
repmgrd: support "pausing" of repmgrd (Ian)
4.1.1 2018-09-05
logging: explicitly log the text of failed queries as ERRORs to

View File

@@ -30,13 +30,18 @@ all: \
PG_CPPFLAGS = -std=gnu89 -I$(includedir_internal) -I$(libpq_srcdir) -Wall -Wmissing-prototypes -Wmissing-declarations $(EXTRA_CFLAGS)
SHLIB_LINK = $(libpq)
HEADERS = $(wildcard *.h)
OBJS = \
repmgr.o
include Makefile.global
ifeq ($(vpath_build),yes)
HEADERS = $(wildcard *.h)
else
HEADERS_built = $(wildcard *.h)
endif
$(info Building against PostgreSQL $(MAJORVERSION))

View File

@@ -1531,6 +1531,9 @@ parse_bool(const char *s, const char *config_item, ItemList *error_list)
{
PQExpBufferData errors;
if (s == NULL)
return true;
if (strcasecmp(s, "0") == 0)
return false;

View File

@@ -96,10 +96,44 @@
</para>
</sect2>
<sect2>
<title>repmgrd enhancements</title>
<para>
<itemizedlist>
<listitem>
<para>
<application>repmgrd</application> can now be &quot;paused&quot;, i.e. instructed
not to take any action such as a failover, even if the prerequisites for such an
action are detected.
</para>
<para>
This removes the need to stop <application>repmgrd</application> on all nodes when
performing a planned operation such as a switchover.
</para>
<para>
For further details, see <link linkend="repmgrd-pausing">Pausing repmgrd</link>.
</para>
</listitem>
</itemizedlist>
</para>
</sect2>
<sect2>
<title>Bug fixes</title>
<para>
<itemizedlist>
<listitem>
<para>
&repmgr;: fix &quot;Missing replication slots&quot; label in
<command><link linkend="repmgr-node-check">repmgr node check</link></command>. (GitHub #507)
</para>
</listitem>
<listitem>
<para>
<application>repmgrd</application>: fix parsing of <option>-d/--daemonize</option> option.

View File

@@ -17,15 +17,15 @@
<link linkend="repmgr-node-rejoin"><command>repmgr node rejoin</command></link>.
</para>
<para>
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> to control the PostgreSQL
By default, &repmgr; will use PostgreSQL's <command>pg_ctl</command> utility to control the PostgreSQL
server. However this can lead to various problems, particularly when PostgreSQL has been
installed from packages, and expecially so if <application>systemd</application> is in use.
installed from packages, and especially so if <application>systemd</application> is in use.
</para>
<note>
<para>
If using <application>systemd</application>, ensure you have <varname>RemoteIPC</varname> set to <literal>off</literal>.
If using <application>systemd</application>, ensure you have <varname>RemoveIPC</varname> set to <literal>off</literal>.
See the <ulink url="https://wiki.postgresql.org/wiki/Systemd">systemd</ulink>
entry in the <ulink url="https://wiki.postgresql.org/wiki/Main_Page">PostgreSQL wiki</ulink> for details.
</para>

View File

@@ -7,9 +7,9 @@
<title>Upgrading repmgr</title>
<para>
&repmgr; is updated regularly with point releases (e.g. 4.0.1 to 4.0.2)
&repmgr; is updated regularly with minor releases (e.g. 4.0.1 to 4.0.2)
containing bugfixes and other minor improvements. Any substantial new
functionality will be included in a feature release (e.g. 4.0.x to 4.1.x).
functionality will be included in a major release (e.g. 4.0 to 4.1).
</para>
<sect1 id="upgrading-repmgr-extension" xreflabel="Upgrading repmgr 4.x and later">
@@ -19,83 +19,110 @@
</indexterm>
<title>Upgrading repmgr 4.x and later</title>
<para>
&repmgr; 4.x is implemented as a PostgreSQL extension; normally the upgrade consists
of the following steps:
<orderedlist>
From version 4, &repmgr; consists of three elements:
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<simpara>
Stop <application>repmgrd</application> (if in use) on all nodes where it is running
the <application>repmgr</application> and <application>repmgrd</application> executables
</simpara>
</listitem>
<listitem>
<simpara>
Install the updated package (or compile the updated source)
the objects for the &repmgr; PostgreSQL extension (SQL files for creating/updating
repmgr metadata, and the extension control file)
</simpara>
</listitem>
<listitem>
<simpara>
For major releases, e.g. from <literal>4.0.x</literal> to <literal>4.1</literal>,
where the <literal>repmgr</literal> shared object library has been updated,
restart PostgreSQL.
the shared library module used by <application>repmgrd</application> which
is resident in the PostgreSQL backend
</simpara>
</listitem>
<listitem>
<simpara>
For major releases, e.g. from <literal>4.0.x</literal> to <literal>4.1</literal>,
execute <command>ALTER EXTENSION repmgr UPDATE</command>
on the primary node in the database where the &repmgr; extension is installed.
</simpara>
<simpara>
This will update the extension metadata and, if necessary, apply
changes to the &repmgr; extension objects.
</simpara>
</listitem>
<listitem>
<simpara>
Start <application>repmgrd</application> (if in use).
</simpara>
</listitem>
</orderedlist>
</itemizedlist>
</para>
<para>
With <emphasis>minor releases</emphasis>, usually changes are only made to the <application>repmgr</application>
and <application>repmgrd</application> executables. In this case, the upgrade is quite straightforward,
and is simply a case of installing the new version, and restarting <application>repmgrd</application>
(if running).
</para>
<para>
For <emphasis>major releases</emphasis>, the &repmgr; PostgreSQL extension will need to be updated
to the latest version. Additionally, if the shared library module has been updated (this is sometimes,
but not always the case), PostgreSQL itself will need to be restarted on each node.
</para>
<important>
<para>
Always check the <link linkend="appendix-release-notes">release notes</link> for every
release as they may contain upgrade instructions particular to individual versions.
</para>
</important>
<para>
Note that it may be necessary to restart the PostgreSQL server if the upgrade contains
changes to the shared object file used by <application>repmgrd</application>; check the
<link linkend="appendix-release-notes">release notes</link> for details.
</para>
<sect2 id="upgrading-replication-cluster" xreflabel="Upgrading a replication cluster">
<sect2 id="upgrading-minor-version" xreflabel="Upgrading a minor version release">
<indexterm>
<primary>upgrading</primary>
<secondary>repmgr 4.x and later</secondary>
<secondary>minor release</secondary>
</indexterm>
<title>Upgrading a replication cluster</title>
<title>Upgrading a minor version release</title>
<para>
The process for installing minor version upgrades is quite straightforward:
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<simpara>
install the new &repmgr; version
</simpara>
</listitem>
<listitem>
<simpara>
restart <application>repmgrd</application> on all nodes where it is running
</simpara>
</listitem>
</itemizedlist>
</para>
<note>
<para>
Some packaging systems (e.g. <link linkend="packages-debian-ubuntu">Debian/Ubuntu</link>
may restart <application>repmgrd</application> as part of the package upgrade process.
</para>
</note>
<para>
Minor version upgrades can be performed in any order on the nodes in the replication
cluster.
</para>
<para>
A PostgreSQL restart is <emphasis>not</emphasis> required for minor version upgrades.
</para>
<note>
<para>
The same &repmgr; &quot;major version&quot; (e.g. <literal>4.2</literal>) must be
installed on all nodes in the replication cluster. While it's possible to have differing
&repmgr; &quot;minor versions&quot; (e.g. <literal>4.2.1</literal>) on different nodes,
we strongly recommend updating all nodes to the latest minor version.
</para>
<note>
<para>
Minor version upgrades can be performed in any order on the nodes in the replicaiton
cluster. In general it makes sense to start on the primary.
</para>
<para>
A PostgreSQL restart is <emphasis>not</emphasis> required for minor version upgrades.
</para>
</note>
</sect2>
<sect2 id="upgrading-major-version" xreflabel="Upgrading a major version release">
<indexterm>
<primary>upgrading</primary>
<secondary>major release</secondary>
</indexterm>
<title>Upgrading a major version release</title>
<para>
&quot;major version&quot; upgrades need to be planned more carefully, as they may include
changes to the &repmgr; metadata (which need to be propagated from the primary to all
@@ -111,7 +138,14 @@
<listitem>
<simpara>
Stop <application>repmgrd</application> (if in use) on all nodes where it is running
Stop <application>repmgrd</application> (if in use) on all nodes where it is running.
</simpara>
</listitem>
<listitem>
<simpara>
Disable the <application>repmgrd</application> service on all nodes where it is in use;
this is to prevent packages from prematurely restarting <application>repmgrd</application>.
</simpara>
</listitem>
@@ -121,12 +155,21 @@
</simpara>
</listitem>
<listitem>
<para>
If running a <literal>systemd</literal>-based Linux distribution, execute (as <literal>root</literal>,
or with appropriate <literal>sudo</literal> permissions):
<programlisting>
systemctl daemon-reload</programlisting>
</para>
</listitem>
<listitem>
<simpara>
If necessary, restart PostgreSQL, then <application>repmgrd</application> (if in use)
on each node. The order in which this is applied to individual nodes is not critical,
and it's also fine to restart on all nodes first before starting <application>repmgrd</application>.
If the &repmgr; shared library module has been updated (check the <link linkend="appendix-release-notes">release notes</link>!),
restart PostgreSQL, then <application>repmgrd</application> (if in use) on each node,
The order in which this is applied to individual nodes is not critical,
and it's also fine to restart PostgreSQL on all nodes first before starting <application>repmgrd</application>.
</simpara>
<simpara>
Note that if the upgrade requires a PostgreSQL restart, <application>repmgrd</application>
@@ -138,11 +181,17 @@
<para>
On the primary node, execute
<programlisting>
ALTER EXTENSION repmgr UPDATE</programlisting>
ALTER EXTENSION repmgr UPDATE</programlisting>
in the database where &repmgr; is installed.
</para>
</listitem>
<listitem>
<simpara>
Reenable the <application>repmgrd</application> service on all nodes where it is in use.
</simpara>
</listitem>
</orderedlist>
</para>
<tip>
@@ -154,6 +203,17 @@
</tip>
</sect2>
<sect2 id="upgrading-check-repmgrd" xreflabel="Checking repmgrd status after an upgrade">
<indexterm>
<primary>upgrading</primary>
<secondary>checking repmgrd status</secondary>
</indexterm>
<title>Checking repmgrd status after an upgrade</title>
<para>
From &repmgr; 4.2, once the upgrade is complete, execute the <command><link linkend="repmgr-daemon-status">repmgr daemon status</link></command>
command (on any node) to show an overview of the status of <application>repmgrd</application> on all nodes.
</para>
</sect2>
</sect1>
<sect1 id="upgrading-and-pg-upgrade" xreflabel="pg_upgrade and repmgr">

View File

@@ -1 +1 @@
<!ENTITY repmgrversion "4.2dev">
<!ENTITY repmgrversion "4.2">

View File

@@ -3718,8 +3718,17 @@ do_standby_switchover(void)
i++;
}
}
else
{
/* close all connections - we'll reestablish later */
for (cell = all_nodes.head; cell; cell = cell->next)
{
PQfinish(cell->node_info->conn);
cell->node_info->conn = NULL;
}
}
}
/*
* Sanity checks completed - prepare for the switchover
@@ -3801,7 +3810,7 @@ do_standby_switchover(void)
shutdown_command);
clear_node_info_list(&sibling_nodes);
clear_node_info_list(&all_nodes);
key_value_list_free(&remote_config_files);
return;
@@ -4087,9 +4096,25 @@ do_standby_switchover(void)
if (sibling_node_record.type == WITNESS)
{
/* TODO: create "repmgr witness resync" or similar */
appendPQExpBuffer(&remote_command_str,
"witness register -d \\'%s\\' --force 2>/dev/null && echo \"1\" || echo \"0\"",
local_node_record.conninfo);
/*
* Notify the witness repmgrd about the new primary, as at this point it will be assuming
* a failover situation is in place. It will detect the new primary at some point, this
* just speeds up the process.
*
* In the unlikely event repmgrd is not running or not in use, this will have no effect.
*/
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
if (PQstatus(cell->node_info->conn) == CONNECTION_OK)
{
notify_follow_primary(cell->node_info->conn, local_node_record.node_id);
}
PQfinish(cell->node_info->conn);
}
else
{
@@ -4144,8 +4169,6 @@ do_standby_switchover(void)
clear_node_info_list(&sibling_nodes);
PQfinish(local_conn);
/*

View File

@@ -1,2 +1,2 @@
#define REPMGR_VERSION_DATE ""
#define REPMGR_VERSION "4.2dev"
#define REPMGR_VERSION "4.2"

View File

@@ -71,6 +71,8 @@ static void notify_followers(NodeInfoList *standby_nodes, int follow_node_id);
static void check_connection(t_node_info *node_info, PGconn **conn);
static bool check_primary_status(int degraded_monitoring_elapsed);
static bool wait_primary_notification(int *new_primary_id);
static FailoverState follow_new_primary(int new_primary_id);
static FailoverState witness_follow_new_primary(int new_primary_id);
@@ -341,6 +343,13 @@ monitor_streaming_primary(void)
repmgrd_set_pid(local_conn, getpid(), pid_file);
}
/*
* check that the local node is still primary, otherwise switch
* to standby monitoring
*/
if (check_primary_status(-1) == false)
return;
goto loop;
}
@@ -393,6 +402,71 @@ monitor_streaming_primary(void)
{
local_node_info.node_status = NODE_STATUS_UP;
if (check_primary_status(degraded_monitoring_elapsed) == false)
return;
goto loop;
}
}
/*
* possibly attempt to find another node from cached list check if
* there's a new primary - if so add hook for fencing? loop, if
* starts up check status, switch monitoring mode
*/
}
loop:
/* check node is still primary, if not restart monitoring */
if (check_primary_status(-1) == false)
return;
/* emit "still alive" log message at regular intervals, if requested */
if (config_file_options.log_status_interval > 0)
{
int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
if (log_status_interval_elapsed >= config_file_options.log_status_interval)
{
log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"),
local_node_info.node_name,
local_node_info.node_id,
print_monitoring_state(monitoring_state));
if (monitoring_state == MS_DEGRADED)
{
log_detail(_("waiting for the node to become available"));
}
INSTR_TIME_SET_CURRENT(log_status_interval_start);
}
}
if (got_SIGHUP)
{
handle_sighup(&local_conn, PRIMARY);
}
log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
config_file_options.monitor_interval_secs);
sleep(config_file_options.monitor_interval_secs);
}
}
/*
* If monitoring a primary, it's possible that after an outage of the local node
* (due to e.g. a switchover), the node has come back as a standby. We therefore
* need to verify its status and if everything looks OK, restart monitoring in
* standby mode.
*/
bool
check_primary_status(int degraded_monitoring_elapsed)
{
PQExpBufferData event_details;
/* check to see if the node has been restored as a standby */
if (get_recovery_type(local_conn) == RECTYPE_STANDBY)
{
@@ -400,9 +474,18 @@ monitor_streaming_primary(void)
initPQExpBuffer(&event_details);
if (degraded_monitoring_elapsed > 0)
{
appendPQExpBuffer(&event_details,
_("reconnected to node after %i seconds, node is now a standby, switching to standby monitoring"),
degraded_monitoring_elapsed);
}
else
{
appendPQExpBufferStr(&event_details,
_("node is now a standby, switching to standby monitoring"));
}
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
@@ -456,17 +539,28 @@ monitor_streaming_primary(void)
}
if (resume_monitoring == true)
{
initPQExpBuffer(&event_details);
if (degraded_monitoring_elapsed > 0)
{
monitoring_state = MS_NORMAL;
log_notice(_("former primary has been restored as standby after %i seconds, updating node record and resuming monitoring"),
degraded_monitoring_elapsed);
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
_("node restored as standby after %i seconds, monitoring connection to upstream node %i"),
degraded_monitoring_elapsed,
local_node_info.upstream_node_id);
}
else
{
appendPQExpBuffer(&event_details,
_("node has become a standby, monitoring connection to upstream node %i"),
local_node_info.upstream_node_id);
}
create_event_notification(new_primary_conn,
&config_file_options,
@@ -481,12 +575,11 @@ monitor_streaming_primary(void)
close_connection(&new_primary_conn);
/* restart monitoring as standby */
return;
return false;
}
}
else if (record_status == RECORD_NOT_FOUND)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
@@ -511,6 +604,8 @@ monitor_streaming_primary(void)
}
}
else
{
if (degraded_monitoring_elapsed > 0)
{
monitoring_state = MS_NORMAL;
@@ -528,53 +623,14 @@ monitor_streaming_primary(void)
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
goto loop;
}
}
}
/*
* possibly attempt to find another node from cached list check if
* there's a new primary - if so add hook for fencing? loop, if
* starts up check status, switch monitoring mode
*/
}
loop:
/* emit "still alive" log message at regular intervals, if requested */
if (config_file_options.log_status_interval > 0)
{
int log_status_interval_elapsed = calculate_elapsed(log_status_interval_start);
if (log_status_interval_elapsed >= config_file_options.log_status_interval)
{
log_info(_("monitoring primary node \"%s\" (node ID: %i) in %s state"),
local_node_info.node_name,
local_node_info.node_id,
print_monitoring_state(monitoring_state));
if (monitoring_state == MS_DEGRADED)
{
log_detail(_("waiting for the node to become available"));
}
INSTR_TIME_SET_CURRENT(log_status_interval_start);
}
}
if (got_SIGHUP)
{
handle_sighup(&local_conn, PRIMARY);
}
log_verbose(LOG_DEBUG, "sleeping %i seconds (parameter \"monitor_interval_secs\")",
config_file_options.monitor_interval_secs);
sleep(config_file_options.monitor_interval_secs);
}
return true;
}
void
monitor_streaming_standby(void)
{
@@ -595,7 +651,7 @@ monitor_streaming_standby(void)
*/
if (local_node_info.upstream_node_id == UNKNOWN_NODE_ID)
{
local_node_info.upstream_node_id = get_primary_node_id(local_conn);
upstream_conn = get_primary_connection(local_conn, &local_node_info.upstream_node_id, NULL);
/*
* Terminate if there doesn't appear to be an active cluster primary.
@@ -608,8 +664,12 @@ monitor_streaming_standby(void)
log_error(_("unable to determine an active primary for this cluster, terminating"));
terminate(ERR_BAD_CONFIG);
}
}
(void) get_node_record(upstream_conn, local_node_info.upstream_node_id, &upstream_node_info);
}
else
{
record_status = get_node_record(local_conn, local_node_info.upstream_node_id, &upstream_node_info);
/*
@@ -636,6 +696,8 @@ monitor_streaming_standby(void)
log_debug("connecting to upstream node %i: \"%s\"", upstream_node_info.node_id, upstream_node_info.conninfo);
upstream_conn = establish_db_connection(upstream_node_info.conninfo, false);
}
/*
* Upstream node must be running at repmgrd startup.
@@ -652,6 +714,15 @@ monitor_streaming_standby(void)
terminate(ERR_DB_CONN);
}
record_status = get_node_record(upstream_conn, local_node_info.node_id, &local_node_info);
if (upstream_node_info.node_id == local_node_info.node_id)
{
PQfinish(upstream_conn);
upstream_conn = NULL;
return;
}
/*
* refresh upstream node record from upstream node, so it's as up-to-date
* as possible
@@ -682,6 +753,23 @@ monitor_streaming_standby(void)
primary_conn = upstream_conn;
}
/*
* It's possible monitoring has been restarted after some outage which
* resulted in the local node being marked as inactive; if so mark it
* as active again.
*/
if (local_node_info.active == false)
{
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
{
PQExpBufferData event_details;
initPQExpBuffer(&event_details);
local_node_info.active = true;
}
}
primary_node_id = get_primary_node_id(primary_conn);
/* Log startup event */
@@ -766,6 +854,7 @@ monitor_streaming_standby(void)
if (PQstatus(local_conn) != CONNECTION_OK)
{
check_connection(&local_node_info, &local_conn);
log_debug("YYY here");
}
try_reconnect(&upstream_conn, &upstream_node_info);
@@ -778,6 +867,43 @@ monitor_streaming_standby(void)
if (upstream_node_info.type == PRIMARY)
{
primary_conn = upstream_conn;
if (get_recovery_type(primary_conn) == RECTYPE_STANDBY)
{
ExecStatusType ping_result;
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name, upstream_node_info.node_id);
PQfinish(upstream_conn);
upstream_conn = NULL;
termPQExpBuffer(&event_details);
local_node_info.upstream_node_id = UNKNOWN_NODE_ID;
/* check local connection */
ping_result = connection_ping(local_conn);
if (ping_result != PGRES_TUPLES_OK)
{
int i;
PQfinish(local_conn);
for (i = 0; i < config_file_options.repmgrd_standby_startup_timeout; i++)
{
local_conn = establish_db_connection(local_node_info.conninfo, false);
if (PQstatus(local_conn) == CONNECTION_OK)
break;
log_debug("sleeping 1 second; %i of %i attempts to reconnect to local node",
i + 1,
config_file_options.repmgrd_standby_startup_timeout);
sleep(1);
}
}
return;
}
}
initPQExpBuffer(&event_details);
@@ -1140,6 +1266,7 @@ loop:
check_connection(&local_node_info, &local_conn);
if (PQstatus(local_conn) != CONNECTION_OK)
{
if (local_node_info.active == true)
@@ -1180,11 +1307,37 @@ loop:
}
else
{
int stored_local_node_id = repmgrd_get_local_node_id(local_conn);
/*
* If the local node was restarted, we'll need to reinitialise values
* stored in shared memory.
*/
if (stored_local_node_id == UNKNOWN_NODE_ID)
{
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
repmgrd_set_pid(local_conn, getpid(), pid_file);
}
if (PQstatus(primary_conn) == CONNECTION_OK)
{
if (get_recovery_type(primary_conn) == RECTYPE_STANDBY)
{
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name, upstream_node_info.node_id);
PQfinish(primary_conn);
primary_conn = NULL;
termPQExpBuffer(&event_details);
local_node_info.upstream_node_id = UNKNOWN_NODE_ID;
return;
}
}
/* we've reconnected to the local node after an outage */
if (local_node_info.active == false)
{
int stored_local_node_id = UNKNOWN_NODE_ID;
if (PQstatus(primary_conn) == CONNECTION_OK)
{
if (update_node_record_set_active(primary_conn, local_node_info.node_id, true) == true)
@@ -1194,7 +1347,6 @@ loop:
initPQExpBuffer(&event_details);
local_node_info.active = true;
appendPQExpBuffer(&event_details,
_("reconnected to local node \"%s\" (ID: %i), marking active"),
local_node_info.node_name,
@@ -1212,18 +1364,6 @@ loop:
termPQExpBuffer(&event_details);
}
}
/*
* If the local node was restarted, we'll need to reinitialise values
* stored in shared memory.
*/
stored_local_node_id = repmgrd_get_local_node_id(local_conn);
if (stored_local_node_id == UNKNOWN_NODE_ID)
{
repmgrd_set_local_node_id(local_conn, config_file_options.node_id);
repmgrd_set_pid(local_conn, getpid(), pid_file);
}
}
}
@@ -1299,12 +1439,13 @@ monitor_streaming_witness(void)
log_warning(_("unable to retrieve node record from primary"));
}
/* Log startup event */
if (startup_event_logged == false)
{
PQExpBufferData event_details;
char *event_type = startup_event_logged == false
? "repmgrd_start"
: "repmgrd_upstream_reconnect";
initPQExpBuffer(&event_details);
appendPQExpBuffer(&event_details,
@@ -1312,17 +1453,18 @@ monitor_streaming_witness(void)
upstream_node_info.node_name,
upstream_node_info.node_id);
log_info("%s", event_details.data);
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_start",
event_type,
true,
event_details.data);
if (startup_event_logged == false)
startup_event_logged = true;
log_info("%s", event_details.data);
termPQExpBuffer(&event_details);
}
@@ -1371,6 +1513,17 @@ monitor_streaming_witness(void)
upstream_node_unreachable_elapsed);
log_notice("%s", event_details.data);
/* check upstream is still primary */
if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
{
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name, upstream_node_info.node_id);
PQfinish(primary_conn);
primary_conn = NULL;
termPQExpBuffer(&event_details);
return;
}
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
@@ -1428,14 +1581,25 @@ monitor_streaming_witness(void)
upstream_node_info.node_id,
degraded_monitoring_elapsed);
log_notice("%s", event_details.data);
/* check upstream is still primary */
if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
{
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name, upstream_node_info.node_id);
PQfinish(primary_conn);
primary_conn = NULL;
termPQExpBuffer(&event_details);
return;
}
create_event_notification(primary_conn,
&config_file_options,
config_file_options.node_id,
"repmgrd_upstream_reconnect",
true,
event_details.data);
log_notice("%s", event_details.data);
termPQExpBuffer(&event_details);
goto loop;
@@ -1467,6 +1631,12 @@ monitor_streaming_witness(void)
continue;
}
/* skip node if configured as a witness node - we can't possibly "follow" that */
if (cell->node_info->type == WITNESS)
{
continue;
}
cell->node_info->conn = establish_db_connection(cell->node_info->conninfo, false);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
@@ -1594,14 +1764,28 @@ loop:
}
/* refresh repmgr.nodes after "witness_sync_interval" seconds */
/*
* Refresh repmgr.nodes after "witness_sync_interval" seconds, and check if primary
* has changed
*/
{
int witness_sync_interval_elapsed = calculate_elapsed(witness_sync_interval_start);
if (witness_sync_interval_elapsed >= config_file_options.witness_sync_interval)
{
if (get_recovery_type(primary_conn) != RECTYPE_PRIMARY)
{
log_notice(_("current upstream node \"%s\" (node ID: %i) is not primary, restarting monitoring"),
upstream_node_info.node_name, upstream_node_info.node_id);
PQfinish(primary_conn);
primary_conn = NULL;
return;
}
log_debug("synchronising witness node records");
witness_copy_node_records(primary_conn, local_conn);
INSTR_TIME_SET_CURRENT(witness_sync_interval_start);
}
}
@@ -3092,6 +3276,8 @@ check_connection(t_node_info *node_info, PGconn **conn)
if (is_server_available(node_info->conninfo) == false)
{
log_warning(_("connection to node %i lost"), node_info->node_id);
PQfinish(*conn);
*conn = NULL;
}
if (PQstatus(*conn) != CONNECTION_OK)

View File

@@ -175,7 +175,6 @@ main(int argc, char **argv)
/* daemon options */
case 'd':
daemonize = true;
break;
@@ -184,7 +183,6 @@ main(int argc, char **argv)
daemonize = parse_bool(optarg, "-d/--daemonize", &cli_errors);
break;
case 'p':
strncpy(pid_file, optarg, MAXPGPATH);
break;