repmgrd: activate inactive node record at startup

If a PostgreSQL instance was shut down while repmgrd was running, and
repmgrd was subsequently restarted (this chain of events could occur
during e.g. a server reboot), the node record will have been set to
"inactive". Previously, in this case repmgrd would refuse to start up.
However, as we can determine the node is running, it should normally
be no problem to automatically set the node record to "active".

The old behaviour can be restored by setting the new parameter
"repmgrd_exit_on_inactive_node" to "true".

RM19604.
This commit is contained in:
Ian Barwick
2021-07-12 17:38:40 +09:00
parent f64f498afb
commit 79d1f005db
10 changed files with 90 additions and 3 deletions

View File

@@ -1,6 +1,8 @@
5.3.0 2021-??-??
repmgrd: prefix all shared library functions with "repmgr_" to
minimize the risk of clashes with other shared libraries (Ian)
repmgrd: at startup, if node record is marked as "inactive", attempt
to set it to "active" (Ian)
5.2.2. 2021-??-??
standby clone: set "slot_name" in node record if required (Ian)

View File

@@ -581,6 +581,16 @@ struct ConfigFileSetting config_file_settings[] =
{ .strmaxlen = sizeof(config_file_options.repmgrd_pid_file) },
{ .postprocess_func = &repmgr_canonicalize_path }
},
/* repmgrd_exit_on_inactive_node */
{
"repmgrd_exit_on_inactive_node",
CONFIG_BOOL,
{ .boolptr = &config_file_options.repmgrd_exit_on_inactive_node},
{ .booldefault = DEFAULT_REPMGRD_EXIT_ON_INACTIVE_NODE },
{},
{},
{}
},
/* standby_disconnect_on_failover */
{
"standby_disconnect_on_failover",

View File

@@ -206,6 +206,7 @@ typedef struct
int primary_notification_timeout;
int repmgrd_standby_startup_timeout;
char repmgrd_pid_file[MAXPGPATH];
bool repmgrd_exit_on_inactive_node;
bool standby_disconnect_on_failover;
int sibling_nodes_disconnect_timeout;
ConnectionCheckType connection_check_type;

View File

@@ -26,6 +26,23 @@
This release provides support for <ulink url="https://www.postgresql.org/docs/14/release-14.html">PostgreSQL 14</ulink>,
to be released later in 2021.
</para>
<sect2>
<title>Improvements</title>
<para>
<itemizedlist>
<listitem>
<para>
&repmgrd;: at startup, if node record is marked as "inactive", attempt
to set it to "active".
</para>
<para>
This behaviour can be overridden by setting the configuration parameter
<varname>repmgrd_exit_on_inactive_node</varname> to <literal>true</literal>.
</para>
</listitem>
</itemizedlist>
</para>
</sect2>
<sect2>
<title>Bug fixes</title>

View File

@@ -485,6 +485,32 @@
</listitem>
</varlistentry>
<varlistentry>
<term><option>repmgrd_exit_on_inactive_node</option></term>
<listitem>
<indexterm>
<primary>repmgrd_exit_on_inactive_node</primary>
</indexterm>
<para>
This parameter is available in &repmgr; 5.3 and later.
</para>
<para>
If a node was marked as inactive but is running, and this option is set to
<literal>true</literal>, &repmgrd; will abort on startup.
</para>
<para>
By default, <option>repmgrd_exit_on_inactive_node</option> is set
to <literal>false</literal>, in which case &repmgrd; will set the
node record to active on startup.
</para>
<para>
Setting this parameter to <literal>true</literal> causes &repmgrd;
to behave in the same way it did in &repmgr; 5.2 and earlier.
</para>
</listitem>
</varlistentry>
</variablelist>
<para>

View File

@@ -337,6 +337,7 @@ ssh_options='-q -o ConnectTimeout=10' # Options to append to "ssh"
# "--no-pid-file" will force PID file creation to be skipped.
# Note: there is normally no need to set this, particularly if
# repmgr was installed from packages.
#repmgrd_exit_on_inactive_node=false # If "true", and the node record is marked as "inactive", abort repmgrd startup
#standby_disconnect_on_failover=false # If "true", in a failover situation wait for all standbys to
# disconnect their WAL receivers before electing a new primary
# (PostgreSQL 9.5 and later only; repmgr user must be a superuser for this)

View File

@@ -135,6 +135,7 @@
#define DEFAULT_ASYNC_QUERY_TIMEOUT 60 /* seconds */
#define DEFAULT_PRIMARY_NOTIFICATION_TIMEOUT 60 /* seconds */
#define DEFAULT_REPMGRD_STANDBY_STARTUP_TIMEOUT -1 /*seconds */
#define DEFAULT_REPMGRD_EXIT_ON_INACTIVE_NODE false,
#define DEFAULT_STANDBY_DISCONNECT_ON_FAILOVER false
#define DEFAULT_SIBLING_NODES_DISCONNECT_TIMEOUT 30 /* seconds */
#define DEFAULT_CONNECTION_CHECK_TYPE CHECK_PING

View File

@@ -169,7 +169,7 @@ handle_sigint_physical(SIGNAL_ARGS)
/* perform some sanity checks on the node's configuration */
void
do_physical_node_check(void)
do_physical_node_check(PGconn *conn)
{
/*
* Check if node record is active - if not, and `failover=automatic`, the
@@ -186,8 +186,37 @@ do_physical_node_check(void)
{
char *hint = "Check that \"repmgr (primary|standby) register\" was executed for this node";
/*
* Attempt to set node record active (unless explicitly configured not to)
*/
if (config_file_options.repmgrd_exit_on_inactive_node == false)
{
PGconn *primary_conn = get_primary_connection_quiet(conn, NULL, NULL);
bool success = true;
if (PQstatus(primary_conn) != CONNECTION_OK)
{
success = false;
}
else
{
log_notice(_("setting node record for node \"%s\" (ID: %i) to \"active\""),
local_node_info.node_name,
local_node_info.node_id);
success = update_node_record_set_active(primary_conn, local_node_info.node_id, true);
PQfinish(primary_conn);
}
if (success == true)
{
local_node_info.active = true;
return;
}
}
switch (config_file_options.failover)
{
/* "failover" is an enum, all values should be covered here */
case FAILOVER_AUTOMATIC:

View File

@@ -19,7 +19,7 @@
#ifndef _REPMGRD_PHYSICAL_H_
#define _REPMGRD_PHYSICAL_H_
void do_physical_node_check(void);
void do_physical_node_check(PGconn *conn);
void monitor_streaming_primary(void);
void monitor_streaming_standby(void);

View File

@@ -512,7 +512,7 @@ main(int argc, char **argv)
log_debug("node id is %i, upstream node id is %i",
local_node_info.node_id,
local_node_info.upstream_node_id);
do_physical_node_check();
do_physical_node_check(local_conn);
}
if (daemonize == true)