daemon (start|stop): verify that repmgrd starts/stops.

Note this may not always be possible for "daemon stop" if we are unable
to determine the repmgrd PID.
This commit is contained in:
Ian Barwick
2019-01-30 14:36:52 +09:00
parent 70e4243a1d
commit d7420d7274
8 changed files with 249 additions and 19 deletions

View File

@@ -23,7 +23,11 @@
This command starts the <application>repmgrd</application> daemon on the
local node.
</para>
<para>
By default, &repmgr; will wait for up to 15 seconds to confirm that <application>repmgrd</application>
started. This behaviour can be overridden by specifying a diffent value using the <option>--wait</option>
option, or disabled altogether with the <option>--no-wait</option> option.
</para>
</refsect1>
<refsect1>
@@ -65,6 +69,33 @@
</listitem>
</varlistentry>
<varlistentry>
<term><option>--wait</option></term>
<listitem>
<para>
Wait for the specified number of seconds to confirm that <application>repmgrd</application>
started successfully.
</para>
<para>
Note that providing <option>--wait=0</option> is the equivalent of <option>--no-wait</option>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--no-wait</option></term>
<listitem>
<para>
Don't wait to confirm that <application>repmgrd</application>
started successfully.
</para>
<para>
This is equivalent to providing <option>--wait=0</option>.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
@@ -79,7 +110,12 @@
<term><option>SUCCESS (0)</option></term>
<listitem>
<para>
<application>repmgrd</application> could be started.
The <application>repmgrd</application> start command (defined in
<varname>repmgrd_service_start_command</varname>) was successfully executed.
</para>
<para>
If the <option>--wait</option> option was provided, &repmgr; will confirm that
<application>repmgrd</application> has actually started up.
</para>
</listitem>
</varlistentry>
@@ -94,11 +130,33 @@
</listitem>
</varlistentry>
<varlistentry>
<term><option>ERR_DB_CONN (6)</option></term>
<listitem>
<para>
&repmgr; was unable to connect to the local PostgreSQL node.
</para>
<para>
PostgreSQL must be running before <application>repmgrd</application>
can be started. Additionally, unless the <option>--no-wait</option> option was
provided, &repmgr; needs to be able to connect to the local PostgreSQL node
to determine the state of <application>repmgrd</application>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>ERR_REPMGRD_SERVICE (27)</option></term>
<listitem>
<para>
<application>repmgrd</application> could not be started.
The <application>repmgrd</application> start command (defined in
<varname>repmgrd_service_start_command</varname>) was not successfully executed.
</para>
<para>
This can also mean that &repmgr; was unable to confirm whether <application>repmgrd</application>
successfully started (unless the <option>--no-wait</option> option was provided).
</para>
</listitem>
</varlistentry>
@@ -109,7 +167,7 @@
<refsect1>
<title>See also</title>
<para>
<xref linkend="repmgr-daemon-stop">, <xref linkend="repmgr-daemon-status">
<xref linkend="repmgr-daemon-stop">, <xref linkend="repmgr-daemon-status">, <xref linkend="repmgrd-daemon">
</para>
</refsect1>

View File

@@ -24,6 +24,18 @@
local node.
</para>
<para>
By default, &repmgr; will wait for up to 15 seconds to confirm that <application>repmgrd</application>
stopped. This behaviour can be overridden by specifying a diffent value using the <option>--wait</option>
option, or disabled altogether with the <option>--no-wait</option> option.
</para>
<note>
<para>
If PostgreSQL is not running on the local node, under some circumstances &repmgr; may not
be able to confirm if <application>repmgrd</application> has actually stopped.
</para>
</note>
</refsect1>
<refsect1>
@@ -65,6 +77,32 @@
</listitem>
</varlistentry>
<varlistentry>
<term><option>-w/--wait</option></term>
<listitem>
<para>
Wait for the specified number of seconds to confirm that <application>repmgrd</application>
stopped successfully.
</para>
<para>
Note that providing <option>--wait=0</option> is the equivalent of <option>--no-wait</option>.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>--no-wait</option></term>
<listitem>
<para>
Don't wait to confirm that <application>repmgrd</application>
stopped successfully.
</para>
<para>
This is equivalent to providing <option>--wait=0</option>.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>

View File

@@ -348,7 +348,7 @@
</sect1>
<sect1 id="repmgrd-daemon">
<sect1 id="repmgrd-daemon" xreflabel="repmgrd daemon">
<indexterm>
<primary>repmgrd</primary>
<secondary>starting and stopping</secondary>

View File

@@ -25,7 +25,9 @@
#include "repmgr-client-global.h"
#include "repmgr-action-daemon.h"
#define REPMGR_DAEMON_STOP_START_WAIT 15
#define REPMGR_DAEMON_STATUS_START_HINT _("use \"repmgr daemon status\" to confirm that repmgrd was successfully started")
#define REPMGR_DAEMON_STATUS_STOP_HINT _("use \"repmgr daemon status\" to confirm that repmgrd was successfully stopped")
/*
* Possibly also show:
@@ -410,7 +412,7 @@ do_daemon_start(void)
/* TODO: if PostgreSQL is not available, have repmgrd loop and retry connection */
log_error(_("unable to connect to local node"));
log_detail(_("PostgreSQL must be running before \"repmgrd\" can be started"));
exit(ERR_REPMGRD_SERVICE);
exit(ERR_DB_CONN);
}
/*
@@ -421,7 +423,17 @@ do_daemon_start(void)
if (is_repmgrd_running(conn) == true)
{
pid_t pid = UNKNOWN_PID;
log_error(_("repmgrd appears to be running already"));
pid = repmgrd_get_pid(conn);
if (pid != UNKNOWN_PID)
log_detail(_("repmgrd PID is %i"), pid);
else
log_warning(_("unable to determine repmgrd PID"));
PQfinish(conn);
exit(ERR_REPMGRD_SERVICE);
}
@@ -457,6 +469,52 @@ do_daemon_start(void)
}
termPQExpBuffer(&output_buf);
if (runtime_options.no_wait == true || runtime_options.wait == 0)
{
log_hint(REPMGR_DAEMON_STATUS_START_HINT);
}
else
{
int i = 0;
int timeout = REPMGR_DAEMON_STOP_START_WAIT;
if (runtime_options.wait_provided)
timeout = runtime_options.wait;
conn = establish_db_connection(config_file_options.conninfo, false);
if (PQstatus(conn) != CONNECTION_OK)
{
log_notice(_("unable to connect to local node"));
log_hint(REPMGR_DAEMON_STATUS_START_HINT);
exit(ERR_DB_CONN);
}
for (;;)
{
if (is_repmgrd_running(conn) == true)
{
log_notice(_("repmgrd was successfully started"));
PQfinish(conn);
break;
}
if (i == timeout)
{
PQfinish(conn);
log_error(_("repmgrd does not appear to have started after %i seconds"),
timeout);
log_hint(REPMGR_DAEMON_STATUS_START_HINT);
exit(ERR_REPMGRD_SERVICE);
}
log_debug("sleeping 1 second; %i of %i attempts to determine if repmgrd is running",
i, runtime_options.wait);
sleep(1);
i++;
}
}
}
@@ -468,7 +526,7 @@ void do_daemon_stop(void)
bool success;
pid_t pid = UNKNOWN_PID;
if (config_file_options.repmgrd_service_start_command[0] == '\0')
if (config_file_options.repmgrd_service_stop_command[0] == '\0')
{
log_error(_("\"repmgrd_service_stop_command\" is not set"));
log_hint(_("set \"repmgrd_service_stop_command\" in \"repmgr.conf\""));
@@ -485,6 +543,9 @@ void do_daemon_stop(void)
if (PQstatus(conn) != CONNECTION_OK)
{
/*
* a PostgreSQL connection is not required to stop repmgrd,
*/
log_warning(_("unable to connect to local node"));
}
else
@@ -534,6 +595,66 @@ void do_daemon_stop(void)
}
termPQExpBuffer(&output_buf);
if (runtime_options.no_wait == true || runtime_options.wait == 0)
{
log_hint(REPMGR_DAEMON_STATUS_STOP_HINT);
}
else
{
int i = 0;
int timeout = REPMGR_DAEMON_STOP_START_WAIT;
/*
*
*/
if (pid == UNKNOWN_PID)
{
/*
* XXX attempt to get pidfile from config
* and get contents
* ( see check_and_create_pid_file() )
* if PID still unknown, exit here
*/
log_warning(_("unable to determine repmgrd PID"));
log_hint(REPMGR_DAEMON_STATUS_STOP_HINT);
exit(ERR_REPMGRD_SERVICE);
}
if (runtime_options.wait_provided)
timeout = runtime_options.wait;
for (;;)
{
if (kill(pid, 0) == -1)
{
if (errno == ESRCH)
{
log_notice(_("repmgrd was successfully stopped"));
exit(SUCCESS);
}
else
{
log_error(_("unable to determine status of process with PID %i"), pid);
log_detail("%s", strerror(errno));
exit(ERR_REPMGRD_SERVICE);
}
}
if (i == timeout)
{
log_error(_("repmgrd does not appear to have stopped after %i seconds"),
timeout);
log_hint(REPMGR_DAEMON_STATUS_START_HINT);
exit(ERR_REPMGRD_SERVICE);
}
log_debug("sleeping 1 second; %i of %i attempts to determine if repmgrd with PID %i is running",
i, timeout, pid);
sleep(1);
i++;
}
}
}
@@ -559,16 +680,20 @@ void do_daemon_help(void)
printf(_("DAEMON START\n"));
puts("");
printf(_(" \"daemon start\" attempts to start repmgrd"));
printf(_(" \"daemon start\" attempts to start repmgrd\n"));
puts("");
printf(_(" --dry-run check prerequisites but don't start repmgrd\n"));
printf(_(" -w/--wait wait for repmgrd to start (default: %i seconds)\n"), REPMGR_DAEMON_STOP_START_WAIT);
printf(_(" --no-wait don't wait for repmgrd to start\n"));
puts("");
printf(_("DAEMON STOP\n"));
puts("");
printf(_(" \"daemon stop\" attempts to stop repmgrd"));
printf(_(" \"daemon stop\" attempts to stop repmgrd\n"));
puts("");
printf(_(" --dry-run check prerequisites but don't stop repmgrd\n"));
printf(_(" -w/--wait wait for repmgrd to stop (default: %i seconds)\n"), REPMGR_DAEMON_STOP_START_WAIT);
printf(_(" --no-wait don't wait for repmgrd to stop\n"));
puts("");
printf(_("DAEMON PAUSE\n"));

View File

@@ -2298,7 +2298,7 @@ do_standby_follow(void)
NULL);
}
if (PQstatus(follow_target_conn) == CONNECTION_OK || runtime_options.wait == false)
if (PQstatus(follow_target_conn) == CONNECTION_OK || runtime_options.wait_provided == false)
{
break;
}
@@ -2317,7 +2317,7 @@ do_standby_follow(void)
log_error(_("unable to connect to target node %i"), follow_target_node_id);
}
if (runtime_options.wait == true)
if (runtime_options.wait_provided == true)
{
if (follow_target_node_id == UNKNOWN_NODE_ID)
{

View File

@@ -35,13 +35,14 @@ typedef struct
bool connection_param_provided;
bool host_param_provided;
bool limit_provided;
bool wait_provided;
/* general configuration options */
char config_file[MAXPGPATH];
bool dry_run;
bool force;
char pg_bindir[MAXLEN]; /* overrides setting in repmgr.conf */
bool wait;
int wait;
bool no_wait;
/* logging options */
@@ -137,9 +138,9 @@ typedef struct
#define T_RUNTIME_OPTIONS_INITIALIZER { \
/* configuration metadata */ \
false, false, false, false, \
false, false, false, false, false, \
/* general configuration options */ \
"", false, false, "", false, false, \
"", false, false, "", -1, false, \
/* logging options */ \
"", false, false, false, false, \
/* output options */ \

View File

@@ -254,7 +254,11 @@ main(int argc, char **argv)
/* -w/--wait */
case 'w':
runtime_options.wait = true;
runtime_options.wait_provided = true;
if (optarg != NULL)
{
runtime_options.wait = repmgr_atoi(optarg, "--wait", &cli_errors, 0);
}
break;
/* -W/--no-wait */
@@ -1713,17 +1717,19 @@ check_cli_parameters(const int action)
/* --wait/--no-wait */
if (runtime_options.wait == true && runtime_options.no_wait == true)
if (runtime_options.wait_provided == true && runtime_options.no_wait == true)
{
item_list_append_format(&cli_errors,
_("both --wait and --no-wait options provided"));
}
else
{
if (runtime_options.wait)
if (runtime_options.wait_provided)
{
switch (action)
{
case DAEMON_START:
case DAEMON_STOP:
case STANDBY_FOLLOW:
break;
default:
@@ -1736,6 +1742,8 @@ check_cli_parameters(const int action)
{
switch (action)
{
case DAEMON_START:
case DAEMON_STOP:
case NODE_REJOIN:
break;
default:

View File

@@ -116,7 +116,7 @@ static struct option long_options[] =
{"dry-run", no_argument, NULL, OPT_DRY_RUN},
{"force", no_argument, NULL, 'F'},
{"pg_bindir", required_argument, NULL, 'b'},
{"wait", no_argument, NULL, 'w'},
{"wait", optional_argument, NULL, 'w'},
{"no-wait", no_argument, NULL, 'W'},
/* connection options */