repmgr: make "node check" and "node status" return ERR_NODE_STATUS when appropriate

If any issue is detected (and "node check" is not being executed with a specific
individual check), "ERR_NODE_STATUS" is returned.
This commit is contained in:
Ian Barwick
2018-07-05 14:31:06 +09:00
parent 92d0e6809b
commit ae60caacdd
6 changed files with 147 additions and 26 deletions

View File

@@ -4,6 +4,8 @@
repmgr: fix "standby register --wait-sync" when no timeout provided (Ian) repmgr: fix "standby register --wait-sync" when no timeout provided (Ian)
repmgr: "cluster show" returns non-zero value if an issue encountered; repmgr: "cluster show" returns non-zero value if an issue encountered;
GitHub #456 (Ian) GitHub #456 (Ian)
repmgr: "node check" and "node status" returns non-zero value if an issue
encountered (Ian)
repmgr: "node status" returns non-zero value if an issue encountered (Ian) repmgr: "node status" returns non-zero value if an issue encountered (Ian)
repmgrd: create a PID file by default; GitHub #457 (Ian) repmgrd: create a PID file by default; GitHub #457 (Ian)
repmgrd: daemonize process by default; GitHub #458 (Ian) repmgrd: daemonize process by default; GitHub #458 (Ian)

View File

@@ -34,7 +34,7 @@
</para> </para>
<sect2> <sect2>
<title>repmgrd enhancements</title> <title>repmgr enhancements</title>
<para> <para>
<itemizedlist> <itemizedlist>
@@ -48,10 +48,20 @@
<listitem> <listitem>
<para> <para>
<command><link linkend="repmgr-cluster-show">repmgr cluster-show</link></command> <command><link linkend="repmgr-cluster-show">repmgr cluster-show</link></command>,
returns non-zero exit code if node status issues detected (GitHub #456). <command><link linkend="repmgr-node-check">repmgr node check</link></command> and
<command><link linkend="repmgr-node-status">repmgr node status</link></command>
return non-zero exit code if node status issues detected. (GitHub #456).
</para> </para>
</listitem> </listitem>
</itemizedlist>
</para>
</sect2>
<sect2>
<title>repmgrd enhancements</title>
<para>
<itemizedlist>
<listitem> <listitem>
<para> <para>

View File

@@ -142,4 +142,11 @@
</variablelist> </variablelist>
</refsect1> </refsect1>
<refsect1>
<title>See also</title>
<para>
<xref linkend="repmgr-node-status">, <xref linkend="repmgr-node-check">
</para>
</refsect1>
</refentry> </refentry>

View File

@@ -61,7 +61,9 @@
<listitem> <listitem>
<simpara> <simpara>
<literal>--archive-ready</literal>: checks for WAL files which have not yet been archived <literal>--archive-ready</literal>: checks for WAL files which have not yet been archived,
and returns <literal>WARNING</literal> or <literal>CRITICAL</literal> if the number
exceeds <varname>archive_ready_warning</varname> or <varname>archive_ready_critical</varname> respectively.
</simpara> </simpara>
</listitem> </listitem>
@@ -107,4 +109,80 @@
</itemizedlist> </itemizedlist>
</para> </para>
</refsect1> </refsect1>
<refsect1>
<title>Exit codes</title>
<para>
When executing <command>repmgr node check</command> with one of the individual
checks listed above, &repmgr; will emit one of the following Nagios-style exit codes
(even if <literal>--nagios</literal> is not supplied):
<itemizedlist spacing="compact" mark="bullet">
<listitem>
<simpara>
<literal>0</literal>: OK
</simpara>
</listitem>
<listitem>
<simpara>
<literal>1</literal>: WARNING
</simpara>
</listitem>
<listitem>
<simpara>
<literal>2</literal>: ERROR
</simpara>
</listitem>
<listitem>
<simpara>
<literal>3</literal>: UNKNOWN
</simpara>
</listitem>
</itemizedlist>
</para>
<para>
Following exit codes can be emitted by <command>repmgr status check</command>
if no individual check was specified.
</para>
<variablelist>
<varlistentry>
<term><option>SUCCESS (0)</option></term>
<listitem>
<para>
No issues were detected.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>ERR_NODE_STATUS (25)</option></term>
<listitem>
<para>
One or more issues were detected.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1>
<title>See also</title>
<para>
<xref linkend="repmgr-node-status">, <xref linkend="repmgr-cluster-show">
</para>
</refsect1>
</refentry> </refentry>

View File

@@ -84,7 +84,8 @@
<refsect1> <refsect1>
<title>See also</title> <title>See also</title>
<para> <para>
See <xref linkend="repmgr-node-check"> to diagnose issues. See <xref linkend="repmgr-node-check"> to diagnose issues and <xref linkend="repmgr-cluster-show">
for an overview of all nodes in the cluster.
</para> </para>
</refsect1> </refsect1>
</refentry> </refentry>

View File

@@ -170,11 +170,17 @@ do_node_status(void)
} }
else else
{ {
/* "archive_mode" is not "off", i.e. one of "on", "always" */
bool enabled = true; bool enabled = true;
PQExpBufferData archiving_status; PQExpBufferData archiving_status;
char archive_command[MAXLEN] = ""; char archive_command[MAXLEN] = "";
initPQExpBuffer(&archiving_status); initPQExpBuffer(&archiving_status);
/*
* if the node is a standby, and "archive_mode" is "on", archiving will
* actually be disabled.
*/
if (recovery_type == RECTYPE_STANDBY) if (recovery_type == RECTYPE_STANDBY)
{ {
if (guc_set(conn, "archive_mode", "=", "on")) if (guc_set(conn, "archive_mode", "=", "on"))
@@ -642,6 +648,7 @@ do_node_check(void)
CheckStatusList status_list = {NULL, NULL}; CheckStatusList status_list = {NULL, NULL};
CheckStatusListCell *cell = NULL; CheckStatusListCell *cell = NULL;
bool issue_detected = false;
/* for internal use */ /* for internal use */
if (runtime_options.has_passfile == true) if (runtime_options.has_passfile == true)
@@ -750,12 +757,23 @@ do_node_check(void)
initPQExpBuffer(&output); initPQExpBuffer(&output);
/* order functions are called is also output order */ /* order functions are called is also output order */
(void) do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list); if (do_node_check_role(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
(void) do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list); issue_detected = true;
(void) do_node_check_archive_ready(conn, runtime_options.output_mode, &status_list);
(void) do_node_check_downstream(conn, runtime_options.output_mode, &status_list); if (do_node_check_replication_lag(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
(void) do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list); issue_detected = true;
(void) do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, &status_list);
if (do_node_check_archive_ready(conn, runtime_options.output_mode, &status_list) != CHECK_STATUS_OK)
issue_detected = true;
if (do_node_check_downstream(conn, runtime_options.output_mode, &status_list) != CHECK_STATUS_OK)
issue_detected = true;
if (do_node_check_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
issue_detected = true;
if (do_node_check_missing_slots(conn, runtime_options.output_mode, &node_info, &status_list) != CHECK_STATUS_OK)
issue_detected = true;
if (runtime_options.output_mode == OM_CSV) if (runtime_options.output_mode == OM_CSV)
{ {
@@ -812,6 +830,11 @@ do_node_check(void)
check_status_list_free(&status_list); check_status_list_free(&status_list);
PQfinish(conn); PQfinish(conn);
if (issue_detected == true)
{
exit(ERR_NODE_STATUS);
}
} }