daemon status: add column "upstream last seen"

This displays the interval (in seconds) since the repmgrd instance on
each node last confirmed its upstream node is available.
This commit is contained in:
Ian Barwick
2019-02-23 12:16:34 +09:00
parent 71d151ca87
commit 07097575b1
6 changed files with 91 additions and 31 deletions

View File

@@ -21,6 +21,7 @@
repmgr: fix long node ID display in "cluster show" (Ian) repmgr: fix long node ID display in "cluster show" (Ian)
repmgr: check for primary server before executing "witness register"; repmgr: check for primary server before executing "witness register";
GitHub #538 (Ian) GitHub #538 (Ian)
repmgr: show "upstream last seen" interval in "daemon status" output (Ian)
repmgr: "node check" will only consider physical replication slots (Ian) repmgr: "node check" will only consider physical replication slots (Ian)
repmgrd: check binary and extension major versions match; GitHub #515 (Ian) repmgrd: check binary and extension major versions match; GitHub #515 (Ian)
repmgrd: on a cascaded standby, don't fail over if "failover=manual"; repmgrd: on a cascaded standby, don't fail over if "failover=manual";

View File

@@ -5086,7 +5086,10 @@ get_primary_last_seen(PGconn *conn)
initPQExpBuffer(&query); initPQExpBuffer(&query);
appendPQExpBufferStr(&query, appendPQExpBufferStr(&query,
"SELECT repmgr.get_primary_last_seen()"); "SELECT CASE WHEN pg_catalog.pg_is_in_recovery() IS FALSE "
" THEN -1 "
" ELSE repmgr.get_primary_last_seen() "
" END AS primary_last_seen ");
res = PQexec(conn, query.data); res = PQexec(conn, query.data);

View File

@@ -354,6 +354,8 @@ typedef struct RepmgrdInfo {
char repmgrd_running[MAXLEN]; char repmgrd_running[MAXLEN];
bool paused; bool paused;
bool wal_paused_pending_wal; bool wal_paused_pending_wal;
int upstream_last_seen;
char upstream_last_seen_text[MAXLEN];
} RepmgrdInfo; } RepmgrdInfo;

View File

@@ -79,6 +79,14 @@ REPMGRD_OPTS="--daemonize=false"</programlisting>
</note> </note>
</listitem> </listitem>
<listitem>
<para>
<link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>
displays the interval (in seconds) since the <application>repmgrd</application> instance
last verified its upstream node was available.
</para>
</listitem>
<listitem> <listitem>
<para> <para>
Add <option>--compact</option> option to <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command> (GitHub #521). Add <option>--compact</option> option to <command><link linkend="repmgr-cluster-show">repmgr cluster show</link></command> (GitHub #521).

View File

@@ -49,33 +49,34 @@
<para> <para>
<application>repmgrd</application> running normally on all nodes: <application>repmgrd</application> running normally on all nodes:
<programlisting>$ repmgr -f /etc/repmgr.conf daemon status <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
ID | Name | Role | Priority | Status | repmgrd | PID | Paused? ID | Name | Role | Priority | Status | repmgrd | PID | Paused? | Upstream last seen
----+-------+---------+----------+---------+---------+------+--------- ----+-------+---------+----------+---------+---------+-------+---------+--------------------
1 | node1 | primary | 100 | running | running | 5722 | no 1 | node1 | primary | 100 | running | running | 71987 | no | n/a
2 | node2 | standby | 100 | running | running | 5731 | no 2 | node2 | standby | 100 | running | running | 71996 | no | 1 second(s) ago
3 | node3 | standby | 100 | running | running | 5779 | no</programlisting> 3 | node3 | standby | 100 | running | running | 72042 | no | 1 second(s) ago
</programlisting>
</para> </para>
<para> <para>
<application>repmgrd</application> paused on all nodes (using <xref linkend="repmgr-daemon-pause">): <application>repmgrd</application> paused on all nodes (using <xref linkend="repmgr-daemon-pause">):
<programlisting>$ repmgr -f /etc/repmgr.conf daemon status <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
ID | Name | Role | Priority | Status | repmgrd | PID | Paused? ID | Name | Role | Priority | Status | repmgrd | PID | Paused? | Upstream last seen
----+-------+---------+----------+---------+---------+------+--------- ----+-------+---------+----------+---------+---------+-------+---------+--------------------
1 | node1 | primary | 100 | running | running | 5722 | yes 1 | node1 | primary | 100 | running | running | 71987 | yes | n/a
2 | node2 | standby | 100 | running | running | 5731 | yes 2 | node2 | standby | 100 | running | running | 71996 | yes | 0 second(s) ago
3 | node3 | standby | 100 | running | running | 5779 | yes</programlisting> 3 | node3 | standby | 100 | running | running | 72042 | yes | 0 second(s) ago
</programlisting>
</para> </para>
<para> <para>
<application>repmgrd</application> not running on one node: <application>repmgrd</application> not running on one node:
<programlisting>$ repmgr -f /etc/repmgr.conf daemon status <programlisting>$ repmgr -f /etc/repmgr.conf daemon status
ID | Name | Role | Priority | Status | repmgrd | PID | Paused? ID | Name | Role | Priority | Status | repmgrd | PID | Paused? | Upstream last seen
----+-------+---------+----------+---------+-------------+------+--------- ----+-------+---------+----------+---------+-------------+-------+---------+--------------------
1 | node1 | primary | 100 | running | running | 5722 | yes 1 | node1 | primary | 100 | running | running | 71987 | yes | n/a
2 | node2 | standby | 100 | running | not running | n/a | n/a 2 | node2 | standby | 100 | running | not running | n/a | n/a | n/a
3 | node3 | standby | 100 | running | running | 5779 | yes</programlisting> 3 | node3 | standby | 100 | running | running | 72042 | yes | 0 second(s) ago</programlisting>
</para> </para>
</refsect1> </refsect1>
<refsect1> <refsect1>
@@ -92,9 +93,9 @@
parsing by scripts, e.g.: parsing by scripts, e.g.:
<programlisting> <programlisting>
$ repmgr -f /etc/repmgr.conf daemon status --csv $ repmgr -f /etc/repmgr.conf daemon status --csv
1,node1,primary,1,1,5722,1,100 1,node1,primary,1,1,5722,1,100,-1
2,node2,standby,1,0,-1,1,100 2,node2,standby,1,0,-1,1,100,1
3,node3,standby,1,1,5779,1,100</programlisting> 3,node3,standby,1,1,5779,1,100,1</programlisting>
</para> </para>
<para> <para>
The columns have following meanings: The columns have following meanings:
@@ -141,11 +142,17 @@
</simpara> </simpara>
</listitem> </listitem>
<listitem> <listitem>
<simpara> <simpara>
<application>repmgrd</application> priority <application>repmgrd</application> node priority
</simpara> </simpara>
</listitem> </listitem>
<listitem>
<simpara>
interval in seconds since the node's upstream was last seen
</simpara>
</listitem>
</itemizedlist> </itemizedlist>
</para> </para>

View File

@@ -47,10 +47,11 @@ typedef enum
STATUS_PG, STATUS_PG,
STATUS_RUNNING, STATUS_RUNNING,
STATUS_PID, STATUS_PID,
STATUS_PAUSED STATUS_PAUSED,
STATUS_UPSTREAM_LAST_SEEN
} StatusHeader; } StatusHeader;
#define STATUS_HEADER_COUNT 8 #define STATUS_HEADER_COUNT 9
struct ColHeader headers_status[STATUS_HEADER_COUNT]; struct ColHeader headers_status[STATUS_HEADER_COUNT];
@@ -101,6 +102,12 @@ do_daemon_status(void)
strncpy(headers_status[STATUS_PID].title, _("PID"), MAXLEN); strncpy(headers_status[STATUS_PID].title, _("PID"), MAXLEN);
strncpy(headers_status[STATUS_PAUSED].title, _("Paused?"), MAXLEN); strncpy(headers_status[STATUS_PAUSED].title, _("Paused?"), MAXLEN);
if (runtime_options.compact == true)
strncpy(headers_status[STATUS_UPSTREAM_LAST_SEEN].title, _("Upstr. last"), MAXLEN);
else
strncpy(headers_status[STATUS_UPSTREAM_LAST_SEEN].title, _("Upstream last seen"), MAXLEN);
for (i = 0; i < STATUS_HEADER_COUNT; i++) for (i = 0; i < STATUS_HEADER_COUNT; i++)
{ {
headers_status[i].max_length = strlen(headers_status[i].title); headers_status[i].max_length = strlen(headers_status[i].title);
@@ -122,6 +129,7 @@ do_daemon_status(void)
repmgrd_info[i]->running = false; repmgrd_info[i]->running = false;
repmgrd_info[i]->pg_running = true; repmgrd_info[i]->pg_running = true;
repmgrd_info[i]->wal_paused_pending_wal = false; repmgrd_info[i]->wal_paused_pending_wal = false;
repmgrd_info[i]->upstream_last_seen = -1;
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo); cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
@@ -193,6 +201,24 @@ do_daemon_status(void)
} }
} }
repmgrd_info[i]->upstream_last_seen = get_primary_last_seen(cell->node_info->conn);
if (repmgrd_info[i]->upstream_last_seen < 0)
{
maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, "%s", _("n/a"));
}
else
{
if (runtime_options.compact == true)
{
maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, _("%i sec(s) ago"), repmgrd_info[i]->upstream_last_seen);
}
else
{
maxlen_snprintf(repmgrd_info[i]->upstream_last_seen_text, _("%i second(s) ago"), repmgrd_info[i]->upstream_last_seen);
}
}
PQfinish(cell->node_info->conn); PQfinish(cell->node_info->conn);
} }
@@ -209,6 +235,8 @@ do_daemon_status(void)
headers_status[STATUS_RUNNING].cur_length = strlen(repmgrd_info[i]->repmgrd_running); headers_status[STATUS_RUNNING].cur_length = strlen(repmgrd_info[i]->repmgrd_running);
headers_status[STATUS_PG].cur_length = strlen(repmgrd_info[i]->pg_running_text); headers_status[STATUS_PG].cur_length = strlen(repmgrd_info[i]->pg_running_text);
headers_status[STATUS_UPSTREAM_LAST_SEEN].cur_length = strlen(repmgrd_info[i]->upstream_last_seen_text);
for (j = 0; j < STATUS_HEADER_COUNT; j++) for (j = 0; j < STATUS_HEADER_COUNT; j++)
{ {
if (headers_status[j].cur_length > headers_status[j].max_length) if (headers_status[j].cur_length > headers_status[j].max_length)
@@ -232,7 +260,7 @@ do_daemon_status(void)
{ {
if (runtime_options.output_mode == OM_CSV) if (runtime_options.output_mode == OM_CSV)
{ {
printf("%i,%s,%s,%i,%i,%i,%i,%i\n", printf("%i,%s,%s,%i,%i,%i,%i,%i,%i\n",
cell->node_info->node_id, cell->node_info->node_id,
cell->node_info->node_name, cell->node_info->node_name,
get_node_type_string(cell->node_info->type), get_node_type_string(cell->node_info->type),
@@ -240,7 +268,10 @@ do_daemon_status(void)
repmgrd_info[i]->running ? 1 : 0, repmgrd_info[i]->running ? 1 : 0,
repmgrd_info[i]->pid, repmgrd_info[i]->pid,
repmgrd_info[i]->paused ? 1 : 0, repmgrd_info[i]->paused ? 1 : 0,
cell->node_info->priority); cell->node_info->priority,
repmgrd_info[i]->pid == UNKNOWN_PID
? -1
: repmgrd_info[i]->upstream_last_seen);
} }
else else
{ {
@@ -254,9 +285,17 @@ do_daemon_status(void)
printf("| %-*s ", headers_status[STATUS_PID].max_length, repmgrd_info[i]->pid_text); printf("| %-*s ", headers_status[STATUS_PID].max_length, repmgrd_info[i]->pid_text);
if (repmgrd_info[i]->pid == UNKNOWN_PID) if (repmgrd_info[i]->pid == UNKNOWN_PID)
printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, "n/a"); {
printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, _("n/a"));
printf("| %-*s ", headers_status[STATUS_UPSTREAM_LAST_SEEN].max_length, _("n/a"));
}
else else
printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, repmgrd_info[i]->paused ? "yes" : "no"); {
printf("| %-*s ", headers_status[STATUS_PAUSED].max_length, repmgrd_info[i]->paused ? _("yes") : _("no"));
printf("| %-*s ", headers_status[STATUS_UPSTREAM_LAST_SEEN].max_length, repmgrd_info[i]->upstream_last_seen_text);
}
printf("\n"); printf("\n");
} }