cluster show: display timeline ID

This helps provide a better picture of the state of the cluster, i.e.
making it more obvious whether there's been a timeline divergence.

This also provides infrastructure for further improvements in cluster
status display and diagnosis.

Note this is only available in PostgreSQL 9.6 and later as it relies
on the SQL functions for interrogating pg_control, which can be executed
remotely. As PostgreSQL 9.5 will shortly be the only community-supported
version without these functions, it's not worth the effort of trying
to duplicate their functionality.
This commit is contained in:
Ian Barwick
2019-05-27 09:15:12 +09:00
parent df6d160d2e
commit c560dfbbce
6 changed files with 133 additions and 23 deletions

View File

@@ -4,6 +4,7 @@
repmgr: add "--repmgrd-force-unpause" option to "standby switchover" (Ian)
repmgr: improve "--dry-run" behaviour for "standby promote" and
"standby switchover" (Ian)
repmgr: display node timeline ID in "cluster show" output (Ian)
repmgr: in "cluster show" and "daemon status", show upstream node name
as reported by each individual node (Ian)
repmgr: in "cluster show" and "daemon status", check if a node is attached

View File

@@ -1600,7 +1600,7 @@ system_identifier(PGconn *conn)
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_db_error(conn, NULL, _("get_system_identifier(): unable to query pg_control_system()"));
log_db_error(conn, NULL, _("system_identifier(): unable to query pg_control_system()"));
}
else
{
@@ -3407,6 +3407,10 @@ clear_node_info_list(NodeInfoList *nodes)
while (cell != NULL)
{
next_cell = cell->next;
if (cell->node_info->replication_info != NULL)
pfree(cell->node_info->replication_info);
pfree(cell->node_info);
pfree(cell);
cell = next_cell;
@@ -5073,6 +5077,7 @@ init_replication_info(ReplInfo *replication_info)
{
memset(replication_info->current_timestamp, 0, sizeof(replication_info->current_timestamp));
replication_info->in_recovery = false;
replication_info->timeline_id = UNKNOWN_TIMELINE_ID;
replication_info->last_wal_receive_lsn = InvalidXLogRecPtr;
replication_info->last_wal_replay_lsn = InvalidXLogRecPtr;
memset(replication_info->last_xact_replay_timestamp, 0, sizeof(replication_info->last_xact_replay_timestamp));
@@ -5259,6 +5264,38 @@ get_replication_lag_seconds(PGconn *conn)
}
TimeLineID
get_node_timeline(PGconn *conn)
{
TimeLineID timeline_id = UNKNOWN_TIMELINE_ID;
PGresult *res = NULL;
/*
* PG_control_checkpoint() was introduced in PostgreSQL 9.6
*/
if (PQserverVersion(conn) < 90600)
{
return UNKNOWN_TIMELINE_ID;
}
res = PQexec(conn, "SELECT timeline_id FROM pg_catalog.pg_control_checkpoint()");
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_db_error(conn, NULL, _("get_node_timeline(): unable to query pg_control_system()"));
}
else
{
timeline_id = atoi(PQgetvalue(res, 0, 0));
}
PQclear(res);
return timeline_id;
}
void
get_node_replication_stats(PGconn *conn, t_node_info *node_info)
{

View File

@@ -164,8 +164,28 @@ typedef struct s_extension_versions {
UNKNOWN_SERVER_VERSION_NUM \
}
typedef struct
{
char current_timestamp[MAXLEN];
bool in_recovery;
TimeLineID timeline_id;
XLogRecPtr last_wal_receive_lsn;
XLogRecPtr last_wal_replay_lsn;
char last_xact_replay_timestamp[MAXLEN];
int replication_lag_time;
bool receiving_streamed_wal;
bool wal_replay_paused;
int upstream_last_seen;
int upstream_node_id;
} ReplInfo;
/*
* Struct to store node information
* Struct to store node information.
*
* The first section represents the contents of the "repmgr.nodes"
* table; subsequent section contain information collated in
* various contexts.
*/
typedef struct s_node_info
{
@@ -199,6 +219,8 @@ typedef struct s_node_info
int total_replication_slots;
int active_replication_slots;
int inactive_replication_slots;
/* replication info */
ReplInfo *replication_info;
} t_node_info;
@@ -225,7 +247,8 @@ typedef struct s_node_info
/* for ad-hoc use e.g. when working with a list of nodes */ \
"", true, true, \
/* various statistics */ \
-1, -1, -1, -1, -1, -1 \
-1, -1, -1, -1, -1, -1, \
NULL \
}
@@ -338,19 +361,7 @@ typedef struct BdrNodeInfoList
0 \
}
typedef struct
{
char current_timestamp[MAXLEN];
bool in_recovery;
XLogRecPtr last_wal_receive_lsn;
XLogRecPtr last_wal_replay_lsn;
char last_xact_replay_timestamp[MAXLEN];
int replication_lag_time;
bool receiving_streamed_wal;
bool wal_replay_paused;
int upstream_last_seen;
int upstream_node_id;
} ReplInfo;
typedef struct
{
@@ -602,6 +613,7 @@ XLogRecPtr get_last_wal_receive_location(PGconn *conn);
void init_replication_info(ReplInfo *replication_info);
bool get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replication_info);
int get_replication_lag_seconds(PGconn *conn);
TimeLineID get_node_timeline(PGconn *conn);
void get_node_replication_stats(PGconn *conn, t_node_info *node_info);
NodeAttached is_downstream_node_attached(PGconn *conn, char *node_name);
void set_upstream_last_seen(PGconn *conn, int upstream_node_id);

View File

@@ -97,6 +97,13 @@
<listitem>
<para>
<link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>:
display each node's timeline ID (PostgreSQL 9.6 and later only).
</para>
</listitem>
<listitem>
<para>
<link linkend="repmgr-cluster-show"><command>repmgr cluster show</command></link>
and <link linkend="repmgr-daemon-status"><command>repmgr daemon status</command></link>:
show the upstream node name as reported by each individual node - this helps visualise

View File

@@ -22,7 +22,9 @@
directly and can be run on any node in the cluster; this is also useful when analyzing
connectivity from a particular node.
</para>
<para>
For PostgreSQL 9.6 and later, the output will also contain the node's current timeline ID.
</para>
<para>
Node availability is tested by connecting from the node where
<command>repmgr cluster show</command> is executed, and does not necessarily imply the node
@@ -52,11 +54,11 @@
<programlisting>
$ repmgr -f /etc/repmgr.conf cluster show
ID | Name | Role | Status | Upstream | Location | Priority | Connection string
ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string
----+-------+---------+-----------+----------+----------+----------+-----------------------------------------
1 | node1 | primary | * running | | default | 100 | host=db_node1 dbname=repmgr user=repmgr
2 | node2 | standby | running | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr
3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr</programlisting>
1 | node1 | primary | * running | | default | 100 | 1 | host=db_node1 dbname=repmgr user=repmgr
2 | node2 | standby | running | node1 | default | 100 | 1 | host=db_node2 dbname=repmgr user=repmgr
3 | node3 | standby | running | node1 | default | 100 | 1 | host=db_node3 dbname=repmgr user=repmgr</programlisting>
</para>
</refsect1>
<refsect1>

View File

@@ -24,7 +24,7 @@
#include "repmgr-client-global.h"
#include "repmgr-action-cluster.h"
#define SHOW_HEADER_COUNT 8
#define SHOW_HEADER_COUNT 9
typedef enum
{
@@ -35,6 +35,7 @@ typedef enum
SHOW_UPSTREAM_NAME,
SHOW_LOCATION,
SHOW_PRIORITY,
SHOW_TIMELINE_ID,
SHOW_CONNINFO
} ShowHeader;
@@ -113,9 +114,15 @@ do_cluster_show(void)
strncpy(headers_show[SHOW_LOCATION].title, _("Location"), MAXLEN);
if (runtime_options.compact == true)
{
strncpy(headers_show[SHOW_PRIORITY].title, _("Prio."), MAXLEN);
strncpy(headers_show[SHOW_TIMELINE_ID].title, _("TLI"), MAXLEN);
}
else
{
strncpy(headers_show[SHOW_PRIORITY].title, _("Priority"), MAXLEN);
strncpy(headers_show[SHOW_TIMELINE_ID].title, _("Timeline"), MAXLEN);
}
strncpy(headers_show[SHOW_CONNINFO].title, _("Connection string"), MAXLEN);
@@ -128,6 +135,16 @@ do_cluster_show(void)
{
headers_show[i].display = true;
/* Don't display timeline on pre-9.6 clusters */
if (i == SHOW_TIMELINE_ID)
{
if (PQserverVersion(conn) < 90600)
{
headers_show[i].display = false;
}
}
/* if --compact provided, don't display conninfo */
if (runtime_options.compact == true)
{
if (i == SHOW_CONNINFO)
@@ -136,6 +153,7 @@ do_cluster_show(void)
}
}
if (headers_show[i].display == true)
{
headers_show[i].max_length = strlen(headers_show[i].title);
@@ -154,6 +172,15 @@ do_cluster_show(void)
PQExpBufferData upstream;
PQExpBufferData buf;
cell->node_info->replication_info = palloc0(sizeof(ReplInfo));
if (cell->node_info->replication_info == NULL)
{
log_error(_("unable to allocate memory"));
exit(ERR_INTERNAL);
}
init_replication_info(cell->node_info->replication_info);
cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo);
if (PQstatus(cell->node_info->conn) != CONNECTION_OK)
@@ -176,6 +203,11 @@ do_cluster_show(void)
cell->node_info->node_name, cell->node_info->node_id);
}
}
else
{
/* NOP on pre-9.6 servers */
cell->node_info->replication_info->timeline_id = get_node_timeline(cell->node_info->conn);
}
initPQExpBuffer(&node_status);
initPQExpBuffer(&upstream);
@@ -212,7 +244,18 @@ do_cluster_show(void)
headers_show[SHOW_LOCATION].cur_length = strlen(cell->node_info->location);
if (cell->node_info->replication_info->timeline_id == UNKNOWN_TIMELINE_ID)
{
/* display "?" */
headers_show[SHOW_PRIORITY].cur_length = 1;
}
else
{
initPQExpBuffer(&buf);
appendPQExpBuffer(&buf, "%i", cell->node_info->replication_info->timeline_id);
headers_show[SHOW_PRIORITY].cur_length = strlen(buf.data);
termPQExpBuffer(&buf);
}
headers_show[SHOW_CONNINFO].cur_length = strlen(cell->node_info->conninfo);
@@ -277,6 +320,14 @@ do_cluster_show(void)
printf("| %-*s ", headers_show[SHOW_LOCATION].max_length, cell->node_info->location);
printf("| %-*i ", headers_show[SHOW_PRIORITY].max_length, cell->node_info->priority);
if (headers_show[SHOW_TIMELINE_ID].display == true)
{
if (cell->node_info->replication_info->timeline_id == UNKNOWN_TIMELINE_ID)
printf("| %-*c ", headers_show[SHOW_TIMELINE_ID].max_length, '?');
else
printf("| %-*i ", headers_show[SHOW_TIMELINE_ID].max_length, (int)cell->node_info->replication_info->timeline_id);
}
if (headers_show[SHOW_CONNINFO].display == true)
{
printf("| %-*s", headers_show[SHOW_CONNINFO].max_length, cell->node_info->conninfo);