From c560dfbbce16a71ff4d3387fde878dd4604a5994 Mon Sep 17 00:00:00 2001 From: Ian Barwick Date: Mon, 27 May 2019 09:15:12 +0900 Subject: [PATCH] cluster show: display timeline ID This helps provide a better picture of the state of the cluster, i.e. making it more obvious whether there's been a timeline divergence. This also provides infrastructure for further improvements in cluster status display and diagnosis. Note this is only available in PostgreSQL 9.6 and later as it relies on the SQL functions for interrogating pg_control, which can be executed remotely. As PostgreSQL 9.5 will shortly be the only community-supported version without these functions, it's not worth the effort of trying to duplicate their functionality. --- HISTORY | 1 + dbutils.c | 39 +++++++++++++++++++++++- dbutils.h | 42 ++++++++++++++++---------- doc/appendix-release-notes.xml | 7 +++++ doc/repmgr-cluster-show.xml | 12 ++++---- repmgr-action-cluster.c | 55 ++++++++++++++++++++++++++++++++-- 6 files changed, 133 insertions(+), 23 deletions(-) diff --git a/HISTORY b/HISTORY index fe6d96a6..9d088ed9 100644 --- a/HISTORY +++ b/HISTORY @@ -4,6 +4,7 @@ repmgr: add "--repmgrd-force-unpause" option to "standby switchover" (Ian) repmgr: improve "--dry-run" behaviour for "standby promote" and "standby switchover" (Ian) + repmgr: display node timeline ID in "cluster show" output (Ian) repmgr: in "cluster show" and "daemon status", show upstream node name as reported by each individual node (Ian) repmgr: in "cluster show" and "daemon status", check if a node is attached diff --git a/dbutils.c b/dbutils.c index a29d5d50..790bee28 100644 --- a/dbutils.c +++ b/dbutils.c @@ -1600,7 +1600,7 @@ system_identifier(PGconn *conn) if (PQresultStatus(res) != PGRES_TUPLES_OK) { - log_db_error(conn, NULL, _("get_system_identifier(): unable to query pg_control_system()")); + log_db_error(conn, NULL, _("system_identifier(): unable to query pg_control_system()")); } else { @@ -3407,6 +3407,10 @@ clear_node_info_list(NodeInfoList *nodes) while (cell != NULL) { next_cell = cell->next; + + if (cell->node_info->replication_info != NULL) + pfree(cell->node_info->replication_info); + pfree(cell->node_info); pfree(cell); cell = next_cell; @@ -5073,6 +5077,7 @@ init_replication_info(ReplInfo *replication_info) { memset(replication_info->current_timestamp, 0, sizeof(replication_info->current_timestamp)); replication_info->in_recovery = false; + replication_info->timeline_id = UNKNOWN_TIMELINE_ID; replication_info->last_wal_receive_lsn = InvalidXLogRecPtr; replication_info->last_wal_replay_lsn = InvalidXLogRecPtr; memset(replication_info->last_xact_replay_timestamp, 0, sizeof(replication_info->last_xact_replay_timestamp)); @@ -5259,6 +5264,38 @@ get_replication_lag_seconds(PGconn *conn) } + +TimeLineID +get_node_timeline(PGconn *conn) +{ + TimeLineID timeline_id = UNKNOWN_TIMELINE_ID; + PGresult *res = NULL; + + /* + * PG_control_checkpoint() was introduced in PostgreSQL 9.6 + */ + if (PQserverVersion(conn) < 90600) + { + return UNKNOWN_TIMELINE_ID; + } + + res = PQexec(conn, "SELECT timeline_id FROM pg_catalog.pg_control_checkpoint()"); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + log_db_error(conn, NULL, _("get_node_timeline(): unable to query pg_control_system()")); + } + else + { + timeline_id = atoi(PQgetvalue(res, 0, 0)); + } + + PQclear(res); + + return timeline_id; +} + + void get_node_replication_stats(PGconn *conn, t_node_info *node_info) { diff --git a/dbutils.h b/dbutils.h index 36adfd5b..f193e1b5 100644 --- a/dbutils.h +++ b/dbutils.h @@ -164,8 +164,28 @@ typedef struct s_extension_versions { UNKNOWN_SERVER_VERSION_NUM \ } + +typedef struct +{ + char current_timestamp[MAXLEN]; + bool in_recovery; + TimeLineID timeline_id; + XLogRecPtr last_wal_receive_lsn; + XLogRecPtr last_wal_replay_lsn; + char last_xact_replay_timestamp[MAXLEN]; + int replication_lag_time; + bool receiving_streamed_wal; + bool wal_replay_paused; + int upstream_last_seen; + int upstream_node_id; +} ReplInfo; + /* - * Struct to store node information + * Struct to store node information. + * + * The first section represents the contents of the "repmgr.nodes" + * table; subsequent section contain information collated in + * various contexts. */ typedef struct s_node_info { @@ -199,6 +219,8 @@ typedef struct s_node_info int total_replication_slots; int active_replication_slots; int inactive_replication_slots; + /* replication info */ + ReplInfo *replication_info; } t_node_info; @@ -225,7 +247,8 @@ typedef struct s_node_info /* for ad-hoc use e.g. when working with a list of nodes */ \ "", true, true, \ /* various statistics */ \ - -1, -1, -1, -1, -1, -1 \ + -1, -1, -1, -1, -1, -1, \ + NULL \ } @@ -338,19 +361,7 @@ typedef struct BdrNodeInfoList 0 \ } -typedef struct -{ - char current_timestamp[MAXLEN]; - bool in_recovery; - XLogRecPtr last_wal_receive_lsn; - XLogRecPtr last_wal_replay_lsn; - char last_xact_replay_timestamp[MAXLEN]; - int replication_lag_time; - bool receiving_streamed_wal; - bool wal_replay_paused; - int upstream_last_seen; - int upstream_node_id; -} ReplInfo; + typedef struct { @@ -602,6 +613,7 @@ XLogRecPtr get_last_wal_receive_location(PGconn *conn); void init_replication_info(ReplInfo *replication_info); bool get_replication_info(PGconn *conn, t_server_type node_type, ReplInfo *replication_info); int get_replication_lag_seconds(PGconn *conn); +TimeLineID get_node_timeline(PGconn *conn); void get_node_replication_stats(PGconn *conn, t_node_info *node_info); NodeAttached is_downstream_node_attached(PGconn *conn, char *node_name); void set_upstream_last_seen(PGconn *conn, int upstream_node_id); diff --git a/doc/appendix-release-notes.xml b/doc/appendix-release-notes.xml index 3f30b0f0..05187a07 100644 --- a/doc/appendix-release-notes.xml +++ b/doc/appendix-release-notes.xml @@ -97,6 +97,13 @@ + repmgr cluster show: + display each node's timeline ID (PostgreSQL 9.6 and later only). + + + + + repmgr cluster show and repmgr daemon status: show the upstream node name as reported by each individual node - this helps visualise diff --git a/doc/repmgr-cluster-show.xml b/doc/repmgr-cluster-show.xml index 196d9b87..b06d326e 100644 --- a/doc/repmgr-cluster-show.xml +++ b/doc/repmgr-cluster-show.xml @@ -22,7 +22,9 @@ directly and can be run on any node in the cluster; this is also useful when analyzing connectivity from a particular node. - + + For PostgreSQL 9.6 and later, the output will also contain the node's current timeline ID. + Node availability is tested by connecting from the node where repmgr cluster show is executed, and does not necessarily imply the node @@ -52,11 +54,11 @@ $ repmgr -f /etc/repmgr.conf cluster show - ID | Name | Role | Status | Upstream | Location | Priority | Connection string + ID | Name | Role | Status | Upstream | Location | Priority | Timeline | Connection string ----+-------+---------+-----------+----------+----------+----------+----------------------------------------- - 1 | node1 | primary | * running | | default | 100 | host=db_node1 dbname=repmgr user=repmgr - 2 | node2 | standby | running | node1 | default | 100 | host=db_node2 dbname=repmgr user=repmgr - 3 | node3 | standby | running | node1 | default | 100 | host=db_node3 dbname=repmgr user=repmgr + 1 | node1 | primary | * running | | default | 100 | 1 | host=db_node1 dbname=repmgr user=repmgr + 2 | node2 | standby | running | node1 | default | 100 | 1 | host=db_node2 dbname=repmgr user=repmgr + 3 | node3 | standby | running | node1 | default | 100 | 1 | host=db_node3 dbname=repmgr user=repmgr diff --git a/repmgr-action-cluster.c b/repmgr-action-cluster.c index 99201e80..53ff8e56 100644 --- a/repmgr-action-cluster.c +++ b/repmgr-action-cluster.c @@ -24,7 +24,7 @@ #include "repmgr-client-global.h" #include "repmgr-action-cluster.h" -#define SHOW_HEADER_COUNT 8 +#define SHOW_HEADER_COUNT 9 typedef enum { @@ -35,6 +35,7 @@ typedef enum SHOW_UPSTREAM_NAME, SHOW_LOCATION, SHOW_PRIORITY, + SHOW_TIMELINE_ID, SHOW_CONNINFO } ShowHeader; @@ -113,9 +114,15 @@ do_cluster_show(void) strncpy(headers_show[SHOW_LOCATION].title, _("Location"), MAXLEN); if (runtime_options.compact == true) + { strncpy(headers_show[SHOW_PRIORITY].title, _("Prio."), MAXLEN); + strncpy(headers_show[SHOW_TIMELINE_ID].title, _("TLI"), MAXLEN); + } else + { strncpy(headers_show[SHOW_PRIORITY].title, _("Priority"), MAXLEN); + strncpy(headers_show[SHOW_TIMELINE_ID].title, _("Timeline"), MAXLEN); + } strncpy(headers_show[SHOW_CONNINFO].title, _("Connection string"), MAXLEN); @@ -128,6 +135,16 @@ do_cluster_show(void) { headers_show[i].display = true; + /* Don't display timeline on pre-9.6 clusters */ + if (i == SHOW_TIMELINE_ID) + { + if (PQserverVersion(conn) < 90600) + { + headers_show[i].display = false; + } + } + + /* if --compact provided, don't display conninfo */ if (runtime_options.compact == true) { if (i == SHOW_CONNINFO) @@ -136,6 +153,7 @@ do_cluster_show(void) } } + if (headers_show[i].display == true) { headers_show[i].max_length = strlen(headers_show[i].title); @@ -154,6 +172,15 @@ do_cluster_show(void) PQExpBufferData upstream; PQExpBufferData buf; + cell->node_info->replication_info = palloc0(sizeof(ReplInfo)); + if (cell->node_info->replication_info == NULL) + { + log_error(_("unable to allocate memory")); + exit(ERR_INTERNAL); + } + + init_replication_info(cell->node_info->replication_info); + cell->node_info->conn = establish_db_connection_quiet(cell->node_info->conninfo); if (PQstatus(cell->node_info->conn) != CONNECTION_OK) @@ -176,6 +203,11 @@ do_cluster_show(void) cell->node_info->node_name, cell->node_info->node_id); } } + else + { + /* NOP on pre-9.6 servers */ + cell->node_info->replication_info->timeline_id = get_node_timeline(cell->node_info->conn); + } initPQExpBuffer(&node_status); initPQExpBuffer(&upstream); @@ -212,7 +244,18 @@ do_cluster_show(void) headers_show[SHOW_LOCATION].cur_length = strlen(cell->node_info->location); - + if (cell->node_info->replication_info->timeline_id == UNKNOWN_TIMELINE_ID) + { + /* display "?" */ + headers_show[SHOW_PRIORITY].cur_length = 1; + } + else + { + initPQExpBuffer(&buf); + appendPQExpBuffer(&buf, "%i", cell->node_info->replication_info->timeline_id); + headers_show[SHOW_PRIORITY].cur_length = strlen(buf.data); + termPQExpBuffer(&buf); + } headers_show[SHOW_CONNINFO].cur_length = strlen(cell->node_info->conninfo); @@ -277,6 +320,14 @@ do_cluster_show(void) printf("| %-*s ", headers_show[SHOW_LOCATION].max_length, cell->node_info->location); printf("| %-*i ", headers_show[SHOW_PRIORITY].max_length, cell->node_info->priority); + if (headers_show[SHOW_TIMELINE_ID].display == true) + { + if (cell->node_info->replication_info->timeline_id == UNKNOWN_TIMELINE_ID) + printf("| %-*c ", headers_show[SHOW_TIMELINE_ID].max_length, '?'); + else + printf("| %-*i ", headers_show[SHOW_TIMELINE_ID].max_length, (int)cell->node_info->replication_info->timeline_id); + } + if (headers_show[SHOW_CONNINFO].display == true) { printf("| %-*s", headers_show[SHOW_CONNINFO].max_length, cell->node_info->conninfo);