mirror of
https://github.com/EnterpriseDB/repmgr.git
synced 2026-03-28 01:16:29 +00:00
Add "cluster diagnose" mode
This mode merges the output of "cluster matrix" from each node to improve node state knowledge.
This commit is contained in:
committed by
Ian Barwick
parent
263128a740
commit
5189488b92
57
README.md
57
README.md
@@ -1601,13 +1601,19 @@ which contains connection details for the local database.
|
|||||||
The first column is the node's ID, and the second column represents the
|
The first column is the node's ID, and the second column represents the
|
||||||
node's status (0 = master, 1 = standby, -1 = failed).
|
node's status (0 = master, 1 = standby, -1 = failed).
|
||||||
|
|
||||||
* `cluster matrix`
|
* `cluster matrix` and `cluster diagnose`
|
||||||
|
|
||||||
Displays connection information for each pair of nodes in the
|
These commands display connection information for each pair of
|
||||||
replication cluster. This command polls each registered server and
|
nodes in the replication cluster.
|
||||||
asks it to connect to each other node.
|
|
||||||
|
|
||||||
This command requires a valid `repmgr.conf` file on each node.
|
- `cluster matrix` polls each registered server and asks it to
|
||||||
|
connect to each other node;
|
||||||
|
|
||||||
|
- `cluster diagnose` runs a `cluster matrix` on each node and
|
||||||
|
combines the results in a single matrix.
|
||||||
|
|
||||||
|
These commands require a valid `repmgr.conf` file on each node, and
|
||||||
|
the optional `ssh_hostname` parameter must be set.
|
||||||
|
|
||||||
Example 1 (all nodes up):
|
Example 1 (all nodes up):
|
||||||
|
|
||||||
@@ -1619,6 +1625,10 @@ which contains connection details for the local database.
|
|||||||
node2 | 2 | * | * | *
|
node2 | 2 | * | * | *
|
||||||
node3 | 3 | * | * | *
|
node3 | 3 | * | * | *
|
||||||
|
|
||||||
|
Here `cluster matrix` is sufficient to establish the state of each
|
||||||
|
possible connection.
|
||||||
|
|
||||||
|
|
||||||
Example 2 (node1 and node2 up, node3 down):
|
Example 2 (node1 and node2 up, node3 down):
|
||||||
|
|
||||||
$ repmgr -f /etc/repmgr.conf cluster matrix
|
$ repmgr -f /etc/repmgr.conf cluster matrix
|
||||||
@@ -1641,20 +1651,45 @@ which contains connection details for the local database.
|
|||||||
node1 and node2, meaning that inbound connections to these nodes
|
node1 and node2, meaning that inbound connections to these nodes
|
||||||
have succeeded.
|
have succeeded.
|
||||||
|
|
||||||
Example 3 (all nodes up, firewall dropping packets originating
|
In this case, `cluster diagnose` gives the same result as `cluster
|
||||||
from node2 and directed to port 5432 on node3)
|
matrix`, because from any functioning node we can observe the same
|
||||||
|
state: node1 and node2 are up, node3 is down.
|
||||||
|
|
||||||
After a long wait (same as before plus two timeouts, by default
|
|
||||||
one minute each), you will see the following output:
|
Example 3 (all nodes up, firewall dropping packets originating
|
||||||
|
from node1 and directed to port 5432 on node3)
|
||||||
|
|
||||||
|
Running `cluster matrix` from node1 gives the following output,
|
||||||
|
after a long wait (two timeouts, by default one minute each):
|
||||||
|
|
||||||
$ repmgr -f /etc/repmgr.conf cluster matrix
|
$ repmgr -f /etc/repmgr.conf cluster matrix
|
||||||
|
|
||||||
Name | Id | 1 | 2 | 3
|
Name | Id | 1 | 2 | 3
|
||||||
-------+----+----+----+----
|
-------+----+----+----+----
|
||||||
node1 | 1 | * | * | *
|
node1 | 1 | * | * | x
|
||||||
node2 | 2 | * | * | x
|
node2 | 2 | * | * | *
|
||||||
|
node3 | 3 | ? | ? | ?
|
||||||
|
|
||||||
|
The matrix tells us that we cannot connect from node1 to node3,
|
||||||
|
and that (therefore) we don't know the state of any outbound
|
||||||
|
connection from node3.
|
||||||
|
|
||||||
|
In this case, the `cluster diagnose` command is more informative:
|
||||||
|
|
||||||
|
$ repmgr -f /etc/repmgr.conf cluster diagnose
|
||||||
|
|
||||||
|
Name | Id | 1 | 2 | 3
|
||||||
|
-------+----+----+----+----
|
||||||
|
node1 | 1 | * | * | x
|
||||||
|
node2 | 2 | * | * | *
|
||||||
node3 | 3 | * | * | *
|
node3 | 3 | * | * | *
|
||||||
|
|
||||||
|
What happened is that `cluster diagnose` merged its own `cluster
|
||||||
|
matrix` with the `cluster matrix` output from node2; the latter is
|
||||||
|
able to connect to node3 and therefore determine the state of
|
||||||
|
outbound connections from that node.
|
||||||
|
|
||||||
|
|
||||||
* `cluster cleanup`
|
* `cluster cleanup`
|
||||||
|
|
||||||
Purges monitoring history from the `repl_monitor` table to prevent excessive
|
Purges monitoring history from the `repl_monitor` table to prevent excessive
|
||||||
|
|||||||
180
repmgr.c
180
repmgr.c
@@ -21,6 +21,7 @@
|
|||||||
* WITNESS REGISTER
|
* WITNESS REGISTER
|
||||||
* WITNESS UNREGISTER
|
* WITNESS UNREGISTER
|
||||||
*
|
*
|
||||||
|
* CLUSTER DIAGNOSE
|
||||||
* CLUSTER MATRIX
|
* CLUSTER MATRIX
|
||||||
* CLUSTER SHOW
|
* CLUSTER SHOW
|
||||||
* CLUSTER CLEANUP
|
* CLUSTER CLEANUP
|
||||||
@@ -90,6 +91,7 @@
|
|||||||
#define CLUSTER_SHOW 13
|
#define CLUSTER_SHOW 13
|
||||||
#define CLUSTER_CLEANUP 14
|
#define CLUSTER_CLEANUP 14
|
||||||
#define CLUSTER_MATRIX 15
|
#define CLUSTER_MATRIX 15
|
||||||
|
#define CLUSTER_DIAGNOSE 16
|
||||||
|
|
||||||
static int test_ssh_connection(char *host, char *remote_user);
|
static int test_ssh_connection(char *host, char *remote_user);
|
||||||
static int copy_remote_files(char *host, char *remote_user, char *remote_path,
|
static int copy_remote_files(char *host, char *remote_user, char *remote_path,
|
||||||
@@ -132,6 +134,7 @@ static void do_witness_unregister(void);
|
|||||||
|
|
||||||
static void do_cluster_show(void);
|
static void do_cluster_show(void);
|
||||||
static void do_cluster_matrix(void);
|
static void do_cluster_matrix(void);
|
||||||
|
static void do_cluster_diagnose(void);
|
||||||
static void do_cluster_cleanup(void);
|
static void do_cluster_cleanup(void);
|
||||||
static void do_check_upstream_config(void);
|
static void do_check_upstream_config(void);
|
||||||
static void do_help(void);
|
static void do_help(void);
|
||||||
@@ -713,6 +716,8 @@ main(int argc, char **argv)
|
|||||||
action = CLUSTER_SHOW;
|
action = CLUSTER_SHOW;
|
||||||
else if (strcasecmp(server_cmd, "CLEANUP") == 0)
|
else if (strcasecmp(server_cmd, "CLEANUP") == 0)
|
||||||
action = CLUSTER_CLEANUP;
|
action = CLUSTER_CLEANUP;
|
||||||
|
else if (strcasecmp(server_cmd, "DIAGNOSE") == 0)
|
||||||
|
action = CLUSTER_DIAGNOSE;
|
||||||
else if (strcasecmp(server_cmd, "MATRIX") == 0)
|
else if (strcasecmp(server_cmd, "MATRIX") == 0)
|
||||||
action = CLUSTER_MATRIX;
|
action = CLUSTER_MATRIX;
|
||||||
}
|
}
|
||||||
@@ -954,6 +959,9 @@ main(int argc, char **argv)
|
|||||||
case WITNESS_UNREGISTER:
|
case WITNESS_UNREGISTER:
|
||||||
do_witness_unregister();
|
do_witness_unregister();
|
||||||
break;
|
break;
|
||||||
|
case CLUSTER_DIAGNOSE:
|
||||||
|
do_cluster_diagnose();
|
||||||
|
break;
|
||||||
case CLUSTER_MATRIX:
|
case CLUSTER_MATRIX:
|
||||||
do_cluster_matrix();
|
do_cluster_matrix();
|
||||||
break;
|
break;
|
||||||
@@ -1282,6 +1290,178 @@ do_cluster_matrix(void)
|
|||||||
PQclear(res);
|
PQclear(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
do_cluster_diagnose(void)
|
||||||
|
{
|
||||||
|
PGconn *conn;
|
||||||
|
PGresult *res;
|
||||||
|
char sqlquery[QUERY_STR_LEN];
|
||||||
|
int i, j, k;
|
||||||
|
const char *node_header = "Name";
|
||||||
|
int name_length = strlen(node_header);
|
||||||
|
|
||||||
|
int x, y, z, u, v;
|
||||||
|
int n = 0; /* number of nodes */
|
||||||
|
int *cube;
|
||||||
|
char *p;
|
||||||
|
char c;
|
||||||
|
|
||||||
|
char command[MAXLEN];
|
||||||
|
PQExpBufferData command_output;
|
||||||
|
|
||||||
|
/* We need to connect to get the list of nodes */
|
||||||
|
log_info(_("connecting to database\n"));
|
||||||
|
conn = establish_db_connection(options.conninfo, true);
|
||||||
|
|
||||||
|
sqlquery_snprintf(sqlquery,
|
||||||
|
"SELECT conninfo, ssh_hostname, type, name, upstream_node_name, id"
|
||||||
|
" FROM %s.repl_show_nodes ORDER BY id",
|
||||||
|
get_repmgr_schema_quoted(conn));
|
||||||
|
|
||||||
|
log_verbose(LOG_DEBUG, "do_cluster_show(): \n%s\n",sqlquery );
|
||||||
|
|
||||||
|
res = PQexec(conn, sqlquery);
|
||||||
|
|
||||||
|
if (PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||||
|
{
|
||||||
|
log_err(_("Unable to retrieve node information from the database\n%s\n"),
|
||||||
|
PQerrorMessage(conn));
|
||||||
|
log_hint(_("Please check that all nodes have been registered\n"));
|
||||||
|
|
||||||
|
PQclear(res);
|
||||||
|
PQfinish(conn);
|
||||||
|
exit(ERR_BAD_CONFIG);
|
||||||
|
}
|
||||||
|
PQfinish(conn);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allocate an empty cube matrix
|
||||||
|
*
|
||||||
|
* -2 == NULL
|
||||||
|
* -1 == Error
|
||||||
|
* 0 == OK
|
||||||
|
*/
|
||||||
|
n = PQntuples(res);
|
||||||
|
cube = (int *) pg_malloc(sizeof(int) * n * n * n);
|
||||||
|
for (i = 0; i < n * n * n; i++)
|
||||||
|
cube[i] = -2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find the maximum length of a node name
|
||||||
|
*/
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
{
|
||||||
|
int name_length_cur;
|
||||||
|
|
||||||
|
name_length_cur = strlen(PQgetvalue(res, i, 3));
|
||||||
|
if (name_length_cur > name_length)
|
||||||
|
name_length = name_length_cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
{
|
||||||
|
maxlen_snprintf(command,
|
||||||
|
"repmgr cluster matrix --csv");
|
||||||
|
|
||||||
|
initPQExpBuffer(&command_output);
|
||||||
|
|
||||||
|
if (i + 1 == options.node)
|
||||||
|
{
|
||||||
|
(void)local_command(
|
||||||
|
command,
|
||||||
|
&command_output);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
(void)remote_command(
|
||||||
|
PQgetvalue(res, i, 1),
|
||||||
|
"postgres",
|
||||||
|
command,
|
||||||
|
&command_output);
|
||||||
|
}
|
||||||
|
|
||||||
|
p = command_output.data;
|
||||||
|
|
||||||
|
for (j = 0; j < n * n; j++)
|
||||||
|
{
|
||||||
|
if (sscanf(p, "%d,%d,%d", &x, &y, &z) != 3)
|
||||||
|
{
|
||||||
|
fprintf(stderr, _("cannot parse --csv output: %s\n"), p);
|
||||||
|
PQfinish(conn);
|
||||||
|
exit(ERR_INTERNAL);
|
||||||
|
}
|
||||||
|
cube[i * n * n + (x - 1) * n + (y - 1)] =
|
||||||
|
(z == -1) ? -1 : 0;
|
||||||
|
while (*p && (*p != '\n'))
|
||||||
|
p++;
|
||||||
|
if (*p == '\n')
|
||||||
|
p++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%*s | Id ", name_length, node_header);
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
printf("| %2d ", i+1);
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
for (i = 0; i < name_length; i++)
|
||||||
|
printf("-");
|
||||||
|
printf("-+----");
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
printf("+----");
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
for (i = 0; i < n; i++)
|
||||||
|
{
|
||||||
|
printf("%*s | %2d ", name_length,
|
||||||
|
PQgetvalue(res, i, 3), i + 1);
|
||||||
|
for (j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
u = cube[i * n + j];
|
||||||
|
for (k = 1; k < n; k++)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* The value of entry (i,j) is equal to the
|
||||||
|
* maximum value of all the (i,j,k). Indeed:
|
||||||
|
*
|
||||||
|
* - if one of the (i,j,k) is 0 (node up), then 0
|
||||||
|
* (the node is up);
|
||||||
|
*
|
||||||
|
* - if the (i,j,k) are either -1 (down) or -2
|
||||||
|
* (unknown), then -1 (the node is down);
|
||||||
|
*
|
||||||
|
* - if all the (i,j,k) are -2 (unknown), then -2
|
||||||
|
* (the node is in an unknown state).
|
||||||
|
*/
|
||||||
|
|
||||||
|
v = cube[k * n * n + i * n + j];
|
||||||
|
|
||||||
|
if (v > u) u = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (u)
|
||||||
|
{
|
||||||
|
case -2:
|
||||||
|
c = '?';
|
||||||
|
break;
|
||||||
|
case -1:
|
||||||
|
c = 'x';
|
||||||
|
break;
|
||||||
|
case 0:
|
||||||
|
c = '*';
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
exit(ERR_INTERNAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("| %c ", c);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
PQclear(res);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
do_cluster_cleanup(void)
|
do_cluster_cleanup(void)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user