Add "cluster diagnose" mode

This mode merges the output of "cluster matrix" from each node to
improve node state knowledge.
This commit is contained in:
Gianni Ciolli
2016-06-21 01:02:44 +02:00
committed by Ian Barwick
parent 263128a740
commit 5189488b92
2 changed files with 226 additions and 11 deletions

180
repmgr.c
View File

@@ -21,6 +21,7 @@
* WITNESS REGISTER
* WITNESS UNREGISTER
*
* CLUSTER DIAGNOSE
* CLUSTER MATRIX
* CLUSTER SHOW
* CLUSTER CLEANUP
@@ -90,6 +91,7 @@
#define CLUSTER_SHOW 13
#define CLUSTER_CLEANUP 14
#define CLUSTER_MATRIX 15
#define CLUSTER_DIAGNOSE 16
static int test_ssh_connection(char *host, char *remote_user);
static int copy_remote_files(char *host, char *remote_user, char *remote_path,
@@ -132,6 +134,7 @@ static void do_witness_unregister(void);
static void do_cluster_show(void);
static void do_cluster_matrix(void);
static void do_cluster_diagnose(void);
static void do_cluster_cleanup(void);
static void do_check_upstream_config(void);
static void do_help(void);
@@ -713,6 +716,8 @@ main(int argc, char **argv)
action = CLUSTER_SHOW;
else if (strcasecmp(server_cmd, "CLEANUP") == 0)
action = CLUSTER_CLEANUP;
else if (strcasecmp(server_cmd, "DIAGNOSE") == 0)
action = CLUSTER_DIAGNOSE;
else if (strcasecmp(server_cmd, "MATRIX") == 0)
action = CLUSTER_MATRIX;
}
@@ -954,6 +959,9 @@ main(int argc, char **argv)
case WITNESS_UNREGISTER:
do_witness_unregister();
break;
case CLUSTER_DIAGNOSE:
do_cluster_diagnose();
break;
case CLUSTER_MATRIX:
do_cluster_matrix();
break;
@@ -1282,6 +1290,178 @@ do_cluster_matrix(void)
PQclear(res);
}
static void
do_cluster_diagnose(void)
{
PGconn *conn;
PGresult *res;
char sqlquery[QUERY_STR_LEN];
int i, j, k;
const char *node_header = "Name";
int name_length = strlen(node_header);
int x, y, z, u, v;
int n = 0; /* number of nodes */
int *cube;
char *p;
char c;
char command[MAXLEN];
PQExpBufferData command_output;
/* We need to connect to get the list of nodes */
log_info(_("connecting to database\n"));
conn = establish_db_connection(options.conninfo, true);
sqlquery_snprintf(sqlquery,
"SELECT conninfo, ssh_hostname, type, name, upstream_node_name, id"
" FROM %s.repl_show_nodes ORDER BY id",
get_repmgr_schema_quoted(conn));
log_verbose(LOG_DEBUG, "do_cluster_show(): \n%s\n",sqlquery );
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_err(_("Unable to retrieve node information from the database\n%s\n"),
PQerrorMessage(conn));
log_hint(_("Please check that all nodes have been registered\n"));
PQclear(res);
PQfinish(conn);
exit(ERR_BAD_CONFIG);
}
PQfinish(conn);
/*
* Allocate an empty cube matrix
*
* -2 == NULL
* -1 == Error
* 0 == OK
*/
n = PQntuples(res);
cube = (int *) pg_malloc(sizeof(int) * n * n * n);
for (i = 0; i < n * n * n; i++)
cube[i] = -2;
/*
* Find the maximum length of a node name
*/
for (i = 0; i < n; i++)
{
int name_length_cur;
name_length_cur = strlen(PQgetvalue(res, i, 3));
if (name_length_cur > name_length)
name_length = name_length_cur;
}
for (i = 0; i < n; i++)
{
maxlen_snprintf(command,
"repmgr cluster matrix --csv");
initPQExpBuffer(&command_output);
if (i + 1 == options.node)
{
(void)local_command(
command,
&command_output);
}
else
{
(void)remote_command(
PQgetvalue(res, i, 1),
"postgres",
command,
&command_output);
}
p = command_output.data;
for (j = 0; j < n * n; j++)
{
if (sscanf(p, "%d,%d,%d", &x, &y, &z) != 3)
{
fprintf(stderr, _("cannot parse --csv output: %s\n"), p);
PQfinish(conn);
exit(ERR_INTERNAL);
}
cube[i * n * n + (x - 1) * n + (y - 1)] =
(z == -1) ? -1 : 0;
while (*p && (*p != '\n'))
p++;
if (*p == '\n')
p++;
}
}
printf("%*s | Id ", name_length, node_header);
for (i = 0; i < n; i++)
printf("| %2d ", i+1);
printf("\n");
for (i = 0; i < name_length; i++)
printf("-");
printf("-+----");
for (i = 0; i < n; i++)
printf("+----");
printf("\n");
for (i = 0; i < n; i++)
{
printf("%*s | %2d ", name_length,
PQgetvalue(res, i, 3), i + 1);
for (j = 0; j < n; j++)
{
u = cube[i * n + j];
for (k = 1; k < n; k++)
{
/*
* The value of entry (i,j) is equal to the
* maximum value of all the (i,j,k). Indeed:
*
* - if one of the (i,j,k) is 0 (node up), then 0
* (the node is up);
*
* - if the (i,j,k) are either -1 (down) or -2
* (unknown), then -1 (the node is down);
*
* - if all the (i,j,k) are -2 (unknown), then -2
* (the node is in an unknown state).
*/
v = cube[k * n * n + i * n + j];
if (v > u) u = v;
}
switch (u)
{
case -2:
c = '?';
break;
case -1:
c = 'x';
break;
case 0:
c = '*';
break;
default:
exit(ERR_INTERNAL);
}
printf("| %c ", c);
}
printf("\n");
}
PQclear(res);
}
static void
do_cluster_cleanup(void)
{