Add "cluster matrix" mode and "ssh_hostname" parameter

- The "cluster matrix" command supports CSV mode via the --csv
  switch.
- Add the optional ssh_hostname configuration parameter, which is
  required by "cluster matrix".
- A corresponding ssh_hostname column has been added to the repl_nodes
  table and to the repl_show_nodes view.
This commit is contained in:
Gianni Ciolli
2016-05-24 00:07:03 +02:00
committed by Ian Barwick
parent 77de5dbeeb
commit 9b5b9acb82
8 changed files with 295 additions and 23 deletions

View File

@@ -1601,6 +1601,61 @@ which contains connection details for the local database.
The first column is the node's ID, and the second column represents the
node's status (0 = master, 1 = standby, -1 = failed).
* `cluster matrix`
Displays connection information for each pair of nodes in the
replication cluster. This command polls each registered server and
asks it to connect to each other node.
This command requires a valid `repmgr.conf` file on each node,
with the optional `ssh_hostname` parameter set.
Example 1 (all nodes up):
$ repmgr -f /etc/repmgr.conf cluster matrix
Name | Id | 1 | 2 | 3
-------+----+----+----+----
node1 | 1 | * | * | *
node2 | 2 | * | * | *
node3 | 3 | * | * | *
Example 2 (node1 and node2 up, node3 down):
$ repmgr -f /etc/repmgr.conf cluster matrix
Name | Id | 1 | 2 | 3
-------+----+----+----+----
node1 | 1 | * | * | x
node2 | 2 | * | * | x
node3 | 3 | ? | ? | ?
Each row corresponds to one server, and indicates the result of
testing an outbound connection from that server.
Since node3 is down, all the entries in its row are filled with
"?", meaning that there we cannot test outbound connections.
The other two nodes are up; the corresponding rows have "x" in the
column corresponding to node3, meaning that inbound connections to
that node have failed, and "*" in the columns corresponding to
node1 and node2, meaning that inbound connections to these nodes
have succeeded.
Example 3 (all nodes up, firewall dropping packets originating
from node2 and directed to port 5432 on node3)
After a long wait (same as before plus two timeouts, by default
one minute each), you will see the following output:
$ repmgr -f /etc/repmgr.conf cluster matrix
Name | Id | 1 | 2 | 3
-------+----+----+----+----
node1 | 1 | * | * | *
node2 | 2 | * | * | x
node3 | 3 | * | * | *
* `cluster cleanup`
Purges monitoring history from the `repl_monitor` table to prevent excessive

View File

@@ -217,6 +217,7 @@ parse_config(t_configuration_options *options)
memset(options->conninfo, 0, sizeof(options->conninfo));
memset(options->barman_server, 0, sizeof(options->barman_server));
memset(options->barman_config, 0, sizeof(options->barman_config));
memset(options->ssh_hostname, 0, sizeof(options->ssh_hostname));
options->failover = MANUAL_FAILOVER;
options->priority = DEFAULT_PRIORITY;
memset(options->node_name, 0, sizeof(options->node_name));
@@ -316,6 +317,8 @@ parse_config(t_configuration_options *options)
strncpy(options->barman_server, value, MAXLEN);
else if (strcmp(name, "barman_config") == 0)
strncpy(options->barman_config, value, MAXLEN);
else if (strcmp(name, "ssh_hostname") == 0)
strncpy(options->ssh_hostname, value, MAXLEN);
else if (strcmp(name, "rsync_options") == 0)
strncpy(options->rsync_options, value, QUERY_STR_LEN);
else if (strcmp(name, "ssh_options") == 0)
@@ -452,6 +455,10 @@ parse_config(t_configuration_options *options)
PQconninfoFree(conninfo_options);
}
/*
* TODO: Sanity check ssh_hostname
*/
if (config_errors.head != NULL)
{
exit_with_errors(&config_errors);
@@ -620,6 +627,10 @@ reload_config(t_configuration_options *orig_options)
PQfinish(conn);
}
/*
* TODO: Sanity check the new ssh_hostname
*/
/*
* No configuration problems detected - copy any changed values
*
@@ -648,6 +659,13 @@ reload_config(t_configuration_options *orig_options)
config_changed = true;
}
/* ssh_hostname */
if (strcmp(orig_options->ssh_hostname, new_options.ssh_hostname) != 0)
{
strcpy(orig_options->ssh_hostname, new_options.ssh_hostname);
config_changed = true;
}
/* node */
if (orig_options->node != new_options.node)
{

View File

@@ -60,6 +60,7 @@ typedef struct
char conninfo[MAXLEN];
char barman_server[MAXLEN];
char barman_config[MAXLEN];
char ssh_hostname[MAXLEN];
int failover;
int priority;
char node_name[MAXLEN];
@@ -93,7 +94,7 @@ typedef struct
* The following will initialize the structure with a minimal set of options;
* actual defaults are set in parse_config() before parsing the configuration file
*/
#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", "", "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", "", 0, 0, 0, 0, "", { NULL, NULL }, {NULL, NULL} }
#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", "", "", "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", "", 0, 0, 0, 0, "", { NULL, NULL }, { NULL, NULL } }
typedef struct ItemListCell
{

View File

@@ -1227,7 +1227,7 @@ witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *cluster
/* Get current records from primary */
sqlquery_snprintf(sqlquery,
"SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name, active FROM %s.repl_nodes",
"SELECT id, type, upstream_node_id, name, conninfo, ssh_command, priority, slot_name, active FROM %s.repl_nodes",
get_repmgr_schema_quoted(masterconn));
log_verbose(LOG_DEBUG, "witness_copy_node_records():\n%s\n", sqlquery);
@@ -1263,11 +1263,12 @@ witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *cluster
cluster_name,
PQgetvalue(res, i, 3),
PQgetvalue(res, i, 4),
atoi(PQgetvalue(res, i, 5)),
strlen(PQgetvalue(res, i, 6))
? PQgetvalue(res, i, 6)
PQgetvalue(res, i, 5),
atoi(PQgetvalue(res, i, 6)),
strlen(PQgetvalue(res, i, 7))
? PQgetvalue(res, i, 7)
: NULL,
(strcmp(PQgetvalue(res, i, 7), "t") == 0)
(strcmp(PQgetvalue(res, i, 8), "t") == 0)
? true
: false
);
@@ -1300,7 +1301,7 @@ witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *cluster
* XXX we should pass the record parameters as a struct.
*/
bool
create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name, bool active)
create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, char *ssh_hostname, int priority, char *slot_name, bool active)
{
char sqlquery[QUERY_STR_LEN];
char upstream_node_id[MAXLEN];
@@ -1341,8 +1342,9 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
sqlquery_snprintf(sqlquery,
"INSERT INTO %s.repl_nodes "
" (id, type, upstream_node_id, cluster, "
" name, conninfo, slot_name, priority, active) "
"VALUES (%i, '%s', %s, '%s', '%s', '%s', %s, %i, %s) ",
" name, conninfo, ssh_hostname, slot_name, "
" priority, active) "
"VALUES (%i, '%s', %s, '%s', '%s', '%s', '%s', %s, %i, %s) ",
get_repmgr_schema_quoted(conn),
node,
type,
@@ -1350,6 +1352,7 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea
cluster_name,
node_name,
conninfo,
ssh_hostname,
slot_name_buf,
priority,
active == true ? "TRUE" : "FALSE");
@@ -1716,7 +1719,8 @@ get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info
sqlquery_snprintf(
sqlquery,
"SELECT id, type, upstream_node_id, name, conninfo, slot_name, priority, active"
"SELECT id, type, upstream_node_id, name, conninfo, ssh_hostname, "
" slot_name, priority, active"
" FROM %s.repl_nodes "
" WHERE cluster = '%s' "
" AND id = %i",
@@ -1789,9 +1793,10 @@ _get_node_record(PGconn *conn, char *cluster, char *sqlquery, t_node_info *node_
node_info->upstream_node_id = atoi(PQgetvalue(res, 0, 2));
strncpy(node_info->name, PQgetvalue(res, 0, 3), MAXLEN);
strncpy(node_info->conninfo_str, PQgetvalue(res, 0, 4), MAXLEN);
strncpy(node_info->slot_name, PQgetvalue(res, 0, 5), MAXLEN);
node_info->priority = atoi(PQgetvalue(res, 0, 6));
node_info->active = (strcmp(PQgetvalue(res, 0, 7), "t") == 0)
strncpy(node_info->ssh_hostname_str, PQgetvalue(res, 0, 5), MAXLEN);
strncpy(node_info->slot_name, PQgetvalue(res, 0, 6), MAXLEN);
node_info->priority = atoi(PQgetvalue(res, 0, 7));
node_info->active = (strcmp(PQgetvalue(res, 0, 8), "t") == 0)
? true
: false;

View File

@@ -45,6 +45,7 @@ typedef struct s_node_info
t_server_type type;
char name[MAXLEN];
char conninfo_str[MAXLEN];
char ssh_hostname_str[MAXLEN];
char slot_name[MAXLEN];
int priority;
bool active;
@@ -61,6 +62,7 @@ typedef struct s_node_info
"", \
"", \
"", \
"", \
DEFAULT_PRIORITY, \
true, \
false, \
@@ -126,7 +128,7 @@ bool start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint);
bool stop_backup(PGconn *conn, char *last_wal_segment);
bool set_config_bool(PGconn *conn, const char *config_param, bool state);
bool witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *cluster_name);
bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name, bool active);
bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, char *ssh_hostname, int priority, char *slot_name, bool active);
bool delete_node_record(PGconn *conn, int node, char *action);
int get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info);
int get_node_record_by_name(PGconn *conn, char *cluster, const char *node_name, t_node_info *node_info);

200
repmgr.c
View File

@@ -21,6 +21,7 @@
* WITNESS REGISTER
* WITNESS UNREGISTER
*
* CLUSTER MATRIX
* CLUSTER SHOW
* CLUSTER CLEANUP
*
@@ -88,7 +89,7 @@
#define WITNESS_UNREGISTER 12
#define CLUSTER_SHOW 13
#define CLUSTER_CLEANUP 14
#define CLUSTER_MATRIX 15
static int test_ssh_connection(char *host, char *remote_user);
static int copy_remote_files(char *host, char *remote_user, char *remote_path,
@@ -130,6 +131,7 @@ static void do_witness_register(PGconn *masterconn);
static void do_witness_unregister(void);
static void do_cluster_show(void);
static void do_cluster_matrix(void);
static void do_cluster_cleanup(void);
static void do_check_upstream_config(void);
static void do_help(void);
@@ -704,6 +706,8 @@ main(int argc, char **argv)
action = CLUSTER_SHOW;
else if (strcasecmp(server_cmd, "CLEANUP") == 0)
action = CLUSTER_CLEANUP;
else if (strcasecmp(server_cmd, "MATRIX") == 0)
action = CLUSTER_MATRIX;
}
else if (strcasecmp(server_mode, "WITNESS") == 0)
{
@@ -939,6 +943,9 @@ main(int argc, char **argv)
case WITNESS_UNREGISTER:
do_witness_unregister();
break;
case CLUSTER_MATRIX:
do_cluster_matrix();
break;
case CLUSTER_SHOW:
do_cluster_show();
break;
@@ -1059,8 +1066,7 @@ do_cluster_show(void)
if (runtime_options.csv_mode)
{
int connection_status =
(PQstatus(conn) == CONNECTION_OK) ?
(is_standby(conn) ? 1 : 0) : -1;
(PQstatus(conn) == CONNECTION_OK) ? 0 : -1;
printf("%s,%d\n", PQgetvalue(res, i, 4), connection_status);
}
else
@@ -1076,6 +1082,177 @@ do_cluster_show(void)
PQclear(res);
}
static void
do_cluster_matrix(void)
{
PGconn *conn;
PGresult *res;
char sqlquery[QUERY_STR_LEN];
int i, j;
const char *node_header = "Name";
int name_length = strlen(node_header);
int x, y;
int n = 0; /* number of nodes */
int *matrix;
char *p;
char command[MAXLEN];
PQExpBufferData command_output;
/* We need to connect to get the list of nodes */
log_info(_("connecting to database\n"));
conn = establish_db_connection(options.conninfo, true);
sqlquery_snprintf(sqlquery,
"SELECT conninfo, ssh_hostname, type, name, upstream_node_name, id"
" FROM %s.repl_show_nodes",
get_repmgr_schema_quoted(conn));
log_verbose(LOG_DEBUG, "do_cluster_show(): \n%s\n",sqlquery );
res = PQexec(conn, sqlquery);
if (PQresultStatus(res) != PGRES_TUPLES_OK)
{
log_err(_("Unable to retrieve node information from the database\n%s\n"),
PQerrorMessage(conn));
log_hint(_("Please check that all nodes have been registered\n"));
PQclear(res);
PQfinish(conn);
exit(ERR_BAD_CONFIG);
}
PQfinish(conn);
/*
* Allocate an empty matrix
*
* -2 == NULL
* -1 == Error
* 0 == OK
*/
n = PQntuples(res);
matrix = (int *) pg_malloc(sizeof(int) * n * n);
for (i = 0; i < n * n; i++)
matrix[i] = -2;
/*
* Find the maximum length of a node name
*/
for (i = 0; i < n; i++)
{
int name_length_cur;
name_length_cur = strlen(PQgetvalue(res, i, 3));
if (name_length_cur > name_length)
name_length = name_length_cur;
}
for (i = 0; i < n; i++)
{
int connection_status;
conn = establish_db_connection(PQgetvalue(res, i, 0), false);
connection_status =
(PQstatus(conn) == CONNECTION_OK) ? 0 : -1;
matrix[(options.node - 1) * n + i] =
connection_status;
if (connection_status)
continue;
if (i + 1 == options.node)
continue;
maxlen_snprintf(command,
"repmgr cluster show --csv");
initPQExpBuffer(&command_output);
(void)remote_command(
PQgetvalue(res, i, 1),
"postgres",
command,
&command_output);
p = command_output.data;
for (j = 0; j < n; j++)
{
if (sscanf(p, "%d,%d", &x, &y) != 2)
{
fprintf(stderr, _("cannot parse --csv output: %s\n"), p);
PQfinish(conn);
exit(ERR_INTERNAL);
}
matrix[i * n + (x - 1)] =
(y == -1) ? -1 : 0;
while (*p && (*p != '\n'))
p++;
if (*p == '\n')
p++;
}
PQfinish(conn);
}
if (runtime_options.csv_mode)
{
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
printf("%d,%d,%d\n",
i + 1, j + 1,
matrix[i * n + j]);
}
else
{
char c;
printf("%*s | Id ", name_length, node_header);
for (i = 0; i < n; i++)
printf("| %2d ", i+1);
printf("\n");
for (i = 0; i < name_length; i++)
printf("-");
printf("-+----");
for (i = 0; i < n; i++)
printf("+----");
printf("\n");
for (i = 0; i < n; i++)
{
printf("%*s | %2d ", name_length,
PQgetvalue(res, i, 3), i + 1);
for (j = 0; j < n; j++)
{
switch (matrix[i * n + j])
{
case -2:
c = '?';
break;
case -1:
c = 'x';
break;
case 0:
c = '*';
break;
default:
exit(ERR_INTERNAL);
}
printf("| %c ", c);
}
printf("\n");
}
}
PQclear(res);
}
static void
do_cluster_cleanup(void)
{
@@ -1298,6 +1475,7 @@ do_master_register(void)
options.cluster_name,
options.node_name,
options.conninfo,
options.ssh_hostname,
options.priority,
repmgr_slot_name_ptr,
true);
@@ -1423,6 +1601,7 @@ do_standby_register(void)
options.cluster_name,
options.node_name,
options.conninfo,
options.ssh_hostname,
options.priority,
repmgr_slot_name_ptr,
true);
@@ -5354,6 +5533,7 @@ do_witness_register(PGconn *masterconn)
options.cluster_name,
options.node_name,
options.conninfo,
options.ssh_hostname,
options.priority,
NULL,
true);
@@ -5583,7 +5763,8 @@ do_help(void)
printf(_(" --pg_rewind[=VALUE] (standby switchover) 9.3/9.4 only - use pg_rewind if available,\n" \
" optionally providing a path to the binary\n"));
printf(_(" -k, --keep-history=VALUE (cluster cleanup) retain indicated number of days of history (default: 0)\n"));
printf(_(" --csv (cluster show) output in CSV mode (0 = master, 1 = standby, -1 = down)\n"));
printf(_(" --csv (cluster show, cluster matrix) output in CSV mode:\n" \
" 0 = OK, -1 = down, -2 = unknown\n"));
printf(_(" -P, --pwprompt (witness server) prompt for password when creating users\n"));
printf(_(" -S, --superuser=USERNAME (witness server) superuser username for witness database\n" \
" (default: postgres)\n"));
@@ -5602,6 +5783,7 @@ do_help(void)
printf(_(" witness register - registers a witness server\n"));
printf(_(" witness unregister - unregisters a witness server\n"));
printf(_(" cluster show - displays information about cluster nodes\n"));
printf(_(" cluster matrix - displays the cluster's connection matrix\n"));
printf(_(" cluster cleanup - prunes or truncates monitoring history\n" \
" (monitoring history creation requires repmgrd\n" \
" with --monitoring-history option)\n"));
@@ -6252,12 +6434,12 @@ check_parameters_for_action(const int action)
}
}
/* Warn about parameters which apply to CLUSTER SHOW only */
if (action != CLUSTER_SHOW)
/* Warn about parameters which apply only to CLUSTER SHOW and CLUSTER MATRIX */
if (action != CLUSTER_SHOW && action != CLUSTER_MATRIX)
{
if (runtime_options.csv_mode)
{
item_list_append(&cli_warnings, _("--csv can only be used when executing CLUSTER SHOW"));
item_list_append(&cli_warnings, _("--csv can only be used when executing CLUSTER SHOW or CLUSTER MATRIX"));
}
}
@@ -6349,6 +6531,7 @@ create_schema(PGconn *conn)
" cluster TEXT NOT NULL, "
" name TEXT NOT NULL, "
" conninfo TEXT NOT NULL, "
" ssh_hostname TEXT NULL, "
" slot_name TEXT NULL, "
" priority INTEGER NOT NULL, "
" active BOOLEAN NOT NULL DEFAULT TRUE )",
@@ -6484,7 +6667,8 @@ create_schema(PGconn *conn)
/* CREATE VIEW repl_show_nodes */
sqlquery_snprintf(sqlquery,
"CREATE VIEW %s.repl_show_nodes AS "
"SELECT rn.id, rn.conninfo, rn.type, rn.name, rn.cluster,"
"SELECT rn.id, rn.conninfo, rn.ssh_hostname, "
" rn.type, rn.name, rn.cluster,"
" rn.priority, rn.active, sq.name AS upstream_node_name"
" FROM %s.repl_nodes as rn"
" LEFT JOIN %s.repl_nodes AS sq"

View File

@@ -42,6 +42,12 @@
# Optional configuration items
# ============================
# SSH connection information
# We recommend using the "postgres" user, so it is enough to indicate the hostname.
# If extra parameters such as port, etc. are needed, they can be
# specified in the .ssh/config file.
#ssh_hostname='192.168.204.104'
# Replication settings
# ---------------------

View File

@@ -17,6 +17,7 @@ CREATE TABLE repl_nodes (
cluster text not null, -- Name to identify the cluster
name text not null,
conninfo text not null,
ssh_command text,
priority integer not null,
witness boolean not null default false
);
@@ -65,6 +66,6 @@ CREATE INDEX idx_repl_status_sort ON repl_monitor(last_monitor_time, standby_nod
* in each case (when appliable)
*/
CREATE VIEW repl_show_nodes AS
SELECT rn.id, rn.conninfo, rn.type, rn.name, rn.cluster,
SELECT rn.id, rn.conninfo, rn.ssh_command, rn.type, rn.name, rn.cluster,
rn.priority, rn.active, sq.name AS upstream_node_name
FROM repl_nodes as rn LEFT JOIN repl_nodes AS sq ON sq.id=rn.upstream_node_id;