diff --git a/README.md b/README.md index 9003f7ed..ca8fca83 100644 --- a/README.md +++ b/README.md @@ -1601,6 +1601,61 @@ which contains connection details for the local database. The first column is the node's ID, and the second column represents the node's status (0 = master, 1 = standby, -1 = failed). +* `cluster matrix` + + Displays connection information for each pair of nodes in the + replication cluster. This command polls each registered server and + asks it to connect to each other node. + + This command requires a valid `repmgr.conf` file on each node, + with the optional `ssh_hostname` parameter set. + + Example 1 (all nodes up): + + $ repmgr -f /etc/repmgr.conf cluster matrix + + Name | Id | 1 | 2 | 3 + -------+----+----+----+---- + node1 | 1 | * | * | * + node2 | 2 | * | * | * + node3 | 3 | * | * | * + + Example 2 (node1 and node2 up, node3 down): + + $ repmgr -f /etc/repmgr.conf cluster matrix + + Name | Id | 1 | 2 | 3 + -------+----+----+----+---- + node1 | 1 | * | * | x + node2 | 2 | * | * | x + node3 | 3 | ? | ? | ? + + Each row corresponds to one server, and indicates the result of + testing an outbound connection from that server. + + Since node3 is down, all the entries in its row are filled with + "?", meaning that there we cannot test outbound connections. + + The other two nodes are up; the corresponding rows have "x" in the + column corresponding to node3, meaning that inbound connections to + that node have failed, and "*" in the columns corresponding to + node1 and node2, meaning that inbound connections to these nodes + have succeeded. + + Example 3 (all nodes up, firewall dropping packets originating + from node2 and directed to port 5432 on node3) + + After a long wait (same as before plus two timeouts, by default + one minute each), you will see the following output: + + $ repmgr -f /etc/repmgr.conf cluster matrix + + Name | Id | 1 | 2 | 3 + -------+----+----+----+---- + node1 | 1 | * | * | * + node2 | 2 | * | * | x + node3 | 3 | * | * | * + * `cluster cleanup` Purges monitoring history from the `repl_monitor` table to prevent excessive diff --git a/config.c b/config.c index 5df5872d..62eb8779 100644 --- a/config.c +++ b/config.c @@ -217,6 +217,7 @@ parse_config(t_configuration_options *options) memset(options->conninfo, 0, sizeof(options->conninfo)); memset(options->barman_server, 0, sizeof(options->barman_server)); memset(options->barman_config, 0, sizeof(options->barman_config)); + memset(options->ssh_hostname, 0, sizeof(options->ssh_hostname)); options->failover = MANUAL_FAILOVER; options->priority = DEFAULT_PRIORITY; memset(options->node_name, 0, sizeof(options->node_name)); @@ -316,6 +317,8 @@ parse_config(t_configuration_options *options) strncpy(options->barman_server, value, MAXLEN); else if (strcmp(name, "barman_config") == 0) strncpy(options->barman_config, value, MAXLEN); + else if (strcmp(name, "ssh_hostname") == 0) + strncpy(options->ssh_hostname, value, MAXLEN); else if (strcmp(name, "rsync_options") == 0) strncpy(options->rsync_options, value, QUERY_STR_LEN); else if (strcmp(name, "ssh_options") == 0) @@ -452,6 +455,10 @@ parse_config(t_configuration_options *options) PQconninfoFree(conninfo_options); } + /* + * TODO: Sanity check ssh_hostname + */ + if (config_errors.head != NULL) { exit_with_errors(&config_errors); @@ -620,6 +627,10 @@ reload_config(t_configuration_options *orig_options) PQfinish(conn); } + /* + * TODO: Sanity check the new ssh_hostname + */ + /* * No configuration problems detected - copy any changed values * @@ -648,6 +659,13 @@ reload_config(t_configuration_options *orig_options) config_changed = true; } + /* ssh_hostname */ + if (strcmp(orig_options->ssh_hostname, new_options.ssh_hostname) != 0) + { + strcpy(orig_options->ssh_hostname, new_options.ssh_hostname); + config_changed = true; + } + /* node */ if (orig_options->node != new_options.node) { diff --git a/config.h b/config.h index 5b21fce3..6ad27dc7 100644 --- a/config.h +++ b/config.h @@ -60,6 +60,7 @@ typedef struct char conninfo[MAXLEN]; char barman_server[MAXLEN]; char barman_config[MAXLEN]; + char ssh_hostname[MAXLEN]; int failover; int priority; char node_name[MAXLEN]; @@ -93,7 +94,7 @@ typedef struct * The following will initialize the structure with a minimal set of options; * actual defaults are set in parse_config() before parsing the configuration file */ -#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", "", "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", "", 0, 0, 0, 0, "", { NULL, NULL }, {NULL, NULL} } +#define T_CONFIGURATION_OPTIONS_INITIALIZER { "", -1, NO_UPSTREAM_NODE, "", "", "", "", MANUAL_FAILOVER, -1, "", "", "", "", "", "", "", "", "", "", -1, -1, -1, "", "", "", "", "", 0, 0, 0, 0, "", { NULL, NULL }, { NULL, NULL } } typedef struct ItemListCell { diff --git a/dbutils.c b/dbutils.c index fd1080b8..969ca23f 100644 --- a/dbutils.c +++ b/dbutils.c @@ -1227,7 +1227,7 @@ witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *cluster /* Get current records from primary */ sqlquery_snprintf(sqlquery, - "SELECT id, type, upstream_node_id, name, conninfo, priority, slot_name, active FROM %s.repl_nodes", + "SELECT id, type, upstream_node_id, name, conninfo, ssh_command, priority, slot_name, active FROM %s.repl_nodes", get_repmgr_schema_quoted(masterconn)); log_verbose(LOG_DEBUG, "witness_copy_node_records():\n%s\n", sqlquery); @@ -1263,11 +1263,12 @@ witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *cluster cluster_name, PQgetvalue(res, i, 3), PQgetvalue(res, i, 4), - atoi(PQgetvalue(res, i, 5)), - strlen(PQgetvalue(res, i, 6)) - ? PQgetvalue(res, i, 6) + PQgetvalue(res, i, 5), + atoi(PQgetvalue(res, i, 6)), + strlen(PQgetvalue(res, i, 7)) + ? PQgetvalue(res, i, 7) : NULL, - (strcmp(PQgetvalue(res, i, 7), "t") == 0) + (strcmp(PQgetvalue(res, i, 8), "t") == 0) ? true : false ); @@ -1300,7 +1301,7 @@ witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *cluster * XXX we should pass the record parameters as a struct. */ bool -create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name, bool active) +create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, char *ssh_hostname, int priority, char *slot_name, bool active) { char sqlquery[QUERY_STR_LEN]; char upstream_node_id[MAXLEN]; @@ -1341,8 +1342,9 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea sqlquery_snprintf(sqlquery, "INSERT INTO %s.repl_nodes " " (id, type, upstream_node_id, cluster, " - " name, conninfo, slot_name, priority, active) " - "VALUES (%i, '%s', %s, '%s', '%s', '%s', %s, %i, %s) ", + " name, conninfo, ssh_hostname, slot_name, " + " priority, active) " + "VALUES (%i, '%s', %s, '%s', '%s', '%s', '%s', %s, %i, %s) ", get_repmgr_schema_quoted(conn), node, type, @@ -1350,6 +1352,7 @@ create_node_record(PGconn *conn, char *action, int node, char *type, int upstrea cluster_name, node_name, conninfo, + ssh_hostname, slot_name_buf, priority, active == true ? "TRUE" : "FALSE"); @@ -1716,7 +1719,8 @@ get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info sqlquery_snprintf( sqlquery, - "SELECT id, type, upstream_node_id, name, conninfo, slot_name, priority, active" + "SELECT id, type, upstream_node_id, name, conninfo, ssh_hostname, " + " slot_name, priority, active" " FROM %s.repl_nodes " " WHERE cluster = '%s' " " AND id = %i", @@ -1789,9 +1793,10 @@ _get_node_record(PGconn *conn, char *cluster, char *sqlquery, t_node_info *node_ node_info->upstream_node_id = atoi(PQgetvalue(res, 0, 2)); strncpy(node_info->name, PQgetvalue(res, 0, 3), MAXLEN); strncpy(node_info->conninfo_str, PQgetvalue(res, 0, 4), MAXLEN); - strncpy(node_info->slot_name, PQgetvalue(res, 0, 5), MAXLEN); - node_info->priority = atoi(PQgetvalue(res, 0, 6)); - node_info->active = (strcmp(PQgetvalue(res, 0, 7), "t") == 0) + strncpy(node_info->ssh_hostname_str, PQgetvalue(res, 0, 5), MAXLEN); + strncpy(node_info->slot_name, PQgetvalue(res, 0, 6), MAXLEN); + node_info->priority = atoi(PQgetvalue(res, 0, 7)); + node_info->active = (strcmp(PQgetvalue(res, 0, 8), "t") == 0) ? true : false; diff --git a/dbutils.h b/dbutils.h index e3aebd79..e7fb5b99 100644 --- a/dbutils.h +++ b/dbutils.h @@ -45,6 +45,7 @@ typedef struct s_node_info t_server_type type; char name[MAXLEN]; char conninfo_str[MAXLEN]; + char ssh_hostname_str[MAXLEN]; char slot_name[MAXLEN]; int priority; bool active; @@ -61,6 +62,7 @@ typedef struct s_node_info "", \ "", \ "", \ + "", \ DEFAULT_PRIORITY, \ true, \ false, \ @@ -126,7 +128,7 @@ bool start_backup(PGconn *conn, char *first_wal_segment, bool fast_checkpoint); bool stop_backup(PGconn *conn, char *last_wal_segment); bool set_config_bool(PGconn *conn, const char *config_param, bool state); bool witness_copy_node_records(PGconn *masterconn, PGconn *witnessconn, char *cluster_name); -bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, int priority, char *slot_name, bool active); +bool create_node_record(PGconn *conn, char *action, int node, char *type, int upstream_node, char *cluster_name, char *node_name, char *conninfo, char *ssh_hostname, int priority, char *slot_name, bool active); bool delete_node_record(PGconn *conn, int node, char *action); int get_node_record(PGconn *conn, char *cluster, int node_id, t_node_info *node_info); int get_node_record_by_name(PGconn *conn, char *cluster, const char *node_name, t_node_info *node_info); diff --git a/repmgr.c b/repmgr.c index 866dfef7..49f9edea 100644 --- a/repmgr.c +++ b/repmgr.c @@ -21,6 +21,7 @@ * WITNESS REGISTER * WITNESS UNREGISTER * + * CLUSTER MATRIX * CLUSTER SHOW * CLUSTER CLEANUP * @@ -88,7 +89,7 @@ #define WITNESS_UNREGISTER 12 #define CLUSTER_SHOW 13 #define CLUSTER_CLEANUP 14 - +#define CLUSTER_MATRIX 15 static int test_ssh_connection(char *host, char *remote_user); static int copy_remote_files(char *host, char *remote_user, char *remote_path, @@ -130,6 +131,7 @@ static void do_witness_register(PGconn *masterconn); static void do_witness_unregister(void); static void do_cluster_show(void); +static void do_cluster_matrix(void); static void do_cluster_cleanup(void); static void do_check_upstream_config(void); static void do_help(void); @@ -704,6 +706,8 @@ main(int argc, char **argv) action = CLUSTER_SHOW; else if (strcasecmp(server_cmd, "CLEANUP") == 0) action = CLUSTER_CLEANUP; + else if (strcasecmp(server_cmd, "MATRIX") == 0) + action = CLUSTER_MATRIX; } else if (strcasecmp(server_mode, "WITNESS") == 0) { @@ -939,6 +943,9 @@ main(int argc, char **argv) case WITNESS_UNREGISTER: do_witness_unregister(); break; + case CLUSTER_MATRIX: + do_cluster_matrix(); + break; case CLUSTER_SHOW: do_cluster_show(); break; @@ -1059,8 +1066,7 @@ do_cluster_show(void) if (runtime_options.csv_mode) { int connection_status = - (PQstatus(conn) == CONNECTION_OK) ? - (is_standby(conn) ? 1 : 0) : -1; + (PQstatus(conn) == CONNECTION_OK) ? 0 : -1; printf("%s,%d\n", PQgetvalue(res, i, 4), connection_status); } else @@ -1076,6 +1082,177 @@ do_cluster_show(void) PQclear(res); } +static void +do_cluster_matrix(void) +{ + PGconn *conn; + PGresult *res; + char sqlquery[QUERY_STR_LEN]; + int i, j; + const char *node_header = "Name"; + int name_length = strlen(node_header); + + int x, y; + int n = 0; /* number of nodes */ + int *matrix; + char *p; + + char command[MAXLEN]; + PQExpBufferData command_output; + + /* We need to connect to get the list of nodes */ + log_info(_("connecting to database\n")); + conn = establish_db_connection(options.conninfo, true); + + sqlquery_snprintf(sqlquery, + "SELECT conninfo, ssh_hostname, type, name, upstream_node_name, id" + " FROM %s.repl_show_nodes", + get_repmgr_schema_quoted(conn)); + + log_verbose(LOG_DEBUG, "do_cluster_show(): \n%s\n",sqlquery ); + + res = PQexec(conn, sqlquery); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + log_err(_("Unable to retrieve node information from the database\n%s\n"), + PQerrorMessage(conn)); + log_hint(_("Please check that all nodes have been registered\n")); + + PQclear(res); + PQfinish(conn); + exit(ERR_BAD_CONFIG); + } + PQfinish(conn); + + /* + * Allocate an empty matrix + * + * -2 == NULL + * -1 == Error + * 0 == OK + */ + n = PQntuples(res); + matrix = (int *) pg_malloc(sizeof(int) * n * n); + for (i = 0; i < n * n; i++) + matrix[i] = -2; + + /* + * Find the maximum length of a node name + */ + for (i = 0; i < n; i++) + { + int name_length_cur; + + name_length_cur = strlen(PQgetvalue(res, i, 3)); + if (name_length_cur > name_length) + name_length = name_length_cur; + } + + for (i = 0; i < n; i++) + { + int connection_status; + + conn = establish_db_connection(PQgetvalue(res, i, 0), false); + + connection_status = + (PQstatus(conn) == CONNECTION_OK) ? 0 : -1; + + matrix[(options.node - 1) * n + i] = + connection_status; + + if (connection_status) + continue; + + if (i + 1 == options.node) + continue; + + maxlen_snprintf(command, + "repmgr cluster show --csv"); + + initPQExpBuffer(&command_output); + + (void)remote_command( + PQgetvalue(res, i, 1), + "postgres", + command, + &command_output); + + p = command_output.data; + + for (j = 0; j < n; j++) + { + if (sscanf(p, "%d,%d", &x, &y) != 2) + { + fprintf(stderr, _("cannot parse --csv output: %s\n"), p); + PQfinish(conn); + exit(ERR_INTERNAL); + } + matrix[i * n + (x - 1)] = + (y == -1) ? -1 : 0; + while (*p && (*p != '\n')) + p++; + if (*p == '\n') + p++; + } + + PQfinish(conn); + } + + if (runtime_options.csv_mode) + { + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) + printf("%d,%d,%d\n", + i + 1, j + 1, + matrix[i * n + j]); + } + else + { + char c; + + printf("%*s | Id ", name_length, node_header); + for (i = 0; i < n; i++) + printf("| %2d ", i+1); + printf("\n"); + + for (i = 0; i < name_length; i++) + printf("-"); + printf("-+----"); + for (i = 0; i < n; i++) + printf("+----"); + printf("\n"); + + for (i = 0; i < n; i++) + { + printf("%*s | %2d ", name_length, + PQgetvalue(res, i, 3), i + 1); + for (j = 0; j < n; j++) + { + switch (matrix[i * n + j]) + { + case -2: + c = '?'; + break; + case -1: + c = 'x'; + break; + case 0: + c = '*'; + break; + default: + exit(ERR_INTERNAL); + } + + printf("| %c ", c); + } + printf("\n"); + } + } + + PQclear(res); +} + static void do_cluster_cleanup(void) { @@ -1298,6 +1475,7 @@ do_master_register(void) options.cluster_name, options.node_name, options.conninfo, + options.ssh_hostname, options.priority, repmgr_slot_name_ptr, true); @@ -1423,6 +1601,7 @@ do_standby_register(void) options.cluster_name, options.node_name, options.conninfo, + options.ssh_hostname, options.priority, repmgr_slot_name_ptr, true); @@ -5354,6 +5533,7 @@ do_witness_register(PGconn *masterconn) options.cluster_name, options.node_name, options.conninfo, + options.ssh_hostname, options.priority, NULL, true); @@ -5583,7 +5763,8 @@ do_help(void) printf(_(" --pg_rewind[=VALUE] (standby switchover) 9.3/9.4 only - use pg_rewind if available,\n" \ " optionally providing a path to the binary\n")); printf(_(" -k, --keep-history=VALUE (cluster cleanup) retain indicated number of days of history (default: 0)\n")); - printf(_(" --csv (cluster show) output in CSV mode (0 = master, 1 = standby, -1 = down)\n")); + printf(_(" --csv (cluster show, cluster matrix) output in CSV mode:\n" \ + " 0 = OK, -1 = down, -2 = unknown\n")); printf(_(" -P, --pwprompt (witness server) prompt for password when creating users\n")); printf(_(" -S, --superuser=USERNAME (witness server) superuser username for witness database\n" \ " (default: postgres)\n")); @@ -5602,6 +5783,7 @@ do_help(void) printf(_(" witness register - registers a witness server\n")); printf(_(" witness unregister - unregisters a witness server\n")); printf(_(" cluster show - displays information about cluster nodes\n")); + printf(_(" cluster matrix - displays the cluster's connection matrix\n")); printf(_(" cluster cleanup - prunes or truncates monitoring history\n" \ " (monitoring history creation requires repmgrd\n" \ " with --monitoring-history option)\n")); @@ -6252,12 +6434,12 @@ check_parameters_for_action(const int action) } } - /* Warn about parameters which apply to CLUSTER SHOW only */ - if (action != CLUSTER_SHOW) + /* Warn about parameters which apply only to CLUSTER SHOW and CLUSTER MATRIX */ + if (action != CLUSTER_SHOW && action != CLUSTER_MATRIX) { if (runtime_options.csv_mode) { - item_list_append(&cli_warnings, _("--csv can only be used when executing CLUSTER SHOW")); + item_list_append(&cli_warnings, _("--csv can only be used when executing CLUSTER SHOW or CLUSTER MATRIX")); } } @@ -6349,6 +6531,7 @@ create_schema(PGconn *conn) " cluster TEXT NOT NULL, " " name TEXT NOT NULL, " " conninfo TEXT NOT NULL, " + " ssh_hostname TEXT NULL, " " slot_name TEXT NULL, " " priority INTEGER NOT NULL, " " active BOOLEAN NOT NULL DEFAULT TRUE )", @@ -6484,7 +6667,8 @@ create_schema(PGconn *conn) /* CREATE VIEW repl_show_nodes */ sqlquery_snprintf(sqlquery, "CREATE VIEW %s.repl_show_nodes AS " - "SELECT rn.id, rn.conninfo, rn.type, rn.name, rn.cluster," + "SELECT rn.id, rn.conninfo, rn.ssh_hostname, " + " rn.type, rn.name, rn.cluster," " rn.priority, rn.active, sq.name AS upstream_node_name" " FROM %s.repl_nodes as rn" " LEFT JOIN %s.repl_nodes AS sq" diff --git a/repmgr.conf.sample b/repmgr.conf.sample index 58a558db..5ffb840c 100644 --- a/repmgr.conf.sample +++ b/repmgr.conf.sample @@ -42,6 +42,12 @@ # Optional configuration items # ============================ +# SSH connection information +# We recommend using the "postgres" user, so it is enough to indicate the hostname. +# If extra parameters such as port, etc. are needed, they can be +# specified in the .ssh/config file. +#ssh_hostname='192.168.204.104' + # Replication settings # --------------------- diff --git a/repmgr.sql b/repmgr.sql index 9628d3f6..6361ad35 100644 --- a/repmgr.sql +++ b/repmgr.sql @@ -17,6 +17,7 @@ CREATE TABLE repl_nodes ( cluster text not null, -- Name to identify the cluster name text not null, conninfo text not null, + ssh_command text, priority integer not null, witness boolean not null default false ); @@ -65,6 +66,6 @@ CREATE INDEX idx_repl_status_sort ON repl_monitor(last_monitor_time, standby_nod * in each case (when appliable) */ CREATE VIEW repl_show_nodes AS -SELECT rn.id, rn.conninfo, rn.type, rn.name, rn.cluster, +SELECT rn.id, rn.conninfo, rn.ssh_command, rn.type, rn.name, rn.cluster, rn.priority, rn.active, sq.name AS upstream_node_name FROM repl_nodes as rn LEFT JOIN repl_nodes AS sq ON sq.id=rn.upstream_node_id;