diff --git a/HISTORY b/HISTORY index bad2bacd..59a5f7e4 100644 --- a/HISTORY +++ b/HISTORY @@ -2,6 +2,8 @@ repmgr: add parameter "shutdown_check_timeout" for use by "standby switchover"; GitHub #504 (Ian) repmgr: add "--node-id" option to "repmgr cluster cleanup"; GitHub #493 (Ian) + repmgr: report unreachable nodes when running "repmgr cluster (matrix|crosscheck); + GitHub #246 (Ian) repmgr: add configuration file parameter "repmgr_bindir"; GitHub #246 (Ian) 4.1.1 2018-09-05 diff --git a/doc/appendix-release-notes.sgml b/doc/appendix-release-notes.sgml index 6f582f98..842e40b4 100644 --- a/doc/appendix-release-notes.sgml +++ b/doc/appendix-release-notes.sgml @@ -71,6 +71,27 @@ + + + When running + repmgr cluster matrix and + repmgr cluster crosscheck, + &repmgr; will report nodes unreachable via SSH, and emit return code ERR_BAD_SSH. + (GitHub #246). + + + + Users relying on + repmgr cluster crosscheck + to return a non-zero return code as a way of detecting connectivity errors should be aware + that ERR_BAD_SSH will be returned if there is an SSH connection error + from the node where the command is executed, even if the command is able to establish + that PostgreSQL connectivity is fine. Therefore the exact return code should be checked + to determine what kind of connectivity error has been detected. + + + + diff --git a/doc/repmgr-cluster-crosscheck.sgml b/doc/repmgr-cluster-crosscheck.sgml index c3aaeb75..0a263a27 100644 --- a/doc/repmgr-cluster-crosscheck.sgml +++ b/doc/repmgr-cluster-crosscheck.sgml @@ -55,12 +55,37 @@ + + + + + One or more nodes could not be accessed via SSH. + + + + This only applies to nodes unreachable from the node where + this command is executed. + + + It's also possible that the crosscheck establishes that + connections between PostgreSQL on all nodes are functioning, + even if SSH access between some nodes is not possible. + + + + + - One or more nodes could not be reached. + PostgreSQL on one or more nodes could not be reached. + + + This error code overrides . + + diff --git a/doc/repmgr-cluster-matrix.sgml b/doc/repmgr-cluster-matrix.sgml index 90992199..a3bb1492 100644 --- a/doc/repmgr-cluster-matrix.sgml +++ b/doc/repmgr-cluster-matrix.sgml @@ -115,12 +115,26 @@ + + + + + One or more nodes could not be accessed via SSH. + + + + - One or more nodes could not be reached. + PostgreSQL on one or more nodes could not be reached. + + + This error code overrides . + + diff --git a/repmgr-action-cluster.c b/repmgr-action-cluster.c index 21470d64..64db3b86 100644 --- a/repmgr-action-cluster.c +++ b/repmgr-action-cluster.c @@ -55,8 +55,8 @@ struct ColHeader headers_event[EVENT_HEADER_COUNT]; -static int build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length); -static int build_cluster_crosscheck(t_node_status_cube ***cube_dest, int *name_length); +static int build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, ItemList *warnings, int *error_code); +static int build_cluster_crosscheck(t_node_status_cube ***cube_dest, int *name_length, ItemList *warnings, int *error_code); static void cube_set_node_status(t_node_status_cube **cube, int n, int node_id, int matrix_node_id, int connection_node_id, int connection_status); /* @@ -602,9 +602,12 @@ do_cluster_crosscheck(void) t_node_status_cube **cube; - bool error_found = false; + bool connection_error_found = false; + int error_code = SUCCESS; + ItemList warnings = {NULL, NULL}; + + n = build_cluster_crosscheck(&cube, &name_length, &warnings, &error_code); - n = build_cluster_crosscheck(&cube, &name_length); if (runtime_options.output_mode == OM_CSV) { for (i = 0; i < n; i++) @@ -626,6 +629,11 @@ do_cluster_crosscheck(void) cube[i]->node_id, cube[j]->node_id, max_node_status); + + if (max_node_status == -1) + { + connection_error_found = true; + } } } @@ -683,16 +691,16 @@ do_cluster_crosscheck(void) { case -2: c = '?'; - error_found = true; break; case -1: c = 'x'; - error_found = true; + connection_error_found = true; break; case 0: c = '*'; break; default: + log_error("unexpected node status value %i", max_node_status); exit(ERR_INTERNAL); } @@ -701,6 +709,13 @@ do_cluster_crosscheck(void) printf("\n"); } + + if (warnings.head != NULL && runtime_options.terse == false) + { + log_warning(_("following problems detected:")); + print_item_list(&warnings); + } + } /* clean up allocated cube array */ @@ -727,13 +742,23 @@ do_cluster_crosscheck(void) free(cube); } - if (error_found == true) + /* errors detected by build_cluster_crosscheck() have priority */ + if (connection_error_found == true) { - exit(ERR_NODE_STATUS); + error_code = ERR_NODE_STATUS; } + + exit(error_code); + } +/* + * CLUSTER MATRIX + * + * Parameters: + * --csv + */ void do_cluster_matrix() { @@ -746,18 +771,30 @@ do_cluster_matrix() t_node_matrix_rec **matrix_rec_list; - bool error_found = false; + bool connection_error_found = false; + int error_code = SUCCESS; + ItemList warnings = {NULL, NULL}; - n = build_cluster_matrix(&matrix_rec_list, &name_length); + n = build_cluster_matrix(&matrix_rec_list, &name_length, &warnings, &error_code); if (runtime_options.output_mode == OM_CSV) { for (i = 0; i < n; i++) + { for (j = 0; j < n; j++) + { printf("%d,%d,%d\n", matrix_rec_list[i]->node_id, matrix_rec_list[i]->node_status_list[j]->node_id, matrix_rec_list[i]->node_status_list[j]->node_status); + + if (matrix_rec_list[i]->node_status_list[j]->node_status == -2 + || matrix_rec_list[i]->node_status_list[j]->node_status == -1) + { + connection_error_found = true; + } + } + } } else { @@ -786,16 +823,16 @@ do_cluster_matrix() { case -2: c = '?'; - error_found = true; break; case -1: c = 'x'; - error_found = true; + connection_error_found = true; break; case 0: c = '*'; break; default: + log_error("unexpected node status value %i", matrix_rec_list[i]->node_status_list[j]->node_status); exit(ERR_INTERNAL); } @@ -803,6 +840,13 @@ do_cluster_matrix() } printf("\n"); } + + if (warnings.head != NULL && runtime_options.terse == false) + { + log_warning(_("following problems detected:")); + print_item_list(&warnings); + } + } for (i = 0; i < n; i++) @@ -817,10 +861,13 @@ do_cluster_matrix() free(matrix_rec_list); - if (error_found == true) + /* actual database connection errors have priority */ + if (connection_error_found == true) { - exit(ERR_NODE_STATUS); + error_code = ERR_NODE_STATUS; } + + exit(error_code); } @@ -849,7 +896,7 @@ matrix_set_node_status(t_node_matrix_rec **matrix_rec_list, int n, int node_id, static int -build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length) +build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length, ItemList *warnings, int *error_code) { PGconn *conn = NULL; int i = 0, @@ -897,7 +944,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length) /* * Allocate an empty matrix record list * - * -2 == NULL ? -1 == Error x 0 == OK * + * -2 == NULL ? -1 == Error x 0 == OK */ matrix_rec_list = (t_node_matrix_rec **) pg_malloc0(sizeof(t_node_matrix_rec) * nodes.node_count); @@ -1017,32 +1064,50 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length) termPQExpBuffer(&command); - for (j = 0; j < nodes.node_count; j++) + /* no output returned - probably SSH error */ + if (p[0] == '\0' || p[0] == '\n') { - if (sscanf(p, "%d,%d", &x, &y) != 2) + item_list_append_format(warnings, + "node %i inaccessible via SSH", + connection_node_id); + *error_code = ERR_BAD_SSH; + } + else + { + for (j = 0; j < nodes.node_count; j++) { - fprintf(stderr, _("cannot parse --csv output: %s\n"), p); - PQfinish(node_conn); - exit(ERR_INTERNAL); + if (sscanf(p, "%d,%d", &x, &y) != 2) + { + matrix_set_node_status(matrix_rec_list, + nodes.node_count, + connection_node_id, + x, + -2); + + item_list_append_format(warnings, + "unable to parse --csv output for node %i; output returned was:\n\"%s\"", + connection_node_id, p); + *error_code = ERR_INTERNAL; + } + else + { + matrix_set_node_status(matrix_rec_list, + nodes.node_count, + connection_node_id, + x, + (y == -1) ? -1 : 0); + } + + while (*p && (*p != '\n')) + p++; + if (*p == '\n') + p++; } - - matrix_set_node_status(matrix_rec_list, - nodes.node_count, - connection_node_id, - x, - (y == -1) ? -1 : 0); - - while (*p && (*p != '\n')) - p++; - if (*p == '\n') - p++; } termPQExpBuffer(&command_output); PQfinish(node_conn); free_conninfo_params(&remote_conninfo); - - node_conn = NULL; } *matrix_rec_dest = matrix_rec_list; @@ -1055,7 +1120,7 @@ build_cluster_matrix(t_node_matrix_rec ***matrix_rec_dest, int *name_length) static int -build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length) +build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length, ItemList *warnings, int *error_code) { PGconn *conn = NULL; int h, @@ -1175,7 +1240,6 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length) initPQExpBuffer(&command_output); - /* fix to work with --node-id */ if (cube[i]->node_id == config_file_options.node_id) { (void) local_command_simple(command.data, @@ -1216,9 +1280,13 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length) p = command_output.data; - if (!strlen(command_output.data)) + if (p[0] == '\0' || p[0] == '\n') { + item_list_append_format(warnings, + "node %i inaccessible via SSH", + remote_node_id); termPQExpBuffer(&command_output); + *error_code = ERR_BAD_SSH; continue; } @@ -1230,16 +1298,23 @@ build_cluster_crosscheck(t_node_status_cube ***dest_cube, int *name_length) if (sscanf(p, "%d,%d,%d", &matrix_rec_node_id, &node_status_node_id, &node_status) != 3) { - fprintf(stderr, _("cannot parse --csv output: %s\n"), p); - exit(ERR_INTERNAL); + cube_set_node_status(cube, + nodes.node_count, + remote_node_id, + matrix_rec_node_id, + node_status_node_id, + -2); + *error_code = ERR_INTERNAL; + } + else + { + cube_set_node_status(cube, + nodes.node_count, + remote_node_id, + matrix_rec_node_id, + node_status_node_id, + node_status); } - - cube_set_node_status(cube, - nodes.node_count, - remote_node_id, - matrix_rec_node_id, - node_status_node_id, - node_status); while (*p && (*p != '\n')) p++;