diff --git c/Makefile.am w/Makefile.am index 467ca4a..3488271 100644 --- c/Makefile.am +++ w/Makefile.am @@ -182,6 +182,7 @@ TESTS = \ tests/datamash-io-errors-cheap.sh \ tests/datamash-strbin.sh \ tests/datamash-valgrind.sh \ + tests/datamash-vnlog.pl \ tests/decorate-tests.pl \ tests/decorate-errors.pl \ tests/decorate-sort-tests.pl diff --git c/src/datamash.c w/src/datamash.c index 8c0d4ce..88b9292 100644 --- c/src/datamash.c +++ w/src/datamash.c @@ -76,12 +76,6 @@ static size_t line_number = 0 ; /* Lines in the current group */ static size_t lines_in_group = 0 ; -/* Print Output Header */ -static bool output_header = false; - -/* Input file has a header line */ -static bool input_header = false; - /* If true, print the entire input line. Otherwise, print only the key fields */ static bool print_full_line = false; @@ -115,6 +109,7 @@ enum OUTPUT_DELIMITER_OPTION, CUSTOM_FORMAT_OPTION, SORT_PROGRAM_OPTION, + VNLOG_OPTION, UNDOC_PRINT_INF_OPTION, UNDOC_PRINT_NAN_OPTION, UNDOC_PRINT_PROGNAME_OPTION, @@ -134,6 +129,8 @@ static struct option const long_options[] = {"header-in", no_argument, NULL, INPUT_HEADER_OPTION}, {"header-out", no_argument, NULL, OUTPUT_HEADER_OPTION}, {"headers", no_argument, NULL, 'H'}, + {"vnlog", no_argument, NULL, VNLOG_OPTION}, + {"vnl", no_argument, NULL, VNLOG_OPTION}, {"full", no_argument, NULL, 'f'}, {"filler", required_argument, NULL, 'F'}, {"format", required_argument, NULL, CUSTOM_FORMAT_OPTION}, @@ -253,6 +250,10 @@ which require a pair of fields (e.g. 'pcov 2:6').\n"), stdout); "), stdout); fputs (_("\ -H, --headers same as '--header-in --header-out'\n\ +"), stdout); + fputs (_("\ + --vnl, --vnlog Reads and writes data in the vnlog format.\n\ + Implies -C -H -W\n\ "), stdout); fputs (_("\ -i, --ignore-case ignore upper/lower case when comparing text;\n\ @@ -469,6 +470,9 @@ print_input_line (const struct line_record_t* lb) static void print_column_headers () { + if ( vnlog ) + printf ("# "); + if (print_full_line) { /* Print the headers of all the input fields */ @@ -556,7 +560,8 @@ process_input_header (FILE *stream) struct line_record_t lr; line_record_init (&lr); - if (line_record_fread (&lr, stream, eolchar, skip_comments)) + + if (line_record_fread (&lr, stream, eolchar, skip_comments, vnlog)) { build_input_line_headers (&lr, true); line_number++; @@ -667,12 +672,10 @@ is deprecated and will be disabled in a future release.\n"), stderr); print_column_headers (); - while (true) + while (line_record_fread (thisline, input_stream, eolchar, skip_comments, false)) { bool new_group = false; - if (!line_record_fread (thisline, input_stream, eolchar, skip_comments)) - break; line_number++; /* If there's no input header line, and the user requested an output @@ -747,7 +750,7 @@ transpose_file () num_lines++; line_record_init (thisline); - if (!line_record_fread (thisline, input_stream, eolchar, skip_comments)) + if (!line_record_fread (thisline, input_stream, eolchar, skip_comments, false)) break; line_number++; @@ -802,10 +805,9 @@ reverse_fields_in_file () thisline = &lr; line_record_init (thisline); - while (true) + while (line_record_fread (thisline, input_stream, eolchar, skip_comments, + vnlog && line_number==0)) { - if (!line_record_fread (thisline, input_stream, eolchar, skip_comments)) - break; line_number++; const size_t num_fields = line_record_num_fields (thisline); @@ -823,6 +825,31 @@ reverse_fields_in_file () /* Special handling for header line */ if (line_number == 1) { + if (vnlog) + { + /* If using named-columns, find the column numbers after reading the + header line. */ + build_input_line_headers (&lr, true); + group_columns_find_named_columns (); + + fprintf(stdout, "# "); + const size_t num_fields = line_record_num_fields (thisline); + for (size_t i = num_fields ; i >= 1 ; --i) { + if (i1) @@ -1054,10 +1079,8 @@ remove_dups_in_file () /* TODO: handle (output_header && !input_header) by generating dummy headers after the first line is read, and the number of fields is known. */ - while (true) + while (line_record_fread (thisline, input_stream, eolchar, skip_comments, false)) { - if (!line_record_fread (thisline, input_stream, eolchar, skip_comments)) - break; line_number++; if (!line_record_get_field (thisline, key_col, &str, &len)) @@ -1264,6 +1287,15 @@ int main (int argc, char* argv[]) case_sensitive = false; break; + case VNLOG_OPTION: + skip_comments = true; + input_header = output_header = true; + missing_field_filler = "-"; + in_tab = TAB_WHITESPACE; + out_tab = ' '; + vnlog = true; + break; + case 'z': eolchar = 0; break; @@ -1390,6 +1422,34 @@ int main (int argc, char* argv[]) die (EXIT_FAILURE, 0, _("-H or --header-in must be used with named columns")); + if(vnlog) + { + if(!skip_comments) + die (EXIT_FAILURE, 0, + _("vnlog processing always skips comments")); + if(!input_header) + die (EXIT_FAILURE, 0, + _("vnlog processing always reads field labels")); + if(!output_header) + die (EXIT_FAILURE, 0, + _("vnlog processing always reads field labels")); + if(0 != strcmp(missing_field_filler, "-")) + die (EXIT_FAILURE, 0, + _("vnlog processing always uses '-' for empty fields")); + if(in_tab != TAB_WHITESPACE) + die (EXIT_FAILURE, 0, + _("vnlog processing always uses whitespace to separate input fields")); + if(out_tab != ' ') + die (EXIT_FAILURE, 0, + _("vnlog processing always uses ' ' to separate output fields")); + if(explicit_output_delimiter != -1) + die (EXIT_FAILURE, 0, + _("vnlog processing always uses the default output delimiter")); + if(eolchar != '\n') + die (EXIT_FAILURE, 0, + _("vnlog processing always uses '\\n' to terminate output lines")); + } + open_input (); switch (dm->mode) /* LCOV_EXCL_BR_LINE */ { diff --git c/src/op-parser.c w/src/op-parser.c index 0ebde2d..8a2c3cf 100644 --- c/src/op-parser.c +++ w/src/op-parser.c @@ -31,6 +31,7 @@ #include "op-parser.h" #include "utils.h" #include "field-ops.h" +#include "text-options.h" static struct datamash_ops *dm = NULL; @@ -262,6 +263,10 @@ parse_simple_operation_column (struct parser_field_t /*OUTPUT*/ *p, { assert (p); /* LCOV_EXCL_LINE */ enum TOKEN tok = scanner_get_token (); + + if (vnlog && tok == TOK_INTEGER) + tok = TOK_IDENTIFIER; + switch (tok) /* LCOV_EXCL_BR */ { case TOK_IDENTIFIER: @@ -541,6 +546,10 @@ static void parse_mode_column (enum processing_mode pm) { enum TOKEN tok = scanner_get_token (); + + if (vnlog && tok == TOK_INTEGER) + tok = TOK_IDENTIFIER; + switch (tok) /* LCOV_EXCL_BR */ { case TOK_IDENTIFIER: diff --git c/src/text-lines.c w/src/text-lines.c index 2d4544b..dbcb892 100644 --- c/src/text-lines.c +++ w/src/text-lines.c @@ -34,6 +34,7 @@ #include "text-options.h" #include "text-lines.h" +#include "die.h" void line_record_init (struct line_record_t* lr) @@ -91,21 +92,32 @@ line_record_reserve_fields (struct line_record_t* lr, const size_t n) } static void -line_record_parse_fields (struct line_record_t *lr, int field_delim) +line_record_parse_fields (/* The buffer. May or may not be the one in the + following argument */ + const struct linebuffer* lbuf, + + /* Used ONLY for the fields. The buffer is picked up + from the above argument */ + struct line_record_t *lr, + int field_delim, + bool ignore_trailing_comments, + bool ignore_trailing_whitespace) { size_t num_fields = 0; size_t pos = 0; - const size_t buflen = line_record_length (lr); - const char* fptr = line_record_buffer (lr); + const size_t buflen = lbuf->length; + const char* fptr = lbuf->buffer; + +#define IS_TRAILING_COMMENT (ignore_trailing_comments && (*fptr == '#' || *fptr == ';')) /* Move 'fptr' to point to the beginning of 'field' */ if (field_delim != TAB_WHITESPACE) { - while (buflen && pos<=buflen) + while (buflen && pos<=buflen && !IS_TRAILING_COMMENT) { /* scan buffer until next delimiter */ const char* field_beg = fptr; - while ( (posfields[num_fields].len = fptr - field_beg; ++num_fields; + if(IS_TRAILING_COMMENT) + pos = buflen; + /* Skip the delimiter */ ++pos; ++fptr; @@ -127,10 +142,10 @@ line_record_parse_fields (struct line_record_t *lr, int field_delim) { /* delimiter is white-space transition (multiple whitespaces are one delimiter) */ - while (posfields[num_fields].buf = field_beg; - lr->fields[num_fields].len = flen; - ++num_fields; + if(!ignore_trailing_whitespace || flen > 0) + { + line_record_reserve_fields (lr, num_fields); + lr->fields[num_fields].buf = field_beg; + lr->fields[num_fields].len = flen; + ++num_fields; + } } lr->num_fields = num_fields; } @@ -169,18 +187,104 @@ line_record_is_comment (const struct line_record_t* lr) return (c=='#' || c==';'); } +/* returns 0 if not a comment, 1 if a single comment, 2 if a double comment or + if the line only contains whitespace or is empty. Used only for vnlog + processing */ +static int +line_leading_comment_count (const struct line_record_t* lr) +{ + const char* pch = line_record_buffer (lr); + + /* Skip white space at beginning of line */ + size_t s = strspn (pch, " \t"); + /* First non-whitespace character */ + const char* c = &pch[s]; + + /* empty line? */ + if (c[0] == '\0') + return 2; + + /* not a comment? */ + if (c[0] != '#') + return 0; + + /* Have at least a single comment */ + if (c[1] == '#' || c[1] == '!') + return 2; + else + return 1; +} + bool -line_record_fread (struct /* in/out */ line_record_t* lr, - FILE *stream, char delimiter, bool skip_comments) +line_record_fread (struct /* in/out */ line_record_t *lr, + FILE *stream, char delimiter, + bool skip_comments, + bool vnlog_prologue) { - do { - if (readlinebuffer_delim (&lr->lbuf, stream, delimiter) == 0) - return false; - linebuffer_nullify (&lr->lbuf); - } while (skip_comments && line_record_is_comment (lr)); + while (1) + { + if (readlinebuffer_delim (&lr->lbuf, stream, delimiter) == 0) + return false; + linebuffer_nullify (&lr->lbuf); + if (vnlog) + { + if( vnlog_prologue ) + { + /* Validate and process single-commented vnlog header. + Skip double-comments and empty lines. */ + int leading_comment_count = line_leading_comment_count (lr); + if (leading_comment_count >= 2) + continue; + if (leading_comment_count == 1) + { + /* Strip the comment characters. + Skip leading regex '^\s*#\s*' */ + const char* pch = line_record_buffer (lr); + size_t s = strspn (pch, " \t#"); + struct linebuffer lbuf = lr->lbuf; + lbuf.buffer += s; + lbuf.length -= s; + if(lbuf.buffer[0] == '\0') + /* Ignore empty comment line. */ + continue; + line_record_parse_fields (&lbuf, lr, in_tab, - line_record_parse_fields (lr, in_tab); + // do NOT ignore comments. We're + // parsing the prologue + false, + + // ignore trailing whitespace + true + ); + return true; + } + + die (EXIT_FAILURE, 0, _("invalid vnlog data: received data line prior to the header: '%s'"), + line_record_buffer (lr)); + } + + /* vnlog data. Skip comments and empty lines */ + const char* pch = line_record_buffer (lr); + size_t s = strspn (pch, " \t"); + char c = pch[s]; + if (c=='#' || c=='\0') + continue; + break; + } + + if (skip_comments && line_record_is_comment (lr)) + continue; + + break; + } + + line_record_parse_fields (&lr->lbuf, lr, in_tab, + /* Ignore trailing comments only if --vnlog */ + vnlog && skip_comments, + + /* ignore trailing whitespace only if --vnlog */ + vnlog); return true; } diff --git c/src/text-lines.h w/src/text-lines.h index 2252a10..048f814 100644 --- c/src/text-lines.h +++ w/src/text-lines.h @@ -82,7 +82,8 @@ line_record_init (struct line_record_t* lr); bool line_record_fread (struct /* in/out */ line_record_t* lr, - FILE *stream, char delimiter, bool skip_comments); + FILE *stream, char delimiter, bool skip_comments, + bool vnlog_prologue); void line_record_free (struct line_record_t* lr); diff --git c/src/text-options.c w/src/text-options.c index 7739d8f..1ddb1a7 100644 --- c/src/text-options.c +++ w/src/text-options.c @@ -67,6 +67,14 @@ char* missing_field_filler = "N/A"; followed by '#' or ';'. See line_record_is_comment. */ bool skip_comments = false; +/* Print Output Header */ +bool output_header = false; + +/* Input file has a header line */ +bool input_header = false; + +bool vnlog = false; + #define UCHAR_LIM (UCHAR_MAX + 1) bool blanks[UCHAR_LIM]; diff --git c/src/text-options.h w/src/text-options.h index 4282016..46ac37d 100644 --- c/src/text-options.h +++ w/src/text-options.h @@ -68,6 +68,14 @@ extern char* missing_field_filler; followed by '#' or ';'. See line_record_is_comment. */ extern bool skip_comments; +extern bool vnlog; + +/* Print Output Header */ +extern bool output_header; + +/* Input file has a header line */ +extern bool input_header; + #define UCHAR_LIM (UCHAR_MAX + 1) extern bool blanks[UCHAR_LIM]; diff --git c/tests/datamash-vnlog.pl w/tests/datamash-vnlog.pl new file mode 100755 index 0000000..09c680d --- /dev/null +++ w/tests/datamash-vnlog.pl @@ -0,0 +1,348 @@ +#!/usr/bin/env perl +=pod + Unit Tests for GNU Datamash - perform simple calculation on input data + + Copyright (C) 2013-2021 Assaf Gordon + Copyright (C) 2022 Dima Kogan + + This file is part of GNU Datamash. + + GNU Datamash is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + GNU Datamash is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GNU Datamash. If not, see . + + Written by Assaf Gordon. +=cut + +use strict; +use warnings; + +use lib '.'; +# Until a better way comes along to auto-use Coreutils Perl modules +# as in the coreutils' autotools system. +use Coreutils; +use CuSkip; +use CuTmpdir qw(datamash); +use MIME::Base64 ; + +(my $program_name = $0) =~ s|.*/||; +my $prog_bin = 'datamash'; + +## Cross-Compiling portability hack: +## under qemu/binfmt, argv[0] (which is used to report errors) will contain +## the full path of the binary, if the binary is on the $PATH. +## So we try to detect what is the actual returned value of the program +## in case of an error. +my $prog = `$prog_bin ---print-progname`; +$prog = $prog_bin unless $prog; + +# TODO: add localization tests with "grouping" +# Turn off localization of executable's output. +@ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; + + +my $in_basic = <<'EOF'; +#! comment +##comment +## comment + ## comment + + +#x y z + + +4 2 3 +4 - 6# comment +## comment +- 8 9 +EOF + +my $in_numeric_columns = <<'EOF'; +# 0 1 2 +1 2 3 +4 - 6 +- 8 9 +EOF + +# trailing whitespace +my $in_trailing_whitespace=<<'EOF'; +# x y +bar 5 +bbb 4 +EOF + + + + + + +my @Tests = (); # I will append to this list as I add tests +@Tests = + ( @Tests, + + ['basic-functionality', + '--vnlog sum z', + {IN_PIPE => $in_basic}, + {OUT => <<'EOF' +# sum(z) +18 +EOF + }], + + ['basic-check', + '--vnlog check', + {IN_PIPE => $in_basic}, + {OUT => <<'EOF' +3 lines, 3 fields +EOF + }], + + ['error-basic-unmatched-field', + '--vnlog sum zzz', + {IN_PIPE => $in_basic}, + {EXIT => 1}, + {ERR => "$prog: column name 'zzz' not found in input file\n"}], + + # Make sure trailing whitespace is ignored properly + ['trailing-whitespace', + '--vnlog check', + {IN_PIPE=>$in_trailing_whitespace}, + {OUT => <<'EOF' +2 lines, 2 fields +EOF + }], + + # I kinda think this trailing-whitespace logic should apply to non-vnlog + # runs too, but the mailing list concensus was that it shouldn't. If that + # changes, here's the (commented-out) test that flags the questionable + # logic. To make it work, tweak line_record_parse_fields() to change + # + # line_record_parse_fields (&lr->lbuf, lr, in_tab, + # // Ignore trailing comments only if --vnlog + # vnlog && skip_comments, + # // ignore trailing whitespace only if --vnlog + # vnlog); + # to + # line_record_parse_fields (&lr->lbuf, lr, in_tab, + # // Ignore trailing comments only if --vnlog + # vnlog && skip_comments, + # // ignore trailing whitespace + # true); + # The disabled test: + # ['trailing-whitespace-no-vnlog', + # '-W -C check', + # {IN_PIPE=>$in_trailing_whitespace}, + # {OUT => <<'EOF' + # 2 lines, 2 fields + # EOF + # }], + + ['sum-requires-numeric-data', + '--vnlog sum x', + {IN_PIPE => $in_basic}, + {EXIT => 1}, + {OUT => "# sum(x)\n" }, + {ERR => "$prog: invalid numeric value in line 4 field 1: '-'\n"} + ], + + ['unique', + '--vnlog unique x', + {IN_PIPE => $in_basic}, + {OUT => <<'EOF' +# unique(x) +-,4 +EOF + }], + + ['collapse', + '--vnlog collapse x', + {IN_PIPE => $in_basic}, + {OUT => <<'EOF' +# collapse(x) +4,4,- +EOF + }], + + ['need-data-before-legend', + '--vnlog sum z', + {IN_PIPE => "5\n" . $in_basic}, + {EXIT => 1}, + {ERR => "$prog: invalid vnlog data: received data line prior to the header: '5'\n" }], + + ['numeric-columns-not-allowed', + '--vnlog sum 1', + {IN_PIPE => $in_basic}, + {EXIT => 1}, + {ERR => "$prog: column name '1' not found in input file\n" + }], + + ['existing-numeric-columns', + '--vnlog sum 2', + {IN_PIPE => $in_numeric_columns}, + {OUT => <<'EOF' +# sum(2) +18 +EOF + }], + + ['groupby', + '--vnlog -g x sum z', + {IN_PIPE => $in_basic}, + {OUT => <<'EOF' +# GroupBy(x) sum(z) +4 9 +- 9 +EOF +}], + + ['rmdup-x', + '--vnlog rmdup x', + {IN_PIPE => $in_basic}, + {OUT => <<'EOF' +# x y z +4 2 3 +- 8 9 +EOF +}], + + ['rmdup-y', + '--vnlog rmdup y', + {IN_PIPE => $in_basic}, + {OUT => <<'EOF' +# x y z +4 2 3 +4 - 6 +- 8 9 +EOF +}], + + ['reverse', + '--vnlog reverse', + {IN_PIPE => $in_basic}, + {OUT => <<'EOF' +# z y x +3 2 4 +6 - 4 +9 8 - +EOF +}], + + # empty input = empty output + [ 'empty1', + '--vnlog count x', + {IN_PIPE=>""}, + {ERR=>""}], + [ 'empty2', + '--vnlog count x', + {IN_PIPE=>"# x"}, + {OUT=>"# count(x)\n"}], + + # various errors + [ 'error-groupby-unmatched-field', + '--vnlog -g zzz sum z', + {IN_PIPE => $in_basic}, + {EXIT => 1}, + {ERR => "$prog: column name 'zzz' not found in input file\n"} ], + + [ 'error-sum-empty-field', + '--vnlog sum ""', + {IN_PIPE => $in_basic}, + {EXIT => 1}, + {ERR => "$prog: missing field for operation 'sum'\n"}], + + [ 'error-groupby-missing-field', + '--vnlog -g x,,y sum z', + {IN_PIPE => $in_basic}, + {EXIT => 1}, + {ERR => "$prog: missing field for operation 'groupby'\n"}], + + # Commandline errors + ['option-parsing-error1', + '--vnlog -t: sum x', + {IN_PIPE => $in_basic}, + {EXIT => 1}, + {ERR => "$prog: vnlog processing always uses whitespace to separate input fields\n"} + ], + + ['basic-check-no-op-options', + '--vnlog -C --header-in --header-out check', + {IN_PIPE => $in_basic}, + {OUT => <<'EOF' +3 lines, 3 fields +EOF + }] +); + +my $in_xy = <<'EOF'; +# x y +1 0.5 +2 1 +3 1.5 +4 2 +EOF + +@Tests = + ( @Tests, + + ['pcov', + '--vnlog pcov x:x pcov x:y pcov y:y', + {IN_PIPE => $in_xy}, + {OUT => <<'EOF' +# pcov(x,x) pcov(x,y) pcov(y,y) +1.25 0.625 0.3125 +EOF + }], + + # transpose isn't handled in any special way. The result is not a vnl + ['transpose', + '--vnlog transpose', + {IN_PIPE => $in_xy}, + {OUT => <<'EOF' +1 2 3 4 +0.5 1 1.5 2 +EOF + }], + + # crosstab isn't handled in any special way. The result is not a vnl + ['crosstab1', + '--vnlog crosstab x,y', + {IN_PIPE => $in_xy}, + {OUT => <<'EOF' +# GroupBy(x) GroupBy(y) count(x) + 0.5 1 1.5 2 +1 1 - - - +2 - 1 - - +3 - - 1 - +4 - - - 1 +EOF + }], + ['crosstab2', + '--vnlog crosstab x,y sum y', + {IN_PIPE => $in_xy}, + {OUT => <<'EOF' +# GroupBy(x) GroupBy(y) sum(y) + 0.5 1 1.5 2 +1 0.5 - - - +2 - 1 - - +3 - - 1.5 - +4 - - - 2 +EOF + }], + + ); + + +my $save_temps = $ENV{SAVE_TEMPS}; +my $verbose = $ENV{VERBOSE}; + +my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); +exit $fail;