diff --git i/NEWS w/NEWS index f769b18..b9292f2 100644 --- i/NEWS +++ w/NEWS @@ -1,5 +1,9 @@ * Noteworthy changes in release ?.? (????-??-??) [?] +** New Features + + New datamash operation dotprod for calculating the scalar product of two + columns. * Noteworthy changes in release 1.8 (2022-07-23) [stable] diff --git i/contrib/bash-completion/datamash w/contrib/bash-completion/datamash index 86b467d..f2c0e86 100644 --- i/contrib/bash-completion/datamash +++ w/contrib/bash-completion/datamash @@ -30,7 +30,7 @@ unique uniq collapse countunique \ mean geomean harmmean trimmean median q1 q3 iqr perc mode antimode \ pstdev sstdev pvar svar mad madraw \ pskew sskew pkurt skurt dpo jarque \ -pcov scov ppearson spearson" +pcov scov ppearson spearson dotprod" local groupby_ops_re=${groupby_ops// /|} local line_ops="base64 debase64 md5 sha1 sha224 sha256 sha384 sha512 \ diff --git i/doc/datamash.texi w/doc/datamash.texi index e5693a5..6b6d817 100644 --- i/doc/datamash.texi +++ w/doc/datamash.texi @@ -193,7 +193,8 @@ thousands separator. @code{antimode}, @code{pstdev}, @code{sstdev}, @code{pvar}, @code{svar}, @code{ms}, @code{rms}, @code{mad}, @code{madraw}, @code{sskew}, @code{pskew}, @code{skurt}, @code{pkurt}, @code{jarque}, @code{dpo}, -@code{scov}, @code{pcov}, @code{spearson}, @code{ppearson} +@code{scov}, @code{pcov}, @code{spearson}, @code{ppearson}, +@code{dotprod} @end table diff --git i/man/datamash.x w/man/datamash.x index 042e710..26ad2f9 100644 --- i/man/datamash.x +++ w/man/datamash.x @@ -277,6 +277,10 @@ covariance of fields X and Y Pearson product-moment correlation coefficient [Pearson's R] of fields X and Y +.TP +.B dotprod [X:Y] +Scalar product (aka dot product or Euclidean inner product) +of fields X and Y [=EXAMPLES] diff --git i/src/datamash.c w/src/datamash.c index 0bdb16b..735abe3 100644 --- i/src/datamash.c +++ w/src/datamash.c @@ -220,7 +220,7 @@ which require a pair of fields (e.g. 'pcov 2:6').\n"), stdout); mean, geomean, harmmean, trimmean, median, q1, q3, iqr, perc,\n\ mode, antimode, pstdev, sstdev, pvar, svar, ms, rms, mad, madraw,\n\ pskew, sskew, pkurt, skurt, dpo, jarque,\n\ - scov, pcov, spearson, ppearson\n\ + scov, pcov, spearson, ppearson, dotprod\n\ \n", stdout); fputs ("\n", stdout); diff --git i/src/field-ops.c w/src/field-ops.c index ea874fc..e1ce62d 100644 --- i/src/field-ops.c +++ w/src/field-ops.c @@ -153,6 +153,8 @@ struct operation_data operations[] = {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT}, /* OP_S_PEARSON_COR */ {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT}, + /* OP_DOT_PRODUCT */ + {NUMERIC_VECTOR, IGNORE_FIRST, NUMERIC_RESULT}, /* OP_BIN_BUCKETS */ {NUMERIC_SCALAR, IGNORE_FIRST, NUMERIC_RESULT}, /* OP_STRBIN */ @@ -553,6 +555,7 @@ field_op_collect (struct fieldop *op, case OP_S_COVARIANCE: case OP_P_PEARSON_COR: case OP_S_PEARSON_COR: + case OP_DOT_PRODUCT: case OP_TRIMMED_MEAN: field_op_add_value (op, num_value); break; @@ -750,6 +753,7 @@ field_op_summarize_empty (struct fieldop *op) case OP_S_COVARIANCE: case OP_P_PEARSON_COR: case OP_S_PEARSON_COR: + case OP_DOT_PRODUCT: case OP_BIN_BUCKETS: case OP_STRBIN: case OP_FLOOR: @@ -997,6 +1001,14 @@ field_op_summarize (struct fieldop *op) DF_POPULATION:DF_SAMPLE); break; + case OP_DOT_PRODUCT: + assert (!op->slave); /* LCOV_EXCL_LINE */ + assert (op->slave_op); /* LCOV_EXCL_LINE */ + verify_slave_num_values (op); + numeric_result = dot_product_value (op->values, op->slave_op->values, + op->num_values ); + break; + case OP_MODE: case OP_ANTIMODE: field_op_sort_values (op); diff --git i/src/op-defs.c w/src/op-defs.c index 4f4a77d..d4c5dbd 100644 --- i/src/op-defs.c +++ w/src/op-defs.c @@ -89,6 +89,7 @@ struct field_operation_definition field_operations[] = {"scov", OP_S_COVARIANCE, MODE_GROUPBY}, {"ppearson", OP_P_PEARSON_COR, MODE_GROUPBY}, {"spearson", OP_S_PEARSON_COR, MODE_GROUPBY}, + {"dotprod", OP_DOT_PRODUCT, MODE_GROUPBY}, {"bin", OP_BIN_BUCKETS, MODE_PER_LINE}, {"strbin", OP_STRBIN, MODE_PER_LINE}, {"floor", OP_FLOOR, MODE_PER_LINE}, diff --git i/src/op-defs.h w/src/op-defs.h index f96b080..f57d0b3 100644 --- i/src/op-defs.h +++ w/src/op-defs.h @@ -75,6 +75,7 @@ enum field_operation OP_S_COVARIANCE, /* Sample Covariance */ OP_P_PEARSON_COR, /* Pearson Correlation Coefficient (population) */ OP_S_PEARSON_COR, /* Pearson Correlation Coefficient (sample) */ + OP_DOT_PRODUCT, /* Scalar Product */ OP_BIN_BUCKETS, /* numeric binning operation */ OP_STRBIN, /* String hash/binning */ OP_FLOOR, /* Floor */ diff --git i/src/op-parser.c w/src/op-parser.c index 6b464e1..0ebde2d 100644 --- i/src/op-parser.c +++ w/src/op-parser.c @@ -112,7 +112,8 @@ alloc_next_field () #define OP_NEED_PAIR_PARAMS(x) (((x)==OP_P_COVARIANCE)||\ ((x)==OP_S_COVARIANCE)||\ ((x)==OP_P_PEARSON_COR)||\ - ((x)==OP_S_PEARSON_COR)) + ((x)==OP_S_PEARSON_COR)||\ + ((x)==OP_DOT_PRODUCT)) #define ADD_NAMED_GROUP(name) (add_group_col (true,0,(name))) #define ADD_NUMERIC_GROUP(num) (add_group_col (false,num,NULL)) diff --git i/src/utils.c w/src/utils.c index dbaeaf9..f8fd1a9 100644 --- i/src/utils.c +++ w/src/utils.c @@ -195,6 +195,18 @@ pearson_corr_value ( const long double * const valuesA, return cor; } +long double _GL_ATTRIBUTE_PURE +dot_product_value ( const long double * const valuesA, + const long double * const valuesB, size_t n ) +{ + long double sum=0; + + for (size_t i = 0; i < n; i++) + sum += valuesA[i] * valuesB[i]; + + return sum; +} + long double stdev_value (const long double * const values, size_t n, int df) diff --git i/src/utils.h w/src/utils.h index 5348086..da50d1e 100644 --- i/src/utils.h +++ w/src/utils.h @@ -117,6 +117,13 @@ long double pearson_corr_value ( const long double * const valuesA, const long double * const valuesB, size_t n, int df); +/* + Given two columns of doubles, return their scalar product value. + */ +long double +dot_product_value ( const long double * const valuesA, + const long double * const valuesB, size_t n ); + /* Given an array of doubles, return the standard-deviation value. 'df' is degrees-of-freedom. Use DF_POPULATION or DF_SAMPLE (see above). diff --git i/tests/datamash-error-msgs.pl w/tests/datamash-error-msgs.pl index 9f5442b..0c36c14 100644 --- i/tests/datamash-error-msgs.pl +++ w/tests/datamash-error-msgs.pl @@ -97,6 +97,16 @@ my @Tests = {ERR=>"$prog: -H or --header-in must be used with named columns\n"}], ['e47','sum 1:3', {IN_PIPE=>""}, {EXIT=>1}, {ERR=>"$prog: operation 'sum' cannot use pair of fields\n"}], + ['e50','dotprod 1', {IN_PIPE=>""}, {EXIT=>1}, + {ERR=>"$prog: operation 'dotprod' requires field pairs\n"}], + ['e51','dotprod 1:', {IN_PIPE=>""}, {EXIT=>1}, + {ERR=>"$prog: invalid field pair for operation 'dotprod'\n"}], + ['e52','dotprod :', {IN_PIPE=>""}, {EXIT=>1}, + {ERR=>"$prog: invalid field pair for operation 'dotprod'\n"}], + ['e53','dotprod :1', {IN_PIPE=>""}, {EXIT=>1}, + {ERR=>"$prog: invalid field pair for operation 'dotprod'\n"}], + ['e54','dotprod hello:world', {IN_PIPE=>""}, {EXIT=>1}, + {ERR=>"$prog: -H or --header-in must be used with named columns\n"}], # Test scanner edge-cases # Floating point value diff --git i/tests/datamash-pair-tests.pl w/tests/datamash-pair-tests.pl index 33d69bf..172ba69 100644 --- i/tests/datamash-pair-tests.pl +++ w/tests/datamash-pair-tests.pl @@ -115,6 +115,11 @@ pcov(field-1,field-2) 1.622 EOF +my $out1_dotprod_hdr=<<'EOF'; +dotprod(field-1,field-2) +34.896 +EOF + my $in2=<<'EOF'; 1.599 1 -1.011 2 @@ -188,10 +193,16 @@ spearson(x,y) 1 EOF +my $out6_dotprod_hdr=<<'EOF'; +dotprod(x,y) +15 +EOF + my @Tests = ( ['c1', 'scov 1:2', {IN_PIPE=>$in1}, {OUT=>$out1_scov}], ['c2', 'pcov 1:2', {IN_PIPE=>$in1}, {OUT=>$out1_pcov}], + ['dp1', 'dotprod 1:2', {IN_PIPE=>$in1}, {OUT=>"34.896\n"}], # Pair with output headers - only one field and header should be printed ['c3', '--header-out pcov 1:2', {IN_PIPE=>$in1}, {OUT=>$out1_pcov_hdr}], @@ -207,6 +218,10 @@ my @Tests = ['p2_hin', '-W --header-in --header-out spearson x:y', {IN_PIPE=>$in6}, {OUT=>$out6_spears_hdr}], + ['dp2', '--header-out dotprod 1:2', {IN_PIPE=>$in1}, {OUT=>$out1_dotprod_hdr}], + ['dp3', '-W --header-in --header-out dotprod x:y', + {IN_PIPE=>$in6}, {OUT=>$out6_dotprod_hdr}], + # Test operations on edge-cases of input (one items, no items, # different number of items) ['c4', 'scov 1:2', {IN_PIPE=>$in3}, {OUT=>"$nan\n"}], @@ -214,6 +229,7 @@ my @Tests = ['c5', '--narm scov 1:2', {IN_PIPE=>$in4}, {OUT=>"$nan\n"}], ['p5', '--narm spearson 1:2', {IN_PIPE=>$in4}, {OUT=>"$nan\n"}], + ['dp5', '--narm dotprod 1:2', {IN_PIPE=>$in4}, {OUT=>"$nan\n"}], ['c6', '--narm scov 1:2', {IN_PIPE=>$in5}, {EXIT=>1}, {ERR=>"$prog: input error for operation 'scov': " . @@ -221,6 +237,9 @@ my @Tests = ['p6', '--narm spearson 1:2', {IN_PIPE=>$in5}, {EXIT=>1}, {ERR=>"$prog: input error for operation 'spearson': " . "fields 1,2 have different number of items\n"}], + ['dp6', '--narm dotprod 1:2', {IN_PIPE=>$in5}, {EXIT=>1}, + {ERR=>"$prog: input error for operation 'dotprod': " . + "fields 1,2 have different number of items\n"}], ); my $save_temps = $ENV{SAVE_TEMPS}; diff --git i/tests/datamash-parser.pl w/tests/datamash-parser.pl index 7cda531..1fe4111 100755 --- i/tests/datamash-parser.pl +++ w/tests/datamash-parser.pl @@ -152,6 +152,13 @@ my @Tests = ['e46','pcov hello:world', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}], ['e47','sum 1:3', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}], + ['p50','dotprod 1:2', {IN_PIPE=>""}, {OUT=>""}], + ['e51','dotprod 1', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}], + ['e52','dotprod 1:', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}], + ['e53','dotprod :', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}], + ['e54','dotprod :1', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}], + ['e56','dotprod hello:world', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}], + # Test scanner edge-cases # Floating point value ['e60','sum 4.5', {IN_PIPE=>""}, {EXIT=>1}, {ERR_SUBST=>'s/.*//s'}],