From e1bb63da31c49b2dfff84b17b841b3e24671aef1 Mon Sep 17 00:00:00 2001 From: Steve Pike Date: Thu, 12 Dec 2019 22:40:40 -0500 Subject: [PATCH 1/4] Add a nullcount option to stats command --- Cargo.lock | 2 ++ src/cmd/stats.rs | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index e8714c13..f7207b6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,3 +1,5 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. [[package]] name = "aho-corasick" version = "0.6.6" diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 64572a7b..2ac4eecc 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -49,6 +49,7 @@ stats options: This requires storing all CSV data in memory. --nulls Include NULLs in the population size for computing mean and standard deviation. + --nullcount Include a count of the number of NULLs. -j, --jobs The number of jobs to run in parallel. This works better when the given CSV data has an index already created. Note that a file handle @@ -76,6 +77,7 @@ struct Args { flag_cardinality: bool, flag_median: bool, flag_nulls: bool, + flag_nullcount: bool, flag_jobs: usize, flag_output: Option, flag_no_headers: bool, @@ -209,6 +211,7 @@ impl Args { range: true, dist: true, cardinality: self.flag_cardinality || self.flag_everything, + nullcount: self.flag_nullcount || self.flag_everything, median: self.flag_median || self.flag_everything, mode: self.flag_mode || self.flag_everything, })).take(record_len).collect() @@ -223,6 +226,7 @@ impl Args { if self.flag_median || all { fields.push("median"); } if self.flag_mode || all { fields.push("mode"); } if self.flag_cardinality || all { fields.push("cardinality"); } + if self.flag_nullcount || all { fields.push("nullcount"); } csv::StringRecord::from(fields) } } @@ -234,6 +238,7 @@ struct WhichStats { range: bool, dist: bool, cardinality: bool, + nullcount: bool, median: bool, mode: bool, } @@ -252,6 +257,7 @@ struct Stats { online: Option, mode: Option>>, median: Option>, + nullcount: u64, which: WhichStats, } @@ -271,6 +277,7 @@ impl Stats { online: online, mode: mode, median: median, + nullcount: 0, which: which, } } @@ -283,9 +290,13 @@ impl Stats { self.sum.as_mut().map(|v| v.add(t, sample)); self.minmax.as_mut().map(|v| v.add(t, sample)); self.mode.as_mut().map(|v| v.add(sample.to_vec())); + + if sample_type.is_null() { self.nullcount += 1; } + match self.typ { TUnknown => {} TNull => { + if self.which.include_nulls { self.online.as_mut().map(|v| { v.add_null(); }); } @@ -365,6 +376,10 @@ impl Stats { } } } + if self.which.nullcount { + pieces.push(self.nullcount.to_string()); + } + csv::StringRecord::from(pieces) } } @@ -377,6 +392,7 @@ impl Commute for Stats { self.online.merge(other.online); self.mode.merge(other.mode); self.median.merge(other.median); + self.nullcount += other.nullcount; self.which.merge(other.which); } } @@ -508,6 +524,7 @@ impl Commute for TypedSum { } } + /// TypedMinMax keeps track of minimum/maximum values for each possible type /// where min/max makes sense. #[derive(Clone)] From 369a3f7f07e5a7a22b108b2e961b4804fd9ac38b Mon Sep 17 00:00:00 2001 From: Steve Pike Date: Fri, 13 Dec 2019 11:42:40 -0500 Subject: [PATCH 2/4] Add some nullcount tests --- tests/test_stats.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_stats.rs b/tests/test_stats.rs index b63b396e..dd248f90 100644 --- a/tests/test_stats.rs +++ b/tests/test_stats.rs @@ -90,6 +90,7 @@ fn setup(name: S, rows: &[&str], headers: bool, fn get_field_value(wrk: &Workdir, cmd: &mut process::Command, field: &str) -> String { + if field == "nullcount" { cmd.arg("--nullcount"); } if field == "median" { cmd.arg("--median"); } if field == "cardinality" { cmd.arg("--cardinality"); } if field == "mode" { cmd.arg("--mode"); } @@ -181,6 +182,12 @@ stats_tests!(stats_median_even_null, "median", &["", "1", "2", "3", "4"], "2.5"); stats_tests!(stats_median_mix, "median", &["1", "2.5", "3"], "2.5"); +stats_tests!(stats_nullcount, "nullcount", &["", "1", "2"], "1"); +stats_tests!(stats_nullcount_none, "nullcount", &["a", "1", "2"], "0"); +stats_tests!(stats_nullcount_spacenotnull, "nullcount", &[" ", "1", "2"], "0"); +stats_tests!(stats_nullcount_all, "nullcount", &["", "", ""], "3"); + + mod stats_infer_nothing { // Only test CSV data with headers. // Empty CSV data with no headers won't produce any statistical analysis. From e5c49fa0c8ae36a570bd2b843034ef9d540b5c20 Mon Sep 17 00:00:00 2001 From: Steve Pike Date: Fri, 13 Dec 2019 11:44:57 -0500 Subject: [PATCH 3/4] Rewrite usage for consistency --- src/cmd/stats.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 2ac4eecc..31eee8d3 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -47,9 +47,9 @@ stats options: This requires storing all CSV data in memory. --median Show the median. This requires storing all CSV data in memory. + --nullcount Show the number of NULLs. --nulls Include NULLs in the population size for computing mean and standard deviation. - --nullcount Include a count of the number of NULLs. -j, --jobs The number of jobs to run in parallel. This works better when the given CSV data has an index already created. Note that a file handle From 89639fbcb3e95a8354f53670899f17894df7292d Mon Sep 17 00:00:00 2001 From: Steve Pike Date: Fri, 13 Dec 2019 11:46:42 -0500 Subject: [PATCH 4/4] Clean up some formatting --- src/cmd/stats.rs | 5 ----- tests/test_stats.rs | 1 - 2 files changed, 6 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 31eee8d3..557be0f7 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -290,13 +290,10 @@ impl Stats { self.sum.as_mut().map(|v| v.add(t, sample)); self.minmax.as_mut().map(|v| v.add(t, sample)); self.mode.as_mut().map(|v| v.add(sample.to_vec())); - if sample_type.is_null() { self.nullcount += 1; } - match self.typ { TUnknown => {} TNull => { - if self.which.include_nulls { self.online.as_mut().map(|v| { v.add_null(); }); } @@ -379,7 +376,6 @@ impl Stats { if self.which.nullcount { pieces.push(self.nullcount.to_string()); } - csv::StringRecord::from(pieces) } } @@ -524,7 +520,6 @@ impl Commute for TypedSum { } } - /// TypedMinMax keeps track of minimum/maximum values for each possible type /// where min/max makes sense. #[derive(Clone)] diff --git a/tests/test_stats.rs b/tests/test_stats.rs index dd248f90..714eb2b1 100644 --- a/tests/test_stats.rs +++ b/tests/test_stats.rs @@ -187,7 +187,6 @@ stats_tests!(stats_nullcount_none, "nullcount", &["a", "1", "2"], "0"); stats_tests!(stats_nullcount_spacenotnull, "nullcount", &[" ", "1", "2"], "0"); stats_tests!(stats_nullcount_all, "nullcount", &["", "", ""], "3"); - mod stats_infer_nothing { // Only test CSV data with headers. // Empty CSV data with no headers won't produce any statistical analysis.