diff --git a/benchmark/benchmark.Makefile b/benchmark/benchmark.Makefile index b3f4202..9dd45a4 100644 --- a/benchmark/benchmark.Makefile +++ b/benchmark/benchmark.Makefile @@ -22,13 +22,10 @@ plot: # TPCH SF1 on delta table bench-run-tpch-sf1-delta: bench-output-dir - ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-delta.csv + ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-delta.csv # TPCH SF1 on parquet files bench-run-tpch-sf1-parquet: bench-output-dir - ./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1-parquet/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-parquet.csv -# TPCH SF1 on duckdb file -bench-run-tpch-sf1-duckdb: bench-output-dir - ./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-duckdb.csv + ./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1-parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-parquet.csv # COMPARES TPCH SF1 on parquet file vs on delta files bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet @@ -38,10 +35,10 @@ bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet # TPCH on remote delta table (set BENCHMARK_DATA_S3_LINEITEM_SF1) bench-run-tpch-sf1-remote-delta: bench-output-dir - ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta-remote/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-remote-delta.csv + ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta-remote/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-remote-delta.csv # TPCH on remote parquet table (set BENCHMARK_DATA_S3_LINEITEM_SF1) bench-run-tpch-sf1-remote-parquet: bench-output-dir - ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-parquet-remote/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-remote-parquet.csv + ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-parquet-remote/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-remote-parquet.csv # COMPARES TPCH SF1 on parquet file vs on delta files bench-run-tpch-sf1-remote: bench-run-tpch-sf1-remote-parquet bench-run-tpch-sf1-remote-delta @@ -51,14 +48,10 @@ bench-run-tpch-sf1-remote: bench-run-tpch-sf1-remote-parquet bench-run-tpch-sf1- # TPCDS SF1 on delta table bench-run-tpcds-sf1-delta: bench-output-dir - ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-delta/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-delta.csv + ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-delta.csv # TPCDS SF1 on parquet files bench-run-tpcds-sf1-parquet: bench-output-dir - ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-parquet/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-parquet.csv -# TPCDS SF1 on duckdb files -bench-run-tpcds-sf1-duckdb: bench-output-dir - ./build/release/benchmark/benchmark_runner 'benchmark/tpcds/sf1/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-duckdb.csv - + ./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-parquet.csv # COMPARES TPCDS SF1 on parquet file vs on delta files bench-run-tpcds-sf1: bench-run-tpcds-sf1-delta bench-run-tpcds-sf1-parquet diff --git a/scripts/plot.py b/scripts/plot.py index c5ea201..9090f3f 100644 --- a/scripts/plot.py +++ b/scripts/plot.py @@ -1,13 +1,24 @@ import duckdb +import argparse + +### Parse script parameters +parser = argparse.ArgumentParser(description='Plot the results in ./benchmark_results') +parser.add_argument('-p','--pattern', help='Pattern to match result csv files to', required=False, default='*.csv') +parser.add_argument('-w','--width', help='Width of graph, adjust to fit data', required=False, default=20) +args = vars(parser.parse_args()) ### Parse Query Results -parse_benchmark_result_query = """ +parse_benchmark_result_query = f""" SELECT parse_filename(name, true) as benchmark, parse_filename(filename, true) as config, avg(timing) as timing FROM - read_csv('benchmark_results/*.csv', filename=1) + read_csv('benchmark_results/{args['pattern']}', filename=1, columns = {{ + 'name': 'VARCHAR', + 'run': 'BIGINT', + 'timing': 'double' + }}) GROUP BY config, benchmark @@ -22,6 +33,6 @@ import matplotlib.pyplot as plt import numpy as np -plt.rcParams["figure.figsize"] = [10, 5] +plt.rcParams["figure.figsize"] = [int(args['width']), 5] fig = benchmark_results.pivot(index='benchmark', columns='config', values='timing').plot(kind='bar', title='', ylabel='runtime [s]').get_figure() fig.savefig('benchmark_results/result.png') \ No newline at end of file diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 2a3dd18..a3e4f11 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -46,7 +46,7 @@ string url_decode(string input) { return result; } -static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats *, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) { +static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats * stats, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) { auto context = (DeltaSnapshot *) engine_context; auto path_string = context->GetPath(); StringUtil::RTrim(path_string, "/"); @@ -63,6 +63,7 @@ static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::Kernel // Initialize the file metadata context->metadata.back()->delta_snapshot_version = context->version; context->metadata.back()->file_number = context->resolved_files.size() - 1; + context->metadata.back()->cardinality = stats->num_records; // Fetch the deletion vector auto selection_vector_res = ffi::selection_vector_from_dv(dv_info, context->extern_engine.get(), context->global_state.get()); @@ -493,6 +494,22 @@ idx_t DeltaSnapshot::GetTotalFileCount() { return resolved_files.size(); } +unique_ptr DeltaSnapshot::GetCardinality(ClientContext &context) { + // This also ensures all files are expanded + auto total_file_count = DeltaSnapshot::GetTotalFileCount(); + + if (total_file_count == 0) { + return make_uniq(0,0); + } + + idx_t total_tuple_count = 0; + for (auto &metadatum : metadata) { + total_tuple_count += metadatum->cardinality; + } + + return make_uniq(total_tuple_count,total_tuple_count); +} + unique_ptr DeltaMultiFileReader::CreateInstance() { return std::move(make_uniq()); } diff --git a/src/include/functions/delta_scan.hpp b/src/include/functions/delta_scan.hpp index 25e312f..aac35cc 100644 --- a/src/include/functions/delta_scan.hpp +++ b/src/include/functions/delta_scan.hpp @@ -28,6 +28,7 @@ struct DeltaFileMetaData { idx_t delta_snapshot_version = DConstants::INVALID_INDEX; idx_t file_number = DConstants::INVALID_INDEX; + idx_t cardinality = DConstants::INVALID_INDEX; ffi::KernelBoolSlice selection_vector = {nullptr, 0}; case_insensitive_map_t partition_map; }; @@ -49,6 +50,8 @@ struct DeltaSnapshot : public MultiFileList { FileExpandResult GetExpandResult() override; idx_t GetTotalFileCount() override; + unique_ptr GetCardinality(ClientContext &context) override; + protected: //! Get the i-th expanded file string GetFile(idx_t i) override; diff --git a/test/sql/dat/basic_append.test b/test/sql/dat/basic_append.test index 87930b8..a0b20a1 100644 --- a/test/sql/dat/basic_append.test +++ b/test/sql/dat/basic_append.test @@ -20,6 +20,12 @@ FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') ---- 5 +# Cardinality estimation should correctly show this +query II +EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') +---- +physical_plan :.*5 Rows.* + query I SELECT count(number) FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') @@ -78,3 +84,16 @@ SELECT a_float, number, letter FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') WHERE number > 6 ---- + +# Filters are reflected in cardinality estimation: filtering out all files shows 0 EC +query II +EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') +WHERE number > 6 +---- +physical_plan :.*0 Rows.* + +query II +EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') +WHERE number > 4 +---- +physical_plan :.*1 Rows.* diff --git a/test/sql/delta_kernel_rs/basic_partitioned.test b/test/sql/delta_kernel_rs/basic_partitioned.test index d66d012..1ee22c9 100644 --- a/test/sql/delta_kernel_rs/basic_partitioned.test +++ b/test/sql/delta_kernel_rs/basic_partitioned.test @@ -17,3 +17,8 @@ e 5 5.5 a 1 1.1 b 2 2.2 c 3 3.3 + +query II +EXPLAIN FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned') +---- +physical_plan :.*6 Rows.*