Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add stats #77

Merged
merged 6 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 6 additions & 13 deletions benchmark/benchmark.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,10 @@ plot:

# TPCH SF1 on delta table
bench-run-tpch-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-delta.csv
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-delta.csv
# TPCH SF1 on parquet files
bench-run-tpch-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1-parquet/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-parquet.csv
# TPCH SF1 on duckdb file
bench-run-tpch-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-duckdb.csv
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1-parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-parquet.csv
# COMPARES TPCH SF1 on parquet file vs on delta files
bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet

Expand All @@ -38,10 +35,10 @@ bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet

# TPCH on remote delta table (set BENCHMARK_DATA_S3_LINEITEM_SF1)
bench-run-tpch-sf1-remote-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta-remote/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-remote-delta.csv
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-delta-remote/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-remote-delta.csv
# TPCH on remote parquet table (set BENCHMARK_DATA_S3_LINEITEM_SF1)
bench-run-tpch-sf1-remote-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-parquet-remote/$(BENCHMARK_PATTERN)' &> benchmark_results/tpch-sf1-remote-parquet.csv
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1-parquet-remote/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-remote-parquet.csv
# COMPARES TPCH SF1 on parquet file vs on delta files
bench-run-tpch-sf1-remote: bench-run-tpch-sf1-remote-parquet bench-run-tpch-sf1-remote-delta

Expand All @@ -51,14 +48,10 @@ bench-run-tpch-sf1-remote: bench-run-tpch-sf1-remote-parquet bench-run-tpch-sf1-

# TPCDS SF1 on delta table
bench-run-tpcds-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-delta/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-delta.csv
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-delta.csv
# TPCDS SF1 on parquet files
bench-run-tpcds-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-parquet/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-parquet.csv
# TPCDS SF1 on duckdb files
bench-run-tpcds-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpcds/sf1/$(BENCHMARK_PATTERN)' &> benchmark_results/tpcds-sf1-duckdb.csv

./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1-parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-parquet.csv
# COMPARES TPCDS SF1 on parquet file vs on delta files
bench-run-tpcds-sf1: bench-run-tpcds-sf1-delta bench-run-tpcds-sf1-parquet

Expand Down
17 changes: 14 additions & 3 deletions scripts/plot.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,24 @@
import duckdb
import argparse

### Parse script parameters
parser = argparse.ArgumentParser(description='Plot the results in ./benchmark_results')
parser.add_argument('-p','--pattern', help='Pattern to match result csv files to', required=False, default='*.csv')
parser.add_argument('-w','--width', help='Width of graph, adjust to fit data', required=False, default=20)
args = vars(parser.parse_args())

### Parse Query Results
parse_benchmark_result_query = """
parse_benchmark_result_query = f"""
SELECT
parse_filename(name, true) as benchmark,
parse_filename(filename, true) as config,
avg(timing) as timing
FROM
read_csv('benchmark_results/*.csv', filename=1)
read_csv('benchmark_results/{args['pattern']}', filename=1, columns = {{
'name': 'VARCHAR',
'run': 'BIGINT',
'timing': 'double'
}})
GROUP BY
config,
benchmark
Expand All @@ -22,6 +33,6 @@
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams["figure.figsize"] = [10, 5]
plt.rcParams["figure.figsize"] = [int(args['width']), 5]
fig = benchmark_results.pivot(index='benchmark', columns='config', values='timing').plot(kind='bar', title='', ylabel='runtime [s]').get_figure()
fig.savefig('benchmark_results/result.png')
19 changes: 18 additions & 1 deletion src/functions/delta_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ string url_decode(string input) {
return result;
}

static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats *, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) {
static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats * stats, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) {
auto context = (DeltaSnapshot *) engine_context;
auto path_string = context->GetPath();
StringUtil::RTrim(path_string, "/");
Expand All @@ -63,6 +63,7 @@ static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::Kernel
// Initialize the file metadata
context->metadata.back()->delta_snapshot_version = context->version;
context->metadata.back()->file_number = context->resolved_files.size() - 1;
context->metadata.back()->cardinality = stats->num_records;

// Fetch the deletion vector
auto selection_vector_res = ffi::selection_vector_from_dv(dv_info, context->extern_engine.get(), context->global_state.get());
Expand Down Expand Up @@ -493,6 +494,22 @@ idx_t DeltaSnapshot::GetTotalFileCount() {
return resolved_files.size();
}

unique_ptr<NodeStatistics> DeltaSnapshot::GetCardinality(ClientContext &context) {
// This also ensures all files are expanded
auto total_file_count = DeltaSnapshot::GetTotalFileCount();

if (total_file_count == 0) {
return make_uniq<NodeStatistics>(0,0);
}

idx_t total_tuple_count = 0;
for (auto &metadatum : metadata) {
total_tuple_count += metadatum->cardinality;
}

return make_uniq<NodeStatistics>(total_tuple_count,total_tuple_count);
}

unique_ptr<MultiFileReader> DeltaMultiFileReader::CreateInstance() {
return std::move(make_uniq<DeltaMultiFileReader>());
}
Expand Down
3 changes: 3 additions & 0 deletions src/include/functions/delta_scan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ struct DeltaFileMetaData {

idx_t delta_snapshot_version = DConstants::INVALID_INDEX;
idx_t file_number = DConstants::INVALID_INDEX;
idx_t cardinality = DConstants::INVALID_INDEX;
ffi::KernelBoolSlice selection_vector = {nullptr, 0};
case_insensitive_map_t<string> partition_map;
};
Expand All @@ -49,6 +50,8 @@ struct DeltaSnapshot : public MultiFileList {
FileExpandResult GetExpandResult() override;
idx_t GetTotalFileCount() override;

unique_ptr<NodeStatistics> GetCardinality(ClientContext &context) override;

protected:
//! Get the i-th expanded file
string GetFile(idx_t i) override;
Expand Down
19 changes: 19 additions & 0 deletions test/sql/dat/basic_append.test
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
----
5

# Cardinality estimation should correctly show this
query II
EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
----
physical_plan <REGEX>:.*5 Rows.*

query I
SELECT count(number)
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
Expand Down Expand Up @@ -78,3 +84,16 @@ SELECT a_float, number, letter
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
WHERE number > 6
----

# Filters are reflected in cardinality estimation: filtering out all files shows 0 EC
query II
EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
WHERE number > 6
----
physical_plan <REGEX>:.*0 Rows.*

query II
EXPLAIN FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
WHERE number > 4
----
physical_plan <REGEX>:.*1 Rows.*
5 changes: 5 additions & 0 deletions test/sql/delta_kernel_rs/basic_partitioned.test
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,8 @@ e 5 5.5
a 1 1.1
b 2 2.2
c 3 3.3

query II
EXPLAIN FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned')
----
physical_plan <REGEX>:.*6 Rows.*
Loading