Skip to content

Commit

Permalink
Remove unneeded partitioning (#1417)
Browse files Browse the repository at this point in the history
  • Loading branch information
Fokko authored Dec 20, 2024
1 parent 85b2053 commit ab6b190
Showing 1 changed file with 11 additions and 9 deletions.
20 changes: 11 additions & 9 deletions dev/provision.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,17 @@
from pyiceberg.schema import Schema
from pyiceberg.types import FixedType, NestedField, UUIDType

spark = SparkSession.builder.getOrCreate()
# The configuration is important, otherwise we get many small
# parquet files with a single row. When a positional delete
# hits the Parquet file with one row, the parquet file gets
# dropped instead of having a merge-on-read delete file.
spark = (
SparkSession
.builder
.config("spark.sql.shuffle.partitions", "1")
.config("spark.default.parallelism", "1")
.getOrCreate()
)

catalogs = {
'rest': load_catalog(
Expand Down Expand Up @@ -120,10 +130,6 @@
"""
)

# Partitioning is not really needed, but there is a bug:
# https://github.com/apache/iceberg/pull/7685
spark.sql(f"ALTER TABLE {catalog_name}.default.test_positional_mor_deletes ADD PARTITION FIELD years(dt) AS dt_years")

spark.sql(
f"""
INSERT INTO {catalog_name}.default.test_positional_mor_deletes
Expand Down Expand Up @@ -168,10 +174,6 @@
"""
)

# Partitioning is not really needed, but there is a bug:
# https://github.com/apache/iceberg/pull/7685
spark.sql(f"ALTER TABLE {catalog_name}.default.test_positional_mor_double_deletes ADD PARTITION FIELD years(dt) AS dt_years")

spark.sql(
f"""
INSERT INTO {catalog_name}.default.test_positional_mor_double_deletes
Expand Down

0 comments on commit ab6b190

Please sign in to comment.