Nit fixes to URL-encoding of partition field names (#1499)

* Revert "Add `make_name_compatible` suggestion so test passes" This reverts commit 61cdd08. * Nit fixes to URL-encoding of partition field names * Fix tests * Collapse * Make lint --------- Co-authored-by: Sreesh Maheshwar <[email protected]>
apache · Jan 10, 2025 · 19ad24e · 19ad24e
1 parent a95f9ee
commit 19ad24e
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 48 deletions.
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -1077,6 +1077,7 @@ with table.update_schema() as update:
 with table.update_schema() as update:
     update.add_column(("details", "confirmed_by"), StringType(), "Name of the exchange")
 ```
+
 A complex type must exist before columns can be added to it. Fields in complex types are added in a tuple.
 
 ### Rename column

diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
@@ -234,11 +234,8 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
             partition_field = self.fields[pos]
             value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos])
 
-            value_str = quote_plus(value_str, safe="")
-            value_strs.append(value_str)
-
-            field_str = quote_plus(partition_field.name, safe="")
-            field_strs.append(field_str)
+            value_strs.append(quote_plus(value_str, safe=""))
+            field_strs.append(quote_plus(partition_field.name, safe=""))
 
         path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)])
         return path

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
@@ -18,15 +18,15 @@
 import uuid
 from datetime import date, datetime, timedelta, timezone
 from decimal import Decimal
-from typing import Any, Callable, List, Optional
+from typing import Any, List
 
 import pytest
 from pyspark.sql import SparkSession
 from pyspark.sql.utils import AnalysisException
 
 from pyiceberg.catalog import Catalog
 from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec
-from pyiceberg.schema import Schema
+from pyiceberg.schema import Schema, make_compatible_name
 from pyiceberg.transforms import (
     BucketTransform,
     DayTransform,
@@ -78,7 +78,7 @@
 
 
 @pytest.mark.parametrize(
-    "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name",
+    "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification",
     [
         # # Identity Transform
         (
@@ -99,7 +99,6 @@
             VALUES
             (false, 'Boolean field set to false');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")],
@@ -119,7 +118,6 @@
             VALUES
             ('sample_string', 'Another string value')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")],
@@ -139,7 +137,6 @@
             VALUES
             (42, 'Associated string value for int 42')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")],
@@ -159,7 +156,6 @@
             VALUES
             (1234567890123456789, 'Associated string value for long 1234567890123456789')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")],
@@ -183,7 +179,6 @@
             # VALUES
             # (3.14, 'Associated string value for float 3.14')
             # """
-            None,
         ),
         (
             [PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")],
@@ -207,7 +202,6 @@
             # VALUES
             # (6.282, 'Associated string value for double 6.282')
             # """
-            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -227,7 +221,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -247,7 +240,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -272,7 +264,6 @@
             # VALUES
             # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             # """
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")],
@@ -297,7 +288,6 @@
             # VALUES
             # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00')
             # """
-            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")],
@@ -317,7 +307,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")],
@@ -337,7 +326,6 @@
             VALUES
             ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")],
@@ -357,7 +345,6 @@
             VALUES
             (CAST('example' AS BINARY), 'Associated string value for binary `example`')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")],
@@ -377,7 +364,6 @@
             VALUES
             (123.45, 'Associated string value for decimal 123.45')
             """,
-            None,
         ),
         # # Year Month Day Hour Transform
         # Month Transform
@@ -399,7 +385,6 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")],
@@ -419,7 +404,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")],
@@ -439,7 +423,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
-            None,
         ),
         # Year Transform
         (
@@ -460,7 +443,6 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")],
@@ -480,7 +462,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")],
@@ -500,7 +481,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
-            None,
         ),
         # # Day Transform
         (
@@ -521,7 +501,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")],
@@ -541,7 +520,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")],
@@ -561,7 +539,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
-            None,
         ),
         # Hour Transform
         (
@@ -582,7 +559,6 @@
                 VALUES
                 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01');
                 """,
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")],
@@ -602,7 +578,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
-            None,
         ),
         # Truncate Transform
         (
@@ -623,7 +598,6 @@
                 VALUES
                 (12345, 'Sample data for int');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")],
@@ -643,7 +617,6 @@
             VALUES
             (4294967297, 'Sample data for long');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")],
@@ -663,7 +636,6 @@
             VALUES
             ('abcdefg', 'Another sample for string');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")],
@@ -683,7 +655,6 @@
             VALUES
             (678.90, 'Associated string value for decimal 678.90')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")],
@@ -703,7 +674,6 @@
                 VALUES
                 (binary('HELLOICEBERG'), 'Sample data for binary');
             """,
-            None,
         ),
         # Bucket Transform
         (
@@ -724,7 +694,6 @@
             VALUES
             (10, 'Integer with value 10');
             """,
-            None,
         ),
         # Test multiple field combinations could generate the Partition record and hive partition path correctly
         (
@@ -753,7 +722,6 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
             """,
-            None,
         ),
         # Test that special characters are URL-encoded
         (
@@ -773,7 +741,6 @@
             VALUES
             ('special string')
             """,
-            lambda name: name.replace("#", "_x23").replace("+", "_x2B"),
         ),
     ],
 )
@@ -787,7 +754,6 @@ def test_partition_key(
     expected_hive_partition_path_slice: str,
     spark_create_table_sql_for_justification: str,
     spark_data_insert_sql_for_justification: str,
-    make_compatible_name: Optional[Callable[[str], str]],
 ) -> None:
     partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)]
     spec = PartitionSpec(*partition_fields)
@@ -823,11 +789,6 @@ def test_partition_key(
             snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path
         )
         # Special characters in partition value are sanitized when written to the data file's partition field
-        # Use `make_compatible_name` to match the sanitize behavior
-        sanitized_record = (
-            Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()})
-            if make_compatible_name
-            else expected_partition_record
-        )
+        sanitized_record = Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()})
         assert spark_partition_for_justification == sanitized_record
         assert expected_hive_partition_path_slice in spark_path_for_justification