From 8ba2c2f983e6b9faf22425c5eb511df096161f46 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Wed, 8 Jan 2025 17:46:43 +0000
Subject: [PATCH 1/5] Revert "Add `make_name_compatible` suggestion so test
 passes"

This reverts commit 61cdd08c59f3f1d3119b5f907eb09dbbcf80b8c2.
---
 tests/integration/test_partitioning_key.py | 62 +++++-----------------
 1 file changed, 13 insertions(+), 49 deletions(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index 1ac808c7d0..6779018003 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -18,7 +18,7 @@
 import uuid
 from datetime import date, datetime, timedelta, timezone
 from decimal import Decimal
-from typing import Any, Callable, List, Optional
+from typing import Any, List
 
 import pytest
 from pyspark.sql import SparkSession
@@ -78,7 +78,7 @@
 
 
 @pytest.mark.parametrize(
-    "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name",
+    "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification",
     [
         # # Identity Transform
         (
@@ -99,7 +99,6 @@
             VALUES
             (false, 'Boolean field set to false');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")],
@@ -119,7 +118,6 @@
             VALUES
             ('sample_string', 'Another string value')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")],
@@ -139,7 +137,6 @@
             VALUES
             (42, 'Associated string value for int 42')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")],
@@ -159,7 +156,6 @@
             VALUES
             (1234567890123456789, 'Associated string value for long 1234567890123456789')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")],
@@ -183,7 +179,6 @@
             # VALUES
             # (3.14, 'Associated string value for float 3.14')
             # """
-            None,
         ),
         (
             [PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")],
@@ -207,7 +202,6 @@
             # VALUES
             # (6.282, 'Associated string value for double 6.282')
             # """
-            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -227,7 +221,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -247,7 +240,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")],
@@ -272,7 +264,6 @@
             # VALUES
             # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00')
             # """
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")],
@@ -297,7 +288,6 @@
             # VALUES
             # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00')
             # """
-            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")],
@@ -317,7 +307,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")],
@@ -337,7 +326,6 @@
             VALUES
             ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")],
@@ -357,7 +345,6 @@
             VALUES
             (CAST('example' AS BINARY), 'Associated string value for binary `example`')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")],
@@ -377,7 +364,6 @@
             VALUES
             (123.45, 'Associated string value for decimal 123.45')
             """,
-            None,
         ),
         # # Year Month Day Hour Transform
         # Month Transform
@@ -399,7 +385,6 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")],
@@ -419,7 +404,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")],
@@ -439,7 +423,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
-            None,
         ),
         # Year Transform
         (
@@ -460,7 +443,6 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")],
@@ -480,7 +462,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")],
@@ -500,7 +481,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
-            None,
         ),
         # # Day Transform
         (
@@ -521,7 +501,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")],
@@ -541,7 +520,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")],
@@ -561,7 +539,6 @@
             VALUES
             (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01');
             """,
-            None,
         ),
         # Hour Transform
         (
@@ -582,7 +559,6 @@
                 VALUES
                 (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01');
                 """,
-            None,
         ),
         (
             [PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")],
@@ -602,7 +578,6 @@
             VALUES
             (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00');
             """,
-            None,
         ),
         # Truncate Transform
         (
@@ -623,7 +598,6 @@
                 VALUES
                 (12345, 'Sample data for int');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")],
@@ -643,7 +617,6 @@
             VALUES
             (4294967297, 'Sample data for long');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")],
@@ -663,7 +636,6 @@
             VALUES
             ('abcdefg', 'Another sample for string');
             """,
-            None,
         ),
         (
             [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")],
@@ -683,7 +655,6 @@
             VALUES
             (678.90, 'Associated string value for decimal 678.90')
             """,
-            None,
         ),
         (
             [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")],
@@ -703,7 +674,6 @@
                 VALUES
                 (binary('HELLOICEBERG'), 'Sample data for binary');
             """,
-            None,
         ),
         # Bucket Transform
         (
@@ -724,7 +694,6 @@
             VALUES
             (10, 'Integer with value 10');
             """,
-            None,
         ),
         # Test multiple field combinations could generate the Partition record and hive partition path correctly
         (
@@ -753,27 +722,30 @@
             VALUES
             (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data');
             """,
-            None,
         ),
         # Test that special characters are URL-encoded
         (
-            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")],
+            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")],
             ["special string"],
-            Record(**{"special#string+field": "special string"}),  # type: ignore
-            "special%23string%2Bfield=special+string",
+            Record(**{"special#string#field": "special string"}),  # type: ignore
+            "special%23string%23field=special+string",
+            # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
+            # (comparing expected value with Spark behavior) would fail: PyIceberg produces
+            # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].
+            # None,
+            # None,
             f"""CREATE TABLE {identifier} (
-                `special#string+field` string
+                `special#string#field` string
             )
             USING iceberg
             PARTITIONED BY (
-                identity(`special#string+field`)
+                identity(`special#string#field`)
             )
             """,
             f"""INSERT INTO {identifier}
             VALUES
             ('special string')
             """,
-            lambda name: name.replace("#", "_x23").replace("+", "_x2B"),
         ),
     ],
 )
@@ -787,7 +759,6 @@ def test_partition_key(
     expected_hive_partition_path_slice: str,
     spark_create_table_sql_for_justification: str,
     spark_data_insert_sql_for_justification: str,
-    make_compatible_name: Optional[Callable[[str], str]],
 ) -> None:
     partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)]
     spec = PartitionSpec(*partition_fields)
@@ -822,12 +793,5 @@ def test_partition_key(
         spark_path_for_justification = (
             snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path
         )
-        # Special characters in partition value are sanitized when written to the data file's partition field
-        # Use `make_compatible_name` to match the sanitize behavior
-        sanitized_record = (
-            Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()})
-            if make_compatible_name
-            else expected_partition_record
-        )
-        assert spark_partition_for_justification == sanitized_record
+        assert spark_partition_for_justification == expected_partition_record
         assert expected_hive_partition_path_slice in spark_path_for_justification

From d303e13a550d6b02c5230ca4d8671c59ad1ea889 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Wed, 8 Jan 2025 17:54:58 +0000
Subject: [PATCH 2/5] Nit fixes to URL-encoding of partition field names

---
 pyiceberg/partitioning.py                  | 3 +--
 tests/integration/test_partitioning_key.py | 6 ++++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
index c9b6316f59..cee85db9cc 100644
--- a/pyiceberg/partitioning.py
+++ b/pyiceberg/partitioning.py
@@ -237,8 +237,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
             value_str = quote_plus(value_str, safe="")
             value_strs.append(value_str)
 
-            field_str = quote_plus(partition_field.name, safe="")
-            field_strs.append(field_str)
+            field_strs.append(quote_plus(partition_field.name, safe=""))
 
         path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)])
         return path
diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index 6779018003..c7976e91f4 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -26,7 +26,7 @@
 
 from pyiceberg.catalog import Catalog
 from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec
-from pyiceberg.schema import Schema
+from pyiceberg.schema import Schema, make_compatible_name
 from pyiceberg.transforms import (
     BucketTransform,
     DayTransform,
@@ -793,5 +793,7 @@ def test_partition_key(
         spark_path_for_justification = (
             snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path
         )
-        assert spark_partition_for_justification == expected_partition_record
+        # Special characters in partition value are sanitized when written to the data file's partition field
+        sanitized_record = Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()})
+        assert spark_partition_for_justification == sanitized_record
         assert expected_hive_partition_path_slice in spark_path_for_justification

From a4bb503f6df516952782bf2296c5516bc890b3f6 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Wed, 8 Jan 2025 17:59:16 +0000
Subject: [PATCH 3/5] Fix tests

---
 tests/integration/test_partitioning_key.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py
index c7976e91f4..3955259d33 100644
--- a/tests/integration/test_partitioning_key.py
+++ b/tests/integration/test_partitioning_key.py
@@ -725,21 +725,16 @@
         ),
         # Test that special characters are URL-encoded
         (
-            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")],
+            [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")],
             ["special string"],
-            Record(**{"special#string#field": "special string"}),  # type: ignore
-            "special%23string%23field=special+string",
-            # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification
-            # (comparing expected value with Spark behavior) would fail: PyIceberg produces
-            # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string'].
-            # None,
-            # None,
+            Record(**{"special#string+field": "special string"}),  # type: ignore
+            "special%23string%2Bfield=special+string",
             f"""CREATE TABLE {identifier} (
-                `special#string#field` string
+                `special#string+field` string
             )
             USING iceberg
             PARTITIONED BY (
-                identity(`special#string#field`)
+                identity(`special#string+field`)
             )
             """,
             f"""INSERT INTO {identifier}

From 312b442ee071dc018b79e75ff8467258df011966 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Thu, 9 Jan 2025 22:31:22 +0000
Subject: [PATCH 4/5] Collapse

---
 pyiceberg/partitioning.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py
index cee85db9cc..1813772217 100644
--- a/pyiceberg/partitioning.py
+++ b/pyiceberg/partitioning.py
@@ -234,9 +234,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str:
             partition_field = self.fields[pos]
             value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos])
 
-            value_str = quote_plus(value_str, safe="")
-            value_strs.append(value_str)
-
+            value_strs.append(quote_plus(value_str, safe=""))
             field_strs.append(quote_plus(partition_field.name, safe=""))
 
         path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)])

From c75d637d4e6daa6c20f2de83181f2be3899fb913 Mon Sep 17 00:00:00 2001
From: Sreesh Maheshwar <smaheshwar@palantir.com>
Date: Fri, 10 Jan 2025 19:59:55 +0000
Subject: [PATCH 5/5] Make lint

---
 mkdocs/docs/api.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
index 8b106c1034..f1ef69b9cb 100644
--- a/mkdocs/docs/api.md
+++ b/mkdocs/docs/api.md
@@ -1077,6 +1077,7 @@ with table.update_schema() as update:
 with table.update_schema() as update:
     update.add_column(("details", "confirmed_by"), StringType(), "Name of the exchange")
 ```
+
 A complex type must exist before columns can be added to it. Fields in complex types are added in a tuple.
 
 ### Rename column