From 8ba2c2f983e6b9faf22425c5eb511df096161f46 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Wed, 8 Jan 2025 17:46:43 +0000 Subject: [PATCH 1/5] Revert "Add `make_name_compatible` suggestion so test passes" This reverts commit 61cdd08c59f3f1d3119b5f907eb09dbbcf80b8c2. --- tests/integration/test_partitioning_key.py | 62 +++++----------------- 1 file changed, 13 insertions(+), 49 deletions(-) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 1ac808c7d0..6779018003 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -18,7 +18,7 @@ import uuid from datetime import date, datetime, timedelta, timezone from decimal import Decimal -from typing import Any, Callable, List, Optional +from typing import Any, List import pytest from pyspark.sql import SparkSession @@ -78,7 +78,7 @@ @pytest.mark.parametrize( - "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name", + "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification", [ # # Identity Transform ( @@ -99,7 +99,6 @@ VALUES (false, 'Boolean field set to false'); """, - None, ), ( [PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")], @@ -119,7 +118,6 @@ VALUES ('sample_string', 'Another string value') """, - None, ), ( [PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")], @@ -139,7 +137,6 @@ VALUES (42, 'Associated string value for int 42') """, - None, ), ( [PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")], @@ -159,7 +156,6 @@ VALUES (1234567890123456789, 'Associated string value for long 1234567890123456789') """, - None, ), ( [PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")], @@ -183,7 +179,6 @@ # VALUES # (3.14, 'Associated string value for float 3.14') # """ - None, ), ( [PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")], @@ -207,7 +202,6 @@ # VALUES # (6.282, 'Associated string value for double 6.282') # """ - None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -227,7 +221,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, - None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -247,7 +240,6 @@ VALUES (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, - None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -272,7 +264,6 @@ # VALUES # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') # """ - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")], @@ -297,7 +288,6 @@ # VALUES # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') # """ - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")], @@ -317,7 +307,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01') """, - None, ), ( [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")], @@ -337,7 +326,6 @@ VALUES ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479') """, - None, ), ( [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")], @@ -357,7 +345,6 @@ VALUES (CAST('example' AS BINARY), 'Associated string value for binary `example`') """, - None, ), ( [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")], @@ -377,7 +364,6 @@ VALUES (123.45, 'Associated string value for decimal 123.45') """, - None, ), # # Year Month Day Hour Transform # Month Transform @@ -399,7 +385,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")], @@ -419,7 +404,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")], @@ -439,7 +423,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), # Year Transform ( @@ -460,7 +443,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")], @@ -480,7 +462,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")], @@ -500,7 +481,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), # # Day Transform ( @@ -521,7 +501,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")], @@ -541,7 +520,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")], @@ -561,7 +539,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), # Hour Transform ( @@ -582,7 +559,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")], @@ -602,7 +578,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), # Truncate Transform ( @@ -623,7 +598,6 @@ VALUES (12345, 'Sample data for int'); """, - None, ), ( [PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")], @@ -643,7 +617,6 @@ VALUES (4294967297, 'Sample data for long'); """, - None, ), ( [PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")], @@ -663,7 +636,6 @@ VALUES ('abcdefg', 'Another sample for string'); """, - None, ), ( [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")], @@ -683,7 +655,6 @@ VALUES (678.90, 'Associated string value for decimal 678.90') """, - None, ), ( [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")], @@ -703,7 +674,6 @@ VALUES (binary('HELLOICEBERG'), 'Sample data for binary'); """, - None, ), # Bucket Transform ( @@ -724,7 +694,6 @@ VALUES (10, 'Integer with value 10'); """, - None, ), # Test multiple field combinations could generate the Partition record and hive partition path correctly ( @@ -753,27 +722,30 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data'); """, - None, ), # Test that special characters are URL-encoded ( - [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")], + [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")], ["special string"], - Record(**{"special#string+field": "special string"}), # type: ignore - "special%23string%2Bfield=special+string", + Record(**{"special#string#field": "special string"}), # type: ignore + "special%23string%23field=special+string", + # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification + # (comparing expected value with Spark behavior) would fail: PyIceberg produces + # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string']. + # None, + # None, f"""CREATE TABLE {identifier} ( - `special#string+field` string + `special#string#field` string ) USING iceberg PARTITIONED BY ( - identity(`special#string+field`) + identity(`special#string#field`) ) """, f"""INSERT INTO {identifier} VALUES ('special string') """, - lambda name: name.replace("#", "_x23").replace("+", "_x2B"), ), ], ) @@ -787,7 +759,6 @@ def test_partition_key( expected_hive_partition_path_slice: str, spark_create_table_sql_for_justification: str, spark_data_insert_sql_for_justification: str, - make_compatible_name: Optional[Callable[[str], str]], ) -> None: partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)] spec = PartitionSpec(*partition_fields) @@ -822,12 +793,5 @@ def test_partition_key( spark_path_for_justification = ( snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path ) - # Special characters in partition value are sanitized when written to the data file's partition field - # Use `make_compatible_name` to match the sanitize behavior - sanitized_record = ( - Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) - if make_compatible_name - else expected_partition_record - ) - assert spark_partition_for_justification == sanitized_record + assert spark_partition_for_justification == expected_partition_record assert expected_hive_partition_path_slice in spark_path_for_justification From d303e13a550d6b02c5230ca4d8671c59ad1ea889 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Wed, 8 Jan 2025 17:54:58 +0000 Subject: [PATCH 2/5] Nit fixes to URL-encoding of partition field names --- pyiceberg/partitioning.py | 3 +-- tests/integration/test_partitioning_key.py | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index c9b6316f59..cee85db9cc 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -237,8 +237,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: value_str = quote_plus(value_str, safe="") value_strs.append(value_str) - field_str = quote_plus(partition_field.name, safe="") - field_strs.append(field_str) + field_strs.append(quote_plus(partition_field.name, safe="")) path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)]) return path diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 6779018003..c7976e91f4 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -26,7 +26,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec -from pyiceberg.schema import Schema +from pyiceberg.schema import Schema, make_compatible_name from pyiceberg.transforms import ( BucketTransform, DayTransform, @@ -793,5 +793,7 @@ def test_partition_key( spark_path_for_justification = ( snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path ) - assert spark_partition_for_justification == expected_partition_record + # Special characters in partition value are sanitized when written to the data file's partition field + sanitized_record = Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) + assert spark_partition_for_justification == sanitized_record assert expected_hive_partition_path_slice in spark_path_for_justification From a4bb503f6df516952782bf2296c5516bc890b3f6 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Wed, 8 Jan 2025 17:59:16 +0000 Subject: [PATCH 3/5] Fix tests --- tests/integration/test_partitioning_key.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index c7976e91f4..3955259d33 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -725,21 +725,16 @@ ), # Test that special characters are URL-encoded ( - [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string#field")], + [PartitionField(source_id=15, field_id=1001, transform=IdentityTransform(), name="special#string+field")], ["special string"], - Record(**{"special#string#field": "special string"}), # type: ignore - "special%23string%23field=special+string", - # Spark currently writes differently to PyIceberg w.r.t special column name sanitization so justification - # (comparing expected value with Spark behavior) would fail: PyIceberg produces - # Record[special_x23string_x23field='special string'], not Record[special#string#field='special string']. - # None, - # None, + Record(**{"special#string+field": "special string"}), # type: ignore + "special%23string%2Bfield=special+string", f"""CREATE TABLE {identifier} ( - `special#string#field` string + `special#string+field` string ) USING iceberg PARTITIONED BY ( - identity(`special#string#field`) + identity(`special#string+field`) ) """, f"""INSERT INTO {identifier} From 312b442ee071dc018b79e75ff8467258df011966 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Thu, 9 Jan 2025 22:31:22 +0000 Subject: [PATCH 4/5] Collapse --- pyiceberg/partitioning.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index cee85db9cc..1813772217 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -234,9 +234,7 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: partition_field = self.fields[pos] value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos]) - value_str = quote_plus(value_str, safe="") - value_strs.append(value_str) - + value_strs.append(quote_plus(value_str, safe="")) field_strs.append(quote_plus(partition_field.name, safe="")) path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)]) From c75d637d4e6daa6c20f2de83181f2be3899fb913 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Fri, 10 Jan 2025 19:59:55 +0000 Subject: [PATCH 5/5] Make lint --- mkdocs/docs/api.md | 1 + 1 file changed, 1 insertion(+) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 8b106c1034..f1ef69b9cb 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -1077,6 +1077,7 @@ with table.update_schema() as update: with table.update_schema() as update: update.add_column(("details", "confirmed_by"), StringType(), "Name of the exchange") ``` + A complex type must exist before columns can be added to it. Fields in complex types are added in a tuple. ### Rename column