diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 8b106c1034..f1ef69b9cb 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -1077,6 +1077,7 @@ with table.update_schema() as update: with table.update_schema() as update: update.add_column(("details", "confirmed_by"), StringType(), "Name of the exchange") ``` + A complex type must exist before columns can be added to it. Fields in complex types are added in a tuple. ### Rename column diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index c9b6316f59..1813772217 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -234,11 +234,8 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: partition_field = self.fields[pos] value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos]) - value_str = quote_plus(value_str, safe="") - value_strs.append(value_str) - - field_str = quote_plus(partition_field.name, safe="") - field_strs.append(field_str) + value_strs.append(quote_plus(value_str, safe="")) + field_strs.append(quote_plus(partition_field.name, safe="")) path = "/".join([field_str + "=" + value_str for field_str, value_str in zip(field_strs, value_strs)]) return path diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 1ac808c7d0..3955259d33 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -18,7 +18,7 @@ import uuid from datetime import date, datetime, timedelta, timezone from decimal import Decimal -from typing import Any, Callable, List, Optional +from typing import Any, List import pytest from pyspark.sql import SparkSession @@ -26,7 +26,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec -from pyiceberg.schema import Schema +from pyiceberg.schema import Schema, make_compatible_name from pyiceberg.transforms import ( BucketTransform, DayTransform, @@ -78,7 +78,7 @@ @pytest.mark.parametrize( - "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name", + "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification", [ # # Identity Transform ( @@ -99,7 +99,6 @@ VALUES (false, 'Boolean field set to false'); """, - None, ), ( [PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")], @@ -119,7 +118,6 @@ VALUES ('sample_string', 'Another string value') """, - None, ), ( [PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")], @@ -139,7 +137,6 @@ VALUES (42, 'Associated string value for int 42') """, - None, ), ( [PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")], @@ -159,7 +156,6 @@ VALUES (1234567890123456789, 'Associated string value for long 1234567890123456789') """, - None, ), ( [PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")], @@ -183,7 +179,6 @@ # VALUES # (3.14, 'Associated string value for float 3.14') # """ - None, ), ( [PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")], @@ -207,7 +202,6 @@ # VALUES # (6.282, 'Associated string value for double 6.282') # """ - None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -227,7 +221,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, - None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -247,7 +240,6 @@ VALUES (CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') """, - None, ), ( [PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], @@ -272,7 +264,6 @@ # VALUES # (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') # """ - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")], @@ -297,7 +288,6 @@ # VALUES # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') # """ - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")], @@ -317,7 +307,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01') """, - None, ), ( [PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")], @@ -337,7 +326,6 @@ VALUES ('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479') """, - None, ), ( [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")], @@ -357,7 +345,6 @@ VALUES (CAST('example' AS BINARY), 'Associated string value for binary `example`') """, - None, ), ( [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")], @@ -377,7 +364,6 @@ VALUES (123.45, 'Associated string value for decimal 123.45') """, - None, ), # # Year Month Day Hour Transform # Month Transform @@ -399,7 +385,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")], @@ -419,7 +404,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")], @@ -439,7 +423,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), # Year Transform ( @@ -460,7 +443,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")], @@ -480,7 +462,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")], @@ -500,7 +481,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), # # Day Transform ( @@ -521,7 +501,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")], @@ -541,7 +520,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), ( [PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")], @@ -561,7 +539,6 @@ VALUES (CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); """, - None, ), # Hour Transform ( @@ -582,7 +559,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01'); """, - None, ), ( [PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")], @@ -602,7 +578,6 @@ VALUES (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); """, - None, ), # Truncate Transform ( @@ -623,7 +598,6 @@ VALUES (12345, 'Sample data for int'); """, - None, ), ( [PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")], @@ -643,7 +617,6 @@ VALUES (4294967297, 'Sample data for long'); """, - None, ), ( [PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")], @@ -663,7 +636,6 @@ VALUES ('abcdefg', 'Another sample for string'); """, - None, ), ( [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")], @@ -683,7 +655,6 @@ VALUES (678.90, 'Associated string value for decimal 678.90') """, - None, ), ( [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")], @@ -703,7 +674,6 @@ VALUES (binary('HELLOICEBERG'), 'Sample data for binary'); """, - None, ), # Bucket Transform ( @@ -724,7 +694,6 @@ VALUES (10, 'Integer with value 10'); """, - None, ), # Test multiple field combinations could generate the Partition record and hive partition path correctly ( @@ -753,7 +722,6 @@ VALUES (CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data'); """, - None, ), # Test that special characters are URL-encoded ( @@ -773,7 +741,6 @@ VALUES ('special string') """, - lambda name: name.replace("#", "_x23").replace("+", "_x2B"), ), ], ) @@ -787,7 +754,6 @@ def test_partition_key( expected_hive_partition_path_slice: str, spark_create_table_sql_for_justification: str, spark_data_insert_sql_for_justification: str, - make_compatible_name: Optional[Callable[[str], str]], ) -> None: partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)] spec = PartitionSpec(*partition_fields) @@ -823,11 +789,6 @@ def test_partition_key( snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path ) # Special characters in partition value are sanitized when written to the data file's partition field - # Use `make_compatible_name` to match the sanitize behavior - sanitized_record = ( - Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) - if make_compatible_name - else expected_partition_record - ) + sanitized_record = Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) assert spark_partition_for_justification == sanitized_record assert expected_hive_partition_path_slice in spark_path_for_justification