-
Notifications
You must be signed in to change notification settings - Fork 202
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Nit fixes to URL-encoding of partition field names #1499
Changes from all commits
8ba2c2f
d303e13
a4bb503
312b442
0aa6442
c75d637
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,15 +18,15 @@ | |
import uuid | ||
from datetime import date, datetime, timedelta, timezone | ||
from decimal import Decimal | ||
from typing import Any, Callable, List, Optional | ||
from typing import Any, List | ||
|
||
import pytest | ||
from pyspark.sql import SparkSession | ||
from pyspark.sql.utils import AnalysisException | ||
|
||
from pyiceberg.catalog import Catalog | ||
from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec | ||
from pyiceberg.schema import Schema | ||
from pyiceberg.schema import Schema, make_compatible_name | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The changes in this file address #1457 (comment) |
||
from pyiceberg.transforms import ( | ||
BucketTransform, | ||
DayTransform, | ||
|
@@ -78,7 +78,7 @@ | |
|
||
|
||
@pytest.mark.parametrize( | ||
"partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification, make_compatible_name", | ||
"partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification", | ||
[ | ||
# # Identity Transform | ||
( | ||
|
@@ -99,7 +99,6 @@ | |
VALUES | ||
(false, 'Boolean field set to false'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=2, field_id=1001, transform=IdentityTransform(), name="string_field")], | ||
|
@@ -119,7 +118,6 @@ | |
VALUES | ||
('sample_string', 'Another string value') | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int_field")], | ||
|
@@ -139,7 +137,6 @@ | |
VALUES | ||
(42, 'Associated string value for int 42') | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=5, field_id=1001, transform=IdentityTransform(), name="long_field")], | ||
|
@@ -159,7 +156,6 @@ | |
VALUES | ||
(1234567890123456789, 'Associated string value for long 1234567890123456789') | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=6, field_id=1001, transform=IdentityTransform(), name="float_field")], | ||
|
@@ -183,7 +179,6 @@ | |
# VALUES | ||
# (3.14, 'Associated string value for float 3.14') | ||
# """ | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=7, field_id=1001, transform=IdentityTransform(), name="double_field")], | ||
|
@@ -207,7 +202,6 @@ | |
# VALUES | ||
# (6.282, 'Associated string value for double 6.282') | ||
# """ | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], | ||
|
@@ -227,7 +221,6 @@ | |
VALUES | ||
(CAST('2023-01-01 12:00:01.000999' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], | ||
|
@@ -247,7 +240,6 @@ | |
VALUES | ||
(CAST('2023-01-01 12:00:01' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=8, field_id=1001, transform=IdentityTransform(), name="timestamp_field")], | ||
|
@@ -272,7 +264,6 @@ | |
# VALUES | ||
# (CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ), 'Associated string value for timestamp 2023-01-01T12:00:00') | ||
# """ | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=9, field_id=1001, transform=IdentityTransform(), name="timestamptz_field")], | ||
|
@@ -297,7 +288,6 @@ | |
# VALUES | ||
# (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') | ||
# """ | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=10, field_id=1001, transform=IdentityTransform(), name="date_field")], | ||
|
@@ -317,7 +307,6 @@ | |
VALUES | ||
(CAST('2023-01-01' AS DATE), 'Associated string value for date 2023-01-01') | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=14, field_id=1001, transform=IdentityTransform(), name="uuid_field")], | ||
|
@@ -337,7 +326,6 @@ | |
VALUES | ||
('f47ac10b-58cc-4372-a567-0e02b2c3d479', 'Associated string value for UUID f47ac10b-58cc-4372-a567-0e02b2c3d479') | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")], | ||
|
@@ -357,7 +345,6 @@ | |
VALUES | ||
(CAST('example' AS BINARY), 'Associated string value for binary `example`') | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")], | ||
|
@@ -377,7 +364,6 @@ | |
VALUES | ||
(123.45, 'Associated string value for decimal 123.45') | ||
""", | ||
None, | ||
), | ||
# # Year Month Day Hour Transform | ||
# Month Transform | ||
|
@@ -399,7 +385,6 @@ | |
VALUES | ||
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP_NTZ), 'Event at 2023-01-01 11:55:59.999999'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_field_month")], | ||
|
@@ -419,7 +404,6 @@ | |
VALUES | ||
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_field_month")], | ||
|
@@ -439,7 +423,6 @@ | |
VALUES | ||
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); | ||
""", | ||
None, | ||
), | ||
# Year Transform | ||
( | ||
|
@@ -460,7 +443,6 @@ | |
VALUES | ||
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event at 2023-01-01 11:55:59.999999'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_field_year")], | ||
|
@@ -480,7 +462,6 @@ | |
VALUES | ||
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_field_year")], | ||
|
@@ -500,7 +481,6 @@ | |
VALUES | ||
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); | ||
""", | ||
None, | ||
), | ||
# # Day Transform | ||
( | ||
|
@@ -521,7 +501,6 @@ | |
VALUES | ||
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_field_day")], | ||
|
@@ -541,7 +520,6 @@ | |
VALUES | ||
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_field_day")], | ||
|
@@ -561,7 +539,6 @@ | |
VALUES | ||
(CAST('2023-01-01' AS DATE), 'Event on 2023-01-01'); | ||
""", | ||
None, | ||
), | ||
# Hour Transform | ||
( | ||
|
@@ -582,7 +559,6 @@ | |
VALUES | ||
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), 'Event within the 11th hour of 2023-01-01'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_field_hour")], | ||
|
@@ -602,7 +578,6 @@ | |
VALUES | ||
(CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Event at 2023-01-01 12:00:01.000999+03:00'); | ||
""", | ||
None, | ||
), | ||
# Truncate Transform | ||
( | ||
|
@@ -623,7 +598,6 @@ | |
VALUES | ||
(12345, 'Sample data for int'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="bigint_field_trunc")], | ||
|
@@ -643,7 +617,6 @@ | |
VALUES | ||
(4294967297, 'Sample data for long'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(3), name="string_field_trunc")], | ||
|
@@ -663,7 +636,6 @@ | |
VALUES | ||
('abcdefg', 'Another sample for string'); | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")], | ||
|
@@ -683,7 +655,6 @@ | |
VALUES | ||
(678.90, 'Associated string value for decimal 678.90') | ||
""", | ||
None, | ||
), | ||
( | ||
[PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")], | ||
|
@@ -703,7 +674,6 @@ | |
VALUES | ||
(binary('HELLOICEBERG'), 'Sample data for binary'); | ||
""", | ||
None, | ||
), | ||
# Bucket Transform | ||
( | ||
|
@@ -724,7 +694,6 @@ | |
VALUES | ||
(10, 'Integer with value 10'); | ||
""", | ||
None, | ||
), | ||
# Test multiple field combinations could generate the Partition record and hive partition path correctly | ||
( | ||
|
@@ -753,7 +722,6 @@ | |
VALUES | ||
(CAST('2023-01-01 11:55:59.999999' AS TIMESTAMP), CAST('2023-01-01' AS DATE), 'some data'); | ||
""", | ||
None, | ||
), | ||
# Test that special characters are URL-encoded | ||
( | ||
|
@@ -773,7 +741,6 @@ | |
VALUES | ||
('special string') | ||
""", | ||
lambda name: name.replace("#", "_x23").replace("+", "_x2B"), | ||
), | ||
], | ||
) | ||
|
@@ -787,7 +754,6 @@ def test_partition_key( | |
expected_hive_partition_path_slice: str, | ||
spark_create_table_sql_for_justification: str, | ||
spark_data_insert_sql_for_justification: str, | ||
make_compatible_name: Optional[Callable[[str], str]], | ||
) -> None: | ||
partition_field_values = [PartitionFieldValue(field, value) for field, value in zip(partition_fields, partition_values)] | ||
spec = PartitionSpec(*partition_fields) | ||
|
@@ -823,11 +789,6 @@ def test_partition_key( | |
snapshot.manifests(iceberg_table.io)[0].fetch_manifest_entry(iceberg_table.io)[0].data_file.file_path | ||
) | ||
# Special characters in partition value are sanitized when written to the data file's partition field | ||
# Use `make_compatible_name` to match the sanitize behavior | ||
sanitized_record = ( | ||
Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) | ||
if make_compatible_name | ||
else expected_partition_record | ||
) | ||
sanitized_record = Record(**{make_compatible_name(k): v for k, v in vars(expected_partition_record).items()}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can either do this and run
which one do you prefer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, not sure I understand. If it's always going to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Aside: it's not easy to have default value for a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
oh i didn't know that! My comment was based on the fact that we can set a default
yea thats also the point i want to make. make it clear that only certain test cases requires but also, this is a nit comment, we dont necessary have to do this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah not sure. I marginally prefer leaving it as it is now - it reads more nicely, prevents |
||
assert spark_partition_for_justification == sanitized_record | ||
assert expected_hive_partition_path_slice in spark_path_for_justification |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(By
make lint
- see #1507)