Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ResidualVisitor to compute residuals #1388

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 212 additions & 1 deletion pyiceberg/expressions/visitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
from pyiceberg.manifest import DataFile, ManifestFile, PartitionFieldSummary
from pyiceberg.partitioning import PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.typedef import EMPTY_DICT, L, StructProtocol
from pyiceberg.typedef import EMPTY_DICT, L, Record, StructProtocol
from pyiceberg.types import (
DoubleType,
FloatType,
Expand Down Expand Up @@ -1731,3 +1731,214 @@ def _can_contain_nulls(self, field_id: int) -> bool:

def _can_contain_nans(self, field_id: int) -> bool:
return (nan_count := self.nan_counts.get(field_id)) is not None and nan_count > 0


class ResidualVisitor(BoundBooleanExpressionVisitor[BooleanExpression], ABC):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

schema: Schema
spec: PartitionSpec
case_sensitive: bool

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add:

expr: BooleanExpression

to the class variables as well?

def __init__(self, schema: Schema, spec: PartitionSpec, case_sensitive: bool, expr: BooleanExpression):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def __init__(self, schema: Schema, spec: PartitionSpec, case_sensitive: bool, expr: BooleanExpression):
def __init__(self, schema: Schema, spec: PartitionSpec, case_sensitive: bool, expr: BooleanExpression) -> None:

self.schema = schema
self.spec = spec
self.case_sensitive = case_sensitive
self.expr = expr

def eval(self, partition_data: Record) -> BooleanExpression:
self.struct = partition_data
return visit(self.expr, visitor=self)

def visit_true(self) -> BooleanExpression:
return AlwaysTrue()

def visit_false(self) -> BooleanExpression:
return AlwaysFalse()

def visit_not(self, child_result: BooleanExpression) -> BooleanExpression:
return Not(child_result)

def visit_and(self, left_result: BooleanExpression, right_result: BooleanExpression) -> BooleanExpression:
return And(left_result, right_result)

def visit_or(self, left_result: BooleanExpression, right_result: BooleanExpression) -> BooleanExpression:
return Or(left_result, right_result)

def visit_is_null(self, term: BoundTerm[L]) -> BooleanExpression:
if term.eval(self.struct) is None:
return AlwaysTrue()
else:
return AlwaysFalse()

def visit_not_null(self, term: BoundTerm[L]) -> BooleanExpression:
if term.eval(self.struct) is not None:
return AlwaysTrue()
else:
return AlwaysFalse()

def visit_is_nan(self, term: BoundTerm[L]) -> BooleanExpression:
val = term.eval(self.struct)
if val is None:
return self.visit_true()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to Java, I think we can return AlwaysTrue directly, instead of calling visit_true(): https://github.com/apache/iceberg/blob/5fd16b5bfeb85e12b5a9ecb4e39504389d7b72ed/api/src/main/java/org/apache/iceberg/expressions/ResidualEvaluator.java#L157

Suggested change
return self.visit_true()
return AlwaysTrue()

else:
return self.visit_false()

def visit_not_nan(self, term: BoundTerm[L]) -> BooleanExpression:
val = term.eval(self.struct)
if val is not None:
return self.visit_true()
else:
return self.visit_false()
Comment on lines +1786 to +1790
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Java takes a different approach and checks for NaN:

Suggested change
val = term.eval(self.struct)
if val is not None:
return self.visit_true()
else:
return self.visit_false()
if isnan(term.eval(self.struct)):
return self.visit_true()
else:
return self.visit_false()

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is causing a type casting problem as term.eval() returns a L with can be of typre str, bytes, UUID with isnan doesn't support

pyiceberg/expressions/visitors.py:1799: error: Argument 1 to "isnan" has incompatible type "str"; expected "Union[SupportsFloat, SupportsIndex]"  [arg-type]
pyiceberg/expressions/visitors.py:1799: error: Argument 1 to "isnan" has incompatible type "bytes"; expected "Union[SupportsFloat, SupportsIndex]"  [arg-type]
pyiceberg/expressions/visitors.py:1799: error: Argument 1 to "isnan" has incompatible type "UUID"; expected "Union[SupportsFloat, SupportsIndex]"  [arg-type]
pyiceberg/expressions/visitors.py:1805: error: Argument 1 to "isnan" has incompatible type "str"; expected "Union[SupportsFloat, SupportsIndex]"  [arg-type]
pyiceberg/expressions/visitors.py:1805: error: Argument 1 to "isnan" has incompatible type "bytes"; expected "Union[SupportsFloat, SupportsIndex]"  [arg-type]
pyiceberg/expressions/visitors.py:1805: error: Argument 1 to "isnan" has incompatible type "UUID"; expected "Union[SupportsFloat, SupportsIndex]"  [arg-type]


def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression:
if term.eval(self.struct) < literal.value:
return self.visit_true()
else:
return self.visit_false()

def visit_less_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression:
if term.eval(self.struct) <= literal.value:
return self.visit_true()
else:
return self.visit_false()

def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression:
if term.eval(self.struct) > literal.value:
return self.visit_true()
else:
return self.visit_false()

def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression:
if term.eval(self.struct) >= literal.value:
return self.visit_true()
else:
return self.visit_false()

def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression:
if term.eval(self.struct) == literal.value:
return self.visit_true()
else:
return self.visit_false()

def visit_not_equal(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression:
if term.eval(self.struct) != literal.value:
return self.visit_true()
else:
return self.visit_false()

def visit_in(self, term: BoundTerm[L], literals: Set[L]) -> BooleanExpression:
if term.eval(self.struct) in literals:
return self.visit_true()
else:
return self.visit_false()

def visit_not_in(self, term: BoundTerm[L], literals: Set[L]) -> BooleanExpression:
if term.eval(self.struct) not in literals:
return self.visit_true()
else:
return self.visit_false()

def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression:
eval_res = term.eval(self.struct)
if eval_res is not None and str(eval_res).startswith(str(literal.value)):
return AlwaysTrue()
else:
return AlwaysFalse()

def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> BooleanExpression:
if not self.visit_starts_with(term, literal):
return AlwaysTrue()
else:
return AlwaysFalse()

def visit_bound_predicate(self, predicate: BoundPredicate[Any]) -> BooleanExpression:
"""
If there is no strict projection or if it evaluates to false, then return the predicate.

Get the strict projection and inclusive projection of this predicate in partition data,
then use them to determine whether to return the original predicate. The strict projection
returns true iff the original predicate would have returned true, so the predicate can be
eliminated if the strict projection evaluates to true. Similarly the inclusive projection
returns false iff the original predicate would have returned false, so the predicate can
also be eliminated if the inclusive projection evaluates to false.

"""
parts = self.spec.fields_by_source_id(predicate.term.ref().field.field_id)
if parts == []:
return predicate

from pyiceberg.types import StructType
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's move this import to the top


def struct_to_schema(struct: StructType) -> Schema:
return Schema(*list(struct.fields))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The conversion to a list is not needed:

Suggested change
return Schema(*list(struct.fields))
return Schema(*struct.fields)
python3
Python 3.10.14 (main, Mar 19 2024, 21:46:16) [Clang 15.0.0 (clang-1500.3.9.4)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> def vo(*int):
...     print(int)
... 
>>> vo(*(1,2,3))
(1, 2, 3)


for part in parts:
strict_projection = part.transform.strict_project(part.name, predicate)
strict_result = None

if strict_projection is not None:
bound = strict_projection.bind(struct_to_schema(self.spec.partition_type(self.schema)))
if isinstance(bound, BoundPredicate):
strict_result = super().visit_bound_predicate(bound)
else:
strict_result = bound
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep the comments from Java in here, I think they are pretty helpful:

Suggested change
strict_result = bound
# if the result is not a predicate, then it must be a constant like alwaysTrue or alwaysFalse
strict_result = bound


if strict_result is not None and isinstance(strict_result, AlwaysTrue):
return AlwaysTrue()

inclusive_projection = part.transform.project(part.name, predicate)
inclusive_result = None
if inclusive_projection is not None:
bound_inclusive = inclusive_projection.bind(struct_to_schema(self.spec.partition_type(self.schema)))
if isinstance(bound_inclusive, BoundPredicate):
# using predicate method specific to inclusive
inclusive_result = super().visit_bound_predicate(bound_inclusive)
else:
# if the result is not a predicate, then it must be a constant like alwaysTrue or
# alwaysFalse
inclusive_result = bound_inclusive
if inclusive_result is not None and isinstance(inclusive_result, AlwaysFalse):
return AlwaysFalse()

return predicate

def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpression:
bound = predicate.bind(self.schema, case_sensitive=True)

if isinstance(bound, BoundPredicate):
bound_residual = self.visit_bound_predicate(predicate=bound)
# if isinstance(bound_residual, BooleanExpression):
if bound_residual not in (AlwaysFalse(), AlwaysTrue()):
# replace inclusive original unbound predicate
return predicate

# use the non-predicate residual (e.g. alwaysTrue)
return bound_residual

# if binding didn't result in a Predicate, return the expression
return bound


class ResidualEvaluator(ResidualVisitor):
def residual_for(self, partition_data: Record) -> BooleanExpression:
return self.eval(partition_data)


class UnpartitionedResidualEvaluator(ResidualEvaluator):
# Finds the residuals for an Expression the partitions in the given PartitionSpec
def __init__(self, schema: Schema, expr: BooleanExpression):
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's move this import to the top as well 👍


super().__init__(schema=schema, spec=UNPARTITIONED_PARTITION_SPEC, expr=expr, case_sensitive=False)
self.expr = expr

def residual_for(self, partition_data: Record) -> BooleanExpression:
return self.expr


def residual_evaluator_of(
spec: PartitionSpec, expr: BooleanExpression, case_sensitive: bool, schema: Schema
) -> ResidualEvaluator:
if len(spec.fields) != 0:
return ResidualEvaluator(spec=spec, expr=expr, schema=schema, case_sensitive=case_sensitive)
else:
return UnpartitionedResidualEvaluator(schema=schema, expr=expr)
Loading