Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update like statements to reflect sql behaviors #91

Merged
merged 6 commits into from
Oct 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions pyiceberg/expressions/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import re
from decimal import Decimal

from pyparsing import (
Expand Down Expand Up @@ -51,7 +52,6 @@
NotIn,
NotNaN,
NotNull,
NotStartsWith,
Or,
Reference,
StartsWith,
Expand All @@ -78,6 +78,8 @@
identifier = Word(alphas, alphanums + "_$").set_results_name("identifier")
column = DelimitedList(identifier, delim=".", combine=False).set_results_name("column")

like_regex = r'(?P<valid_wildcard>(?<!\\)%$)|(?P<invalid_wildcard>(?<!\\)%)'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My first choice would not be a regex, but it works well 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't want to complicate the parser, so this seemed like the most straightforward path.



@column.set_parse_action
def _(result: ParseResults) -> Reference:
Expand Down Expand Up @@ -217,12 +219,25 @@ def _(result: ParseResults) -> BooleanExpression:

@starts_with.set_parse_action
def _(result: ParseResults) -> BooleanExpression:
return StartsWith(result.column, result.raw_quoted_string)
return _evaluate_like_statement(result)


@not_starts_with.set_parse_action
def _(result: ParseResults) -> BooleanExpression:
return NotStartsWith(result.column, result.raw_quoted_string)
return ~_evaluate_like_statement(result)


def _evaluate_like_statement(result: ParseResults) -> BooleanExpression:
literal_like: StringLiteral = result.raw_quoted_string

match = re.search(like_regex, literal_like.value)

if match and match.groupdict()['invalid_wildcard']:
raise ValueError("LIKE expressions only supports wildcard, '%', at the end of a string")
elif match and match.groupdict()['valid_wildcard']:
return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace('\\%', '%')))
else:
return EqualTo(result.column, StringLiteral(literal_like.value.replace('\\%', '%')))


predicate = (comparison | in_check | null_check | nan_check | starts_check | boolean).set_results_name("predicate")
Expand Down
22 changes: 20 additions & 2 deletions tests/expressions/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,30 @@ def test_multiple_and_or() -> None:
) == parser.parse("foo is not null and foo < 5 or (foo > 10 and foo < 100 and bar is null)")


def test_like_equality() -> None:
assert EqualTo("foo", "data") == parser.parse("foo LIKE 'data'")
assert EqualTo("foo", "data%") == parser.parse("foo LIKE 'data\\%'")


def test_starts_with() -> None:
assert StartsWith("foo", "data") == parser.parse("foo LIKE 'data'")
assert StartsWith("foo", "data") == parser.parse("foo LIKE 'data%'")
assert StartsWith("foo", "some % data") == parser.parse("foo LIKE 'some \\% data%'")
assert StartsWith("foo", "some data%") == parser.parse("foo LIKE 'some data\\%%'")


def test_invalid_likes() -> None:
invalid_statements = ["foo LIKE '%data%'", "foo LIKE 'da%ta'", "foo LIKE '%data'"]

for statement in invalid_statements:
with pytest.raises(ValueError) as exc_info:
parser.parse(statement)

assert "LIKE expressions only supports wildcard, '%', at the end of a string" in str(exc_info)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice one, this was one of my concerns 👍



def test_not_starts_with() -> None:
assert NotStartsWith("foo", "data") == parser.parse("foo NOT LIKE 'data'")
assert NotEqualTo("foo", "data") == parser.parse("foo NOT LIKE 'data'")
assert NotStartsWith("foo", "data") == parser.parse("foo NOT LIKE 'data%'")


def test_with_function() -> None:
Expand Down