Merge branch 'main' of github.com:apache/iceberg-python into fd-warnings

apache · Nov 2, 2023 · 35a4311 · 35a4311
2 parents c2c4504 + d92bc18
commit 35a4311
Show file tree

Hide file tree

Showing 21 changed files with 523 additions and 188 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -70,3 +70,11 @@ repos:
           ]
         additional_dependencies:
           - tomli==2.0.1
+  - repo: https://github.com/ikamensh/flynt
+    rev: 1.0.1
+    hooks:
+      - id: flynt
+        args:
+          # --line-length is set to a high value to deal with very long lines
+          - --line-length
+          - '99999'
diff --git a/dev/provision.py b/dev/provision.py
@@ -301,3 +301,23 @@
 );
 """
 )
+
+spark.sql(
+    """
+CREATE TABLE default.test_table_sanitized_character (
+    `letter/abc` string
+)
+USING iceberg
+TBLPROPERTIES (
+    'format-version'='1'
+);
+"""
+)
+
+spark.sql(
+    f"""
+INSERT INTO default.test_table_sanitized_character
+VALUES
+    ('123')
+"""
+)
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -54,6 +54,12 @@ catalog = load_catalog(
 )
 ```
 
+If the catalog has not been initialized before, you need to run:
+
+```python
+catalog.create_tables()
+```
+
 Let's create a namespace:
 
 ```python
@@ -124,6 +130,7 @@ sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))
 catalog.create_table(
     identifier="docs_example.bids",
     schema=schema,
+    location="s3://pyiceberg",
     partition_spec=partition_spec,
     sort_order=sort_order,
 )

diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md
@@ -54,7 +54,7 @@ Both the source distribution (`sdist`) and the binary distributions (`wheels`) n
 
 Before committing the files to the Apache SVN artifact distribution SVN hashes need to be generated, and those need to be signed with gpg to make sure that they are authentic.
 
-Go to [Github Actions and run the `Python release` action](https://github.com/apache/iceberg/actions/workflows/python-release.yml). **Set the version to main, since we cannot modify the source**. Download the zip, and sign the files:
+Go to [Github Actions and run the `Python release` action](https://github.com/apache/iceberg-python/actions/workflows/python-release.yml). **Set the version to main, since we cannot modify the source**. Download the zip, and sign the files:
 
 ```bash
 cd release-main/
@@ -106,8 +106,8 @@ A summary of the high level features:
 The commit ID is $LAST_COMMIT_ID
 
 * This corresponds to the tag: $GIT_TAG ($GIT_TAG_HASH)
-* https://github.com/apache/iceberg/releases/tag/$GIT_TAG
-* https://github.com/apache/iceberg/tree/$LAST_COMMIT_ID
+* https://github.com/apache/iceberg-python/releases/tag/$GIT_TAG
+* https://github.com/apache/iceberg-python/tree/$LAST_COMMIT_ID
 
 The release tarball, signature, and checksums are here:
 
@@ -163,4 +163,4 @@ Thanks to everyone for contributing!
 
 ## Release the docs
 
-A committer triggers the [`Python Docs` Github Actions](https://github.com/apache/iceberg/actions/workflows/python-ci-docs.yml) through the UI by selecting the branch that just has been released. This will publish the new docs.
+A committer triggers the [`Python Docs` Github Actions](https://github.com/apache/iceberg-python/actions/workflows/python-ci-docs.yml) through the UI by selecting the branch that just has been released. This will publish the new docs.
diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md
@@ -38,17 +38,17 @@ You can install the latest release version from pypi:
 pip install "pyiceberg[s3fs,hive]"
 ```
 
-Install it directly for Github (not recommended), but sometimes handy:
+Install it directly for GitHub (not recommended), but sometimes handy:
 
 ```
-pip install "git+https://github.com/apache/iceberg.git#subdirectory=python&egg=pyiceberg[s3fs]"
+pip install "git+https://github.com/apache/iceberg-python.git#egg=pyiceberg[s3fs]"
 ```
 
 Or clone the repository for local development:
 
 ```sh
-git clone https://github.com/apache/iceberg.git
-cd iceberg/python
+git clone https://github.com/apache/iceberg-python.git
+cd iceberg-python
 pip3 install -e ".[s3fs,hive]"
 ```
 

diff --git a/mkdocs/docs/verify-release.md b/mkdocs/docs/verify-release.md
@@ -59,7 +59,7 @@ done
 
 ```sh
 cd  /tmp/pyiceberg/
-for name in $(ls /tmp/pyiceberg/pyiceberg-*.whl.asc.sha512 /tmp/pyiceberg/pyiceberg-*.tar.gz.asc.sha512)
+for name in $(ls /tmp/pyiceberg/pyiceberg-*.whl.sha512 /tmp/pyiceberg/pyiceberg-*.tar.gz.sha512)
 do
     shasum -a 512 --check ${name}
 done

diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt
@@ -16,13 +16,13 @@
 # under the License.
 
 mkdocs==1.5.3
-griffe==0.36.5
+griffe==0.36.7
 jinja2==3.1.2
 mkdocstrings==0.23.0
 mkdocstrings-python==1.7.3
 mkdocs-literate-nav==0.6.1
 mkdocs-autorefs==0.5.0
 mkdocs-gen-files==0.5.0
-mkdocs-material==9.4.2
+mkdocs-material==9.4.7
 mkdocs-material-extensions==1.2
 mkdocs-section-index==0.3.8
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyiceberg/expressions/__init__.py b/pyiceberg/expressions/__init__.py
@@ -346,7 +346,7 @@ def __init__(self, term: BoundTerm[L]):
 
     def __eq__(self, other: Any) -> bool:
         """Return the equality of two instances of the BoundPredicate class."""
-        if isinstance(other, BoundPredicate):
+        if isinstance(other, self.__class__):
             return self.term == other.term
         return False
 
@@ -364,7 +364,7 @@ def __init__(self, term: Union[str, UnboundTerm[Any]]):
 
     def __eq__(self, other: Any) -> bool:
         """Return the equality of two instances of the UnboundPredicate class."""
-        return self.term == other.term if isinstance(other, UnboundPredicate) else False
+        return self.term == other.term if isinstance(other, self.__class__) else False
 
     @abstractmethod
     def bind(self, schema: Schema, case_sensitive: bool = True) -> BooleanExpression:
@@ -531,7 +531,7 @@ def __repr__(self) -> str:
 
     def __eq__(self, other: Any) -> bool:
         """Return the equality of two instances of the SetPredicate class."""
-        return self.term == other.term and self.literals == other.literals if isinstance(other, SetPredicate) else False
+        return self.term == other.term and self.literals == other.literals if isinstance(other, self.__class__) else False
 
     def __getnewargs__(self) -> Tuple[UnboundTerm[L], Set[Literal[L]]]:
         """Pickle the SetPredicate class."""
@@ -567,7 +567,7 @@ def __repr__(self) -> str:
 
     def __eq__(self, other: Any) -> bool:
         """Return the equality of two instances of the BoundSetPredicate class."""
-        return self.term == other.term and self.literals == other.literals if isinstance(other, BoundSetPredicate) else False
+        return self.term == other.term and self.literals == other.literals if isinstance(other, self.__class__) else False
 
     def __getnewargs__(self) -> Tuple[BoundTerm[L], Set[Literal[L]]]:
         """Pickle the BoundSetPredicate class."""
@@ -595,7 +595,7 @@ def __invert__(self) -> BoundNotIn[L]:
 
     def __eq__(self, other: Any) -> bool:
         """Return the equality of two instances of the BoundIn class."""
-        return self.term == other.term and self.literals == other.literals if isinstance(other, BoundIn) else False
+        return self.term == other.term and self.literals == other.literals if isinstance(other, self.__class__) else False
 
     @property
     def as_unbound(self) -> Type[In[L]]:
@@ -664,12 +664,6 @@ def __invert__(self) -> In[L]:
         """Transform the Expression into its negated version."""
         return In[L](self.term, self.literals)
 
-    def __eq__(self, other: Any) -> bool:
-        """Return the equality of two instances of the NotIn class."""
-        if isinstance(other, NotIn):
-            return self.term == other.term and self.literals == other.literals
-        return False
-
     @property
     def as_bound(self) -> Type[BoundNotIn[L]]:
         return BoundNotIn[L]
@@ -701,7 +695,7 @@ def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundLiteralPredi
 
     def __eq__(self, other: Any) -> bool:
         """Return the equality of two instances of the LiteralPredicate class."""
-        if isinstance(other, LiteralPredicate):
+        if isinstance(other, self.__class__):
             return self.term == other.term and self.literal == other.literal
         return False
 
@@ -725,7 +719,7 @@ def __init__(self, term: BoundTerm[L], literal: Literal[L]):  # pylint: disable=
 
     def __eq__(self, other: Any) -> bool:
         """Return the equality of two instances of the BoundLiteralPredicate class."""
-        if isinstance(other, BoundLiteralPredicate):
+        if isinstance(other, self.__class__):
             return self.term == other.term and self.literal == other.literal
         return False
 
@@ -890,9 +884,9 @@ def as_bound(self) -> Type[BoundStartsWith[L]]:
 
 
 class NotStartsWith(LiteralPredicate[L]):
-    def __invert__(self) -> NotStartsWith[L]:
+    def __invert__(self) -> StartsWith[L]:
         """Transform the Expression into its negated version."""
-        return NotStartsWith[L](self.term, self.literal)
+        return StartsWith[L](self.term, self.literal)
 
     @property
     def as_bound(self) -> Type[BoundNotStartsWith[L]]:

diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py
@@ -14,6 +14,7 @@
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
+import re
 from decimal import Decimal
 
 from pyparsing import (
@@ -51,7 +52,6 @@
     NotIn,
     NotNaN,
     NotNull,
-    NotStartsWith,
     Or,
     Reference,
     StartsWith,
@@ -78,6 +78,8 @@
 identifier = Word(alphas, alphanums + "_$").set_results_name("identifier")
 column = DelimitedList(identifier, delim=".", combine=False).set_results_name("column")
 
+like_regex = r'(?P<valid_wildcard>(?<!\\)%$)|(?P<invalid_wildcard>(?<!\\)%)'
+
 
 @column.set_parse_action
 def _(result: ParseResults) -> Reference:
@@ -217,12 +219,25 @@ def _(result: ParseResults) -> BooleanExpression:
 
 @starts_with.set_parse_action
 def _(result: ParseResults) -> BooleanExpression:
-    return StartsWith(result.column, result.raw_quoted_string)
+    return _evaluate_like_statement(result)
 
 
 @not_starts_with.set_parse_action
 def _(result: ParseResults) -> BooleanExpression:
-    return NotStartsWith(result.column, result.raw_quoted_string)
+    return ~_evaluate_like_statement(result)
+
+
+def _evaluate_like_statement(result: ParseResults) -> BooleanExpression:
+    literal_like: StringLiteral = result.raw_quoted_string
+
+    match = re.search(like_regex, literal_like.value)
+
+    if match and match.groupdict()['invalid_wildcard']:
+        raise ValueError("LIKE expressions only supports wildcard, '%', at the end of a string")
+    elif match and match.groupdict()['valid_wildcard']:
+        return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace('\\%', '%')))
+    else:
+        return EqualTo(result.column, StringLiteral(literal_like.value.replace('\\%', '%')))
 
 
 predicate = (comparison | in_check | null_check | nan_check | starts_check | boolean).set_results_name("predicate")
@@ -252,4 +267,4 @@ def handle_or(result: ParseResults) -> Or:
 
 def parse(expr: str) -> BooleanExpression:
     """Parse a boolean expression."""
-    return boolean_expression.parse_string(expr)[0]
+    return boolean_expression.parse_string(expr, parse_all=True)[0]
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -113,6 +113,7 @@
     pre_order_visit,
     promote,
     prune_columns,
+    sanitize_column_names,
     visit,
     visit_with_partner,
 )
@@ -632,7 +633,7 @@ def visit_pyarrow(obj: Union[pa.DataType, pa.Schema], visitor: PyArrowSchemaVisi
     Raises:
         NotImplementedError: If attempting to visit an unrecognized object type.
     """
-    raise NotImplementedError("Cannot visit non-type: %s" % obj)
+    raise NotImplementedError(f"Cannot visit non-type: {obj}")
 
 
 @visit_pyarrow.register(pa.Schema)
@@ -830,7 +831,7 @@ def _task_to_table(
             bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive)
             pyarrow_filter = expression_to_pyarrow(bound_file_filter)
 
-        file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False)
+        file_project_schema = sanitize_column_names(prune_columns(file_schema, projected_field_ids, select_full_types=False))
 
         if file_schema is None:
             raise ValueError(f"Missing Iceberg schema in Metadata for file: {path}")
@@ -1099,8 +1100,8 @@ def map_value_partner(self, partner_map: Optional[pa.Array]) -> Optional[pa.Arra
         return partner_map.items if isinstance(partner_map, pa.MapArray) else None
 
 
-def _primitive_to_phyisical(iceberg_type: PrimitiveType) -> str:
-    return visit(iceberg_type, _PRIMITIVE_TO_PHYISCAL_TYPE_VISITOR)
+def _primitive_to_physical(iceberg_type: PrimitiveType) -> str:
+    return visit(iceberg_type, _PRIMITIVE_TO_PHYSICAL_TYPE_VISITOR)
 
 
 class PrimitiveToPhysicalType(SchemaVisitorPerPrimitiveType[str]):
@@ -1162,7 +1163,7 @@ def visit_binary(self, binary_type: BinaryType) -> str:
         return "BYTE_ARRAY"
 
 
-_PRIMITIVE_TO_PHYISCAL_TYPE_VISITOR = PrimitiveToPhysicalType()
+_PRIMITIVE_TO_PHYSICAL_TYPE_VISITOR = PrimitiveToPhysicalType()
 
 
 class StatsAggregator:
@@ -1175,7 +1176,7 @@ def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc
         self.current_max = None
         self.trunc_length = trunc_length
 
-        expected_physical_type = _primitive_to_phyisical(iceberg_type)
+        expected_physical_type = _primitive_to_physical(iceberg_type)
         if expected_physical_type != physical_type_string:
             raise ValueError(
                 f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}"