Merge branch 'master' into fix/column-name-wildcard-prefix-table-name

macbre · Sep 11, 2024 · 7ea22c1 · 7ea22c1
2 parents 4496724 + c2849ea
commit 7ea22c1
Show file tree

Hide file tree

Showing 23 changed files with 818 additions and 540 deletions.
diff --git a/.github/workflows/auto-merge-dependabot.yml b/.github/workflows/auto-merge-dependabot.yml
@@ -13,7 +13,7 @@ jobs:
         steps:
             - name: Dependabot metadata
               id: metadata
-              uses: dependabot/fetch-metadata@v1.3.3
+              uses: dependabot/fetch-metadata@v2.2.0
               with:
                   github-token: "${{ secrets.GITHUB_TOKEN }}"
 

diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
@@ -12,11 +12,11 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
       with:
-        python-version: 3.8
+        python-version: 3.x
 
     - name: Install black
       run: |

diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -13,24 +13,26 @@ jobs:
     runs-on: ubuntu-latest
 
     strategy:
+      fail-fast: false
       matrix:
         # https://github.com/actions/python-versions/blob/main/versions-manifest.json
+        # https://devguide.python.org/versions/#supported-versions
         python-version:
-        - "3.6"
-        - "3.7"
         - "3.8"
         - "3.9"
         - "3.10"
+        - "3.11"
+        - "3.12"
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
 
     - name: Load ~/.cache directory and Poetry .venv
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: |
           ~/.cache
@@ -45,8 +47,9 @@ jobs:
 
     # https://github.com/marketplace/actions/install-poetry-action
     - name: Install Poetry
-      uses: snok/install-poetry@v1.3
+      uses: snok/install-poetry@v1.4.1
       with:
+        version: latest
         virtualenvs-create: true
         virtualenvs-in-project: true
 

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -16,14 +16,14 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: '3.x'
 
     - name: Install Poetry
-      uses: snok/install-poetry@v1.3
+      uses: snok/install-poetry@v1.4.1
       with:
         virtualenvs-create: true
         virtualenvs-in-project: true

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sql_metadata"
-version = "2.6.0"
+version = "2.13.0"
 license="MIT"
 description = "Uses tokenized query returned by python-sqlparse and generates query metadata"
 authors = ["Maciej Brencz <[email protected]>", "Radosław Drążkiewicz <[email protected]>"]
@@ -13,17 +13,17 @@ packages = [
 ]
 
 [tool.poetry.dependencies]
-python = "^3.6.2"
-sqlparse = "^0.4.1"
+python = "^3.8"
+sqlparse = ">=0.4.1,<0.6.0"
 
 [tool.poetry.dev-dependencies]
-black = "^22.6"
-coverage = {extras = ["toml"], version = "^6.2"}
-pylint = "^2.13.9"
-pytest = "^7.0.1"
-pytest-cov = "^3.0.0"
+black = "^24.8"
+coverage = {extras = ["toml"], version = "^6.5"}
+pylint = "^3.2.7"
+pytest = "^8.3.3"
+pytest-cov = "^5.0.0"
 coveralls = "^3.3.1"
-flake8 = "^4.0.1"
+flake8 = "^5.0.4"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

diff --git a/sql_metadata/__init__.py b/sql_metadata/__init__.py
@@ -2,6 +2,7 @@
 Module for parsing sql queries and returning columns,
 tables, names of with statements etc.
 """
+
 # pylint:disable=unsubscriptable-object
 from sql_metadata.parser import Parser
 from sql_metadata.keywords_lists import QueryType

diff --git a/sql_metadata/compat.py b/sql_metadata/compat.py
@@ -11,6 +11,7 @@
 from sql_metadata.compat import get_query_columns, get_query_tables
 
 """
+
 # pylint:disable=missing-function-docstring
 from typing import List, Optional, Tuple
 

diff --git a/sql_metadata/generalizator.py b/sql_metadata/generalizator.py
@@ -1,6 +1,7 @@
 """
 Module used to produce generalized sql out of given query
 """
+
 import re
 import sqlparse
 
@@ -47,7 +48,8 @@ def without_comments(self) -> str:
         :rtype: str
         """
         sql = sqlparse.format(self._raw_query, strip_comments=True)
-        sql = re.sub(r"\s{2,}", " ", sql)
+        sql = sql.replace("\n", " ")
+        sql = re.sub(r"[ \t]+", " ", sql)
         return sql
 
     @property

diff --git a/sql_metadata/keywords_lists.py b/sql_metadata/keywords_lists.py
@@ -9,6 +9,7 @@
 KEYWORDS_BEFORE_COLUMNS = {
     "SELECT",
     "WHERE",
+    "HAVING",
     "ORDERBY",
     "GROUPBY",
     "ON",
@@ -28,6 +29,7 @@
     "RIGHTJOIN",
     "LEFTOUTERJOIN",
     "RIGHTOUTERJOIN",
+    "NATURALJOIN",
     "INTO",
     "UPDATE",
     "TABLE",
@@ -48,13 +50,15 @@
     "RIGHTJOIN",
     "LEFTOUTERJOIN",
     "RIGHTOUTERJOIN",
+    "NATURALJOIN",
 }
 
 # section of a query in which column can exists
 # based on last normalized keyword
 COLUMNS_SECTIONS = {
     "SELECT": "select",
     "WHERE": "where",
+    "HAVING": "having",
     "ORDERBY": "order_by",
     "ON": "join",
     "USING": "join",
@@ -77,6 +81,7 @@ class QueryType(str, Enum):
     CREATE = "CREATE TABLE"
     ALTER = "ALTER TABLE"
     DROP = "DROP TABLE"
+    TRUNCATE = "TRUNCATE TABLE"
 
 
 class TokenType(str, Enum):
@@ -104,6 +109,8 @@ class TokenType(str, Enum):
     "CREATETABLE": QueryType.CREATE,
     "ALTERTABLE": QueryType.ALTER,
     "DROPTABLE": QueryType.DROP,
+    "CREATEFUNCTION": QueryType.CREATE,
+    "TRUNCATETABLE": QueryType.TRUNCATE,
 }
 
 # all the keywords we care for - rest is ignored in assigning

diff --git a/sql_metadata/parser.py b/sql_metadata/parser.py
@@ -30,8 +30,9 @@ class Parser:  # pylint: disable=R0902
     Main class to parse sql query
     """
 
-    def __init__(self, sql: str = "") -> None:
+    def __init__(self, sql: str = "", disable_logging: bool = False) -> None:
         self._logger = logging.getLogger(self.__class__.__name__)
+        self._logger.disabled = disable_logging
 
         self._raw_query = sql
         self._query = self._preprocess_query()
@@ -66,6 +67,7 @@ def __init__(self, sql: str = "") -> None:
         self._nested_level = 0
         self._parenthesis_level = 0
         self._open_parentheses: List[SQLToken] = []
+        self._preceded_keywords: List[SQLToken] = []
         self._aliases_to_check = None
         self._is_in_nested_function = False
         self._is_in_with_block = False
@@ -112,7 +114,9 @@ def query_type(self) -> str:
             )
             .position
         )
-        if tokens[index].normalized in ["CREATE", "ALTER", "DROP"]:
+        if tokens[index].normalized == "CREATE":
+            switch = self._get_switch_by_create_query(tokens, index)
+        elif tokens[index].normalized in ("ALTER", "DROP", "TRUNCATE"):
             switch = tokens[index].normalized + tokens[index + 1].normalized
         else:
             switch = tokens[index].normalized
@@ -123,7 +127,7 @@ def query_type(self) -> str:
         return self._query_type
 
     @property
-    def tokens(self) -> List[SQLToken]:
+    def tokens(self) -> List[SQLToken]:  # noqa: C901
         """
         Tokenizes the query
         """
@@ -163,6 +167,8 @@ def tokens(self) -> List[SQLToken]:
             elif token.is_right_parenthesis:
                 token.token_type = TokenType.PARENTHESIS
                 self._determine_closing_parenthesis_type(token=token)
+                if token.is_subquery_end:
+                    last_keyword = self._preceded_keywords.pop()
 
             last_keyword = self._determine_last_relevant_keyword(
                 token=token, last_keyword=last_keyword
@@ -356,6 +362,14 @@ def tables(self) -> List[str]:
                     )
                 ):
                     continue
+
+                # handle INSERT INTO ON DUPLICATE KEY UPDATE queries
+                if (
+                    token.last_keyword_normalized == "UPDATE"
+                    and self.query_type == "INSERT"
+                ):
+                    continue
+
                 table_name = str(token.value.strip("`"))
                 token.token_type = TokenType.TABLE
                 tables.append(table_name)
@@ -381,8 +395,12 @@ def limit_and_offset(self) -> Optional[Tuple[int, int]]:
                 elif token.last_keyword_normalized == "OFFSET":
                     # OFFSET <offset>
                     offset = int(token.value)
-                elif token.previous_token.is_punctuation:
+                elif (
+                    token.previous_token.is_punctuation
+                    and token.last_keyword_normalized == "LIMIT"
+                ):
                     # LIMIT <offset>,<limit>
+                    #  enter this condition only when the limit has already been parsed
                     offset = limit
                     limit = int(token.value)
 
@@ -447,7 +465,8 @@ def with_names(self) -> List[str]:
                         while token.next_token and not token.is_with_query_end:
                             token = token.next_token
                         is_end_of_with_block = (
-                            token.next_token_not_comment.normalized
+                            token.next_token_not_comment is None
+                            or token.next_token_not_comment.normalized
                             in WITH_ENDING_KEYWORDS
                         )
                         if is_end_of_with_block:
@@ -488,7 +507,7 @@ def with_queries(self) -> Dict[str, str]:
                 True, value_attribute="is_with_query_end", direction="right"
             )
             query_token = with_start.next_token
-            while query_token != with_end:
+            while query_token is not None and query_token != with_end:
                 current_with_query.append(query_token)
                 query_token = query_token.next_token
             with_query_text = "".join([x.stringified_token for x in current_with_query])
@@ -517,12 +536,16 @@ def subqueries(self) -> Dict:
                 ):
                     current_subquery.append(inner_token)
                     inner_token = inner_token.next_token
+
+                query_name = None
                 if inner_token.next_token.value in self.subqueries_names:
                     query_name = inner_token.next_token.value
-                else:
+                elif inner_token.next_token.is_as_keyword:
                     query_name = inner_token.next_token.next_token.value
+
                 subquery_text = "".join([x.stringified_token for x in current_subquery])
-                subqueries[query_name] = subquery_text
+                if query_name is not None:
+                    subqueries[query_name] = subquery_text
 
             token = token.next_token
 
@@ -606,7 +629,7 @@ def without_comments(self) -> str:
         """
         Removes comments from SQL query
         """
-        return Generalizator(self.query).without_comments
+        return Generalizator(self._raw_query).without_comments
 
     @property
     def generalize(self) -> str:
@@ -647,6 +670,10 @@ def _handle_with_name_save(token: SQLToken, with_names: List[str]) -> None:
             token.is_with_columns_end = True
             token.is_nested_function_end = False
             start_token = token.find_nearest_token("(")
+            # like: with (col1, col2) as (subquery) as ..., it enters an infinite loop.
+            # return exception
+            if start_token.is_with_query_start:
+                raise ValueError("This query is wrong")
             start_token.is_with_columns_start = True
             start_token.is_nested_function_start = False
             prev_token = start_token.previous_token
@@ -847,12 +874,13 @@ def _determine_opening_parenthesis_type(self, token: SQLToken):
             # inside subquery / derived table
             token.is_subquery_start = True
             self._subquery_level += 1
+            self._preceded_keywords.append(token.last_keyword_normalized)
             token.subquery_level = self._subquery_level
         elif token.previous_token.normalized in KEYWORDS_BEFORE_COLUMNS.union({","}):
             # we are in columns and in a column subquery definition
             token.is_column_definition_start = True
         elif (
-            token.previous_token.is_as_keyword
+            token.previous_token_not_comment.is_as_keyword
             and token.last_keyword_normalized != "WINDOW"
         ):
             # window clause also contains AS keyword, but it is not a query
@@ -955,12 +983,13 @@ def replace_back_quotes_in_string(match):
         # as double quotes are not properly handled in sqlparse
         query = re.sub(r"'.*?'", replace_quotes_in_string, self._raw_query)
         query = re.sub(r'"([^`]+?)"', r"`\1`", query)
-        query = re.sub(r'"([^`]+?)"\."([^`]+?)"', r"`\1`.`\2`", query)
         query = re.sub(r"'.*?'", replace_back_quotes_in_string, query)
 
         return query
 
     def _determine_last_relevant_keyword(self, token: SQLToken, last_keyword: str):
+        if token.value == "," and token.last_keyword_normalized == "ON":
+            return "FROM"
         if token.is_keyword and "".join(token.normalized.split()) in RELEVANT_KEYWORDS:
             if (
                 not (
@@ -1056,3 +1085,19 @@ def _flatten_sqlparse(self):
                             yield tok
             else:
                 yield token
+
+    @staticmethod
+    def _get_switch_by_create_query(tokens: List[SQLToken], index: int) -> str:
+        """
+        Return the switch that creates query type.
+        """
+        switch = tokens[index].normalized + tokens[index + 1].normalized
+
+        # Hive CREATE FUNCTION
+        if any(
+            index + i < len(tokens) and tokens[index + i].normalized == "FUNCTION"
+            for i in (1, 2)
+        ):
+            switch = "CREATEFUNCTION"
+
+        return switch
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ @@
     from sql_metadata.compat import get_query_columns, get_query_tables
     """
     # pylint:disable=missing-function-docstring
     from typing import List, Optional, Tuple
@@ Expand Down @@