Skip to content

Commit

Permalink
Merge branch 'master' into fix/column-name-wildcard-prefix-table-name
Browse files Browse the repository at this point in the history
  • Loading branch information
macbre authored Sep 11, 2024
2 parents 4496724 + c2849ea commit 7ea22c1
Show file tree
Hide file tree
Showing 23 changed files with 818 additions and 540 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/auto-merge-dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
steps:
- name: Dependabot metadata
id: metadata
uses: dependabot/fetch-metadata@v1.3.3
uses: dependabot/fetch-metadata@v2.2.0
with:
github-token: "${{ secrets.GITHUB_TOKEN }}"

Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/black.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v4
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: 3.x

- name: Install black
run: |
Expand Down
15 changes: 9 additions & 6 deletions .github/workflows/python-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,26 @@ jobs:
runs-on: ubuntu-latest

strategy:
fail-fast: false
matrix:
# https://github.com/actions/python-versions/blob/main/versions-manifest.json
# https://devguide.python.org/versions/#supported-versions
python-version:
- "3.6"
- "3.7"
- "3.8"
- "3.9"
- "3.10"
- "3.11"
- "3.12"

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Load ~/.cache directory and Poetry .venv
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: |
~/.cache
Expand All @@ -45,8 +47,9 @@ jobs:

# https://github.com/marketplace/actions/install-poetry-action
- name: Install Poetry
uses: snok/install-poetry@v1.3
uses: snok/install-poetry@v1.4.1
with:
version: latest
virtualenvs-create: true
virtualenvs-in-project: true

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.x'

- name: Install Poetry
uses: snok/install-poetry@v1.3
uses: snok/install-poetry@v1.4.1
with:
virtualenvs-create: true
virtualenvs-in-project: true
Expand Down
893 changes: 389 additions & 504 deletions poetry.lock

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "sql_metadata"
version = "2.6.0"
version = "2.13.0"
license="MIT"
description = "Uses tokenized query returned by python-sqlparse and generates query metadata"
authors = ["Maciej Brencz <[email protected]>", "Radosław Drążkiewicz <[email protected]>"]
Expand All @@ -13,17 +13,17 @@ packages = [
]

[tool.poetry.dependencies]
python = "^3.6.2"
sqlparse = "^0.4.1"
python = "^3.8"
sqlparse = ">=0.4.1,<0.6.0"

[tool.poetry.dev-dependencies]
black = "^22.6"
coverage = {extras = ["toml"], version = "^6.2"}
pylint = "^2.13.9"
pytest = "^7.0.1"
pytest-cov = "^3.0.0"
black = "^24.8"
coverage = {extras = ["toml"], version = "^6.5"}
pylint = "^3.2.7"
pytest = "^8.3.3"
pytest-cov = "^5.0.0"
coveralls = "^3.3.1"
flake8 = "^4.0.1"
flake8 = "^5.0.4"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
1 change: 1 addition & 0 deletions sql_metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Module for parsing sql queries and returning columns,
tables, names of with statements etc.
"""

# pylint:disable=unsubscriptable-object
from sql_metadata.parser import Parser
from sql_metadata.keywords_lists import QueryType
Expand Down
1 change: 1 addition & 0 deletions sql_metadata/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sql_metadata.compat import get_query_columns, get_query_tables
"""

# pylint:disable=missing-function-docstring
from typing import List, Optional, Tuple

Expand Down
4 changes: 3 additions & 1 deletion sql_metadata/generalizator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Module used to produce generalized sql out of given query
"""

import re
import sqlparse

Expand Down Expand Up @@ -47,7 +48,8 @@ def without_comments(self) -> str:
:rtype: str
"""
sql = sqlparse.format(self._raw_query, strip_comments=True)
sql = re.sub(r"\s{2,}", " ", sql)
sql = sql.replace("\n", " ")
sql = re.sub(r"[ \t]+", " ", sql)
return sql

@property
Expand Down
7 changes: 7 additions & 0 deletions sql_metadata/keywords_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
KEYWORDS_BEFORE_COLUMNS = {
"SELECT",
"WHERE",
"HAVING",
"ORDERBY",
"GROUPBY",
"ON",
Expand All @@ -28,6 +29,7 @@
"RIGHTJOIN",
"LEFTOUTERJOIN",
"RIGHTOUTERJOIN",
"NATURALJOIN",
"INTO",
"UPDATE",
"TABLE",
Expand All @@ -48,13 +50,15 @@
"RIGHTJOIN",
"LEFTOUTERJOIN",
"RIGHTOUTERJOIN",
"NATURALJOIN",
}

# section of a query in which column can exists
# based on last normalized keyword
COLUMNS_SECTIONS = {
"SELECT": "select",
"WHERE": "where",
"HAVING": "having",
"ORDERBY": "order_by",
"ON": "join",
"USING": "join",
Expand All @@ -77,6 +81,7 @@ class QueryType(str, Enum):
CREATE = "CREATE TABLE"
ALTER = "ALTER TABLE"
DROP = "DROP TABLE"
TRUNCATE = "TRUNCATE TABLE"


class TokenType(str, Enum):
Expand Down Expand Up @@ -104,6 +109,8 @@ class TokenType(str, Enum):
"CREATETABLE": QueryType.CREATE,
"ALTERTABLE": QueryType.ALTER,
"DROPTABLE": QueryType.DROP,
"CREATEFUNCTION": QueryType.CREATE,
"TRUNCATETABLE": QueryType.TRUNCATE,
}

# all the keywords we care for - rest is ignored in assigning
Expand Down
67 changes: 56 additions & 11 deletions sql_metadata/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ class Parser: # pylint: disable=R0902
Main class to parse sql query
"""

def __init__(self, sql: str = "") -> None:
def __init__(self, sql: str = "", disable_logging: bool = False) -> None:
self._logger = logging.getLogger(self.__class__.__name__)
self._logger.disabled = disable_logging

self._raw_query = sql
self._query = self._preprocess_query()
Expand Down Expand Up @@ -66,6 +67,7 @@ def __init__(self, sql: str = "") -> None:
self._nested_level = 0
self._parenthesis_level = 0
self._open_parentheses: List[SQLToken] = []
self._preceded_keywords: List[SQLToken] = []
self._aliases_to_check = None
self._is_in_nested_function = False
self._is_in_with_block = False
Expand Down Expand Up @@ -112,7 +114,9 @@ def query_type(self) -> str:
)
.position
)
if tokens[index].normalized in ["CREATE", "ALTER", "DROP"]:
if tokens[index].normalized == "CREATE":
switch = self._get_switch_by_create_query(tokens, index)
elif tokens[index].normalized in ("ALTER", "DROP", "TRUNCATE"):
switch = tokens[index].normalized + tokens[index + 1].normalized
else:
switch = tokens[index].normalized
Expand All @@ -123,7 +127,7 @@ def query_type(self) -> str:
return self._query_type

@property
def tokens(self) -> List[SQLToken]:
def tokens(self) -> List[SQLToken]: # noqa: C901
"""
Tokenizes the query
"""
Expand Down Expand Up @@ -163,6 +167,8 @@ def tokens(self) -> List[SQLToken]:
elif token.is_right_parenthesis:
token.token_type = TokenType.PARENTHESIS
self._determine_closing_parenthesis_type(token=token)
if token.is_subquery_end:
last_keyword = self._preceded_keywords.pop()

last_keyword = self._determine_last_relevant_keyword(
token=token, last_keyword=last_keyword
Expand Down Expand Up @@ -356,6 +362,14 @@ def tables(self) -> List[str]:
)
):
continue

# handle INSERT INTO ON DUPLICATE KEY UPDATE queries
if (
token.last_keyword_normalized == "UPDATE"
and self.query_type == "INSERT"
):
continue

table_name = str(token.value.strip("`"))
token.token_type = TokenType.TABLE
tables.append(table_name)
Expand All @@ -381,8 +395,12 @@ def limit_and_offset(self) -> Optional[Tuple[int, int]]:
elif token.last_keyword_normalized == "OFFSET":
# OFFSET <offset>
offset = int(token.value)
elif token.previous_token.is_punctuation:
elif (
token.previous_token.is_punctuation
and token.last_keyword_normalized == "LIMIT"
):
# LIMIT <offset>,<limit>
# enter this condition only when the limit has already been parsed
offset = limit
limit = int(token.value)

Expand Down Expand Up @@ -447,7 +465,8 @@ def with_names(self) -> List[str]:
while token.next_token and not token.is_with_query_end:
token = token.next_token
is_end_of_with_block = (
token.next_token_not_comment.normalized
token.next_token_not_comment is None
or token.next_token_not_comment.normalized
in WITH_ENDING_KEYWORDS
)
if is_end_of_with_block:
Expand Down Expand Up @@ -488,7 +507,7 @@ def with_queries(self) -> Dict[str, str]:
True, value_attribute="is_with_query_end", direction="right"
)
query_token = with_start.next_token
while query_token != with_end:
while query_token is not None and query_token != with_end:
current_with_query.append(query_token)
query_token = query_token.next_token
with_query_text = "".join([x.stringified_token for x in current_with_query])
Expand Down Expand Up @@ -517,12 +536,16 @@ def subqueries(self) -> Dict:
):
current_subquery.append(inner_token)
inner_token = inner_token.next_token

query_name = None
if inner_token.next_token.value in self.subqueries_names:
query_name = inner_token.next_token.value
else:
elif inner_token.next_token.is_as_keyword:
query_name = inner_token.next_token.next_token.value

subquery_text = "".join([x.stringified_token for x in current_subquery])
subqueries[query_name] = subquery_text
if query_name is not None:
subqueries[query_name] = subquery_text

token = token.next_token

Expand Down Expand Up @@ -606,7 +629,7 @@ def without_comments(self) -> str:
"""
Removes comments from SQL query
"""
return Generalizator(self.query).without_comments
return Generalizator(self._raw_query).without_comments

@property
def generalize(self) -> str:
Expand Down Expand Up @@ -647,6 +670,10 @@ def _handle_with_name_save(token: SQLToken, with_names: List[str]) -> None:
token.is_with_columns_end = True
token.is_nested_function_end = False
start_token = token.find_nearest_token("(")
# like: with (col1, col2) as (subquery) as ..., it enters an infinite loop.
# return exception
if start_token.is_with_query_start:
raise ValueError("This query is wrong")
start_token.is_with_columns_start = True
start_token.is_nested_function_start = False
prev_token = start_token.previous_token
Expand Down Expand Up @@ -847,12 +874,13 @@ def _determine_opening_parenthesis_type(self, token: SQLToken):
# inside subquery / derived table
token.is_subquery_start = True
self._subquery_level += 1
self._preceded_keywords.append(token.last_keyword_normalized)
token.subquery_level = self._subquery_level
elif token.previous_token.normalized in KEYWORDS_BEFORE_COLUMNS.union({","}):
# we are in columns and in a column subquery definition
token.is_column_definition_start = True
elif (
token.previous_token.is_as_keyword
token.previous_token_not_comment.is_as_keyword
and token.last_keyword_normalized != "WINDOW"
):
# window clause also contains AS keyword, but it is not a query
Expand Down Expand Up @@ -955,12 +983,13 @@ def replace_back_quotes_in_string(match):
# as double quotes are not properly handled in sqlparse
query = re.sub(r"'.*?'", replace_quotes_in_string, self._raw_query)
query = re.sub(r'"([^`]+?)"', r"`\1`", query)
query = re.sub(r'"([^`]+?)"\."([^`]+?)"', r"`\1`.`\2`", query)
query = re.sub(r"'.*?'", replace_back_quotes_in_string, query)

return query

def _determine_last_relevant_keyword(self, token: SQLToken, last_keyword: str):
if token.value == "," and token.last_keyword_normalized == "ON":
return "FROM"
if token.is_keyword and "".join(token.normalized.split()) in RELEVANT_KEYWORDS:
if (
not (
Expand Down Expand Up @@ -1056,3 +1085,19 @@ def _flatten_sqlparse(self):
yield tok
else:
yield token

@staticmethod
def _get_switch_by_create_query(tokens: List[SQLToken], index: int) -> str:
"""
Return the switch that creates query type.
"""
switch = tokens[index].normalized + tokens[index + 1].normalized

# Hive CREATE FUNCTION
if any(
index + i < len(tokens) and tokens[index + i].normalized == "FUNCTION"
for i in (1, 2)
):
switch = "CREATEFUNCTION"

return switch
Loading

0 comments on commit 7ea22c1

Please sign in to comment.