diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 27645fa7..255cd7a5 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -60,7 +60,7 @@ jobs: run: poetry run pytest -vv --cov=sql_metadata --cov-report=term - name: Lint with pylint - run: poetry run pylint sql_metadata.py + run: poetry run pylint sql_metadata - name: Build a distribution package run: poetry build -vvv diff --git a/Makefile b/Makefile index 73f8fe5c..cfcf1929 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ coverage: poetry run pytest -vv --cov=sql_metadata --cov-report=term lint: - poetry run pylint sql_metadata.py + poetry run pylint sql_metadata publish: # run git tag -a v0.0.0 before running make publish diff --git a/README.md b/README.md index 2e6f4f15..7dfa5d7f 100644 --- a/README.md +++ b/README.md @@ -21,48 +21,189 @@ Supported queries syntax: pip install sql-metadata ``` +### Extracting raw sql-metadata tokens + +```python +from sql_metadata import Parser + +# extract raw sql-metadata tokens +Parser("SELECT * FROM foo").tokens +# ['SELECT', '*', 'FROM', 'foo'] +``` + +### Extracting columns from query + ```python ->>> import sql_metadata +from sql_metadata import Parser + +# get columns from query - for more examples see `tests/test_getting_columns.py` +Parser("SELECT test, id FROM foo, bar").columns +# ['test', 'id'] + +Parser("INSERT /* VoteHelper::addVote xxx */ INTO `page_vote` (article_id,user_id,`time`) VALUES ('442001','27574631','20180228130846')").columns +# ['article_id', 'user_id', 'time'] ->>> sql_metadata.get_query_tokens("SELECT * FROM foo") -[, , , ] +parser = Parser("SELECT a.* FROM product_a.users AS a JOIN product_b.users AS b ON a.ip_address = b.ip_address") ->>> sql_metadata.get_query_columns("SELECT test, id FROM foo, bar") -[u'test', u'id'] +# note that aliases are auto-resolved +parser.columns +# ['product_a.*', 'product_a.users.ip_address', 'product_b.users.ip_address'] ->>> sql_metadata.get_query_tables("SELECT a.* FROM product_a.users AS a JOIN product_b.users AS b ON a.ip_address = b.ip_address") -['product_a.users', 'product_b.users'] +# note that you can also extract columns with their place in the query +# which will return dict with lists divided into select, where, order_by, join, insert and update +parser.columns_dict +# {'select': ['product_a.users.*'], 'join': ['product_a.users.ip_address', 'product_b.users.ip_address']} +``` + +### Extracting tables from query ->>> sql_metadata.get_query_columns("INSERT /* VoteHelper::addVote xxx */ INTO `page_vote` (article_id,user_id,`time`) VALUES ('442001','27574631','20180228130846')") -['article_id', 'user_id', 'time'] +```python +from sql_metadata import Parser ->>> sql_metadata.get_query_columns("SELECT a.* FROM product_a.users AS a JOIN product_b.users AS b ON a.ip_address = b.ip_address") -['a.*', 'a.ip_address', 'b.ip_address'] +# get tables from query - for more examples see `tests/test_getting_tables.py` +Parser("SELECT a.* FROM product_a.users AS a JOIN product_b.users AS b ON a.ip_address = b.ip_address").tables +# ['product_a.users', 'product_b.users'] ->>> sql_metadata.get_query_tables("SELECT test, id FROM foo, bar") -[u'foo', u'bar'] +Parser("SELECT test, id FROM foo, bar").tables +# ['foo', 'bar'] ->>> sql_metadata.get_query_limit_and_offset('SELECT foo_limit FROM bar_offset LIMIT 50 OFFSET 1000') -(50, 1000) +# you can also extract aliases of the tables as a dictionary +parser = Parser("SELECT f.test FROM foo AS f") ->>> sql_metadata.get_query_limit_and_offset('SELECT foo_limit FROM bar_offset limit 2000,50') -(50, 2000) +# get table aliases +parser.tables_aliases +# {'f': 'foo'} ->>> sql_metadata.get_query_table_aliases("SELECT test FROM foo AS f") -{'f': 'foo'} +# note that aliases are auto-resolved for columns +parser.columns +# ["foo.test"] ``` -> See `test/test_query.py` file for more examples of a bit more complex queries. +### Extracting values from query +```python +from sql_metadata import Parser + +parser = Parser( + "INSERT /* VoteHelper::addVote xxx */ INTO `page_vote` (article_id,user_id,`time`) " + "VALUES ('442001','27574631','20180228130846')" +) +# extract values from query +parser.values +# ["442001", "27574631", "20180228130846"] + +# extract a dictionary with column-value pairs +parser.values_dict +#{"article_id": "442001", "user_id": "27574631", "time": "20180228130846"} + +# if column names are not set auto-add placeholders +parser = Parser( + "INSERT IGNORE INTO `table` VALUES (9, 2.15, '123', '2017-01-01');" +) +parser.values +# [9, 2.15, "123", "2017-01-01"] + +parser.values_dict +#{"column_1": 9, "column_2": 2.15, "column_3": "123", "column_4": "2017-01-01"} +``` -### Queries normalization +### Extracting limit and offset ```python ->>> from sql_metadata import generalize_sql ->>> generalize_sql('SELECT /* Test */ foo FROM bar WHERE id in (1, 2, 56)') -'SELECT foo FROM bar WHERE id in (XYZ)' +from sql_metadata import Parser + +Parser('SELECT foo_limit FROM bar_offset LIMIT 50 OFFSET 1000').limit_and_offset +# (50, 1000) + +Parser('SELECT foo_limit FROM bar_offset limit 2000,50').limit_and_offset +# (50, 2000) +``` + +### Extracting with names + +```python +from sql_metadata import Parser + +parser = Parser( + """ +WITH + database1.tableFromWith AS (SELECT aa.* FROM table3 as aa + left join table4 on aa.col1=table4.col2), + test as (select * from table3) +SELECT + "xxxxx" +FROM + database1.tableFromWith alias +LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") +""" +) + +# get names/ aliases of with statements +parser.with_names +# ["database1.tableFromWith", "test"] + +# note that names of with statements do not appear in tables +parser.tables +# ["table3", "table4", "database2.table2"] +``` + +### Extracting sub-queries + +```python +from sql_metadata import Parser + +parser = Parser( +""" +SELECT COUNT(1) FROM +(SELECT std.task_id FROM some_task_detail std WHERE std.STATUS = 1) a +JOIN (SELECT st.task_id FROM some_task st WHERE task_type_id = 80) b +ON a.task_id = b.task_id; +""" +) + +# get sub-queries dictionary +parser.subqueries +# {"a": "SELECT std.task_id FROM some_task_detail std WHERE std.STATUS = 1", +# "b": "SELECT st.task_id FROM some_task st WHERE task_type_id = 80"} + + +# get names/ aliases of sub-queries / derived tables +parser.subqueries_names +# ["a", "b"] + +# note that you can also exclude columns coming from sub-queries +# all columns +parser.columns +#["some_task_detail.task_id", "some_task_detail.STATUS", "some_task.task_id", +# "task_type_id", "a.task_id", "b.task_id"] + +# without subqueries +parser.columns_without_subqueries +#["some_task_detail.task_id", "some_task_detail.STATUS", "some_task.task_id", +# "task_type_id"] +``` + +See `tests` file for more examples of a bit more complex queries. + +### Queries normalization and comments extraction + +```python +from sql_metadata import Parser +parser = Parser('SELECT /* Test */ foo FROM bar WHERE id in (1, 2, 56)') + +# generalize query +parser.generalize +# 'SELECT foo FROM bar WHERE id in (XYZ)' + +# remove comments +parser.without_comments +# 'SELECT foo FROM bar WHERE id in (1, 2, 56)' + +# extract comments +parser.comments +# ['/* Test */'] ``` -> See `test/test_normalization.py` file for more examples of a bit more complex queries. +See `test/test_normalization.py` file for more examples of a bit more complex queries. ## Stargazers over time diff --git a/pyproject.toml b/pyproject.toml index 0323b1e7..6346a428 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ homepage = "https://github.com/macbre/sql-metadata" repository = "https://github.com/macbre/sql-metadata" packages = [ - { include="sql_metadata.py" } + { include="sql_metadata" } ] [tool.poetry.dependencies] diff --git a/sql_metadata.py b/sql_metadata.py deleted file mode 100644 index 4a754ed4..00000000 --- a/sql_metadata.py +++ /dev/null @@ -1,468 +0,0 @@ -""" -This module provides SQL query parsing functions -""" -# pylint:disable=unsubscriptable-object -import re -from typing import List, Tuple, Optional, Dict - -import sqlparse - -from sqlparse.sql import TokenList -from sqlparse.tokens import Name, Whitespace, Wildcard, Number, Punctuation - - -def unique(_list: List) -> List: - """ - Makes the list have unique items only and maintains the order - - list(set()) won't provide that - - :type _list list - :rtype: list - """ - ret = [] - - for item in _list: - if item not in ret: - ret.append(item) - - return ret - - -def preprocess_query(query: str) -> str: - """ - Perform initial query cleanup - - :type query str - :rtype str - """ - # 0. remove newlines - query = query.replace("\n", " ") - - # 1. remove aliases - # FROM `dimension_wikis` `dw` - # INNER JOIN `fact_wam_scores` `fwN` - query = re.sub( - r"(\s(FROM|JOIN)\s`[^`]+`)\s`[^`]+`", r"\1", query, flags=re.IGNORECASE - ) - - # 2. `database`.`table` notation -> database.table - query = re.sub(r"`([^`]+)`\.`([^`]+)`", r"\1.\2", query) - - # 2. database.table notation -> table - # query = re.sub(r'([a-z_0-9]+)\.([a-z_0-9]+)', r'\2', query, flags=re.IGNORECASE) - - return query - - -def get_query_tokens(query: str) -> List[sqlparse.sql.Token]: - """ - :type query str - :rtype: list[sqlparse.sql.Token] - """ - query = preprocess_query(query) - parsed = sqlparse.parse(query) - - # handle empty queries (#12) - if not parsed: - return [] - - tokens = TokenList(parsed[0].tokens).flatten() - # print([(token.value, token.ttype) for token in tokens]) - - return [token for token in tokens if token.ttype is not Whitespace] - - -def get_query_columns(query: str) -> List[str]: - """ - :type query str - :rtype: list[str] - """ - columns = [] - last_keyword = None - last_token = None - - # print(preprocess_query(query)) - - # these keywords should not change the state of a parser - # and not "reset" previously found SELECT keyword - keywords_ignored = [ - "AS", - "AND", - "OR", - "IN", - "IS", - "NULL", - "NOT", - "NOT NULL", - "LIKE", - "CASE", - "WHEN", - "DISTINCT", - "UNIQUE", - ] - - # these keywords are followed by columns reference - keywords_before_columns = ["SELECT", "WHERE", "ORDER BY", "ON"] - - # these function should be ignored - # and not "reset" previously found SELECT keyword - functions_ignored = [ - "COUNT", - "MIN", - "MAX", - "FROM_UNIXTIME", - "DATE_FORMAT", - "CAST", - "CONVERT", - ] - - tables_aliases = get_query_table_aliases(query) - - def resolve_table_alias(_table_name: str) -> str: - """ - Resolve aliases, e.g. SELECT bar.column FROM foo AS bar - """ - if _table_name in tables_aliases: - return tables_aliases[_table_name] - return _table_name - - for token in get_query_tokens(query): - if token.is_keyword and token.value.upper() not in keywords_ignored: - # keep the name of the last keyword, e.g. SELECT, FROM, WHERE, (ORDER) BY - last_keyword = token.value.upper() - # print('keyword', last_keyword) - elif token.ttype is Name: - # analyze the name tokens, column names and where condition values - if ( - last_keyword in keywords_before_columns - and last_token.value.upper() not in ["AS"] - ): - if token.value.upper() not in functions_ignored: - if str(last_token) == ".": - # print('DOT', last_token, columns[-1]) - - # we have table.column notation example - # append column name to the last entry of columns - # as it is a table name in fact - table_name = resolve_table_alias(columns[-1]) - - columns[-1] = "{}.{}".format(table_name, token) - else: - columns.append(str(token.value)) - elif last_keyword in ["INTO"] and last_token.ttype is Punctuation: - # INSERT INTO `foo` (col1, `col2`) VALUES (..) - # print(last_keyword, token, last_token) - columns.append(str(token.value).strip("`")) - elif token.ttype is Wildcard: - # handle * wildcard in SELECT part, but ignore count(*) - # print(last_keyword, last_token, token.value) - if last_keyword == "SELECT" and last_token.value != "(": - - if str(last_token) == ".": - # handle SELECT foo.* - table_name = resolve_table_alias(columns[-1]) - columns[-1] = "{}.{}".format(table_name, str(token)) - else: - columns.append(str(token.value)) - - last_token = token - - return unique(columns) - - -def _get_token_normalized_value(token: sqlparse.sql.Token) -> str: - return token.value.translate(str.maketrans("", "", " \n\t\r")).upper() - - -def _update_table_names( - tables: List[str], tokens: List[sqlparse.sql.Token], index: int, last_keyword: str -) -> List[str]: - """ - Return new table names matching database.table or database.schema.table notation - - :type tables list[str] - :type tokens list[sqlparse.sql.Token] - :type index int - :type last_keyword str - :rtype: list[str] - """ - - token = tokens[index] - last_token = tokens[index - 1].value.upper() if index > 0 else None - next_token = tokens[index + 1].value.upper() if index + 1 < len(tokens) else None - - if ( - last_keyword - in [ - "FROM", - "JOIN", - "INNERJOIN", - "FULLJOIN", - "FULLOUTERJOIN", - "LEFTJOIN", - "RIGHTJOIN", - "LEFTOUTERJOIN", - "RIGHTOUTERJOIN", - "INTO", - "UPDATE", - "TABLE", - ] - and last_token not in ["AS", "WITH"] - and token.value not in ["AS", "SELECT"] - ): - if last_token == "." and next_token != ".": - # we have database.table notation example - table_name = "{}.{}".format(tokens[index - 2], tokens[index]) - if len(tables) > 0: - tables[-1] = table_name - else: - tables.append(table_name) - - schema_notation_match = (Name, ".", Name, ".", Name) - schema_notation_tokens = ( - ( - tokens[index - 4].ttype, - tokens[index - 3].value, - tokens[index - 2].ttype, - tokens[index - 1].value, - tokens[index].ttype, - ) - if len(tokens) > 4 - else None - ) - if schema_notation_tokens == schema_notation_match: - # we have database.schema.table notation example - table_name = "{}.{}.{}".format( - tokens[index - 4], tokens[index - 2], tokens[index] - ) - if len(tables) > 0: - tables[-1] = table_name - else: - tables.append(table_name) - elif _get_token_normalized_value(tokens[index - 1]) not in [",", last_keyword]: - # it's not a list of tables, e.g. SELECT * FROM foo, bar - # hence, it can be the case of alias without AS, e.g. SELECT * FROM foo bar - pass - else: - table_name = str(token.value.strip("`")) - tables.append(table_name) - - return tables - - -def get_query_tables(query: str) -> List[str]: - """ - :type query str - :rtype: list[str] - """ - tables = [] - last_keyword = None - - table_syntax_keywords = [ - # SELECT queries - "FROM", - "WHERE", - "JOIN", - "INNERJOIN", - "FULLJOIN", - "FULLOUTERJOIN", - "LEFTOUTERJOIN", - "RIGHTOUTERJOIN", - "LEFTJOIN", - "RIGHTJOIN", - "ON", - "UNION", - "UNIONALL", - # INSERT queries - "INTO", - "VALUES", - # UPDATE queries - "UPDATE", - "SET", - # Hive queries - "TABLE", # INSERT TABLE - ] - - # print(query, get_query_tokens(query)) - query = query.replace('"', "") - tokens = get_query_tokens(query) - - for index, token in enumerate(tokens): - # remove whitespaces from token value and uppercase - token_val_norm = _get_token_normalized_value(token) - - # print([token, token_val_norm, token.ttype, last_keyword]) - - if token.is_keyword and token_val_norm in table_syntax_keywords: - # keep the name of the last keyword, the next one can be a table name - last_keyword = token_val_norm - # print('keyword', last_keyword) - elif str(token) == "(" and last_keyword in ["INTO", "VALUES"]: - # reset the last_keyword for INSERT `foo` VALUES(id, bar) ... - # reset the last_keyword for INSERT `foo` (col1, col2) VALUES(id, bar) ... - last_keyword = None - elif token.is_keyword and token_val_norm in ["FORCE", "ORDER", "GROUPBY"]: - # reset the last_keyword for queries like: - # "SELECT x FORCE INDEX" - # "SELECT x ORDER BY" - # "SELECT x FROM y GROUP BY x" - last_keyword = None - elif ( - token.is_keyword - and token_val_norm == "SELECT" - and last_keyword in ["INTO", "TABLE"] - ): - # reset the last_keyword for "INSERT INTO SELECT" and "INSERT TABLE SELECT" queries - last_keyword = None - elif token.ttype is Name or token.is_keyword: - tables = _update_table_names(tables, tokens, index, last_keyword) - - return unique(tables) - - -def get_query_limit_and_offset(query: str) -> Optional[Tuple[int, int]]: - """ - :type query str - :rtype: (int, int) - """ - limit = None - offset = None - last_keyword = None - last_token = None - - # print(query) - for token in get_query_tokens(query): - # print([token, token.ttype, last_keyword]) - - if token.is_keyword and token.value.upper() in ["LIMIT", "OFFSET"]: - last_keyword = token.value.upper() - elif token.ttype is Number.Integer: - # print([token, last_keyword, last_token_was_integer]) - if last_keyword == "LIMIT": - # LIMIT - limit = int(token.value) - last_keyword = None - elif last_keyword == "OFFSET": - # OFFSET - offset = int(token.value) - last_keyword = None - elif last_token and last_token.ttype is Punctuation: - # LIMIT , - offset = limit - limit = int(token.value) - - last_token = token - - if limit is None: - return None - - return limit, offset or 0 - - -def get_query_table_aliases(query: str) -> Dict[str, str]: - """ - Returns tables aliases mapping from a given query - - E.g. SELECT a.* FROM users1 AS a JOIN users2 AS b ON a.ip_address = b.ip_address - will give you {'a': 'users1', 'b': 'users2'} - """ - aliases = dict() - last_keyword_token = None - last_table_name = None - - for token in get_query_tokens(query): - # print(token.ttype, token, last_table_name) - - # handle "FROM foo alias" syntax (i.e, "AS" keyword is missing) - # if last_table_name and token.ttype is Name: - # aliases[token.value] = last_table_name - # last_table_name = False - - if last_keyword_token: - if last_keyword_token.value.upper() in ["FROM", "JOIN", "INNER JOIN"]: - last_table_name = token.value - - elif last_table_name and last_keyword_token.value.upper() in ["AS"]: - aliases[token.value] = last_table_name - last_table_name = False - - last_keyword_token = token if token.is_keyword else False - - return aliases - - -# SQL queries normalization (#16) -def normalize_likes(sql: str) -> str: - """ - Normalize and wrap LIKE statements - - :type sql str - :rtype: str - """ - sql = sql.replace("%", "") - - # LIKE '%bot' - sql = re.sub(r"LIKE '[^\']+'", "LIKE X", sql) - - # or all_groups LIKE X or all_groups LIKE X - matches = re.finditer(r"(or|and) [^\s]+ LIKE X", sql, flags=re.IGNORECASE) - matches = [match.group(0) for match in matches] if matches else None - - if matches: - for match in set(matches): - sql = re.sub(r"(\s?" + re.escape(match) + ")+", " " + match + " ...", sql) - - return sql - - -def remove_comments_from_sql(sql: str) -> str: - """ - Removes comments from SQL query - - :type sql str|None - :rtype: str - """ - return re.sub(r"\s?/\*.+\*/", "", sql) - - -def generalize_sql(sql: Optional[str]) -> Optional[str]: - """ - Removes most variables from an SQL query and replaces them with X or N for numbers. - - Based on Mediawiki's DatabaseBase::generalizeSQL - - :type sql str|None - :rtype: str - """ - if sql is None: - return None - - # multiple spaces - sql = re.sub(r"\s{2,}", " ", sql) - - # MW comments - # e.g. /* CategoryDataService::getMostVisited N.N.N.N */ - sql = remove_comments_from_sql(sql) - - # handle LIKE statements - sql = normalize_likes(sql) - - sql = re.sub(r"\\\\", "", sql) - sql = re.sub(r"\\'", "", sql) - sql = re.sub(r'\\"', "", sql) - sql = re.sub(r"'[^\']*'", "X", sql) - sql = re.sub(r'"[^\"]*"', "X", sql) - - # All newlines, tabs, etc replaced by single space - sql = re.sub(r"\s+", " ", sql) - - # All numbers => N - sql = re.sub(r"-?[0-9]+", "N", sql) - - # WHERE foo IN ('880987','882618','708228','522330') - sql = re.sub( - r" (IN|VALUES)\s*\([^,]+,[^)]+\)", " \\1 (XYZ)", sql, flags=re.IGNORECASE - ) - - return sql.strip() diff --git a/sql_metadata/__init__.py b/sql_metadata/__init__.py new file mode 100644 index 00000000..03ef1744 --- /dev/null +++ b/sql_metadata/__init__.py @@ -0,0 +1,7 @@ +""" +Module for parsing sql queries and returning columns, tables, names of with statements etc. +""" +# pylint:disable=unsubscriptable-object +from sql_metadata.parser import Parser + +__all__ = ["Parser"] diff --git a/sql_metadata/generalizator.py b/sql_metadata/generalizator.py new file mode 100644 index 00000000..c60ba2fc --- /dev/null +++ b/sql_metadata/generalizator.py @@ -0,0 +1,106 @@ +""" +Module used to produce generalized sql out of given query +""" +import re +from typing import List, Optional + +import sqlparse + + +class Generalizator: + """ + Class used to produce generalized sql out of given query + """ + + def __init__(self, sql: str = ""): + self._raw_query = sql + + # SQL queries normalization (#16) + @staticmethod + def _normalize_likes(sql: str) -> str: + """ + Normalize and wrap LIKE statements + + :type sql str + :rtype: str + """ + sql = sql.replace("%", "") + + # LIKE '%bot' + sql = re.sub(r"LIKE '[^\']+'", "LIKE X", sql) + + # or all_groups LIKE X or all_groups LIKE X + matches = re.finditer(r"(or|and) [^\s]+ LIKE X", sql, flags=re.IGNORECASE) + matches = [match.group(0) for match in matches] if matches else None + + if matches: + for match in set(matches): + sql = re.sub( + r"(\s?" + re.escape(match) + ")+", " " + match + " ...", sql + ) + + return sql + + @property + def comments(self) -> List[str]: + """ + Gets comments from SQL query + + :rtype: str + """ + comments = re.findall(r"\s?/\*.+?\*/", self._raw_query) + return [x.strip() for x in comments] + + @property + def without_comments(self) -> str: + """ + Removes comments from SQL query + + :rtype: str + """ + sql = sqlparse.format(self._raw_query, strip_comments=True) + sql = re.sub(r"\s{2,}", " ", sql) + return sql + + @property + def generalize(self) -> Optional[str]: + """ + Removes most variables from an SQL query and replaces them with X or N for numbers. + + Based on Mediawiki's DatabaseBase::generalizeSQL + + :type sql str|None + :rtype: str + """ + if self._raw_query is None: + return None + + # MW comments + # e.g. /* CategoryDataService::getMostVisited N.N.N.N */ + sql = self.without_comments + sql = sql.replace('"', "") + + # multiple spaces + sql = re.sub(r"\s{2,}", " ", sql) + + # handle LIKE statements + sql = self._normalize_likes(sql) + + sql = re.sub(r"\\\\", "", sql) + sql = re.sub(r"\\'", "", sql) + sql = re.sub(r'\\"', "", sql) + sql = re.sub(r"'[^\']*'", "X", sql) + sql = re.sub(r'"[^\"]*"', "X", sql) + + # All newlines, tabs, etc replaced by single space + sql = re.sub(r"\s+", " ", sql) + + # All numbers => N + sql = re.sub(r"-?[0-9]+", "N", sql) + + # WHERE foo IN ('880987','882618','708228','522330') + sql = re.sub( + r" (IN|VALUES)\s*\([^,]+,[^)]+\)", " \\1 (XYZ)", sql, flags=re.IGNORECASE + ) + + return sql.strip() diff --git a/sql_metadata/keywords_lists.py b/sql_metadata/keywords_lists.py new file mode 100644 index 00000000..c8c5b901 --- /dev/null +++ b/sql_metadata/keywords_lists.py @@ -0,0 +1,82 @@ +""" +Module provide lists of sql keywords that should trigger or skip +checks for tables an columns +""" +# these keywords should not change the state of a parser +# and not "reset" previously found SELECT keyword +KEYWORDS_IGNORED = [ + "AS", + "AND", + "OR", + "IN", + "IS", + "NULL", + "NOT", + "NOT NULL", + "LIKE", + "CASE", + "WHEN", + "DISTINCT", + "UNIQUE", +] + +# these function should be ignored +# and not "reset" previously found SELECT keyword +FUNCTIONS_IGNORED = [ + "COUNT", + "MIN", + "MAX", + "FROM_UNIXTIME", + "DATE_FORMAT", + "CAST", + "CONVERT", + "YEAR", + "MONTH", + "YEARWEEK", + "DAY", + "AVG", + "SUM", + "IFNULL", + "DATEDIFF", + "DIV", + "MID", + "WEEKDAY", + "NOW", + "LAST_DAY", + "DATE_ADD", + "COALESCE", +] +# these keywords are followed by columns reference +KEYWORDS_BEFORE_COLUMNS = ["SELECT", "WHERE", "ORDERBY", "ON", "SET"] + +# normalized list of table preceding keywords +TABLE_ADJUSTMENT_KEYWORDS = [ + "FROM", + "JOIN", + "INNERJOIN", + "FULLJOIN", + "FULLOUTERJOIN", + "LEFTJOIN", + "RIGHTJOIN", + "LEFTOUTERJOIN", + "RIGHTOUTERJOIN", + "INTO", + "UPDATE", + "TABLE", +] + +# next statement beginning after with statement +WITH_ENDING_KEYWORDS = ["UPDATE", "SELECT", "DELETE", "REPLACE"] + +# subquery preceding keywords +SUBQUERY_PRECEDING_KEYWORDS = [ + "FROM", + "JOIN", + "INNERJOIN", + "FULLJOIN", + "FULLOUTERJOIN", + "LEFTJOIN", + "RIGHTJOIN", + "LEFTOUTERJOIN", + "RIGHTOUTERJOIN", +] diff --git a/sql_metadata/parser.py b/sql_metadata/parser.py new file mode 100644 index 00000000..472ad578 --- /dev/null +++ b/sql_metadata/parser.py @@ -0,0 +1,506 @@ +""" +This module provides SQL query parsing functions +""" +import re +from typing import Dict, List, Optional, Tuple + +import sqlparse +from sqlparse.sql import TokenList +from sqlparse.tokens import Name, Number, Punctuation, Whitespace, Wildcard + +from sql_metadata.generalizator import Generalizator +from sql_metadata.keywords_lists import ( + FUNCTIONS_IGNORED, + KEYWORDS_BEFORE_COLUMNS, + KEYWORDS_IGNORED, + SUBQUERY_PRECEDING_KEYWORDS, + TABLE_ADJUSTMENT_KEYWORDS, + WITH_ENDING_KEYWORDS, +) +from sql_metadata.token import EmptyToken, SQLToken +from sql_metadata.utils import UniqueList + + +class Parser: # pylint: disable=R0902 + """ + Main class to parse sql query + """ + + def __init__(self, sql: str = "") -> None: + self._raw_query = sql + self._query = self._preprocess_query() + + self._tokens = None + + self._columns = None + self._columns_dict = None + + self._tables = None + self._table_aliases = None + + self._with_names = None + self._subqueries = None + self._subqueries_names = None + + self._limit_and_offset = None + + self._values = None + self._values_dict = None + + @property + def query(self) -> str: + """ + Returns preprocessed query + """ + return self._query + + @property + def tokens(self) -> List[SQLToken]: + """ + :rtype: list[SQLToken] + """ + if self._tokens is not None: + return self._tokens + + parsed = sqlparse.parse(self.query) + tokens = [] + # handle empty queries (#12) + if not parsed: + return tokens + + sqlparse_tokens = TokenList(parsed[0].tokens).flatten() + non_empty_tokens = [ + token for token in sqlparse_tokens if token.ttype is not Whitespace + ] + last_keyword = None + subquery_level = 0 + open_parenthesises = [] + for index, tok in enumerate(non_empty_tokens): + token = SQLToken( + value=tok.value, + is_keyword=tok.is_keyword, + is_name=tok.ttype is Name, + is_punctuation=tok.ttype is Punctuation, + is_dot=str(tok) == ".", + is_wildcard=tok.ttype is Wildcard, + is_integer=tok.ttype is Number.Integer, + is_float=tok.ttype is Number.Float, + is_left_parenthesis=str(tok) == "(", + is_right_parenthesis=str(tok) == ")", + position=index, + last_keyword=last_keyword, + next_token=EmptyToken, + previous_token=EmptyToken, + subquery_level=subquery_level, + ) + if index > 0: + token.previous_token = tokens[index - 1] + tokens[index - 1].next_token = token + + if ( + token.is_left_parenthesis + and token.previous_token.normalized not in SUBQUERY_PRECEDING_KEYWORDS + ): + token.is_nested_function_start = True + open_parenthesises.append(token) + elif ( + token.is_left_parenthesis + and token.previous_token.normalized in SUBQUERY_PRECEDING_KEYWORDS + ): + token.is_subquery_start = True + subquery_level += 1 + token.subquery_level = subquery_level + open_parenthesises.append(token) + elif token.is_right_parenthesis: + last_open_parenthesis = open_parenthesises.pop(-1) + if last_open_parenthesis.is_subquery_start: + token.is_subquery_end = True + subquery_level -= 1 + else: + token.is_nested_function_end = True + + if tok.is_keyword and tok.normalized not in KEYWORDS_IGNORED: + last_keyword = tok.normalized + tokens.append(token) + + self._tokens = tokens + return tokens + + @property + def columns(self) -> List[str]: + """ + :rtype: list[str] + """ + if self._columns is not None: + return self._columns + columns = UniqueList() + tables_aliases = self.tables_aliases + subqueries_names = self.subqueries_names + + for token in self.tokens: + if token.is_name and not token.next_token.is_dot: + # analyze the name tokens, column names and where condition values + if ( + token.last_keyword_normalized in KEYWORDS_BEFORE_COLUMNS + and token.previous_token.normalized != "AS" + ): + if token.normalized not in FUNCTIONS_IGNORED and not ( + # aliases of sub-queries i.e.: select from (...) + token.previous_token.is_right_parenthesis + and token.value in subqueries_names + ): + column = token.table_prefixed_column(tables_aliases) + self._add_to_columns_subsection( + keyword=token.last_keyword_normalized, column=column + ) + columns.append(column) + elif ( + token.last_keyword_normalized == "INTO" + and token.previous_token.is_punctuation + ): + # INSERT INTO `foo` (col1, `col2`) VALUES (..) + column = str(token.value).strip("`") + self._add_to_columns_subsection( + keyword=token.last_keyword_normalized, column=column + ) + columns.append(column) + elif ( + token.is_wildcard + and token.last_keyword_normalized == "SELECT" + and not token.previous_token.is_left_parenthesis + ): + # handle * wildcard in SELECT part, but ignore count(*) + column = token.table_prefixed_column(tables_aliases) + self._add_to_columns_subsection( + keyword=token.last_keyword_normalized, column=column + ) + columns.append(column) + + self._columns = columns + return self._columns + + @property + def columns_without_subqueries(self) -> List: + """ + Returns columns without ones explicitly coming from sub-queries + """ + columns = self.columns + subqueries = self.subqueries_names + return [column for column in columns if column.split(".")[0] not in subqueries] + + @property + def columns_dict(self) -> Dict[str, List[str]]: + """ + Returns dictionary of column names divided into section of the query in which + given column is present. + + Sections consist of: select, where, order_by, join, insert and update + """ + if self._columns_dict: + return self._columns_dict + _ = self.columns + return self._columns_dict + + @property + def tables(self) -> List[str]: + """ + :rtype: list[str] + """ + if self._tables is not None: + return self._tables + tables = UniqueList() + with_names = self.with_names + + for token in self.tokens: + if ( + (token.is_name or token.is_keyword) + and token.last_keyword_normalized in TABLE_ADJUSTMENT_KEYWORDS + and token.previous_token.normalized not in ["AS", "WITH"] + and token.normalized not in ["AS", "SELECT"] + ): + if token.next_token.is_dot: + pass # part of the qualified name + elif token.previous_token.is_dot: + tables.append(token.left_expanded) # full qualified name + elif ( + token.previous_token.normalized != token.last_keyword_normalized + and not token.previous_token.is_punctuation + ) or token.previous_token.is_right_parenthesis: + # it's not a list of tables, e.g. SELECT * FROM foo, bar + # hence, it can be the case of alias without AS, + # e.g. SELECT * FROM foo bar + # or an alias of subquery (SELECT * FROM foo) bar + pass + elif ( + token.last_keyword_normalized == "INTO" and token.is_in_parenthesis + ): + # we are in of INSERT INTO () + pass + else: + table_name = str(token.value.strip("`")) + tables.append(table_name) + + self._tables = tables - with_names + return self._tables + + @property + def limit_and_offset(self) -> Optional[Tuple[int, int]]: + """ + Returns value for limit and offset if set + + :rtype: (int, int) + """ + if self._limit_and_offset is not None: + return self._limit_and_offset + limit = None + offset = None + + for token in self.tokens: + if token.is_integer: + if token.last_keyword_normalized == "LIMIT" and not limit: + # LIMIT + limit = int(token.value) + elif token.last_keyword_normalized == "OFFSET": + # OFFSET + offset = int(token.value) + elif token.previous_token.is_punctuation: + # LIMIT , + offset = limit + limit = int(token.value) + + if limit is None: + return None + + self._limit_and_offset = limit, offset or 0 + return self._limit_and_offset + + @property + def tables_aliases(self) -> Dict[str, str]: + """ + Returns tables aliases mapping from a given query + + E.g. SELECT a.* FROM users1 AS a JOIN users2 AS b ON a.ip_address = b.ip_address + will give you {'a': 'users1', 'b': 'users2'} + """ + if self._table_aliases is not None: + return self._table_aliases + aliases = dict() + tables = self.tables + + for token in self.tokens: + if ( + token.last_keyword_normalized in TABLE_ADJUSTMENT_KEYWORDS + and token.is_name + and token.next_token.normalized != "AS" + and not token.next_token.is_dot + ): + if token.previous_token.normalized == "AS": + # potential .
as + potential_table_name = token.get_nth_previous(2).left_expanded + else: + # potential .
+ potential_table_name = token.previous_token.left_expanded + + if potential_table_name in tables: + aliases[token.value] = potential_table_name + + self._table_aliases = aliases + return self._table_aliases + + @property + def with_names(self) -> List[str]: + """ + Returns with statements aliases list from a given query + + E.g. WITH database1.tableFromWith AS (SELECT * FROM table3) + SELECT "xxxxx" FROM database1.tableFromWith alias + LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") + will return ["database1.tableFromWith"] + """ + if self._with_names is not None: + return self._with_names + with_names = UniqueList() + for token in self.tokens: + if token.previous_token.normalized == "WITH": + in_with = True + while in_with and token.next_token: + # name is first + if token.next_token.normalized == "AS": + with_names.append(token.left_expanded) + # move to next with if exists, this with ends with + # ) + , if many withs or ) + select if one + # need to move to next as AS can be in + # sub-queries inside with definition + while token.next_token and not ( + token.is_right_parenthesis + and ( + token.next_token.is_punctuation + or token.next_token.normalized in WITH_ENDING_KEYWORDS + ) + ): + token = token.next_token + if token.next_token.normalized in WITH_ENDING_KEYWORDS: + in_with = False + else: + token = token.next_token + + self._with_names = with_names + return self._with_names + + @property + def subqueries(self) -> Dict: + """ + Returns a dictionary with all sub-queries existing in query + """ + if self._subqueries is not None: + return self._subqueries + subqueries = dict() + token = self.tokens[0] + while token.next_token: + if token.previous_token.is_subquery_start: + current_subquery = [] + current_level = token.subquery_level + inner_token = token + while ( + inner_token.next_token + and not inner_token.next_token.subquery_level < current_level + ): + current_subquery.append(inner_token) + inner_token = inner_token.next_token + if inner_token.next_token.value in self.subqueries_names: + query_name = inner_token.next_token.value + else: + query_name = inner_token.next_token.next_token.value + subquery_text = "".join([x.stringified_token for x in current_subquery]) + subqueries[query_name] = subquery_text + + token = token.next_token + + self._subqueries = subqueries + return self._subqueries + + @property + def subqueries_names(self) -> List[str]: + """ + Returns sub-queries aliases list from a given query + + e.g. SELECT COUNT(1) FROM + (SELECT std.task_id FROM some_task_detail std WHERE std.STATUS = 1) a + JOIN (SELECT st.task_id FROM some_task st WHERE task_type_id = 80) b + ON a.task_id = b.task_id; + will return ["a", "b"] + """ + if self._subqueries_names is not None: + return self._subqueries_names + subqueries_names = UniqueList() + for token in self.tokens: + if (token.previous_token.is_subquery_end and token.normalized != "AS") or ( + token.previous_token.normalized == "AS" + and token.get_nth_previous(2).is_subquery_end + ): + subqueries_names.append(str(token)) + + self._subqueries_names = subqueries_names + return self._subqueries_names + + @property + def values(self) -> List: + """ + Returns list of values from insert queries + """ + if self._values: + return self._values + values = [] + for token in self.tokens: + if ( + token.last_keyword_normalized == "VALUES" + and token.is_in_parenthesis + and token.next_token.is_punctuation + ): + if token.is_integer: + value = int(token.value) + elif token.is_float: + value = float(token.value) + else: + value = token.value.strip("'\"") + values.append(value) + self._values = values + return self._values + + @property + def values_dict(self) -> Dict: + """ + Returns dictionary of column-value pairs. + If columns are not set the auto generated column_ are added. + """ + values = self.values + if self._values_dict or not values: + return self._values_dict + columns = self.columns + if not columns: + columns = [f"column_{ind + 1}" for ind in range(len(values))] + values_dict = dict(zip(columns, values)) + self._values_dict = values_dict + return self._values_dict + + @property + def comments(self) -> List[str]: + """ + Return comments from SQL query + + :rtype: List[str] + """ + return Generalizator(self._raw_query).comments + + @property + def without_comments(self) -> str: + """ + Removes comments from SQL query + + :rtype: str + """ + return Generalizator(self._raw_query).without_comments + + @property + def generalize(self) -> Optional[str]: + """ + Removes most variables from an SQL query + and replaces them with X or N for numbers. + + Based on Mediawiki's DatabaseBase::generalizeSQL + + :rtype: Optional[str] + """ + return Generalizator(self._raw_query).generalize + + def _add_to_columns_subsection(self, keyword: str, column: str): + sections = { + "SELECT": "select", + "WHERE": "where", + "ORDERBY": "order_by", + "ON": "join", + "INTO": "insert", + "SET": "update", + } + section = sections[keyword] + self._columns_dict = self._columns_dict or dict() + self._columns_dict.setdefault(section, UniqueList()).append(column) + + def _preprocess_query(self) -> str: + """ + Perform initial query cleanup + + :rtype str + """ + if self._raw_query == "": + return "" + + # 0. remove newlines + query = self._raw_query.replace("\n", " ") + # 1. remove quotes " + query = query.replace('"', "") + + # 2. `database`.`table` notation -> database.table + query = re.sub(r"`([^`]+)`\.`([^`]+)`", r"\1.\2", query) + + return query diff --git a/sql_metadata/token.py b/sql_metadata/token.py new file mode 100644 index 00000000..7ec6e763 --- /dev/null +++ b/sql_metadata/token.py @@ -0,0 +1,162 @@ +""" +Module contains internal SQLToken that creates linked list +""" +import dataclasses +from typing import Dict, Optional + +from sql_metadata.keywords_lists import FUNCTIONS_IGNORED + + +@dataclasses.dataclass +class SQLToken: # pylint: disable=R0902 + """ + Class representing single token and connected into linked list + """ + + value: Optional[str] + is_keyword: bool + is_name: bool + is_dot: bool + is_punctuation: bool + is_wildcard: bool + is_integer: bool + is_float: bool + + is_left_parenthesis: bool + is_right_parenthesis: bool + subquery_level: int + position: int + + is_subquery_start: bool = False + is_subquery_end: bool = False + is_nested_function_start: bool = False + is_nested_function_end: bool = False + + last_keyword: Optional[str] = None + previous_token: Optional["SQLToken"] = None + next_token: Optional["SQLToken"] = None + + def __str__(self): + """ + String representation + """ + return self.value.strip('"') + + def __repr__(self) -> str: # pragma: no cover + """ + Representation - useful for debugging + """ + repr_str = ["=".join([str(k), str(v)]) for k, v in self.__dict__.items()] + return f"SQLToken({','.join(repr_str)})" + + @property + def normalized(self) -> str: + """ + Property returning uppercase value without end lines and spaces + """ + return self.value.translate(str.maketrans("", "", " \n\t\r")).upper() + + @property + def stringified_token(self) -> str: + """ + Returns string representation with whitespace or not - used to rebuild query + from list of tokens + """ + if self.previous_token: + if ( + self.normalized in [")", ".", ","] + or self.previous_token.normalized in ["(", "."] + or ( + self.is_left_parenthesis + and self.previous_token.normalized in FUNCTIONS_IGNORED + ) + ): + return str(self) + return f" {self}" + return str(self) # pragma: no cover + + @property + def last_keyword_normalized(self) -> str: + """ + Property returning uppercase last keyword without end lines and spaces + """ + if self.last_keyword: + return self.last_keyword.translate(str.maketrans("", "", " \n\t\r")).upper() + return "" + + @property + def left_expanded(self) -> str: + """ + Property tries to expand value with dot notation if left token is a dot + to capture whole groups like .
or ..
+ """ + value = str(self) + token = self + while token.previous_token.is_dot: + if token.get_nth_previous(2) and token.get_nth_previous(2).is_name: + value = f"{token.get_nth_previous(2)}." + value + token = token.get_nth_previous(2) + return value.strip("`") + + @property + def is_in_parenthesis(self) -> bool: + """ + Property checks if token is surrounded with brackets () + """ + token = self + left_parenthesis = False + right_parenthesis = False + while token.previous_token: + if token.previous_token.is_left_parenthesis: + left_parenthesis = True + break + token = token.previous_token + token = self + while token.next_token: + if token.next_token.is_right_parenthesis: + right_parenthesis = True + break + token = token.next_token + + return left_parenthesis and right_parenthesis + + def table_prefixed_column(self, table_aliases: Dict) -> str: + """ + Substitutes table alias with actual table name + """ + value = self.left_expanded + if "." in value: + parts = value.split(".") + if len(parts) > 3: # pragma: no cover + raise ValueError(f"Wrong columns name: {value}") + parts[0] = table_aliases.get(parts[0], parts[0]) + value = ".".join(parts) + return value + + def get_nth_previous(self, level: int): + """ + Function iterates previous tokens getting nth previous token + """ + assert level >= 1 + if self.previous_token: + if level > 1: + return self.previous_token.get_nth_previous(level=level - 1) + return self.previous_token + return None # pragma: no cover + + +EmptyToken = SQLToken( + value="", + is_keyword=False, + is_name=False, + is_punctuation=False, + is_dot=False, + is_wildcard=False, + is_integer=False, + is_float=False, + is_left_parenthesis=False, + is_right_parenthesis=False, + last_keyword=None, + subquery_level=0, + position=-1, +) diff --git a/sql_metadata/utils.py b/sql_metadata/utils.py new file mode 100644 index 00000000..7620a9b7 --- /dev/null +++ b/sql_metadata/utils.py @@ -0,0 +1,17 @@ +""" +Module with various utils +""" +from typing import Any, List + + +class UniqueList(list): + """ + List that keeps it's items unique + """ + + def append(self, item: Any) -> None: + if item not in self: + super().append(item) + + def __sub__(self, other) -> List: + return [x for x in self if x not in other] diff --git a/test/test.sql b/test/test.sql new file mode 100644 index 00000000..48c9999e --- /dev/null +++ b/test/test.sql @@ -0,0 +1,215 @@ +SELECT * + +FROM EBH_DM_CRM.CRM_CUSTOMER C + +LEFT JOIN KMR_FINANCIAL_INDEX_STAGE_01 RB_ACCT +ON RB_ACCT.CUSTOMER_ID = C.CUSTOMER_ID + +LEFT JOIN KMR_FINANCIAL_INDEX_STAGE_07 OD_FAC +ON OD_FAC.CUSTOMER_ID = C.CUSTOMER_ID + +LEFT JOIN KMR_FINANCIAL_INDEX_STAGE_02 CARD +ON CARD.CUSTOMER_ID = C.CUSTOMER_ID + +LEFT JOIN KMR_FINANCIAL_INDEX_STAGE_03 CL_LOAN +ON CL_LOAN.CUSTOMER_ID = C.CUSTOMER_ID + +LEFT JOIN KMR_FINANCIAL_INDEX_STAGE_05 CL_INV +ON CL_INV.CUSTOMER_ID = C.CUSTOMER_ID + +LEFT JOIN KMR_FINANCIAL_INDEX_STAGE_04 BEF_ZRT +ON BEF_ZRT.CUSTOMER_SYMBOLS_SID = C.SYMBOLS_CUSTOMER_SID + +LEFT JOIN (SELECT CA.CUSTOMER_ID, +SUM(CASE +WHEN P.KMR_TYPE_GROUP IN ('CRED_MORTG', 'CRED_HOUSE_NORM', 'CRED_HOUSE_SUBV') AND LPRE.RELATED_LOAN IS NULL AND L.ROTATION_CODE = 'N' AND CF.INVOICE_TYPE = 'PRI' THEN +CF.PAID_AMT +ELSE +0 +END) AS FIN_CREDIT_MORTGAGE_AP_AMT, +SUM(CASE +WHEN P.KMR_TYPE_GROUP IN +('CRED_PERS', 'CRED_OTHER_LOMB', 'CRED_OTHER_GAR', 'CRED_OTHER_GAR_FRPR', 'CRED_OTHER_INVEST', 'CRED_OTHER_CURR', 'CRED_OTHER_SUBVINV', 'CRED_OTHER_OTHER') AND +LPRE.RELATED_LOAN IS NULL AND L.ROTATION_CODE = 'N' AND CF.INVOICE_TYPE = 'PRI' THEN +CF.PAID_AMT +ELSE +0 +END) AS FIN_CREDIT_SHORT_AP_AMT +FROM SCHEMA.CRM_ARRANGEMENT AR + + JOIN SCHEMA.CRM_LOAN L + ON L.LOAN_ID = AR.LOAN_ID + AND L.START_OF_VALIDITY <= DATE('2020-01-01') + AND L.END_OF_VALIDITY > DATE('2020-01-01') + + JOIN SCHEMA.CRM_CASH_FLOW CF + ON CF.LOAN_ID = L.LOAN_ID + AND CF.RECEIPT_DATE BETWEEN TRUNCATE(&GLOBAL_P_EFFECTIVE_LOAD_DATE, 'MONTH') AND &GLOBAL_P_EFFECTIVE_LOAD_DATE + AND CF.EFFECTIVE_LOAD_DATE BETWEEN TRUNCATE(&GLOBAL_P_EFFECTIVE_LOAD_DATE, 'MONTH') AND DATE('2020-01-01') + + JOIN SCHEMA.CRM_REL_CUSTOMER_ARRANGEMENT CA + ON CA.ARRANGEMENT_ID = AR.ARRANGEMENT_ID + AND CA.START_OF_VALIDITY <= DATE('2020-01-01') + AND CA.END_OF_VALIDITY > DATE('2020-01-01') + AND CA.RELATION_TYPE = 'PRIMARY_CUSTOMER' + + LEFT JOIN SCHEMA.CRM_LOAN LPRE + ON LPRE.LOAN_SID1 = L.RELATED_LOAN + AND LPRE.START_OF_VALIDITY <= DATE('2020-01-01') + AND LPRE.END_OF_VALIDITY > DATE('2020-01-01') + + LEFT JOIN SCHEMA.CRM_PRODUCT P + ON P.PRODUCT = L.LOAN_CODE2 + AND P.DML <> 'D' + AND P.START_OF_VALIDITY <= DATE('2020-01-01') + AND P.END_OF_VALIDITY > DATE('2020-01-01') + + WHERE 1 = 1 + AND AR.START_OF_VALIDITY <= DATE('2020-01-01') + AND AR.END_OF_VALIDITY > DATE('2020-01-01') + GROUP BY CA.CUSTOMER_ID) QRM_ELOTORL +ON QRM_ELOTORL.CUSTOMER_ID = C.CUSTOMER_ID +LEFT JOIN (SELECT PD.CUSTOMER_ID, +CASE +WHEN MAX(PD.DAYS_PAST_DUE) > 30 THEN +MAX(PD.DAYS_PAST_DUE) - 30 +ELSE +0 +END AS EXP_CRED_DAYS_OVR_30_DAYS, +SUM(CASE +WHEN PD.DAYS_PAST_DUE > 30 THEN +PD.PAST_DUE_AMOUNT +ELSE +0 +END) AS EXP_CRED_HUF_OVR_30_DAYS_AMT, +CASE +WHEN MAX(PD.DAYS_PAST_DUE) > 90 THEN +MAX(PD.DAYS_PAST_DUE) - 90 +ELSE +0 +END AS EXP_CRED_DAYS_OVR_90_DAYS, +SUM(CASE +WHEN PD.DAYS_PAST_DUE > 90 THEN +PD.PAST_DUE_AMOUNT +ELSE +0 +END) AS EXP_CRED_HUF_OVR_90_DAYS_AMT, +MAX(PD.DAYS_PAST_DUE) AS EXP_CRED_DAYS, +SUM(PD.PAST_DUE_AMOUNT) AS EXP_CRED_AMT + + FROM SCHEMA.CRM_PAST_DUE PD + + WHERE 1 = 1 + AND PD.PAST_DUE_SID4 = 'FACILITY_MAX_DPD' + AND PD.DAYS_PAST_DUE > 0 + AND PD.PAST_DUE_AMOUNT > 0 + AND PD.EFFECTIVE_LOAD_DATE = &GLOBAL_P_EFFECTIVE_LOAD_DATE + GROUP BY PD.CUSTOMER_ID) KESEDELMEK +ON KESEDELMEK.CUSTOMER_ID = C.CUSTOMER_ID +LEFT JOIN (SELECT BLL.CUSTOMER_ID, +SUM(CASE +WHEN P.TMO_HIER_LEVEL_1 = 'HITEL' AND P.TMO_HIER_LEVEL_2 = 'MIKROVÃLLALATI HITEL' AND +(P.TMO_HIER_LEVEL_4 = 'FAKTOR' OR P.TMO_HIER_LEVEL_3 = 'KÃNYSZERHITEL' OR AR.ESTIMATED_END_DATE - L.FIRST_DISBURSEMENT_DATE <= 365) +AND +AR.ESTIMATED_END_DATE > DAT.CALENDAR_MONTH_LAST_WORKDAY THEN +NVL(BLL.OUTSTANDING_AMT * E.EXCHANGE_RATE_VALUE, 0) +ELSE +0 +END) AS FIN_CRED_OTH_SHORT_BAL, + + SUM(CASE + WHEN (P.TMO_HIER_LEVEL_1 = 'HITEL' AND P.TMO_HIER_LEVEL_2 = 'MIKROVÃLLALATI HITEL' AND + (AR.ESTIMATED_END_DATE - L.FIRST_DISBURSEMENT_DATE > 365 OR L.FIRST_DISBURSEMENT_DATE IS NULL)) AND AR.ESTIMATED_END_DATE > DAT.CALENDAR_MONTH_LAST_WORKDAY THEN + NVL(BLL.OUTSTANDING_AMT * E.EXCHANGE_RATE_VALUE, 0) + ELSE + 0 + END) AS FIN_CRED_OTH_LONG_BAL + + FROM SCHEMA.CRM_ARRANGEMENT AR + + JOIN SCHEMA.CRM_LOAN L + ON L.LOAN_ID = AR.LOAN_ID + AND L.START_OF_VALIDITY <= DATE('2020-01-01') + AND L.END_OF_VALIDITY > DATE('2020-01-01') + + JOIN (SELECT CALENDAR_MONTH_LAST_WORKDAY + FROM SCHEMA.CRM_DATE DAT + WHERE DAT.REFERENCE_DAY = &GLOBAL_P_EFFECTIVE_LOAD_DATE + AND DAT.START_OF_VALIDITY <= DATE('2020-01-01') + AND DAT.END_OF_VALIDITY > DATE('2020-01-01') + ) DAT + ON 1 = 1 + + LEFT JOIN SCHEMA.CRM_BALANCE_LOAN_LOAN BLL + ON BLL.LOAN_ID = L.LOAN_ID + AND BLL.EFFECTIVE_LOAD_DATE = DATE('2020-01-01') + + LEFT JOIN SCHEMA.CRM_EXCHANGE_RATE E + ON E.EXCHANGE_RATE_DATE = DATE('2020-01-01') + AND E.EFFECTIVE_LOAD_DATE = DATE('2020-01-01') + AND E.TARGET_CURRENCY = BLL.CCY + AND E.EXCHANGE_RATE_CODE = 'FT0' + + LEFT JOIN SCHEMA.CRM_PRODUCT P + ON P.PRODUCT = L.LOAN_CODE2 + AND P.MODUL = 'CL' + AND NVL(P.RB_FORCED_LOAN, '#') = '#' + AND P.DML <> 'D' + AND P.START_OF_VALIDITY <= DATE('2020-01-01') + AND P.END_OF_VALIDITY > DATE('2020-01-01') + + WHERE 1 = 1 + AND AR.START_OF_VALIDITY <= DATE('2020-01-01') + AND AR.END_OF_VALIDITY > DATE('2020-01-01') + AND AR.ARRANGEMENT_TYPE = 'LOAN' + GROUP BY BLL.CUSTOMER_ID) SHORT_LONG +ON SHORT_LONG.CUSTOMER_ID = C.CUSTOMER_ID +LEFT JOIN (SELECT BLE.CUSTOMER_ID, SUM(BLE.FUTURE_CAPITAL) AS FI_CRED_LEASING_BAL, COUNT(LE.LEASING_ID) AS PR_CRED_LEASING_CNT + + FROM SCHEMA.CRM_LEASING LE + + JOIN SCHEMA.CRM_BALANCE_LEASING BLE + ON BLE.LEASING_ID = LE.LEASING_ID + AND BLE.CUSTOMER_ID IS NOT NULL + AND BLE.EFFECTIVE_LOAD_DATE = DATE('2020-01-01') + + WHERE 1 = 1 + AND LE.START_OF_VALIDITY <= DATE('2020-01-01') + AND LE.END_OF_VALIDITY > DATE('2020-01-01') + + GROUP BY BLE.CUSTOMER_ID) DEALS_AND_INTERESTS +ON DEALS_AND_INTERESTS.CUSTOMER_ID = C.CUSTOMER_ID +LEFT JOIN (SELECT SCB.SYMBOLS_ID, COUNT(DISTINCT SCB.CONTRACT_ID) AS BPR_INS_LIFE_CNT, SUM(SCB.AMOUNT) AS FIN_INS_LIFE_BAL +FROM SCHEMA.CRM_RL_SUBSIDIARY_COMPANIE_BAT SCB +WHERE SCB.SOURCE_TCH = 'BIZTOSITO' +AND SCB.EFFECTIVE_LOAD_DATE = &GLOBAL_P_EFFECTIVE_LOAD_DATE +GROUP BY SCB.SYMBOLS_ID) LIFE_INSUR +ON LIFE_INSUR.SYMBOLS_ID = C.SYMBOLS_CUSTOMER_SID + +LEFT JOIN (SELECT PD.CUSTOMER_ID, MAX(PD.DAYS_PAST_DUE) AS EXP_CRED_DAYS_MAX, ROUND(SUM(PD.PAST_DUE_AMOUNT)) AS EXP_CRED_AMT_MAX + + FROM SCHEMA.CRM_PAST_DUE PD + + JOIN SCHEMA.CRM_ARRANGEMENT AR + ON AR.ARRANGEMENT_ID = PD.ARRANGEMENT_ID + AND AR.START_OF_VALIDITY <= DATE('2020-01-01') + AND AR.END_OF_VALIDITY > DATE('2020-01-01') + AND AR.ARRANGEMENT_TYPE = 'LOAN' + + JOIN SCHEMA.CRM_LOAN L + ON L.LOAN_ID = AR.LOAN_ID + AND L.START_OF_VALIDITY <= DATE('2020-01-01') + AND L.END_OF_VALIDITY > DATE('2020-01-01') + AND L.LOAN_CODE1 NOT LIKE 'T%' + + WHERE 1 = 1 + AND PD.PAST_DUE_SID4 = 'FACILITY_MAX_DPD' + AND PD.EFFECTIVE_LOAD_DATE = &GLOBAL_P_EFFECTIVE_LOAD_DATE + GROUP BY PD.CUSTOMER_ID) EXPIRED_LOAN +ON EXPIRED_LOAN.CUSTOMER_ID = C.CUSTOMER_ID +LEFT JOIN KMR_FINANCIAL_INDEX_STAGE_06 SZCH +ON SZCH.CUSTOMER_ID = C.CUSTOMER_ID + +WHERE 1 = 1 +AND C.START_OF_VALIDITY <= DATE('2020-01-01') +AND C.END_OF_VALIDITY > DATE('2020-01-01') \ No newline at end of file diff --git a/test/test_aliases.py b/test/test_aliases.py index b1e80abc..90ee5d42 100644 --- a/test/test_aliases.py +++ b/test/test_aliases.py @@ -1,25 +1,25 @@ -from sql_metadata import get_query_tables, get_query_columns, get_query_table_aliases +from sql_metadata.parser import Parser def test_get_query_table_aliases(): - assert get_query_table_aliases("SELECT bar FROM foo") == {} - assert get_query_table_aliases("SELECT bar FROM foo AS f") == {"f": "foo"} - # assert get_query_table_aliases('SELECT bar FROM foo f') == {'f': 'foo'} - assert get_query_table_aliases("SELECT bar AS value FROM foo AS f") == {"f": "foo"} - assert get_query_table_aliases( + assert Parser("SELECT bar FROM foo").tables_aliases == {} + assert Parser("SELECT bar FROM foo AS f").tables_aliases == {"f": "foo"} + assert Parser("SELECT bar FROM foo f").tables_aliases == {"f": "foo"} + assert Parser("SELECT bar AS value FROM foo AS f").tables_aliases == {"f": "foo"} + assert Parser( "SELECT bar AS value FROM foo AS f INNER JOIN dimensions AS d ON f.id = d.id" - ) == {"f": "foo", "d": "dimensions"} - assert get_query_table_aliases( - "SELECT e.foo FROM (SELECT * FROM bar) AS e" + ).tables_aliases == {"f": "foo", "d": "dimensions"} + assert ( + Parser("SELECT e.foo FROM (SELECT * FROM bar) AS e").tables_aliases == {} ), "Sub-query aliases are ignored" - assert get_query_table_aliases( + assert Parser( "SELECT a.* FROM product_a AS a JOIN product_b AS b ON a.ip_address = b.ip_address" - ) == {"a": "product_a", "b": "product_b"} + ).tables_aliases == {"a": "product_a", "b": "product_b"} def test_select_aliases(): - assert get_query_columns("SELECT e.foo FROM bar AS e") == ["bar.foo"] - # assert get_query_columns('SELECT e.foo FROM bar e') == ['bar.foo'] + assert Parser("SELECT e.foo FROM bar AS e").columns == ["bar.foo"] + assert Parser("SELECT e.foo FROM bar e").columns == ["bar.foo"] def test_tables_aliases_are_resolved(): @@ -28,10 +28,11 @@ def test_tables_aliases_are_resolved(): """ sql = "SELECT a.* FROM users1 AS a JOIN users2 AS b ON a.ip_address = b.ip_address" - assert get_query_tables(sql) == ["users1", "users2"] - assert get_query_table_aliases(sql) == {"a": "users1", "b": "users2"} - assert get_query_columns(sql) == [ + parser = Parser(sql) + assert parser.tables == ["users1", "users2"] + assert parser.tables_aliases == {"a": "users1", "b": "users2"} + assert parser.columns == [ "users1.*", "users1.ip_address", "users2.ip_address", - ], "Should resolve table aliases" + ] diff --git a/test/test_caching.py b/test/test_caching.py new file mode 100644 index 00000000..403dd67a --- /dev/null +++ b/test/test_caching.py @@ -0,0 +1,13 @@ +import pytest + +from sql_metadata import Parser + + +def test_cleared_cache(): + parser = Parser("Select * from test") + assert parser.tables == ["test"] + + with pytest.raises(AttributeError): + parser.query = "Select * from test2" + + assert parser._tables == ["test"] diff --git a/test/test_column_aliases.py b/test/test_column_aliases.py new file mode 100644 index 00000000..4cac88dc --- /dev/null +++ b/test/test_column_aliases.py @@ -0,0 +1,47 @@ +from sql_metadata import Parser + + +def test_column_aliases(): + query = """ + SELECT yearweek(SignDate) as Aggregation, + BusinessSource, + (select sum(C2Count) + from (select count(C2) as C2Count, BusinessSource, yearweek(Start1) Start1, yearweek(End1) End1 + from ( + select ContractID as C2, BusinessSource, StartDate as Start1, EndDate as End1 + from data_contracts_report + ) sq2 + group by 2, 3, 4) sq + where Start1 <= yearweek(SignDate) + and End1 >= yearweek(SignDate) + and sq.BusinessSource = mq.BusinessSource) CountOfConsultants +FROM data_contracts_report mq +where SignDate >= last_day(date_add(now(), interval -13 month)) +group by 1, 2 +order by 1, 2; + """ + parser = Parser(query) + assert parser.tables == ["data_contracts_report"] + assert parser.subqueries_names == ["sq2", "sq"] + assert parser.subqueries == { + "sq": "select count(C2) as C2Count, BusinessSource, yearweek(Start1) Start1, " + "yearweek(End1) End1 from (select ContractID as C2, BusinessSource, " + "StartDate as Start1, EndDate as End1 from data_contracts_report) sq2 " + "group by 2, 3, 4", + "sq2": "select ContractID as C2, BusinessSource, StartDate as Start1, EndDate " + "as End1 from data_contracts_report", + } + assert parser.columns == [ + "SignDate", + "BusinessSource", + "C2Count", + "C2", + "Start1", + "End1", + "ContractID", + "StartDate", + "EndDate", + "sq.BusinessSource", + "data_contracts_report.BusinessSource", + "CountOfConsultants", + ] diff --git a/test/test_comments.py b/test/test_comments.py new file mode 100644 index 00000000..99b060fa --- /dev/null +++ b/test/test_comments.py @@ -0,0 +1,30 @@ +from sql_metadata import Parser + + +def test_getting_comments(): + parser = Parser( + "INSERT /* VoteHelper::addVote xxx */ INTO `page_vote` (article_id,user_id,`time`) VALUES ('442001','27574631','20180228130846')" + ) + assert parser.comments == ["/* VoteHelper::addVote xxx */"] + + parser = Parser( + "SELECT /* CategoryPaginationViewer::processSection */ " + "page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix FROM `page` " + "INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) " + " /* We should add more conditions */ " + "WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' " + " /* Verify with accounting */ " + "ORDER BY cl_sortkey LIMIT 927600,200" + ) + assert parser.comments == [ + "/* CategoryPaginationViewer::processSection */", + "/* We should add more conditions */", + "/* Verify with accounting */", + ] + assert parser.without_comments == ( + "SELECT page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix " + "FROM `page` " + "INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) " + "WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' " + "ORDER BY cl_sortkey LIMIT 927600,200" + ) diff --git a/test/test_complex_aliases.py b/test/test_complex_aliases.py new file mode 100644 index 00000000..5193e48c --- /dev/null +++ b/test/test_complex_aliases.py @@ -0,0 +1,57 @@ +import pathlib + +from sql_metadata import Parser + +dir_path = pathlib.Path(__file__).parent.absolute() + + +def test_complex_query_aliases(): + sql_filename = f"{dir_path}/test.sql" + with open(sql_filename, "r", encoding="latin-1") as content_file: + content = content_file.read() + parser = Parser(content) + assert parser.tables_aliases == { + "C": "EBH_DM_CRM.CRM_CUSTOMER", + "RB_ACCT": "KMR_FINANCIAL_INDEX_STAGE_01", + "OD_FAC": "KMR_FINANCIAL_INDEX_STAGE_07", + "CARD": "KMR_FINANCIAL_INDEX_STAGE_02", + "CL_LOAN": "KMR_FINANCIAL_INDEX_STAGE_03", + "CL_INV": "KMR_FINANCIAL_INDEX_STAGE_05", + "BEF_ZRT": "KMR_FINANCIAL_INDEX_STAGE_04", + "AR": "SCHEMA.CRM_ARRANGEMENT", + "L": "SCHEMA.CRM_LOAN", + "CF": "SCHEMA.CRM_CASH_FLOW", + "CA": "SCHEMA.CRM_REL_CUSTOMER_ARRANGEMENT", + "LPRE": "SCHEMA.CRM_LOAN", + "P": "SCHEMA.CRM_PRODUCT", + "PD": "SCHEMA.CRM_PAST_DUE", + "DAT": "SCHEMA.CRM_DATE", + "BLL": "SCHEMA.CRM_BALANCE_LOAN_LOAN", + "E": "SCHEMA.CRM_EXCHANGE_RATE", + "LE": "SCHEMA.CRM_LEASING", + "BLE": "SCHEMA.CRM_BALANCE_LEASING", + "SCB": "SCHEMA.CRM_RL_SUBSIDIARY_COMPANIE_BAT", + "SZCH": "KMR_FINANCIAL_INDEX_STAGE_06", + } + assert parser.tables == [ + "EBH_DM_CRM.CRM_CUSTOMER", + "KMR_FINANCIAL_INDEX_STAGE_01", + "KMR_FINANCIAL_INDEX_STAGE_07", + "KMR_FINANCIAL_INDEX_STAGE_02", + "KMR_FINANCIAL_INDEX_STAGE_03", + "KMR_FINANCIAL_INDEX_STAGE_05", + "KMR_FINANCIAL_INDEX_STAGE_04", + "SCHEMA.CRM_ARRANGEMENT", + "SCHEMA.CRM_LOAN", + "SCHEMA.CRM_CASH_FLOW", + "SCHEMA.CRM_REL_CUSTOMER_ARRANGEMENT", + "SCHEMA.CRM_PRODUCT", + "SCHEMA.CRM_PAST_DUE", + "SCHEMA.CRM_DATE", + "SCHEMA.CRM_BALANCE_LOAN_LOAN", + "SCHEMA.CRM_EXCHANGE_RATE", + "SCHEMA.CRM_LEASING", + "SCHEMA.CRM_BALANCE_LEASING", + "SCHEMA.CRM_RL_SUBSIDIARY_COMPANIE_BAT", + "KMR_FINANCIAL_INDEX_STAGE_06", + ] diff --git a/test/test_get_query_columns.py b/test/test_get_query_columns.py deleted file mode 100644 index 16ece633..00000000 --- a/test/test_get_query_columns.py +++ /dev/null @@ -1,100 +0,0 @@ -from sql_metadata import get_query_columns - - -def test_get_query_columns(): - assert get_query_columns("SELECT * FROM `test_table`") == ["*"] - assert get_query_columns("SELECT foo.* FROM `test_table`") == ["foo.*"] - assert get_query_columns("SELECT foo FROM `test_table`") == ["foo"] - assert get_query_columns("SELECT count(foo) FROM `test_table`") == ["foo"] - assert get_query_columns("SELECT COUNT(foo), max(time_id) FROM `test_table`") == [ - "foo", - "time_id", - ] - assert get_query_columns("SELECT id, foo FROM test_table WHERE id = 3") == [ - "id", - "foo", - ] - assert get_query_columns( - "SELECT id, foo FROM test_table WHERE foo_id = 3 AND bar = 5" - ) == ["id", "foo", "foo_id", "bar"] - assert get_query_columns( - "SELECT foo, count(*) as bar FROM `test_table` WHERE id = 3" - ) == ["foo", "id"] - assert get_query_columns("SELECT foo, test as bar FROM `test_table`") == [ - "foo", - "test", - ] - assert get_query_columns("SELECT /* a comment */ bar FROM test_table") == ["bar"] - - -def test_get_query_columns_order_by(): - assert get_query_columns("SELECT foo FROM bar ORDER BY id") == ["foo", "id"] - assert get_query_columns("SELECT foo FROM bar WHERE id > 20 ORDER BY id") == [ - "foo", - "id", - ] - assert get_query_columns("SELECT id, foo FROM bar ORDER BY id DESC") == [ - "id", - "foo", - ] - assert get_query_columns("SELECT user_id,foo FROM bar ORDER BY id LIMIT 20") == [ - "user_id", - "foo", - "id", - ] - - -def test_get_query_columns_complex(): - # @see https://github.com/macbre/sql-metadata/issues/6 - assert get_query_columns( - "SELECT 1 as c FROM foo_pageviews WHERE time_id = '2018-01-07 00:00:00' AND period_id = '2' LIMIT 1" - ) == ["time_id", "period_id"] - - # table aliases - assert get_query_columns( - "SELECT r.wiki_id AS id, pageviews_7day AS pageviews FROM report_wiki_recent_pageviews AS r " - "INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.is_public = '1' " - "AND r.lang IN ( 'en', 'ru' ) AND r.hub_name = 'gaming' ORDER BY pageviews DESC LIMIT 300" - ) == [ - "report_wiki_recent_pageviews.wiki_id", - "pageviews_7day", - "dimension_wikis.wiki_id", - "dimension_wikis.is_public", - "report_wiki_recent_pageviews.lang", - "report_wiki_recent_pageviews.hub_name", - "pageviews", - ] - - # self joins - assert get_query_columns( - "SELECT count(fw1.wiki_id) as wam_results_total FROM `fact_wam_scores` `fw1` " - "left join `fact_wam_scores` `fw2` ON ((fw1.wiki_id = fw2.wiki_id) AND " - "(fw2.time_id = FROM_UNIXTIME(1466380800))) left join `dimension_wikis` `dw` " - "ON ((fw1.wiki_id = dw.wiki_id)) WHERE (fw1.time_id = FROM_UNIXTIME(1466467200)) " - "AND (dw.url like '%%' OR dw.title like '%%') AND fw1.vertical_id IN " - "('0','1','2','3','4','5','6','7') AND (fw1.wiki_id NOT " - "IN ('23312','70256','168929','463633','381622','1089624')) " - "AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" - ) == [ - "fw1.wiki_id", - "fw2.wiki_id", - "fw2.time_id", - "dw.wiki_id", - "fw1.time_id", - "dw.url", - "dw.title", - "fw1.vertical_id", - ] - - assert get_query_columns( - "SELECT date_format(time_id,'%Y-%m-%d') AS date, pageviews AS cnt FROM rollup_wiki_pageviews WHERE period_id = '2' AND wiki_id = '1676379' AND time_id BETWEEN '2018-01-08' AND '2018-01-01'" - ) == ["time_id", "pageviews", "period_id", "wiki_id"] - - assert get_query_columns( - "INSERT /* VoteHelper::addVote xxx */ INTO `page_vote` (article_id,user_id,`time`) VALUES ('442001','27574631','20180228130846')" - ) == ["article_id", "user_id", "time"] - - # REPLACE queries - assert get_query_columns( - "REPLACE INTO `page_props` (pp_page,pp_propname,pp_value) VALUES ('47','infoboxes','')" - ) == ["pp_page", "pp_propname", "pp_value"] diff --git a/test/test_getting_columns.py b/test/test_getting_columns.py new file mode 100644 index 00000000..6802691b --- /dev/null +++ b/test/test_getting_columns.py @@ -0,0 +1,212 @@ +from sql_metadata.parser import Parser + + +def test_cast_and_convert_functions(): + # https://dev.mysql.com/doc/refman/8.0/en/cast-functions.html + parser = Parser("SELECT count(c) as test, id FROM foo where cast(d as bigint) > e") + assert parser.columns == ["c", "id", "d", "e"] + assert parser.columns_dict == {"select": ["c", "id"], "where": ["d", "e"]} + + parser = Parser("SELECT CONVERT(latin1_column USING utf8) FROM latin1_table;") + assert parser.columns == ["latin1_column"] + assert parser.columns_dict == {"select": ["latin1_column"]} + + +def test_queries_with_null_conditions(): + parser = Parser( + "SELECT id FROM cm WHERE cm.status = 1 AND cm.OPERATIONDATE IS NULL AND cm.OID IN(123123);" + ) + assert parser.columns == ["id", "cm.status", "cm.OPERATIONDATE", "cm.OID"] + assert parser.columns_dict == { + "select": ["id"], + "where": ["cm.status", "cm.OPERATIONDATE", "cm.OID"], + } + + parser = Parser( + "SELECT id FROM cm WHERE cm.status = 1 AND cm.OPERATIONDATE IS NOT NULL AND cm.OID IN(123123);" + ) + assert parser.columns == ["id", "cm.status", "cm.OPERATIONDATE", "cm.OID"] + assert parser.columns_dict == { + "select": ["id"], + "where": ["cm.status", "cm.OPERATIONDATE", "cm.OID"], + } + + +def test_queries_with_distinct(): + assert Parser("SELECT DISTINCT DATA.ASSAY_ID FROM foo").columns == ["DATA.ASSAY_ID"] + + assert Parser("SELECT UNIQUE DATA.ASSAY_ID FROM foo").columns == ["DATA.ASSAY_ID"] + + +def test_joins(): + assert ["page_title", "rd_title", "rd_namespace", "page_id", "rd_from",] == Parser( + "SELECT page_title FROM `redirect` INNER JOIN `page` " + "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))" + ).columns + + +def test_getting_columns(): + assert Parser("SELECT * FROM `test_table`").columns == ["*"] + assert Parser("SELECT foo.* FROM `test_table`").columns == ["foo.*"] + assert Parser("SELECT foo FROM `test_table`").columns == ["foo"] + assert Parser("SELECT count(foo) FROM `test_table`").columns == ["foo"] + assert Parser("SELECT COUNT(foo), max(time_id) FROM `test_table`").columns == [ + "foo", + "time_id", + ] + assert Parser("SELECT id, foo FROM test_table WHERE id = 3").columns == [ + "id", + "foo", + ] + assert Parser( + "SELECT id, foo FROM test_table WHERE foo_id = 3 AND bar = 5" + ).columns == ["id", "foo", "foo_id", "bar"] + assert Parser( + "SELECT foo, count(*) as bar FROM `test_table` WHERE id = 3" + ).columns == ["foo", "id"] + assert Parser("SELECT foo, test as bar FROM `test_table`").columns == [ + "foo", + "test", + ] + assert Parser("SELECT /* a comment */ bar FROM test_table").columns == ["bar"] + + +def test_columns_with_order_by(): + assert Parser("SELECT foo FROM bar ORDER BY id").columns == ["foo", "id"] + assert Parser("SELECT foo FROM bar WHERE id > 20 ORDER BY id").columns == [ + "foo", + "id", + ] + assert Parser("SELECT id, foo FROM bar ORDER BY id DESC").columns == [ + "id", + "foo", + ] + assert Parser("SELECT user_id,foo FROM bar ORDER BY id LIMIT 20").columns == [ + "user_id", + "foo", + "id", + ] + + +def test_update_and_replace(): + # UPDATE queries + parser = Parser( + "UPDATE `page` SET page_touched = other_column WHERE page_id = 'test'" + ) + assert parser.columns == ["page_touched", "other_column", "page_id"] + assert parser.columns_dict == { + "update": ["page_touched", "other_column"], + "where": ["page_id"], + } + + parser = Parser("UPDATE `page` SET page_touched = 'value' WHERE page_id = 'test'") + assert parser.columns == ["page_touched", "page_id"] + assert parser.columns_dict == {"update": ["page_touched"], "where": ["page_id"]} + + # REPLACE queries + parser = Parser( + "REPLACE INTO `page_props` (pp_page,pp_propname,pp_value) VALUES ('47','infoboxes','')" + ) + assert parser.columns == ["pp_page", "pp_propname", "pp_value"] + assert parser.columns_dict == {"insert": ["pp_page", "pp_propname", "pp_value"]} + + +def test_complex_queries_columns(): + # @see https://github.com/macbre/sql-metadata/issues/6 + assert Parser( + "SELECT 1 as c FROM foo_pageviews WHERE time_id = '2018-01-07 00:00:00' AND period_id = '2' LIMIT 1" + ).columns == ["time_id", "period_id"] + + # table aliases + parser = Parser( + "SELECT r.wiki_id AS id, pageviews_7day AS pageviews FROM report_wiki_recent_pageviews AS r " + "INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.is_public = '1' " + "AND r.lang IN ( 'en', 'ru' ) AND r.hub_name = 'gaming' ORDER BY pageviews DESC LIMIT 300" + ) + assert parser.columns == [ + "report_wiki_recent_pageviews.wiki_id", + "pageviews_7day", + "dimension_wikis.wiki_id", + "dimension_wikis.is_public", + "report_wiki_recent_pageviews.lang", + "report_wiki_recent_pageviews.hub_name", + "pageviews", + ] + assert parser.columns_dict == { + "select": ["report_wiki_recent_pageviews.wiki_id", "pageviews_7day"], + "join": ["report_wiki_recent_pageviews.wiki_id", "dimension_wikis.wiki_id"], + "where": [ + "dimension_wikis.is_public", + "report_wiki_recent_pageviews.lang", + "report_wiki_recent_pageviews.hub_name", + ], + "order_by": ["pageviews"], + } + + # self joins + parser = Parser( + "SELECT count(fw1.wiki_id) as wam_results_total FROM `fact_wam_scores` `fw1` " + "left join `fact_wam_scores` `fw2` ON ((fw1.wiki_id = fw2.wiki_id) AND " + "(fw2.time_id = FROM_UNIXTIME(1466380800))) left join `dimension_wikis` `dw` " + "ON ((fw1.wiki_id = dw.wiki_id)) WHERE (fw1.time_id = FROM_UNIXTIME(1466467200)) " + "AND (dw.url like '%%' OR dw.title like '%%') AND fw1.vertical_id IN " + "('0','1','2','3','4','5','6','7') AND (fw1.wiki_id NOT " + "IN ('23312','70256','168929','463633','381622','1089624')) " + "AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" + ) + assert parser.columns == [ + "fw1.wiki_id", + "fw2.wiki_id", + "fw2.time_id", + "dw.wiki_id", + "fw1.time_id", + "dw.url", + "dw.title", + "fw1.vertical_id", + ] + assert parser.columns_dict == { + "select": ["fw1.wiki_id"], + "join": ["fw1.wiki_id", "fw2.wiki_id", "fw2.time_id", "dw.wiki_id"], + "where": [ + "fw1.time_id", + "dw.url", + "dw.title", + "fw1.vertical_id", + "fw1.wiki_id", + ], + } + + assert Parser( + "SELECT date_format(time_id,'%Y-%m-%d') AS date, pageviews AS cnt FROM rollup_wiki_pageviews WHERE period_id = '2' AND wiki_id = '1676379' AND time_id BETWEEN '2018-01-08' AND '2018-01-01'" + ).columns == ["time_id", "pageviews", "period_id", "wiki_id"] + + parser = Parser( + "INSERT /* VoteHelper::addVote xxx */ INTO `page_vote` (article_id,user_id,`time`) VALUES ('442001','27574631','20180228130846')" + ) + assert parser.columns == ["article_id", "user_id", "time"] + + # REPLACE queries + parser = Parser( + "REPLACE INTO `page_props` (pp_page,pp_propname,pp_value) VALUES ('47','infoboxes','')" + ) + assert parser.columns == ["pp_page", "pp_propname", "pp_value"] + assert parser.columns_dict == {"insert": ["pp_page", "pp_propname", "pp_value"]} + + assert Parser( + "SELECT /* CategoryPaginationViewer::processSection */ " + "page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix FROM `page` " + "INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) " + "WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' " + "ORDER BY cl_sortkey LIMIT 927600,200" + ).columns_dict == { + "select": [ + "page_namespace", + "page_title", + "page_len", + "page_is_redirect", + "cl_sortkey_prefix", + ], + "join": ["cl_from", "page_id"], + "where": ["cl_type", "cl_to"], + "order_by": ["cl_sortkey"], + } diff --git a/test/test_getting_tables.py b/test/test_getting_tables.py new file mode 100644 index 00000000..ea776e13 --- /dev/null +++ b/test/test_getting_tables.py @@ -0,0 +1,404 @@ +from sql_metadata.parser import Parser + + +def test_simple_queries_tables(): + assert ["test_table"] == Parser("SELECT * FROM `test_table`").tables + + assert ["0001_test_table"] == Parser("SELECT * FROM `0001_test_table`").tables + + assert ["test_table"] == Parser("SELECT foo FROM `test_table`").tables + + assert ["s.t"] == Parser("SELECT * FROM s.t").tables + + assert ["db.test_table"] == Parser("SELECT foo FROM `db`.`test_table`").tables + + assert ["test_table"] == Parser("SELECT foo FROM test_table WHERE id = 1").tables + + assert ["test_table", "second_table"] == Parser( + "SELECT foo FROM test_table, second_table WHERE id = 1" + ).tables + + assert ["revision", "page", "wikicities_user"] == Parser( + "SELECT rev_id,rev_page,rev_text_id,rev_timestamp,rev_comment,rev_user_text,rev_user,rev_minor_edit,rev_deleted,rev_len,rev_parent_id,rev_shaN,page_namespace,page_title,page_id,page_latest,user_name FROM `revision` INNER JOIN `page` ON ((page_id = rev_page)) LEFT JOIN `wikicities_user` ON ((rev_user != N) AND (user_id = rev_user)) WHERE rev_id = X LIMIT N" + ).tables + + assert ["events"] == Parser( + "SELECT COUNT( 0 ) AS cnt, date_format(event_date, '%Y-%m-%d') AS date FROM events WHERE event_date BETWEEN '2017-10-18 00:00:00' AND '2017-10-24 23:59:59' AND wiki_id = '1289985' GROUP BY date WITH ROLLUP" + ).tables + + +def test_complex_query_tables(): + # complex queries + # @see https://github.com/macbre/query-digest/issues/16 + assert ["report_wiki_recent_pageviews", "dimension_wikis"] == Parser( + "SELECT r.wiki_id AS id, pageviews_Nday AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = X AND r.lang = X AND r.hub_name = X ORDER BY pageviews DESC LIMIT N" + ).tables + + assert ["dimension_wikis", "fact_wam_scores"] == Parser( + "SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC" + ).tables + + assert ["fact_wam_scores", "dimension_wikis"] == Parser( + "SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" + ).tables + + assert ["revision", "page", "wikicities_cN.user"] == Parser( + "SELECT rev_id,rev_page,rev_text_id,rev_timestamp,rev_comment,rev_user_text,rev_user,rev_minor_edit,rev_deleted,rev_len,rev_parent_id,rev_shaN,page_namespace,page_title,page_id,page_latest,user_name FROM `revision` INNER JOIN `page` ON ((page_id = rev_page)) LEFT JOIN `wikicities_cN`.`user` ON ((rev_user != N) AND (user_id = rev_user)) WHERE rev_id = X LIMIT N" + ).tables + + # complex queries, take two + # @see https://github.com/macbre/sql-metadata/issues/6 + assert ["foo_pageviews"] == Parser( + "SELECT 1 as c FROM foo_pageviews WHERE time_id = '2018-01-07 00:00:00' AND period_id = '2' LIMIT 1" + ).tables + + # table aliases + assert ["report_wiki_recent_pageviews", "dimension_wikis"] == Parser( + "SELECT r.wiki_id AS id, pageviews_7day AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = '1' AND r.lang IN ( 'en', 'ru' ) AND r.hub_name = 'gaming' ORDER BY pageviews DESC LIMIT 300" + ).tables + + # include multiple FROM tables when they prefixed + # @see https://github.com/macbre/sql-metadata/issues/38 + assert ["MYDB1.TABLE1", "MYDB2.TABLE2"] == Parser( + "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM MYDB1.TABLE1 AS A, MYDB2.TABLE2 AS B" + ).tables + + # test whitespaces in keywords + # @see https://github.com/macbre/sql-metadata/issues/80 + assert ( + ["tab", "tab2"] + == Parser( + """select a,b,c from tab full outer \r\n\t join tab2 on (col1 = col2) group + \r\n \t by a, b, c """ + ).tables + ) + + +def test_joins(): + # self joins + assert ["fact_wam_scores", "dimension_wikis"] == Parser( + "SELECT count(fw1.wiki_id) as wam_results_total FROM `fact_wam_scores` `fw1` left join `fact_wam_scores` `fw2` ON ((fw1.wiki_id = fw2.wiki_id) AND (fw2.time_id = FROM_UNIXTIME(1466380800))) left join `dimension_wikis` `dw` ON ((fw1.wiki_id = dw.wiki_id)) WHERE (fw1.time_id = FROM_UNIXTIME(1466467200)) AND (dw.url like '%%' OR dw.title like '%%') AND fw1.vertical_id IN ('0','1','2','3','4','5','6','7') AND (fw1.wiki_id NOT IN ('23312','70256','168929','463633','381622','524772','476782','9764','214934','170145','529622','52149','96420','390','468156','690804','197434','29197','88043','37317','466775','402313','169142','746246','119847','57268','1089624')) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" + ).tables + + assert ["rollup_wiki_pageviews"] == Parser( + "SELECT date_format(time_id,'%Y-%m-%d') AS date, pageviews AS cnt FROM rollup_wiki_pageviews WHERE period_id = '2' AND wiki_id = '1676379' AND time_id BETWEEN '2018-01-08' AND '2018-01-01'" + ).tables + + # JOINs + assert ["product_a.users", "product_b.users"] == Parser( + "SELECT a.* FROM product_a.users AS a JOIN product_b.users AS b ON a.ip_address = b.ip_address" + ).tables + + assert ["redirect", "page"] == Parser( + "SELECT page_title FROM `redirect` INNER JOIN `page` " + "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))" + ).tables + + assert ["redirect", "page"] == Parser( + "SELECT page_title FROM `redirect` INNER JOIN `page` `foo` " + "ON (rd_title = 'foo' AND rd_namespace = '100' AND (foo.page_id = rd_from))" + ).tables + + # see #34 + assert ["foos", "bars"] == Parser( + "SELECT foo FROM `foos` JOIN `bars` ON (foos.id = bars.id)" + ).tables + + assert ["foos", "bars"] == Parser( + "SELECT foo FROM `foos` FULL JOIN `bars` ON (foos.id = bars.id)" + ).tables + + assert ["foos", "bars"] == Parser( + "SELECT foo FROM `foos` FULL OUTER JOIN `bars` ON (foos.id = bars.id)" + ).tables + + assert ["foos", "bars"] == Parser( + "SELECT foo FROM `foos` RIGHT OUTER JOIN `bars` ON (foos.id = bars.id)" + ).tables + + assert ["foos", "bars"] == Parser( + "SELECT foo FROM `foos` LEFT OUTER JOIN `bars` ON (foos.id = bars.id)" + ).tables + + +def test_quoted_names(): + # handle quoted names + assert ["MYDB.MYTABLE"] == Parser('SELECT COUNT(*) FROM "MYDB".MYTABLE').tables + + assert ["MYDB.MYTABLE"] == Parser('SELECT COUNT(*) FROM MYDB."MYTABLE"').tables + + assert ["MYDB.MYTABLE"] == Parser('SELECT COUNT(*) FROM "MYDB"."MYTABLE"').tables + + assert ["MYDB.MYSCHEMA.MYTABLE"] == Parser( + 'SELECT COUNT(*) FROM "MYDB".MYSCHEMA.MYTABLE' + ).tables + + assert ["MYDB.MYSCHEMA.MYTABLE"] == Parser( + 'SELECT COUNT(*) FROM MYDB."MYSCHEMA".MYTABLE' + ).tables + + assert ["MYDB.MYSCHEMA.MYTABLE"] == Parser( + 'SELECT COUNT(*) FROM MYDB.MYSCHEMA."MYTABLE"' + ).tables + + assert ["MYDB.MYSCHEMA.MYTABLE"] == Parser( + 'SELECT COUNT(*) FROM "MYDB"."MYSCHEMA"."MYTABLE"' + ).tables + + +def test_update_and_replace(): + # UPDATE queries + assert ["page"] == Parser( + "UPDATE `page` SET page_touched = X WHERE page_id = X" + ).tables + + # REPLACE queries + assert ["page_props"] == Parser( + "REPLACE INTO `page_props` (pp_page,pp_propname,pp_value) VALUES ('47','infoboxes','')" + ).tables + + +def test_order_bys(): + # ORDER BY + assert ["bar"] == Parser("SELECT foo FROM bar ORDER BY id").tables + + assert ["bar"] == Parser("SELECT foo FROM bar WHERE id > 20 ORDER BY id").tables + + assert ["bar"] == Parser("SELECT foo FROM bar ORDER BY id DESC").tables + + assert ["bar"] == Parser("SELECT foo FROM bar ORDER BY id LIMIT 20").tables + + +def test_three_part_qualified_names(): + # database.schema.table formats + assert ["MYDB1.MYSCHEMA1.MYTABLE1"] == Parser( + "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1" + ).tables + + assert ["MYDB1.MYSCHEMA1.MYTABLE1", "MYDB2.MYSCHEMA2.MYTABLE2"] == Parser( + "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1 JOIN MYDB2.MYSCHEMA2.MYTABLE2" + ).tables + + assert ["MYDB1.MYSCHEMA1.MYTABLE1", "MYDB2.MYSCHEMA2.MYTABLE2"] == Parser( + "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1 INNER JOIN MYDB2.MYSCHEMA2.MYTABLE2" + ).tables + + assert ["MYDB1.MYSCHEMA1.MYTABLE1", "MYDB2.MYSCHEMA2.MYTABLE2"] == Parser( + "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1 A LEFT JOIN MYDB2.MYSCHEMA2.MYTABLE2 B ON A.COL = B.COL" + ).tables + + assert ["MYDB1.MYSCHEMA1.MYTABLE1", "MYDB2.MYSCHEMA2.MYTABLE2"] == Parser( + "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1 INNER JOIN MYDB2.MYSCHEMA2.MYTABLE2" + ).tables + + +def test_insert_queries(): + # INSERT queries + assert ["0070_insert_ignore_table"] == Parser( + "INSERT IGNORE INTO `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');" + ).tables + + assert ["0070_insert_ignore_table"] == Parser( + "INSERT into `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');" + ).tables + + assert ["foo"] == Parser("INSERT INTO `foo` (id,text) VALUES (X,X)").tables + + assert ["page_vote"] == Parser( + "INSERT /* VoteHelper::addVote xxx */ INTO `page_vote` (article_id,user_id,time) VALUES ('442001','27574631','20180228130846')" + ).tables + + +def test_select_aliases(): + assert Parser("SELECT e.foo FROM bar AS e").tables == ["bar"] + assert Parser("SELECT e.foo FROM bar e").tables == ["bar"] + assert Parser("SELECT e.foo FROM (SELECT * FROM bar) AS e").tables == ["bar"] + assert Parser("SELECT e.foo FROM (SELECT * FROM bar) e").tables == ["bar"] + + +def test_table_name_with_group_by(): + expected_tables = ["SH.sales"] + + assert ( + Parser("SELECT s.cust_id,count(s.cust_id) FROM SH.sales s").tables + == expected_tables + ) + + assert ( + Parser( + "SELECT s.cust_id,count(s.cust_id) FROM SH.sales s GROUP BY s.cust_id" + ).tables + == expected_tables + ) + + assert ( + Parser( + """ + SELECT s.cust_id,count(s.cust_id) FROM SH.sales s + GROUP BY s.cust_id HAVING s.cust_id != '1660' AND s.cust_id != '2' + """.strip() + ).tables + == expected_tables + ) + + +def test_datasets(): + # see https://github.com/macbre/sql-metadata/issues/38 + assert Parser( + "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM TABLE1 AS A, TABLE2 AS B" + ).tables == ["TABLE1", "TABLE2"] + + assert Parser( + "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM DATASET1.TABLE1, DATASET2.TABLE2" + ).tables == ["DATASET1.TABLE1", "DATASET2.TABLE2"] + + assert Parser( + "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM DATASET1.TABLE1 AS A, DATASET2.TABLE2 AS B" + ).tables == ["DATASET1.TABLE1", "DATASET2.TABLE2"] + + +def test_queries_with_distinct(): + assert Parser("SELECT DISTINCT DATA.ASSAY_ID FROM foo").tables == ["foo"] + + +def test_table_names_with_dashes(): + assert Parser("SELECT * FROM `schema-with-dash.tablename`").tables == [ + "schema-with-dash.tablename" + ] + + +def test_unions(): + # @see https://github.com/macbre/sql-metadata/issues/79 + assert ["tab1", "tab2"] == Parser( + "select col1, col2, col3 from tab1 union all select col4, col5, col6 from tab2" + ).tables + + # @see https://github.com/macbre/sql-metadata/issues/94 + assert ["d", "g"] == Parser( + "SELECT a,b,c FROM d UNION ALL SELECT e,f FROM g" + ).tables + + +def test_with_brackets(): + assert ( + ["database1.table1", "database2.table2"] + == Parser( + """ +SELECT + "xxxxx" +FROM + (database1.table1 alias +LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx")) +""" + ).tables + ) + + assert ( + ["inner_table"] + == Parser( + """ +SELECT + t.foo +FROM + (SELECT foo FROM inner_table + WHERE bar = '1') t +""" + ).tables + ) + + +def test_db2_query(): + query = """ + select ca.IDENTIFICATION_CODE identificationCode, +eo.KBO_NUMBER kboNumber, +eo.PARTY_NAME, +ca.total_guaranteed totale_borgtocht, +coalesce(sum(ae1.remainder),0) Saldo, +coalesce(sum(ae3.remainder),0) uitstel_van_betaling, +coalesce(sum(ae4.remainder),0) reservering_aangifte, +coalesce(sum(ae5.remainder),0) reservering_vergunning, +coalesce(sum(ae6.remainder),0) zekerheid_douanevervoer, +coalesce(sum(ae7.remainder),0) zekerheid_accijnsbeweging, +coalesce(sum(ae8.remainder),0) FRCT +from CUSTOMER_ACCOUNT ca +inner join economic_operator eo on eo.id = ca.economic_operator_id +join contact_details cd on cd.id = ca.contact_details_id +left join ( ca1_remainder_total_guaranteed crtg +inner join accounting_entity ae1 on ae1.id = crtg.accounting_entity_id) +on crtg.id = ca.ca1_id +left join (ca3_credit_account cca inner join accounting_entity ae3 on ae3.id = +cca.accounting_entity_id) on cca.id = ca.ca3_id +left join (ca4_reservations_declaration crd inner join accounting_entity ae4 on +ae4.id = crd.accounting_entity_id) on crd.id = ca.ca4_id +left join (ca5_reservations_permits crp inner join accounting_entity ae5 on ae5.id += crp.accounting_entity_id) on crp.id = ca.ca5_id +left join (CA6_GUARANTEE_CUSTOMS_TRANSPORT gct inner join accounting_entity ae6 on +ae6.id = gct.accounting_entity_id) on gct.id = ca.ca6_id +left join (CA7_GUARANTEE_EXCISE_PRODUCTS gep inner join accounting_entity ae7 on +ae7.id = gep.accounting_entity_id) on gep.id = ca.ca7_id +left join (ca8_frct cf inner join ca8_frct_per_discharge cfpd on cfpd.CA8_ID = +cf.id inner join accounting_entity ae8 on ae8.id = cfpd.accounting_entity_id) on +cf.id = ca.ca8_id +group by eo.PARTY_NAME,eo.KBO_NUMBER, ca.IDENTIFICATION_CODE, ca.total_guaranteed +order by eo.KBO_NUMBER, ca.IDENTIFICATION_CODE +with ur + """ + parser = Parser(query) + assert parser.tables == [ + "CUSTOMER_ACCOUNT", + "economic_operator", + "contact_details", + "ca1_remainder_total_guaranteed", + "accounting_entity", + "ca3_credit_account", + "ca4_reservations_declaration", + "ca5_reservations_permits", + "CA6_GUARANTEE_CUSTOMS_TRANSPORT", + "CA7_GUARANTEE_EXCISE_PRODUCTS", + "ca8_frct", + "ca8_frct_per_discharge", + ] + assert parser.columns == [ + "CUSTOMER_ACCOUNT.IDENTIFICATION_CODE", + "identificationCode", + "economic_operator.KBO_NUMBER", + "kboNumber", + "economic_operator.PARTY_NAME", + "CUSTOMER_ACCOUNT.total_guaranteed", + "totale_borgtocht", + "accounting_entity.remainder", + "Saldo", + "uitstel_van_betaling", + "reservering_aangifte", + "reservering_vergunning", + "zekerheid_douanevervoer", + "zekerheid_accijnsbeweging", + "FRCT", + "economic_operator.id", + "CUSTOMER_ACCOUNT.economic_operator_id", + "contact_details.id", + "CUSTOMER_ACCOUNT.contact_details_id", + "accounting_entity.id", + "ca1_remainder_total_guaranteed.accounting_entity_id", + "ca1_remainder_total_guaranteed.id", + "CUSTOMER_ACCOUNT.ca1_id", + "ca3_credit_account.accounting_entity_id", + "ca3_credit_account.id", + "CUSTOMER_ACCOUNT.ca3_id", + "ca4_reservations_declaration.accounting_entity_id", + "ca4_reservations_declaration.id", + "CUSTOMER_ACCOUNT.ca4_id", + "ca5_reservations_permits.accounting_entity_id", + "ca5_reservations_permits.id", + "CUSTOMER_ACCOUNT.ca5_id", + "CA6_GUARANTEE_CUSTOMS_TRANSPORT.accounting_entity_id", + "CA6_GUARANTEE_CUSTOMS_TRANSPORT.id", + "CUSTOMER_ACCOUNT.ca6_id", + "CA7_GUARANTEE_EXCISE_PRODUCTS.accounting_entity_id", + "CA7_GUARANTEE_EXCISE_PRODUCTS.id", + "CUSTOMER_ACCOUNT.ca7_id", + "ca8_frct_per_discharge.CA8_ID", + "ca8_frct.id", + "ca8_frct_per_discharge.accounting_entity_id", + "CUSTOMER_ACCOUNT.ca8_id", + ] diff --git a/test/test_hive.py b/test/test_hive.py index 03068528..94a0c8c2 100644 --- a/test/test_hive.py +++ b/test/test_hive.py @@ -1,26 +1,23 @@ """ Set of unit tests for handling of Apache Hive queries """ -import pytest -from sql_metadata import get_query_columns, get_query_tables +from sql_metadata.parser import Parser def test_insert_overwrite_table(): - assert ["foo_report"] == get_query_tables("INSERT TABLE foo_report") - assert ["foo_report"] == get_query_tables("INSERT OVERWRITE TABLE foo_report") - assert ["foo_report", "bar"] == get_query_tables( + assert ["foo_report"] == Parser("INSERT TABLE foo_report").tables + assert ["foo_report"] == Parser("INSERT OVERWRITE TABLE foo_report").tables + assert ["foo_report", "bar"] == Parser( "INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar" - ) + ).tables - assert ["foo"] == get_query_columns( + assert ["foo"] == Parser( "INSERT OVERWRITE TABLE foo_report SELECT foo FROM bar" - ) + ).columns def test_complex_hive_query(): - pytest.skip("Improve HIVE syntax handling with a new parser (#98)") - # https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML#LanguageManualDML-InsertingdataintoHiveTablesfromqueries dag = """ INSERT OVERWRITE TABLE foo_report @@ -46,4 +43,4 @@ def test_complex_hive_query(): "foo_report", "rollup_wiki_beacon_pageviews", "statsdb.dimension_wikis", - ] == get_query_tables(dag) + ] == Parser(dag).tables diff --git a/test/test_limit_and_offset.py b/test/test_limit_and_offset.py new file mode 100644 index 00000000..218218c5 --- /dev/null +++ b/test/test_limit_and_offset.py @@ -0,0 +1,46 @@ +from sql_metadata.parser import Parser + + +def test_no_limit_and_offset(): + assert Parser("SELECT foo_limit FROM bar_offset").limit_and_offset is None + assert ( + Parser("SELECT foo_limit FROM bar_offset /* limit 1000,50 */").limit_and_offset + is None + ) + + +def test_only_limit(): + assert Parser("SELECT foo_limit FROM bar_offset LIMIT 50").limit_and_offset == ( + 50, + 0, + ) + + +def test_limit_and_offset(): + assert Parser( + "SELECT foo_limit FROM bar_offset LIMIT 50 OFFSET 1000" + ).limit_and_offset == (50, 1000) + assert Parser( + "SELECT foo_limit FROM bar_offset Limit 50 offset 1000" + ).limit_and_offset == (50, 1000) + + +def test_comma_separated(): + assert Parser( + "SELECT foo_limit FROM bar_offset LIMIT 1000, 50" + ).limit_and_offset == (50, 1000) + parser = Parser("SELECT foo_limit FROM bar_offset LIMIT 1000,50") + assert parser.limit_and_offset == (50, 1000) + assert parser.limit_and_offset != (0, 1000) + + assert Parser( + "SELECT foo_limit FROM bar_offset limit 1000,50" + ).limit_and_offset == (50, 1000) + + assert Parser( + "SELECT /* CategoryPaginationViewer::processSection */ " + "page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix FROM `page` " + "INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) " + "WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' " + "ORDER BY cl_sortkey LIMIT 927600,200" + ).limit_and_offset == (200, 927600) diff --git a/test/test_mssql_server.py b/test/test_mssql_server.py new file mode 100644 index 00000000..baba0e1a --- /dev/null +++ b/test/test_mssql_server.py @@ -0,0 +1,92 @@ +from sql_metadata.parser import Parser + + +def test_sql_server_cte(): + """ + Tests support for SQL Server's common table expression (CTE). + + @see https://www.sqlservertutorial.net/sql-server-basics/sql-server-cte/ + """ + assert ( + Parser( + """ + WITH x AS ( + SELECT * FROM n + ) + SELECT + * + FROM x + JOIN y ON x.a = y.a + """.strip() + ).tables + == ["n", "y"] + ) + + assert ( + Parser( + """ + WITH x AS ( + SELECT * FROM n + ) + select + * + FROM x + JOIN y ON x.a = y.a + """.strip() + ).tables + == ["n", "y"] + ) + + assert ( + Parser( + """ + WITH foo AS ( + SELECT * FROM n + ) + update z from foo set z.q = foo.y + """.strip() + ).tables + == ["n", "z"] + ) + + assert ( + Parser( + """ + WITH foo AS ( + SELECT * FROM tab + ) + DELETE FROM z JOIN foo ON z.a = foo.a + """.strip() + ).tables + == ["tab", "z"] + ) + + +def test_sql_server_cte_sales_by_year(): + sales_query = """ +WITH cte_sales AS ( + SELECT + staff_id, + COUNT(*) order_count + FROM + sales.orders + WHERE + YEAR(order_date) = 2018 + GROUP BY + staff_id +) +SELECT + AVG(order_count) average_orders_by_staff +FROM + cte_sales; + """.strip() + + assert Parser(sales_query).tables == ["sales.orders"] + # TODO: Check if average_orders_by_staff should be included, + # if no why order_count is included - what is the rule here? + assert Parser(sales_query).columns == [ + "staff_id", + "order_count", + "order_date", + "average_orders_by_staff", + ] diff --git a/test/test_multiple_subqueries.py b/test/test_multiple_subqueries.py new file mode 100644 index 00000000..fc3960c5 --- /dev/null +++ b/test/test_multiple_subqueries.py @@ -0,0 +1,273 @@ +from sql_metadata import Parser + + +def test_multiple_subqueries(): + query = """ +select main_qry.*, + subdays.DAYS_OFFER1, + subdays.DAYS_OFFER2, + subdays.DAYS_OFFER3 +from ( + select jr.id as PROJECT_ID, + 5 * (DATEDIFF(ifnull(lc.creation_date, now()), jr.creation_date) DIV 7) + + MID('0123444401233334012222340111123400001234000123440', + 7 * WEEKDAY(jr.creation_date) + WEEKDAY(ifnull(lc.creation_date, now())) + 1, 1) as LIFETIME, + count(distinct + case when jra.application_source = 'VERAMA' then jra.id else null end) NUM_APPLICATIONS, + count(distinct jra.id) NUM_CANDIDATES, + sum(case when jro.stage = 'DEAL' then 1 else 0 end) as NUM_CONTRACTED, + sum(ifnull(IS_INTERVIEW, 0)) as NUM_INTERVIEWED, + sum(ifnull(IS_PRESENTATION, 0)) as NUM_OFFERED + from job_request jr + left join job_request_application jra on jr.id = jra.job_request_id + left join job_request_offer jro on jro.job_request_application_id = jra.id + left join lifecycle lc on lc.object_id=jr.id and lc.lifecycle_object_type='JOB_REQUEST' + and lc.event = 'JOB_REQUEST_CLOSED' + left join (select jro2.job_request_application_id, + max(case + when jro2.first_interview_scheduled_date is not null then 1 + else 0 end) as IS_INTERVIEW, + max(case when jro2.first_presented_date is not null then 1 else 0 end) as IS_PRESENTATION + from job_request_offer jro2 + group by 1) jrah2 on jra.id = jrah2.job_request_application_id + left join client u on jr.client_id = u.id + where jr.from_point_break = 0 + and u.name not in ('Test', 'Demo Client') + group by 1, 2) main_qry + left join ( + select PROJECT_ID, + sum(case when RowNo = 1 then days_to_offer else null end) as DAYS_OFFER1, + sum(case when RowNo = 2 then days_to_offer else null end) as DAYS_OFFER2, + sum(case when RowNo = 3 then days_to_offer else null end) as DAYS_OFFER3 + from (select PROJECT_ID, + days_to_offer, + (select count(distinct jro.job_request_application_id) + from job_request_offer jro + left join job_request_application jra2 on jro.job_request_application_id = jra2.id + where jra2.job_request_id = PROJECT_ID + and jro.first_presented_date is not null + and jro.first_presented_date <= InitialChangeDate + ) as RowNo + from ( + select jr.id as PROJECT_ID, + 5 * (DATEDIFF(jro.first_presented_date, jr.creation_date) DIV 7) + + MID('0123444401233334012222340111123400001234000123440', + 7 * WEEKDAY(jr.creation_date) + WEEKDAY(jro.first_presented_date) + 1, + 1) as days_to_offer, + jro.job_request_application_id, + jro.first_presented_date as InitialChangeDate + from presentation pr + left join presentation_job_request_offer pjro on pr.id = pjro.presentation_id + left join job_request_offer jro on pjro.job_request_offer_id = jro.id + left join job_request jr on pr.job_request_id = jr.id + where jro.first_presented_date is not null) days_sqry) days_final_qry + group by PROJECT_ID) subdays + on subdays.PROJECT_ID = main_qry.PROJECT_ID +""" + parser = Parser(query) + assert parser.subqueries_names == [ + "jrah2", + "main_qry", + "days_sqry", + "days_final_qry", + "subdays", + ] + assert parser.columns == [ + "main_qry.*", + "subdays.DAYS_OFFER1", # subquery nested resolve? + "subdays.DAYS_OFFER2", # subquery nested resolve? + "subdays.DAYS_OFFER3", # subquery nested resolve? + "job_request.id", + "lifecycle.creation_date", + "job_request.creation_date", + "job_request_application.application_source", + "job_request_application.job_request_id", + "job_request_offer.job_request_application_id", + "job_request_application.id", + "lifecycle.object_id", + "lifecycle.lifecycle_object_type", + "lifecycle.event", + "job_request_offer.first_interview_scheduled_date", + "jrah2.job_request_application_id", # subquery nested resolve? + "job_request.client_id", + "client.id", + "job_request.from_point_break", + "client.name", + "PROJECT_ID", # recursive search? + "RowNo", # subquery name? + "days_to_offer", # should be resoled? + "job_request_offer.first_presented_date", + "InitialChangeDate", # alias of other column + "presentation.id", + "presentation_job_request_offer.presentation_id", + "presentation_job_request_offer.job_request_offer_id", + "job_request_offer.id", + "presentation.job_request_id", + "subdays.PROJECT_ID", # subquery nested resolve? + "main_qry.PROJECT_ID", # subquery nested resolve? + ] + assert parser.columns_without_subqueries == [ + "job_request.id", + "lifecycle.creation_date", + "job_request.creation_date", + "job_request_application.application_source", + "job_request_application.job_request_id", + "job_request_offer.job_request_application_id", + "job_request_application.id", + "lifecycle.object_id", + "lifecycle.lifecycle_object_type", + "lifecycle.event", + "job_request_offer.first_interview_scheduled_date", + "job_request.client_id", + "client.id", + "job_request.from_point_break", + "client.name", + "PROJECT_ID", # recursive search? + "RowNo", # subquery name? + "days_to_offer", # should be resoled? + "job_request_offer.first_presented_date", + "InitialChangeDate", # alias of other column + "presentation.id", + "presentation_job_request_offer.presentation_id", + "presentation_job_request_offer.job_request_offer_id", + "job_request_offer.id", + "presentation.job_request_id", + ] + assert parser.subqueries == { + "days_final_qry": "select PROJECT_ID, days_to_offer, (select count(distinct " + "jro.job_request_application_id) from job_request_offer jro " + "left join job_request_application jra2 on " + "jro.job_request_application_id = jra2.id where " + "jra2.job_request_id = PROJECT_ID and " + "jro.first_presented_date is not null and " + "jro.first_presented_date <= InitialChangeDate) as RowNo " + "from (select jr.id as PROJECT_ID, 5 * " + "(DATEDIFF(jro.first_presented_date, jr.creation_date) DIV " + "7) + " + "MID('0123444401233334012222340111123400001234000123440', 7 " + "* WEEKDAY(jr.creation_date) + " + "WEEKDAY(jro.first_presented_date) + 1, 1) as " + "days_to_offer, jro.job_request_application_id, " + "jro.first_presented_date as InitialChangeDate from " + "presentation pr left join presentation_job_request_offer " + "pjro on pr.id = pjro.presentation_id left join " + "job_request_offer jro on pjro.job_request_offer_id = " + "jro.id left join job_request jr on pr.job_request_id = " + "jr.id where jro.first_presented_date is not null) " + "days_sqry", + "days_sqry": "select jr.id as PROJECT_ID, 5 * " + "(DATEDIFF(jro.first_presented_date, jr.creation_date) DIV 7) + " + "MID('0123444401233334012222340111123400001234000123440', 7 * " + "WEEKDAY(jr.creation_date) + WEEKDAY(jro.first_presented_date) + " + "1, 1) as days_to_offer, jro.job_request_application_id, " + "jro.first_presented_date as InitialChangeDate from presentation " + "pr left join presentation_job_request_offer pjro on pr.id = " + "pjro.presentation_id left join job_request_offer jro on " + "pjro.job_request_offer_id = jro.id left join job_request jr on " + "pr.job_request_id = jr.id where jro.first_presented_date is not " + "null", + "jrah2": "select jro2.job_request_application_id, max(case when " + "jro2.first_interview_scheduled_date is not null then 1 else 0 end) " + "as IS_INTERVIEW, max(case when jro2.first_presented_date is not " + "null then 1 else 0 end) as IS_PRESENTATION from job_request_offer " + "jro2 group by 1", + "main_qry": "select jr.id as PROJECT_ID, 5 * " + "(DATEDIFF(ifnull(lc.creation_date, now()), jr.creation_date) DIV " + "7) + MID('0123444401233334012222340111123400001234000123440', 7 " + "* WEEKDAY(jr.creation_date) + WEEKDAY(ifnull(lc.creation_date, " + "now())) + 1, 1) as LIFETIME, count(distinct case when " + "jra.application_source = 'VERAMA' then jra.id else null end) " + "NUM_APPLICATIONS, count(distinct jra.id) NUM_CANDIDATES, " + "sum(case when jro.stage = 'DEAL' then 1 else 0 end) as " + "NUM_CONTRACTED, sum(ifnull(IS_INTERVIEW, 0)) as NUM_INTERVIEWED, " + "sum(ifnull(IS_PRESENTATION, 0)) as NUM_OFFERED from job_request " + "jr left join job_request_application jra on jr.id = " + "jra.job_request_id left join job_request_offer jro on " + "jro.job_request_application_id = jra.id left join lifecycle lc " + "on lc.object_id = jr.id and lc.lifecycle_object_type = " + "'JOB_REQUEST' and lc.event = 'JOB_REQUEST_CLOSED' left join " + "(select jro2.job_request_application_id, max(case when " + "jro2.first_interview_scheduled_date is not null then 1 else 0 " + "end) as IS_INTERVIEW, max(case when jro2.first_presented_date is " + "not null then 1 else 0 end) as IS_PRESENTATION from " + "job_request_offer jro2 group by 1) jrah2 on jra.id = " + "jrah2.job_request_application_id left join client u on " + "jr.client_id = u.id where jr.from_point_break = 0 and u.name not " + "in ('Test', 'Demo Client') group by 1, 2", + "subdays": "select PROJECT_ID, sum(case when RowNo = 1 then days_to_offer " + "else null end) as DAYS_OFFER1, sum(case when RowNo = 2 then " + "days_to_offer else null end) as DAYS_OFFER2, sum(case when RowNo " + "= 3 then days_to_offer else null end) as DAYS_OFFER3 from (select " + "PROJECT_ID, days_to_offer, (select count(distinct " + "jro.job_request_application_id) from job_request_offer jro left " + "join job_request_application jra2 on " + "jro.job_request_application_id = jra2.id where " + "jra2.job_request_id = PROJECT_ID and jro.first_presented_date is " + "not null and jro.first_presented_date <= InitialChangeDate) as " + "RowNo from (select jr.id as PROJECT_ID, 5 * " + "(DATEDIFF(jro.first_presented_date, jr.creation_date) DIV 7) + " + "MID('0123444401233334012222340111123400001234000123440', 7 * " + "WEEKDAY(jr.creation_date) + WEEKDAY(jro.first_presented_date) + " + "1, 1) as days_to_offer, jro.job_request_application_id, " + "jro.first_presented_date as InitialChangeDate from presentation " + "pr left join presentation_job_request_offer pjro on pr.id = " + "pjro.presentation_id left join job_request_offer jro on " + "pjro.job_request_offer_id = jro.id left join job_request jr on " + "pr.job_request_id = jr.id where jro.first_presented_date is not " + "null) days_sqry) days_final_qry group by PROJECT_ID", + } + + +def test_multiline_queries(): + query = """ +SELECT +COUNT(1) +FROM +(SELECT +std.task_id as new_task_id +FROM +some_task_detail std +WHERE +std.STATUS = 1 +) a +JOIN ( +SELECT +st.task_id +FROM +some_task st +WHERE +task_type_id = 80 +) as b ON a.new_task_id = b.task_id; + """.strip() + + parser = Parser(query) + assert parser.subqueries_names == ["a", "b"] + assert parser.tables == ["some_task_detail", "some_task"] + assert parser.columns == [ + "some_task_detail.task_id", + "some_task_detail.STATUS", + "some_task.task_id", + "task_type_id", + "a.new_task_id", + "b.task_id", + ] + assert parser.columns_without_subqueries == [ + "some_task_detail.task_id", + "some_task_detail.STATUS", + "some_task.task_id", + "task_type_id", + ] + assert parser.columns_dict == { + "join": ["a.new_task_id", "b.task_id"], + "select": ["some_task_detail.task_id", "some_task.task_id"], + "where": ["some_task_detail.STATUS", "task_type_id"], + } + + assert parser.subqueries == { + "a": "SELECT std.task_id as new_task_id FROM some_task_detail std WHERE std.STATUS = 1", + "b": "SELECT st.task_id FROM some_task st WHERE task_type_id = 80", + } + + parser2 = Parser(parser.subqueries["a"]) + assert parser2.tables == ["some_task_detail"] + assert parser2.columns == ["some_task_detail.task_id", "some_task_detail.STATUS"] diff --git a/test/test_normalization.py b/test/test_normalization.py index 47be080d..9767a2da 100644 --- a/test/test_normalization.py +++ b/test/test_normalization.py @@ -1,79 +1,79 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -from sql_metadata import generalize_sql, remove_comments_from_sql +from sql_metadata.parser import Parser -def test_generalize_sql(): - assert generalize_sql(None) is None +def test_generalization_of_sql(): + assert Parser().generalize == "" assert ( - remove_comments_from_sql("SELECT /* Test */ foo FROM BAR") + Parser("SELECT /* Test */ foo FROM BAR").without_comments == "SELECT foo FROM BAR" ) assert ( - generalize_sql( + Parser( "UPDATE `category` SET cat_pages = cat_pages + 1,cat_files = cat_files + 1 WHERE cat_title = 'foo'" - ) + ).generalize == "UPDATE `category` SET cat_pages = cat_pages + N,cat_files = cat_files + N WHERE cat_title = X" ) assert ( - generalize_sql( + Parser( "SELECT entity_key FROM `wall_notification_queue` WHERE (wiki_id = ) AND (event_date > '20150105141012')" - ) + ).generalize == "SELECT entity_key FROM `wall_notification_queue` WHERE (wiki_id = ) AND (event_date > X)" ) assert ( - generalize_sql( + Parser( "UPDATE `user` SET user_touched = '20150112143631' WHERE user_id = '25239755'" - ) + ).generalize == "UPDATE `user` SET user_touched = X WHERE user_id = X" ) assert ( - generalize_sql( + Parser( "SELECT /* CategoryDataService::getMostVisited 207.46.13.56 */ page_id,cl_to FROM `page` INNER JOIN `categorylinks` ON ((cl_from = page_id)) WHERE cl_to = 'Characters' AND (page_namespace NOT IN(500,6,14)) ORDER BY page_title" - ) + ).generalize == "SELECT page_id,cl_to FROM `page` INNER JOIN `categorylinks` ON ((cl_from = page_id)) WHERE cl_to = X AND (page_namespace NOT IN (XYZ)) ORDER BY page_title" ) assert ( - generalize_sql( + Parser( "SELECT /* ArticleCommentList::getCommentList Dancin'NoViolen... */ page_id,page_title FROM `page` WHERE (page_title LIKE 'Dreams\\_Come\\_True/@comment-%' ) AND page_namespace = '1' ORDER BY page_id DESC" - ) + ).generalize == "SELECT page_id,page_title FROM `page` WHERE (page_title LIKE X ) AND page_namespace = X ORDER BY page_id DESC" ) assert ( - generalize_sql( + Parser( "delete /* DatabaseBase::sourceFile( /usr/wikia/slot1/3690/src/maintenance/cleanupStarter.sql ) CreateWiki scri... */ from text where old_id not in (select rev_text_id from revision)" - ) + ).generalize == "delete from text where old_id not in (select rev_text_id from revision)" ) assert ( - generalize_sql( + Parser( "SELECT /* WallNotifications::getBackupData Craftindiedo */ id,is_read,is_reply,unique_id,entity_key,author_id,notifyeveryone FROM `wall_notification` WHERE user_id = '24944488' AND wiki_id = '1030786' AND unique_id IN ('880987','882618','708228','522330','662055','837815','792393','341504','600103','612640','667267','482428','600389','213400','620177','164442','659210','621286','609757','575865','567668','398132','549770','495396','344814','421448','400650','411028','341771','379461','332587','314176','284499','250207','231714') AND is_hidden = '0' ORDER BY id" - ) + ).generalize == "SELECT id,is_read,is_reply,unique_id,entity_key,author_id,notifyeveryone FROM `wall_notification` WHERE user_id = X AND wiki_id = X AND unique_id IN (XYZ) AND is_hidden = X ORDER BY id" ) # comments with * inside assert ( - generalize_sql( + Parser( "SELECT /* ArticleCommentList::getCommentList *Crashie* */ page_id,page_title FROM `page` WHERE (page_title LIKE 'Dainava/@comment-%' ) AND page_namespace = '1201' ORDER BY page_id DESC" - ) + ).generalize == "SELECT page_id,page_title FROM `page` WHERE (page_title LIKE X ) AND page_namespace = X ORDER BY page_id DESC" ) # comments with * inside assert ( - generalize_sql( + Parser( "SELECT /* ListusersData::loadData Lart96 - 413bc6e5-b151-44fd-80bd-3baff733fb91 */ count(0) as cnt FROM `events_local_users` WHERE wiki_id = '7467' AND (user_name != '') AND user_is_closed = '0' AND ( single_group = 'poweruser' or all_groups = '' or all_groups LIKE '%bot' or all_groups LIKE '%bot;%' or all_groups LIKE '%bureaucrat' or all_groups LIKE '%bureaucrat;%' or all_groups LIKE '%sysop' or all_groups LIKE '%sysop;%' or all_groups LIKE '%authenticated' or all_groups LIKE '%authenticated;%' or all_groups LIKE '%bot-global' or all_groups LIKE '%bot-global;%' or all_groups LIKE '%content-reviewer' or all_groups LIKE '%content-reviewer;%' or all_groups LIKE '%council' or all_groups LIKE '%council;%' or all_groups LIKE '%fandom-editor' or all_groups LIKE '%fandom-editor;%' or all_groups LIKE '%helper' or all_groups LIKE '%helper;%' or all_groups LIKE '%restricted-login' or all_groups LIKE '%restricted-login;%' or all_groups LIKE '%restricted-login-exempt' or all_groups LIKE '%restricted-login-exempt;%' or all_groups LIKE '%reviewer' or all_groups LIKE '%reviewer;%' or all_groups LIKE '%staff' or all_groups LIKE '%staff;%' or all_groups LIKE '%translator' or all_groups LIKE '%translator;%' or all_groups LIKE '%util' or all_groups LIKE '%util;%' or all_groups LIKE '%vanguard' or all_groups LIKE '%vanguard;%' or all_groups LIKE '%voldev' or all_groups LIKE '%voldev;%' or all_groups LIKE '%vstf' or all_groups LIKE '%vstf;%' ) AND ( edits >= 5) LIMIT 1 " - ) + ).generalize == "SELECT count(N) as cnt FROM `events_local_users` WHERE wiki_id = X AND (user_name != X) AND user_is_closed = X AND ( single_group = X or all_groups = X or all_groups LIKE X ... ) AND ( edits >= N) LIMIT N" ) @@ -86,55 +86,55 @@ def test_generalize_sql(): """ assert ( - generalize_sql(sql) + Parser(sql).generalize == "SELECT page_title FROM page WHERE page_namespace = X AND page_title COLLATE LATINN_GENERAL_CI LIKE X" ) # queries with IN + brackets (#21) assert ( - generalize_sql("SELECT foo FROM bar WHERE id IN (123,456, 789)") + Parser("SELECT foo FROM bar WHERE id IN (123,456, 789)").generalize == "SELECT foo FROM bar WHERE id IN (XYZ)" ) assert ( - generalize_sql("SELECT foo FROM bar WHERE id in ( 123, 456, 789 )") + Parser("SELECT foo FROM bar WHERE id in ( 123, 456, 789 )").generalize == "SELECT foo FROM bar WHERE id in (XYZ)" ) assert ( - generalize_sql( + Parser( "SELECT foo FROM bar WHERE slug in ( 'american-horror-story', 'animated-series', 'batman', 'comics', 'dc', 'fallout', 'game-of-thrones', 'hbo', 'horror', 'marvel', 'mcu', 'movie-reviews', 'movie-trailers', 'movies', 'netflix', 'playstation', 'star-wars', 'stranger-things', 'streaming', 'the-simpsons', 'zelda' )" - ) + ).generalize == "SELECT foo FROM bar WHERE slug in (XYZ)" ) assert ( - generalize_sql( + Parser( "select curation_cms.topics.slug from curation_cms.topics where curation_cms.topics.id in ( 87, 86, 79, 77, 76, 73, 72, 70, 71, 69, 68, 66, 65, 64, 62, 63, 2, 57, 17, 1, 22, 49, 30, 55, 15, 3, 48, 43, 24, 47, 45, 10, 50, 39, 36, 8, 34, 25, 13, 6, 4 )" - ) + ).generalize == "select curation_cms.topics.slug from curation_cms.topics where curation_cms.topics.id in (XYZ)" ) def test_generalize_timestamp(): assert ( - generalize_sql( + Parser( # ODBC syntax - https://dev.mysql.com/doc/refman/5.7/en/date-and-time-literals.html "SELECT foo FROM bar WHERE publish_date < {ts '2018-04-05 10:14:33.824'}" - ) + ).generalize == "SELECT foo FROM bar WHERE publish_date < {ts X}" ) def test_generalize_insert(): assert ( - generalize_sql("INSERT INTO bar (foo, test) Values ( 123, 456, 789 )") + Parser("INSERT INTO bar (foo, test) Values ( 123, 456, 789 )").generalize == "INSERT INTO bar (foo, test) Values (XYZ)" ) assert ( - generalize_sql( + Parser( "/* 7e6384e5 */ insert into notification_stats.request_info ( type, request_id, title, message, details ) values ( 'action-notification', '51f8a962-bae0-4d25-9341-130658161541', 'RickSanchez15 replied to What''s your overall favourite Season of South Park?.', 'Cool', 'null' )" - ) + ).generalize == "insert into notification_stats.request_info ( type, request_id, title, message, details ) values (XYZ)" ) diff --git a/test/test_postgress.py b/test/test_postgress.py new file mode 100644 index 00000000..5cb7d1e8 --- /dev/null +++ b/test/test_postgress.py @@ -0,0 +1,33 @@ +from sql_metadata import Parser + + +def test_postgress_quoted_names(): + # https://github.com/macbre/sql-metadata/issues/85 + parser = Parser( + 'INSERT INTO "test" ("name") VALUES (\'foo\') RETURNING "test"."id"' + ) + assert ["test"] == parser.tables + assert ["name"] == parser.columns + assert {"insert": ["name"]} == parser.columns_dict + assert "INSERT INTO test (name) VALUES (X) RETURNING test.id" == parser.generalize + assert parser.values == ["foo"] + + parser = Parser( + 'SELECT "test"."id", "test"."name" FROM "test" WHERE "test"."name" = \'foo\' LIMIT 21 FOR UPDATE' + ) + assert ["test"] == parser.tables + assert ["test.id", "test.name"] == parser.columns + assert { + "select": ["test.id", "test.name"], + "where": ["test.name"], + } == parser.columns_dict + assert ( + "SELECT test.id, test.name FROM test WHERE test.name = X LIMIT N FOR UPDATE" + == parser.generalize + ) + + parser = Parser('UPDATE "test" SET "name" = \'bar\' WHERE "test"."id" = 1') + assert ["test"] == parser.tables + assert ["name", "test.id"] == parser.columns + assert {"update": ["name"], "where": ["test.id"]} == parser.columns_dict + assert "UPDATE test SET name = X WHERE test.id = N" == parser.generalize diff --git a/test/test_query.py b/test/test_query.py index 980bbb3f..eb28b4a4 100644 --- a/test/test_query.py +++ b/test/test_query.py @@ -1,283 +1,60 @@ -import pytest - -from sql_metadata import ( - preprocess_query, - get_query_tokens, - get_query_columns, - get_query_tables, - get_query_limit_and_offset, -) - -from sqlparse.tokens import DML, Keyword +from sql_metadata.parser import Parser def test_get_query_tokens(): - assert get_query_tokens("") == [] + assert Parser("").tokens == [] - tokens = get_query_tokens("SELECT * FROM foo") + tokens = Parser("SELECT * FROM foo").tokens assert len(tokens) == 4 - - assert tokens[0].ttype is DML assert str(tokens[0]) == "SELECT" - assert tokens[2].ttype is Keyword + assert tokens[1].is_wildcard + assert tokens[2].is_keyword assert str(tokens[2]) == "FROM" -def test_preprocess_query(): - assert ( - preprocess_query( - "SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC" - ) - == "SELECT DISTINCT dw.lang FROM `dimension_wikis` INNER JOIN `fact_wam_scores` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC" - ) - - assert ( - preprocess_query( - "SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" - ) - == "SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` left join `fact_wam_scores` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" - ) - +def test_preprocessing(): # normalize database selector - assert preprocess_query("SELECT foo FROM `db`.`test`") == "SELECT foo FROM db.test" + assert Parser("SELECT foo FROM `db`.`test`").query == "SELECT foo FROM db.test" assert ( - preprocess_query( + Parser( "SELECT r1.wiki_id AS id FROM report_wiki_recent_pageviews AS r1 INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id" - ) + ).query == "SELECT r1.wiki_id AS id FROM report_wiki_recent_pageviews AS r1 INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id" ) # normalize newlines assert ( - preprocess_query("SELECT foo,\nid\nFROM `db`.`test`") + Parser("SELECT foo,\nid\nFROM `db`.`test`").query == "SELECT foo, id FROM db.test" ) - -def test_get_query_tables(): - assert ["test_table"] == get_query_tables("SELECT * FROM `test_table`") - - assert ["0001_test_table"] == get_query_tables("SELECT * FROM `0001_test_table`") - - assert ["test_table"] == get_query_tables("SELECT foo FROM `test_table`") - - assert ["s.t"] == get_query_tables("SELECT * FROM s.t") - - assert ["db.test_table"] == get_query_tables("SELECT foo FROM `db`.`test_table`") - - assert ["test_table"] == get_query_tables("SELECT foo FROM test_table WHERE id = 1") - - assert ["test_table", "second_table"] == get_query_tables( - "SELECT foo FROM test_table, second_table WHERE id = 1" - ) - - assert ["revision", "page", "wikicities_user"] == get_query_tables( - "SELECT rev_id,rev_page,rev_text_id,rev_timestamp,rev_comment,rev_user_text,rev_user,rev_minor_edit,rev_deleted,rev_len,rev_parent_id,rev_shaN,page_namespace,page_title,page_id,page_latest,user_name FROM `revision` INNER JOIN `page` ON ((page_id = rev_page)) LEFT JOIN `wikicities_user` ON ((rev_user != N) AND (user_id = rev_user)) WHERE rev_id = X LIMIT N" - ) - - assert ["events"] == get_query_tables( - "SELECT COUNT( 0 ) AS cnt, date_format(event_date, '%Y-%m-%d') AS date FROM events WHERE event_date BETWEEN '2017-10-18 00:00:00' AND '2017-10-24 23:59:59' AND wiki_id = '1289985' GROUP BY date WITH ROLLUP" - ) - - # complex queries - # @see https://github.com/macbre/query-digest/issues/16 - assert ["report_wiki_recent_pageviews", "dimension_wikis"] == get_query_tables( - "SELECT r.wiki_id AS id, pageviews_Nday AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = X AND r.lang = X AND r.hub_name = X ORDER BY pageviews DESC LIMIT N" - ) - - assert ["dimension_wikis", "fact_wam_scores"] == get_query_tables( - "SELECT DISTINCT dw.lang FROM `dimension_wikis` `dw` INNER JOIN `fact_wam_scores` `fwN` ON ((dw.wiki_id = fwN.wiki_id)) WHERE fwN.time_id = FROM_UNIXTIME(N) ORDER BY dw.lang ASC" - ) - - assert ["fact_wam_scores", "dimension_wikis"] == get_query_tables( - "SELECT count(fwN.wiki_id) as wam_results_total FROM `fact_wam_scores` `fwN` left join `fact_wam_scores` `fwN` ON ((fwN.wiki_id = fwN.wiki_id) AND (fwN.time_id = FROM_UNIXTIME(N))) left join `dimension_wikis` `dw` ON ((fwN.wiki_id = dw.wiki_id)) WHERE (fwN.time_id = FROM_UNIXTIME(N)) AND (dw.url like X OR dw.title like X) AND fwN.vertical_id IN (XYZ) AND dw.lang = X AND (fwN.wiki_id NOT IN (XYZ)) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" - ) - - assert ["revision", "page", "wikicities_cN.user"] == get_query_tables( - "SELECT rev_id,rev_page,rev_text_id,rev_timestamp,rev_comment,rev_user_text,rev_user,rev_minor_edit,rev_deleted,rev_len,rev_parent_id,rev_shaN,page_namespace,page_title,page_id,page_latest,user_name FROM `revision` INNER JOIN `page` ON ((page_id = rev_page)) LEFT JOIN `wikicities_cN`.`user` ON ((rev_user != N) AND (user_id = rev_user)) WHERE rev_id = X LIMIT N" - ) - - # complex queries, take two - # @see https://github.com/macbre/sql-metadata/issues/6 - assert ["foo_pageviews"] == get_query_tables( - "SELECT 1 as c FROM foo_pageviews WHERE time_id = '2018-01-07 00:00:00' AND period_id = '2' LIMIT 1" - ) - - # table aliases - assert ["report_wiki_recent_pageviews", "dimension_wikis"] == get_query_tables( - "SELECT r.wiki_id AS id, pageviews_7day AS pageviews FROM report_wiki_recent_pageviews AS r INNER JOIN dimension_wikis AS d ON r.wiki_id = d.wiki_id WHERE d.public = '1' AND r.lang IN ( 'en', 'ru' ) AND r.hub_name = 'gaming' ORDER BY pageviews DESC LIMIT 300" - ) - - # self joins - assert ["fact_wam_scores", "dimension_wikis"] == get_query_tables( - "SELECT count(fw1.wiki_id) as wam_results_total FROM `fact_wam_scores` `fw1` left join `fact_wam_scores` `fw2` ON ((fw1.wiki_id = fw2.wiki_id) AND (fw2.time_id = FROM_UNIXTIME(1466380800))) left join `dimension_wikis` `dw` ON ((fw1.wiki_id = dw.wiki_id)) WHERE (fw1.time_id = FROM_UNIXTIME(1466467200)) AND (dw.url like '%%' OR dw.title like '%%') AND fw1.vertical_id IN ('0','1','2','3','4','5','6','7') AND (fw1.wiki_id NOT IN ('23312','70256','168929','463633','381622','524772','476782','9764','214934','170145','529622','52149','96420','390','468156','690804','197434','29197','88043','37317','466775','402313','169142','746246','119847','57268','1089624')) AND ((dw.url IS NOT NULL AND dw.title IS NOT NULL))" - ) - - assert ["rollup_wiki_pageviews"] == get_query_tables( - "SELECT date_format(time_id,'%Y-%m-%d') AS date, pageviews AS cnt FROM rollup_wiki_pageviews WHERE period_id = '2' AND wiki_id = '1676379' AND time_id BETWEEN '2018-01-08' AND '2018-01-01'" - ) - - # INSERT queries - assert ["0070_insert_ignore_table"] == get_query_tables( - "INSERT IGNORE INTO `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');" - ) - - assert ["0070_insert_ignore_table"] == get_query_tables( - "INSERT into `0070_insert_ignore_table` VALUES (9, '123', '2017-01-01');" - ) - - assert ["foo"] == get_query_tables("INSERT INTO `foo` (id,text) VALUES (X,X)") - - assert ["page_vote"] == get_query_tables( - "INSERT /* VoteHelper::addVote xxx */ INTO `page_vote` (article_id,user_id,time) VALUES ('442001','27574631','20180228130846')" - ) - - # UPDATE queries - assert ["page"] == get_query_tables( - "UPDATE `page` SET page_touched = X WHERE page_id = X" - ) - - # ORDER BY - assert ["bar"] == get_query_tables("SELECT foo FROM bar ORDER BY id") - - assert ["bar"] == get_query_tables("SELECT foo FROM bar WHERE id > 20 ORDER BY id") - - assert ["bar"] == get_query_tables("SELECT foo FROM bar ORDER BY id DESC") - - assert ["bar"] == get_query_tables("SELECT foo FROM bar ORDER BY id LIMIT 20") - - # REPLACE queries - assert ["page_props"] == get_query_tables( - "REPLACE INTO `page_props` (pp_page,pp_propname,pp_value) VALUES ('47','infoboxes','')" - ) - - # JOINs - assert ["product_a.users", "product_b.users"] == get_query_tables( - "SELECT a.* FROM product_a.users AS a JOIN product_b.users AS b ON a.ip_address = b.ip_address" - ) - - # database.schema.table formats - assert ["MYDB1.MYSCHEMA1.MYTABLE1"] == get_query_tables( - "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1" - ) - - assert ["MYDB1.MYSCHEMA1.MYTABLE1", "MYDB2.MYSCHEMA2.MYTABLE2"] == get_query_tables( - "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1 JOIN MYDB2.MYSCHEMA2.MYTABLE2" - ) - - assert ["MYDB1.MYSCHEMA1.MYTABLE1", "MYDB2.MYSCHEMA2.MYTABLE2"] == get_query_tables( - "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1 INNER JOIN MYDB2.MYSCHEMA2.MYTABLE2" - ) - - assert ["MYDB1.MYSCHEMA1.MYTABLE1", "MYDB2.MYSCHEMA2.MYTABLE2"] == get_query_tables( - "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1 A LEFT JOIN MYDB2.MYSCHEMA2.MYTABLE2 B ON A.COL = B.COL" - ) - - assert ["MYDB1.MYSCHEMA1.MYTABLE1", "MYDB2.MYSCHEMA2.MYTABLE2"] == get_query_tables( - "SELECT * FROM MYDB1.MYSCHEMA1.MYTABLE1 INNER JOIN MYDB2.MYSCHEMA2.MYTABLE2" - ) - - # handle quoted names - assert ["MYDB.MYTABLE"] == get_query_tables('SELECT COUNT(*) FROM "MYDB".MYTABLE') - - assert ["MYDB.MYTABLE"] == get_query_tables('SELECT COUNT(*) FROM MYDB."MYTABLE"') - - assert ["MYDB.MYTABLE"] == get_query_tables('SELECT COUNT(*) FROM "MYDB"."MYTABLE"') - - assert ["MYDB.MYSCHEMA.MYTABLE"] == get_query_tables( - 'SELECT COUNT(*) FROM "MYDB".MYSCHEMA.MYTABLE' - ) - - assert ["MYDB.MYSCHEMA.MYTABLE"] == get_query_tables( - 'SELECT COUNT(*) FROM MYDB."MYSCHEMA".MYTABLE' - ) - - assert ["MYDB.MYSCHEMA.MYTABLE"] == get_query_tables( - 'SELECT COUNT(*) FROM MYDB.MYSCHEMA."MYTABLE"' - ) - - assert ["MYDB.MYSCHEMA.MYTABLE"] == get_query_tables( - 'SELECT COUNT(*) FROM "MYDB"."MYSCHEMA"."MYTABLE"' - ) - - # include multiple FROM tables when they prefixed - # @see https://github.com/macbre/sql-metadata/issues/38 - assert ["MYDB1.TABLE1", "MYDB2.TABLE2"] == get_query_tables( - "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM MYDB1.TABLE1 AS A, MYDB2.TABLE2 AS B" - ) - - # test whitespaces in keywords - # @see https://github.com/macbre/sql-metadata/issues/80 - assert ["tab", "tab2"] == get_query_tables( - """select a,b,c from tab full outer \r\n\t join tab2 on (col1 = col2) group - \r\n \t by a, b, c """ + # comments are kept + assert ( + Parser("SELECT /*my random comment*/ foo, id FROM `db`.`test`").query + == "SELECT /*my random comment*/ foo, id FROM db.test" ) def test_case_insensitive(): # case-insensitive handling # https://github.com/macbre/sql-metadata/issues/71 - assert ["abc.foo", "foo", "bar"] == get_query_tables( + assert ["abc.foo", "foo", "bar"] == Parser( "create table abc.foo as SELECT pqr.foo1 , ab.foo2 FROM foo pqr, bar ab" - ) + ).tables - assert ["abc.foo", "foo", "bar"] == get_query_tables( + assert ["abc.foo", "foo", "bar"] == Parser( "create table abc.foo as select pqr.foo1 , ab.foo2 FROM foo pqr, bar ab" - ) + ).tables - assert ["pqr.foo1", "ab.foo2"] == get_query_columns( + assert ["foo.foo1", "bar.foo2"] == Parser( "create table abc.foo as SELECT pqr.foo1 , ab.foo2 FROM foo pqr, bar ab" - ) + ).columns - assert ["pqr.foo1", "ab.foo2"] == get_query_columns( + assert ["foo.foo1", "bar.foo2"] == Parser( "create table abc.foo as select pqr.foo1 , ab.foo2 FROM foo pqr, bar ab" - ) - - -def test_joins(): - assert ["redirect", "page"] == get_query_tables( - "SELECT page_title FROM `redirect` INNER JOIN `page` " - "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))" - ) - - assert ["redirect", "page"] == get_query_tables( - "SELECT page_title FROM `redirect` INNER JOIN `page` `foo` " - "ON (rd_title = 'foo' AND rd_namespace = '100' AND (foo.page_id = rd_from))" - ) - - assert [ - "page_title", - "rd_title", - "rd_namespace", - "page_id", - "rd_from", - ] == get_query_columns( - "SELECT page_title FROM `redirect` INNER JOIN `page` " - "ON (rd_title = 'foo' AND rd_namespace = '100' AND (page_id = rd_from))" - ) - - # see #34 - assert ["foos", "bars"] == get_query_tables( - "SELECT foo FROM `foos` JOIN `bars` ON (foos.id = bars.id)" - ) - - assert ["foos", "bars"] == get_query_tables( - "SELECT foo FROM `foos` FULL JOIN `bars` ON (foos.id = bars.id)" - ) - - assert ["foos", "bars"] == get_query_tables( - "SELECT foo FROM `foos` FULL OUTER JOIN `bars` ON (foos.id = bars.id)" - ) - - assert ["foos", "bars"] == get_query_tables( - "SELECT foo FROM `foos` RIGHT OUTER JOIN `bars` ON (foos.id = bars.id)" - ) - - assert ["foos", "bars"] == get_query_tables( - "SELECT foo FROM `foos` LEFT OUTER JOIN `bars` ON (foos.id = bars.id)" - ) + ).columns def test_handle_force_index(): @@ -287,9 +64,9 @@ def test_handle_force_index(): "AND (page_random >= 0.197372293871) AND cl_to = 'Muppet_Characters' " "ORDER BY page_random LIMIT 1" ) - - assert get_query_tables(query) == ["page", "categorylinks"] - assert get_query_columns(query) == [ + parser = Parser(query) + assert parser.tables == ["page", "categorylinks"] + assert parser.columns == [ "page_title", "page_namespace", "page_id", @@ -298,322 +75,35 @@ def test_handle_force_index(): "page_random", "cl_to", ] - - -def test_get_query_limit_and_offset(): - assert get_query_limit_and_offset("SELECT foo_limit FROM bar_offset") is None - assert ( - get_query_limit_and_offset( - "SELECT foo_limit FROM bar_offset /* limit 1000,50 */" - ) - is None - ) - - assert get_query_limit_and_offset("SELECT foo_limit FROM bar_offset LIMIT 50") == ( - 50, - 0, - ) - assert get_query_limit_and_offset( - "SELECT foo_limit FROM bar_offset LIMIT 50 OFFSET 1000" - ) == (50, 1000) - assert get_query_limit_and_offset( - "SELECT foo_limit FROM bar_offset Limit 50 offset 1000" - ) == (50, 1000) - assert get_query_limit_and_offset( - "SELECT foo_limit FROM bar_offset LIMIT 1000, 50" - ) == (50, 1000) - assert get_query_limit_and_offset( - "SELECT foo_limit FROM bar_offset LIMIT 1000,50" - ) == (50, 1000) - assert get_query_limit_and_offset( - "SELECT foo_limit FROM bar_offset limit 1000,50" - ) == (50, 1000) - - assert get_query_limit_and_offset( - "SELECT /* CategoryPaginationViewer::processSection */ " - "page_namespace,page_title,page_len,page_is_redirect,cl_sortkey_prefix FROM `page` " - "INNER JOIN `categorylinks` FORCE INDEX (cl_sortkey) ON ((cl_from = page_id)) " - "WHERE cl_type = 'page' AND cl_to = 'Spotify/Song' " - "ORDER BY cl_sortkey LIMIT 927600,200" - ) == (200, 927600) + assert parser.columns_dict == { + "select": ["page_title", "page_namespace"], + "join": ["page_id", "cl_from"], + "where": ["page_is_redirect", "page_random", "cl_to"], + "order_by": ["page_random"], + } def test_insert_into_select(): # https://dev.mysql.com/doc/refman/5.7/en/insert-select.html query = "INSERT INTO foo SELECT * FROM bar" - assert get_query_tables(query) == ["foo", "bar"] - assert get_query_columns(query) == ["*"] + assert Parser(query).tables == ["foo", "bar"] + assert Parser(query).columns == ["*"] query = "INSERT INTO foo SELECT id, price FROM bar" - assert get_query_tables(query) == ["foo", "bar"] - assert get_query_columns(query) == ["id", "price"] + assert Parser(query).tables == ["foo", "bar"] + assert Parser(query).columns == ["id", "price"] query = "INSERT INTO foo SELECT id, price FROM bar WHERE qty > 200" - assert get_query_tables(query) == ["foo", "bar"] - assert get_query_columns(query) == ["id", "price", "qty"] - - -def test_cast_and_convert_functions(): - # https://dev.mysql.com/doc/refman/8.0/en/cast-functions.html - assert get_query_columns( - "SELECT count(c) as test, id FROM foo where cast(d as bigint) > e" - ) == ["c", "id", "d", "e"] - assert get_query_columns( - "SELECT CONVERT(latin1_column USING utf8) FROM latin1_table;" - ) == ["latin1_column"] + assert Parser(query).tables == ["foo", "bar"] + assert Parser(query).columns == ["id", "price", "qty"] + assert Parser(query).columns_dict == {"select": ["id", "price"], "where": ["qty"]} def test_case_syntax(): # https://dev.mysql.com/doc/refman/8.0/en/case.html - assert get_query_columns( + assert Parser( "select case when p > 0 then 1 else 0 end as cs from c where g > f" - ) == ["p", "g", "f"] - assert get_query_tables( + ).columns == ["p", "g", "f"] + assert Parser( "select case when p > 0 then 1 else 0 end as cs from c where g > f" - ) == ["c"] - - -def test_select_aliases(): - assert get_query_tables("SELECT e.foo FROM bar AS e") == ["bar"] - assert get_query_tables("SELECT e.foo FROM bar e") == ["bar"] - assert get_query_tables("SELECT e.foo FROM (SELECT * FROM bar) AS e") == ["bar"] - assert get_query_tables("SELECT e.foo FROM (SELECT * FROM bar) e") == ["bar"] - - -def test_multiline_queries(): - query = """ -SELECT -COUNT(1) -FROM -(SELECT -task_id -FROM -some_task_detail -WHERE -STATUS = 1 -) a -JOIN ( -SELECT -task_id -FROM -some_task -WHERE -task_type_id = 80 -) b ON a.task_id = b.task_id; - """.strip() - - assert get_query_tables(query) == ["some_task_detail", "some_task"] - # assert get_query_columns(query) == ['task_id', 'STATUS', 'a', 'task_type_id', 'b', 'a.task_id', 'b.task_id'] - - -def test_redshift(): - assert get_query_tables("ALTER TABLE target_table APPEND FROM source_table") == [ - "target_table", - "source_table", - ] - assert get_query_tables("ALTER TABLE x APPEND FROM y") == ["x", "y"] - - -def test_sql_server_cte(): - """ - Tests support for SQL Server's common table expression (CTE). - - @see https://www.sqlservertutorial.net/sql-server-basics/sql-server-cte/ - """ - assert ( - get_query_tables( - """ -WITH x AS ( - SELECT * FROM n -) -SELECT - * -FROM x -JOIN y ON x.a = y.a - """.strip() - ) - == ["n", "x", "y"] - ) - - assert ( - get_query_tables( - """ -WITH x AS ( - SELECT * FROM n -) -select - * -FROM x -JOIN y ON x.a = y.a - """.strip() - ) - == ["n", "x", "y"] - ) - - assert ( - get_query_tables( - """ -WITH foo AS ( - SELECT * FROM n -) -update z from foo set z.q = fpp.y - """.strip() - ) - == ["n", "z", "foo"] - ) - - assert ( - get_query_tables( - """ -WITH foo AS ( - SELECT * FROM tab -) -DELETE FROM z JOIN foo ON z.a = foo.a - """.strip() - ) - == ["tab", "z", "foo"] - ) - - -def test_sql_server_cte_sales_by_year(): - sales_query = """ -WITH cte_sales AS ( - SELECT - staff_id, - COUNT(*) order_count - FROM - sales.orders - WHERE - YEAR(order_date) = 2018 - GROUP BY - staff_id -) -SELECT - AVG(order_count) average_orders_by_staff -FROM - cte_sales; - """.strip() - - assert get_query_tables(sales_query) == ["sales.orders", "cte_sales"] - - # TODO - # assert get_query_columns(sales_query) == ['staff_id', 'order_count', 'order_date'] - - -def test_table_name_with_group_by(): - expected_tables = ["SH.sales"] - - assert ( - get_query_tables("SELECT s.cust_id,count(s.cust_id) FROM SH.sales s") - == expected_tables - ) - - assert ( - get_query_tables( - "SELECT s.cust_id,count(s.cust_id) FROM SH.sales s GROUP BY s.cust_id" - ) - == expected_tables - ) - - assert ( - get_query_tables( - """ -SELECT s.cust_id,count(s.cust_id) FROM SH.sales s -GROUP BY s.cust_id HAVING s.cust_id != '1660' AND s.cust_id != '2' - """.strip() - ) - == expected_tables - ) - - -def test_datasets(): - # see https://github.com/macbre/sql-metadata/issues/38 - assert get_query_tables( - "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM TABLE1 AS A, TABLE2 AS B" - ) == ["TABLE1", "TABLE2"] - - assert get_query_tables( - "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM DATASET1.TABLE1, DATASET2.TABLE2" - ) == ["DATASET1.TABLE1", "DATASET2.TABLE2"] - - assert get_query_tables( - "SELECT A.FIELD1, B.FIELD1, (A.FIELD1 * B.FIELD1) AS QTY FROM DATASET1.TABLE1 AS A, DATASET2.TABLE2 AS B" - ) == ["DATASET1.TABLE1", "DATASET2.TABLE2"] - - -def test_table_names_with_dashes(): - assert get_query_tables("SELECT * FROM `schema-with-dash.tablename`") == [ - "schema-with-dash.tablename" - ] - - -def test_queries_with_null_conditions(): - assert get_query_columns( - "SELECT id FROM cm WHERE cm.status = 1 AND cm.OPERATIONDATE IS NULL AND cm.OID IN(123123);" - ) == ["id", "cm.status", "cm.OPERATIONDATE", "cm.OID"] - - assert get_query_columns( - "SELECT id FROM cm WHERE cm.status = 1 AND cm.OPERATIONDATE IS NOT NULL AND cm.OID IN(123123);" - ) == ["id", "cm.status", "cm.OPERATIONDATE", "cm.OID"] - - -def test_queries_with_distinct(): - assert get_query_columns("SELECT DISTINCT DATA.ASSAY_ID FROM foo") == [ - "DATA.ASSAY_ID" - ] - - assert get_query_columns("SELECT UNIQUE DATA.ASSAY_ID FROM foo") == [ - "DATA.ASSAY_ID" - ] - - assert get_query_tables("SELECT DISTINCT DATA.ASSAY_ID FROM foo") == ["foo"] - - -def test_unions(): - # @see https://github.com/macbre/sql-metadata/issues/79 - assert ["tab1", "tab2"] == get_query_tables( - "select col1, col2, col3 from tab1 union all select col4, col5, col6 from tab2" - ) - - # @see https://github.com/macbre/sql-metadata/issues/94 - assert ["d", "g"] == get_query_tables( - "SELECT a,b,c FROM d UNION ALL SELECT e,f FROM g" - ) - - -def test_with_brackets(): - assert ["database1.table1", "database2.table2"] == get_query_tables( - """ - SELECT - "xxxxx" - FROM - (database1.table1 alias - LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx")) - """ - ) - - assert ["inner_table"] == get_query_tables( - """ - SELECT - t.foo - FROM - (SELECT foo FROM inner_table - WHERE bar = '1') t - """ - ) - - -def test_with_with(): - pytest.skip("Improve WITH syntax handling with a new parser (#98)") - - assert ["table3", "database2.table2"] == get_query_tables( - """ - WITH - database1.tableFromWith AS SELECT * FROM table3 - SELECT - "xxxxx" - FROM - database1.tableFromWith alias - LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") - """ - ) + ).tables == ["c"] diff --git a/test/test_redshift.py b/test/test_redshift.py new file mode 100644 index 00000000..84fcae59 --- /dev/null +++ b/test/test_redshift.py @@ -0,0 +1,9 @@ +from sql_metadata.parser import Parser + + +def test_redshift(): + assert Parser("ALTER TABLE target_table APPEND FROM source_table").tables == [ + "target_table", + "source_table", + ] + assert Parser("ALTER TABLE x APPEND FROM y").tables == ["x", "y"] diff --git a/test/test_values.py b/test/test_values.py new file mode 100644 index 00000000..f63c8c36 --- /dev/null +++ b/test/test_values.py @@ -0,0 +1,76 @@ +from sql_metadata import Parser + + +def test_getting_values(): + parser = Parser( + "INSERT /* VoteHelper::addVote xxx */ INTO `page_vote` (article_id,user_id,`time`) VALUES ('442001','27574631','20180228130846')" + ) + assert parser.values == ["442001", "27574631", "20180228130846"] + assert parser.values_dict == { + "article_id": "442001", + "user_id": "27574631", + "time": "20180228130846", + } + + # REPLACE queries + parser = Parser( + "REPLACE INTO `page_props` (pp_page,pp_propname,pp_value) VALUES ('47','infoboxes','')" + ) + assert parser.values == ["47", "infoboxes", ""] + assert parser.values_dict == { + "pp_page": "47", + "pp_propname": "infoboxes", + "pp_value": "", + } + + parser = Parser( + "INSERT IGNORE INTO `0070_insert_ignore_table` VALUES (9, 2.15, '123', '2017-01-01');" + ) + assert parser.values == [9, 2.15, "123", "2017-01-01"] + assert parser.values_dict == { + "column_1": 9, + "column_2": 2.15, + "column_3": "123", + "column_4": "2017-01-01", + } + + assert [] == Parser("SELECT * from foo;").values + + assert Parser("SELECT * from foo;").values_dict is None + + parser = Parser( + "INSERT INTO `wp_comments` (`comment_post_ID`, `comment_author`, `comment_author_email`, `comment_author_url`, `comment_author_IP`, `comment_date`, `comment_date_gmt`, `comment_content`, `comment_karma`, `comment_approved`, `comment_agent`, `comment_type`, `comment_parent`, `user_id`) VALUES (1, 'test user', '', '', '127.0.0.1', '2021-02-27 03:21:52', '2021-02-27 03:21:52', 'test comment', 0, '0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv: 78.0) Gecko/20100101 Firefox/78.0', 'comment', 0, 0)'," + ) + assert parser.values == [ + 1, + "test user", + "", + "", + "127.0.0.1", + "2021-02-27 03:21:52", + "2021-02-27 03:21:52", + "test comment", + 0, + "0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv: 78.0) Gecko/20100101 Firefox/78.0", + "comment", + 0, + 0, + ] + + assert parser.values_dict == { + "comment_post_ID": 1, + "comment_author": "test user", + "comment_author_email": "", + "comment_author_url": "", + "comment_author_IP": "127.0.0.1", + "comment_date": "2021-02-27 03:21:52", + "comment_date_gmt": "2021-02-27 03:21:52", + "comment_content": "test comment", + "comment_karma": 0, + "comment_approved": "0", + "comment_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv: 78.0) Gecko/20100101 Firefox/78.0", + "comment_type": "comment", + "comment_parent": 0, + "user_id": 0, + } diff --git a/test/test_with_statements.py b/test/test_with_statements.py new file mode 100644 index 00000000..ffb3c617 --- /dev/null +++ b/test/test_with_statements.py @@ -0,0 +1,87 @@ +from sql_metadata import Parser + + +def test_with_statements(): + parser = Parser( + """ +WITH +database1.tableFromWith AS (SELECT aa.* FROM table3 as aa + left join table4 on aa.col1=table4.col2), +test as (select * from table3) +SELECT +"xxxxx" +FROM +database1.tableFromWith alias +LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") +""" + ) + assert ["table3", "table4", "database2.table2"] == parser.tables + + assert ["database1.tableFromWith", "test"] == parser.with_names + + assert ( + [ + "database1.tableFromWith", + "database1.tableFromWith2", + "database1.tableFromWith3", + "database1.tableFromWith4", + ] + == Parser( + """ + WITH + database1.tableFromWith AS (SELECT * FROM table3), + database1.tableFromWith2 AS (SELECT * FROM table4), + database1.tableFromWith3 AS (SELECT * FROM table5), + database1.tableFromWith4 AS (SELECT * FROM table6) + SELECT + "xxxxx" + FROM + database1.tableFromWith alias + LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") + """ + ).with_names + ) + + assert ( + ["table3", "table4", "table5", "table6", "database2.table2"] + == Parser( + """ + WITH + database1.tableFromWith AS (SELECT * FROM table3), + database1.tableFromWith2 AS (SELECT * FROM table4), + database1.tableFromWith3 AS (SELECT * FROM table5), + database1.tableFromWith4 AS (SELECT * FROM table6) + SELECT + "xxxxx" + FROM + database1.tableFromWith alias + LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx") + """ + ).tables + ) + + assert ( + ["cte1", "cte2"] + == Parser( + """ +WITH +cte1 AS (SELECT a, b FROM table1), +cte2 AS (SELECT c, d FROM table2) +SELECT b, d FROM cte1 JOIN cte2 +WHERE cte1.a = cte2.c; +""" + ).with_names + ) + + assert ( + ["table1", "table2"] + == Parser( + """ +WITH +cte1 AS (SELECT a, b FROM table1), +cte2 AS (SELECT c, d FROM table2) +SELECT b, d FROM cte1 JOIN cte2 +WHERE cte1.a = cte2.c; +""" + ).tables + )