diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 5bd141a4062..3d89a597eea 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -23,12 +23,12 @@ import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.operations as ibis_ops import bigframes_vendored.ibis.expr.types as ibis_types +import bigframes_vendored.sqlglot.expressions as sge from google.cloud import bigquery import pyarrow as pa from bigframes.core import agg_expressions, rewrite import bigframes.core.agg_expressions as ex_types -import bigframes.core.compile.googlesql import bigframes.core.compile.ibis_compiler.aggregate_compiler as agg_compiler import bigframes.core.compile.ibis_compiler.scalar_op_compiler as op_compilers import bigframes.core.compile.ibis_types @@ -82,13 +82,21 @@ def to_sql( ) if order_by or limit or not is_noop_selection: - sql = ibis_bigquery.Backend().compile(ibis_table) - sql = ( - bigframes.core.compile.googlesql.Select() - .from_(sql) - .select(selection_strings) - .sql() - ) + # selections are (ref.id.sql, name) where ref.id.sql is escaped identifier + to_select = [ + sge.Alias( + this=sge.to_identifier(src, quoted=True), + alias=sge.to_identifier(alias, quoted=True), + ) + if src != alias + else sge.to_identifier(src, quoted=True) + for src, alias in selection_strings + ] + # Use string formatting for FROM clause to avoid re-parsing potentially complex SQL (like ARRAY>) + # that sqlglot might not handle perfectly when parsing BigQuery dialect strings. + select_sql = sge.Select().select(*to_select).sql(dialect="bigquery") + ibis_sql = ibis_bigquery.Backend().compile(ibis_table) + sql = f"{select_sql} FROM ({ibis_sql}) AS `t`" # Single row frames may not have any ordering columns if len(order_by) > 0: @@ -99,7 +107,7 @@ def to_sql( raise TypeError(f"Limit param: {limit} must be an int.") sql += f"\nLIMIT {limit}" else: - sql = ibis_bigquery.Backend().compile(self._to_ibis_expr()) + sql = ibis_bigquery.Backend().compile(ibis_table) return typing.cast(str, sql) @property diff --git a/bigframes/core/compile/googlesql/__init__.py b/bigframes/core/compile/googlesql/__init__.py deleted file mode 100644 index add0c5ec445..00000000000 --- a/bigframes/core/compile/googlesql/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Python classes representing GoogleSQL syntax nodes, adhering to the official syntax: -https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax""" - -from __future__ import annotations - -from bigframes.core.compile.googlesql.datatype import DataType -from bigframes.core.compile.googlesql.expression import ( - _escape_chars, - AliasExpression, - ColumnExpression, - CTEExpression, - identifier, - StarExpression, - TableExpression, -) -from bigframes.core.compile.googlesql.function import Cast -from bigframes.core.compile.googlesql.query import ( - AsAlias, - FromClause, - FromItem, - NonRecursiveCTE, - QueryExpr, - Select, - SelectAll, - SelectExpression, -) - -__all__ = [ - "_escape_chars", - "identifier", - "AliasExpression", - "AsAlias", - "Cast", - "ColumnExpression", - "CTEExpression", - "DataType", - "FromClause", - "FromItem", - "NonRecursiveCTE", - "QueryExpr", - "Select", - "SelectAll", - "SelectExpression", - "StarExpression", - "StringType", - "TableExpression", -] diff --git a/bigframes/core/compile/googlesql/abc.py b/bigframes/core/compile/googlesql/abc.py deleted file mode 100644 index 081836467c2..00000000000 --- a/bigframes/core/compile/googlesql/abc.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import abc - - -class SQLSyntax(abc.ABC): - """Abstract base class provides GoogleSQL syntax.""" - - @abc.abstractmethod - def sql(self): - ... diff --git a/bigframes/core/compile/googlesql/datatype.py b/bigframes/core/compile/googlesql/datatype.py deleted file mode 100644 index ccf3ff4d41b..00000000000 --- a/bigframes/core/compile/googlesql/datatype.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import enum - -"""This module represents all GoogleSQL for BigQuery data types: -https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types""" - - -class DataType(enum.Enum): - STRING = 1 - FLOAT64 = 2 diff --git a/bigframes/core/compile/googlesql/expression.py b/bigframes/core/compile/googlesql/expression.py deleted file mode 100644 index 581ab67718a..00000000000 --- a/bigframes/core/compile/googlesql/expression.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import dataclasses -import typing - -import bigframes.core.compile.googlesql.abc as abc - -"""This module represents GoogleSQL `expression` and its extensions. -Core class: - -* `expression`: Models basic SQL expressions. - -Extended classes (not part of standard GoogleSQL syntax, but added for convenience): - -* `ColumnExpression`: Represents column references. -* `TableExpression`: Represents table references. -* `AliasExpression`: Represents aliased expressions. -* ... -""" - - -@dataclasses.dataclass -class Expression(abc.SQLSyntax): - pass - - -@dataclasses.dataclass -class ColumnExpression(Expression): - name: str - parent: typing.Optional[TableExpression | AliasExpression | CTEExpression] = None - - def sql(self) -> str: - if self.parent is not None: - return f"{self.parent.sql()}.{identifier(self.name)}" - return identifier(self.name) - - -@dataclasses.dataclass -class StarExpression(Expression): - parent: typing.Optional[TableExpression | AliasExpression | CTEExpression] = None - - def sql(self) -> str: - if self.parent is not None: - return f"{self.parent.sql()}.*" - return "*" - - -@dataclasses.dataclass -class TableExpression(Expression): - table_id: str - dataset_id: typing.Optional[str] = None - project_id: typing.Optional[str] = None - - def __post_init__(self): - if self.project_id is not None and self.dataset_id is None: - raise ValueError("The `dataset_id` is missing.") - - def sql(self) -> str: - text = [] - if self.project_id is not None: - text.append(identifier(self.project_id)) - if self.dataset_id is not None: - text.append(identifier(self.dataset_id)) - text.append(identifier(self.table_id)) - return ".".join(text) - - -@dataclasses.dataclass -class AliasExpression(Expression): - alias: str - - def sql(self) -> str: - return identifier(self.alias) - - -@dataclasses.dataclass -class CTEExpression(Expression): - name: str - - def sql(self) -> str: - return identifier(self.name) - - -def identifier(id: str) -> str: - """Return a string representing column reference in a SQL.""" - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers - # Just always escape, otherwise need to check against every reserved sql keyword - return f"`{_escape_chars(id)}`" - - -def _escape_chars(value: str): - """Escapes all special charactesrs""" - # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals - trans_table = str.maketrans( - { - "\a": r"\a", - "\b": r"\b", - "\f": r"\f", - "\n": r"\n", - "\r": r"\r", - "\t": r"\t", - "\v": r"\v", - "\\": r"\\", - "?": r"\?", - '"': r"\"", - "'": r"\'", - "`": r"\`", - } - ) - return value.translate(trans_table) diff --git a/bigframes/core/compile/googlesql/function.py b/bigframes/core/compile/googlesql/function.py deleted file mode 100644 index 19b61f2fc99..00000000000 --- a/bigframes/core/compile/googlesql/function.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import dataclasses - -import bigframes.core.compile.googlesql.datatype as datatype -import bigframes.core.compile.googlesql.expression as expr - -# Conversion functions: -# https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions - - -@dataclasses.dataclass -class Cast(expr.Expression): - """This class represents the `cast` function.""" - - expression: expr.ColumnExpression - type: datatype.DataType - - def sql(self) -> str: - return f"CAST ({self.expression.sql()} AS {self.type.name})" diff --git a/bigframes/core/compile/googlesql/query.py b/bigframes/core/compile/googlesql/query.py deleted file mode 100644 index f591216b3a8..00000000000 --- a/bigframes/core/compile/googlesql/query.py +++ /dev/null @@ -1,231 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import dataclasses -import typing - -import google.cloud.bigquery as bigquery - -import bigframes.core.compile.googlesql.abc as abc -import bigframes.core.compile.googlesql.expression as expr - -"""This module provides a structured representation of GoogleSQL syntax using nodes. -Each node's name and child nodes are designed to strictly follow the official GoogleSQL -syntax rules outlined in the documentation: -https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax""" - -TABLE_SOURCE_TYPE = typing.Union[str, bigquery.TableReference] - - -@dataclasses.dataclass -class QueryExpr(abc.SQLSyntax): - """This class represents GoogleSQL `query_expr` syntax.""" - - select: Select - with_cte_list: typing.Sequence[NonRecursiveCTE] = () - - def sql(self) -> str: - text = [] - if len(self.with_cte_list) > 0: - with_cte_text = ",\n".join( - [with_cte.sql() for with_cte in self.with_cte_list] - ) - text.append(f"WITH {with_cte_text}") - - text.append(self.select.sql()) - return "\n".join(text) - - -@dataclasses.dataclass -class Select(abc.SQLSyntax): - """This class represents GoogleSQL `select` syntax.""" - - select_list: typing.Sequence[ - typing.Union[SelectExpression, SelectAll] - ] = dataclasses.field(default_factory=list) - from_clause_list: typing.Sequence[FromClause] = dataclasses.field( - default_factory=list - ) - distinct: bool = False - - def select( - self, - columns: typing.Union[ - typing.Iterable[str], typing.Iterable[tuple[str, str]], str, None - ] = None, - distinct: bool = False, - ) -> Select: - if isinstance(columns, str): - columns = [columns] - self.select_list: typing.List[typing.Union[SelectExpression, SelectAll]] = ( - [self._select_field(column) for column in columns] - if columns - else [SelectAll(expression=expr.StarExpression())] - ) - self.distinct = distinct - return self - - def _select_field(self, field) -> SelectExpression: - if isinstance(field, str): - return SelectExpression(expression=expr.ColumnExpression(name=field)) - - else: - alias = ( - expr.AliasExpression(field[1]) - if isinstance(field[1], str) - else field[1] - if (field[0] != field[1]) - else None - ) - return SelectExpression( - expression=expr.ColumnExpression(name=field[0]), alias=alias - ) - - def from_( - self, - sources: typing.Union[TABLE_SOURCE_TYPE, typing.Iterable[TABLE_SOURCE_TYPE]], - ) -> Select: - if (not isinstance(sources, typing.Iterable)) or isinstance(sources, str): - sources = [sources] - self.from_clause_list = [ - FromClause(FromItem.from_source(source)) for source in sources - ] - return self - - def sql(self) -> str: - if (self.select_list is not None) and (not self.select_list): - raise ValueError("Select clause has not been properly initialized.") - - text = ["SELECT"] - - if self.distinct: - text.append("DISTINCT") - - select_list_sql = ",\n".join([select.sql() for select in self.select_list]) - text.append(select_list_sql) - - if self.from_clause_list: - from_clauses_sql = ",\n".join( - [clause.sql() for clause in self.from_clause_list] - ) - text.append(f"FROM\n{from_clauses_sql}") - return "\n".join(text) - - -@dataclasses.dataclass(frozen=True) -class SelectExpression(abc.SQLSyntax): - """This class represents `select_expression`.""" - - expression: expr.ColumnExpression - alias: typing.Optional[expr.AliasExpression] = None - - def sql(self) -> str: - if self.alias is None: - return self.expression.sql() - else: - return f"{self.expression.sql()} AS {self.alias.sql()}" - - -@dataclasses.dataclass -class SelectAll(abc.SQLSyntax): - """This class represents `select_all` (aka. `SELECT *`).""" - - expression: expr.StarExpression - - def sql(self) -> str: - return self.expression.sql() - - -@dataclasses.dataclass -class FromClause(abc.SQLSyntax): - """This class represents GoogleSQL `from_clause` syntax.""" - - from_item: FromItem - - def sql(self) -> str: - return self.from_item.sql() - - -@dataclasses.dataclass -class FromItem(abc.SQLSyntax): - """This class represents GoogleSQL `from_item` syntax.""" - - # Note: Temporarily introduces the `str` type to interact with pre-existing, - # compiled SQL strings. - expression: typing.Union[expr.TableExpression, QueryExpr, str, expr.CTEExpression] - as_alias: typing.Optional[AsAlias] = None - - @classmethod - def from_source( - cls, - subquery_or_tableref: typing.Union[bigquery.TableReference, str], - as_alias: typing.Optional[AsAlias] = None, - ): - if isinstance(subquery_or_tableref, bigquery.TableReference): - return cls( - expression=expr.TableExpression( - table_id=subquery_or_tableref.table_id, - dataset_id=subquery_or_tableref.dataset_id, - project_id=subquery_or_tableref.project, - ), - as_alias=as_alias, - ) - elif isinstance(subquery_or_tableref, str): - return cls( - expression=subquery_or_tableref, - as_alias=as_alias, - ) - else: - raise ValueError("The source must be bigquery.TableReference or str.") - - def sql(self) -> str: - if isinstance(self.expression, (expr.TableExpression, expr.CTEExpression)): - text = self.expression.sql() - elif isinstance(self.expression, str): - text = f"({self.expression})" - elif isinstance(self.expression, QueryExpr): - text = f"({self.expression.sql()})" - else: - raise ValueError( - f"Unsupported expression type {type(self.expression).__name__};" - "expected one of TableExpression, QueryExpr, str, or CTEExpression." - ) - - if self.as_alias is None: - return text - else: - return f"{text} {self.as_alias.sql()}" - - -@dataclasses.dataclass -class NonRecursiveCTE(abc.SQLSyntax): - """This class represents GoogleSQL `non_recursive_cte` syntax.""" - - cte_name: expr.CTEExpression - query_expr: QueryExpr - - def sql(self) -> str: - return f"{self.cte_name.sql()} AS (\n{self.query_expr.sql()}\n)" - - -@dataclasses.dataclass -class AsAlias(abc.SQLSyntax): - """This class represents GoogleSQL `as_alias` syntax.""" - - alias: expr.AliasExpression - - def sql(self) -> str: - return f"AS {self.alias.sql()}" diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 88d01c2a9e6..94ffa39dae5 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -598,6 +598,34 @@ def _select_to_cte(self) -> tuple[sge.Select, sge.Identifier]: return new_select_expr, cte_name +def identifier(id: str) -> str: + """Return a string representing column reference in a SQL.""" + return sge.to_identifier(id, quoted=SQLGlotIR.quoted).sql(dialect=SQLGlotIR.dialect) + + +def _escape_chars(value: str): + """Escapes all special characters""" + # TODO: Reuse _literal's escaping logic instead of re-implementing it here. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals + trans_table = str.maketrans( + { + "\a": r"\a", + "\b": r"\b", + "\f": r"\f", + "\n": r"\n", + "\r": r"\r", + "\t": r"\t", + "\v": r"\v", + "\\": r"\\", + "?": r"\?", + '"': r"\"", + "'": r"\'", + "`": r"\`", + } + ) + return value.translate(trans_table) + + def _is_null_literal(expr: sge.Expression) -> bool: """Checks if the given expression is a NULL literal.""" if isinstance(expr, sge.Null): diff --git a/bigframes/core/sql/__init__.py b/bigframes/core/sql/__init__.py index ccd2a16ddcd..1e88aad642d 100644 --- a/bigframes/core/sql/__init__.py +++ b/bigframes/core/sql/__init__.py @@ -23,9 +23,10 @@ import math from typing import cast, Collection, Iterable, Mapping, Optional, TYPE_CHECKING, Union +import bigframes_vendored.sqlglot.expressions as sge import shapely.geometry.base # type: ignore -import bigframes.core.compile.googlesql as googlesql +import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir if TYPE_CHECKING: import google.cloud.bigquery as bigquery @@ -65,7 +66,7 @@ def simple_literal(value: Union[SIMPLE_LITERAL_TYPES, None]) -> str: return "NULL" elif isinstance(value, str): # Single quoting seems to work nicer with ibis than double quoting - return f"'{googlesql._escape_chars(value)}'" + return f"'{sqlglot_ir._escape_chars(value)}'" elif isinstance(value, bytes): return repr(value) elif isinstance(value, (bool, int)): @@ -110,15 +111,15 @@ def multi_literal(*values: str): def cast_as_string(column_name: str) -> str: """Return a string representing string casting of a column.""" - return googlesql.Cast( - googlesql.ColumnExpression(column_name), googlesql.DataType.STRING - ).sql() + return sge.Cast(this=sge.to_identifier(column_name, quoted=True), to="STRING").sql( + dialect="bigquery" + ) def to_json_string(column_name: str) -> str: """Return a string representing JSON version of a column.""" - return f"TO_JSON_STRING({googlesql.identifier(column_name)})" + return f"TO_JSON_STRING({sqlglot_ir.identifier(column_name)})" def csv(values: Iterable[str]) -> str: @@ -132,11 +133,29 @@ def infix_op(opname: str, left_arg: str, right_arg: str): def is_distinct_sql(columns: Iterable[str], table_ref: bigquery.TableReference) -> str: + table_expr = sge.Table( + this=sge.Identifier(this=table_ref.table_id, quoted=True), + db=sge.Identifier(this=table_ref.dataset_id, quoted=True), + catalog=sge.Identifier(this=table_ref.project, quoted=True), + ) + to_select = [sge.to_identifier(col, quoted=True) for col in columns] + + full_table_sql = ( + sge.Select().select(*to_select).from_(table_expr).sql(dialect="bigquery") + ) + distinct_table_sql = ( + sge.Select() + .select(*to_select) + .distinct() + .from_(table_expr) + .sql(dialect="bigquery") + ) + is_unique_sql = f"""WITH full_table AS ( - {googlesql.Select().from_(table_ref).select(columns).sql()} + {full_table_sql} ), distinct_table AS ( - {googlesql.Select().from_(table_ref).select(columns, distinct=True).sql()} + {distinct_table_sql} ) SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, @@ -183,7 +202,7 @@ def create_vector_index_ddl( if len(stored_column_names) > 0: escaped_stored = [ - f"{googlesql.identifier(name)}" for name in stored_column_names + f"{sqlglot_ir.identifier(name)}" for name in stored_column_names ] storing = f"STORING({', '.join(escaped_stored)}) " else: @@ -197,8 +216,8 @@ def create_vector_index_ddl( ) return f""" - {create} {googlesql.identifier(index_name)} - ON {googlesql.identifier(table_name)}({googlesql.identifier(column_name)}) + {create} {sqlglot_ir.identifier(index_name)} + ON {sqlglot_ir.identifier(table_name)}({sqlglot_ir.identifier(column_name)}) {storing} OPTIONS({rendered_options}); """ @@ -217,7 +236,7 @@ def create_vector_search_sql( """Encode the VECTOR SEARCH statement for BigQuery Vector Search.""" vector_search_args = [ - f"TABLE {googlesql.identifier(cast(str, base_table))}", + f"TABLE {sqlglot_ir.identifier(cast(str, base_table))}", f"{simple_literal(column_to_search)}", f"({sql_string})", ] diff --git a/bigframes/core/sql/ml.py b/bigframes/core/sql/ml.py index a2a4d32ae84..d27b65a1c79 100644 --- a/bigframes/core/sql/ml.py +++ b/bigframes/core/sql/ml.py @@ -16,7 +16,7 @@ from typing import Any, Dict, List, Mapping, Optional, Union -import bigframes.core.compile.googlesql as googlesql +import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir import bigframes.core.sql import bigframes.core.sql.literals @@ -46,7 +46,7 @@ def create_model_ddl( else: create = "CREATE MODEL " - ddl = f"{create}{googlesql.identifier(model_name)}\n" + ddl = f"{create}{sqlglot_ir.identifier(model_name)}\n" # [TRANSFORM (select_list)] if transform: @@ -66,7 +66,7 @@ def create_model_ddl( if connection_name.upper() == "DEFAULT": ddl += "REMOTE WITH CONNECTION DEFAULT\n" else: - ddl += f"REMOTE WITH CONNECTION {googlesql.identifier(connection_name)}\n" + ddl += f"REMOTE WITH CONNECTION {sqlglot_ir.identifier(connection_name)}\n" # [OPTIONS(model_option_list)] if options: @@ -130,7 +130,7 @@ def evaluate( if confidence_level is not None: struct_options["confidence_level"] = confidence_level - sql = f"SELECT * FROM ML.EVALUATE(MODEL {googlesql.identifier(model_name)}" + sql = f"SELECT * FROM ML.EVALUATE(MODEL {sqlglot_ir.identifier(model_name)}" if table: sql += f", ({table})" @@ -159,7 +159,7 @@ def predict( struct_options["trial_id"] = trial_id sql = ( - f"SELECT * FROM ML.PREDICT(MODEL {googlesql.identifier(model_name)}, ({table})" + f"SELECT * FROM ML.PREDICT(MODEL {sqlglot_ir.identifier(model_name)}, ({table})" ) sql += _build_struct_sql(struct_options) sql += ")\n" @@ -190,7 +190,7 @@ def explain_predict( if approx_feature_contrib is not None: struct_options["approx_feature_contrib"] = approx_feature_contrib - sql = f"SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {googlesql.identifier(model_name)}, ({table})" + sql = f"SELECT * FROM ML.EXPLAIN_PREDICT(MODEL {sqlglot_ir.identifier(model_name)}, ({table})" sql += _build_struct_sql(struct_options) sql += ")\n" return sql @@ -208,7 +208,7 @@ def global_explain( if class_level_explain is not None: struct_options["class_level_explain"] = class_level_explain - sql = f"SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL {googlesql.identifier(model_name)}" + sql = f"SELECT * FROM ML.GLOBAL_EXPLAIN(MODEL {sqlglot_ir.identifier(model_name)}" sql += _build_struct_sql(struct_options) sql += ")\n" return sql @@ -221,7 +221,7 @@ def transform( """Encode the ML.TRANSFORM statement. See https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-transform for reference. """ - sql = f"SELECT * FROM ML.TRANSFORM(MODEL {googlesql.identifier(model_name)}, ({table}))\n" + sql = f"SELECT * FROM ML.TRANSFORM(MODEL {sqlglot_ir.identifier(model_name)}, ({table}))\n" return sql @@ -262,7 +262,7 @@ def generate_text( if request_type is not None: struct_options["request_type"] = request_type - sql = f"SELECT * FROM ML.GENERATE_TEXT(MODEL {googlesql.identifier(model_name)}, ({table})" + sql = f"SELECT * FROM ML.GENERATE_TEXT(MODEL {sqlglot_ir.identifier(model_name)}, ({table})" sql += _build_struct_sql(struct_options) sql += ")\n" return sql @@ -290,7 +290,7 @@ def generate_embedding( if output_dimensionality is not None: struct_options["output_dimensionality"] = output_dimensionality - sql = f"SELECT * FROM ML.GENERATE_EMBEDDING(MODEL {googlesql.identifier(model_name)}, ({table})" + sql = f"SELECT * FROM ML.GENERATE_EMBEDDING(MODEL {sqlglot_ir.identifier(model_name)}, ({table})" sql += _build_struct_sql(struct_options) sql += ")\n" return sql diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index f8244fb0d81..9413cd06954 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -27,7 +27,7 @@ import bigframes_vendored.sklearn.compose._column_transformer from google.cloud import bigquery -import bigframes.core.compile.googlesql as sql_utils +import bigframes.core.compile.sqlglot.sqlglot_ir as sql_utils from bigframes.core.logging import log_adapter import bigframes.core.utils as core_utils from bigframes.ml import base, core, globals, impute, preprocessing, utils diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 09a46b235d9..d90d23a4747 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -21,7 +21,7 @@ import bigframes_vendored.constants as constants import google.cloud.bigquery -import bigframes.core.compile.googlesql as sql_utils +import bigframes.core.compile.sqlglot.sqlglot_ir as sql_utils import bigframes.core.sql as sql_vals INDENT_STR = " " diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 98b5f194c74..bb472872def 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -32,7 +32,7 @@ import google.cloud.bigquery._job_helpers import google.cloud.bigquery.table -import bigframes.core.compile.googlesql as googlesql +import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir import bigframes.core.events from bigframes.core.logging import log_adapter import bigframes.core.sql @@ -599,7 +599,7 @@ def compile_filters(filters: third_party_pandas_gbq.FiltersType) -> str: operator_str = valid_operators[operator] - column_ref = googlesql.identifier(column) + column_ref = sqlglot_ir.identifier(column) if operator_str in ["IN", "NOT IN"]: value_literal = bigframes.core.sql.multi_literal(*value) else: diff --git a/bigframes/session/bigquery_session.py b/bigframes/session/bigquery_session.py index 99c13007d85..60d71e1cfed 100644 --- a/bigframes/session/bigquery_session.py +++ b/bigframes/session/bigquery_session.py @@ -24,7 +24,7 @@ import bigframes_vendored.ibis.backends.bigquery.datatypes as ibis_bq import google.cloud.bigquery as bigquery -from bigframes.core.compile import googlesql +import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir import bigframes.core.events from bigframes.session import temporary_storage import bigframes.session._io.bigquery as bfbqio @@ -80,7 +80,7 @@ def create_temp_table( ibis_schema = ibis_bq.BigQuerySchema.to_ibis(list(schema)) fields = [ - f"{googlesql.identifier(name)} {ibis_bq.BigQueryType.from_ibis(ibis_type)}" + f"{sqlglot_ir.identifier(name)} {ibis_bq.BigQueryType.from_ibis(ibis_type)}" for name, ibis_type in ibis_schema.fields.items() ] fields_string = ",".join(fields) @@ -88,12 +88,12 @@ def create_temp_table( cluster_string = "" if cluster_cols: cluster_cols_sql = ", ".join( - f"{googlesql.identifier(cluster_col)}" + f"{sqlglot_ir.identifier(cluster_col)}" for cluster_col in cluster_cols ) cluster_string = f"\nCLUSTER BY {cluster_cols_sql}" - ddl = f"CREATE TEMP TABLE `_SESSION`.{googlesql.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" + ddl = f"CREATE TEMP TABLE `_SESSION`.{sqlglot_ir.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" _, job = bfbqio.start_query_with_client( self.bqclient, diff --git a/tests/unit/core/compile/googlesql/__init__.py b/tests/unit/core/compile/googlesql/__init__.py deleted file mode 100644 index 6d5e14bcf4a..00000000000 --- a/tests/unit/core/compile/googlesql/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/unit/core/compile/googlesql/test_expression.py b/tests/unit/core/compile/googlesql/test_expression.py deleted file mode 100644 index e72598b1760..00000000000 --- a/tests/unit/core/compile/googlesql/test_expression.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -import bigframes.core.compile.googlesql as sql - - -@pytest.mark.parametrize( - ("table_id", "dataset_id", "project_id", "expected"), - [ - pytest.param("a", None, None, "`a`"), - pytest.param("a", "b", None, "`b`.`a`"), - pytest.param("a", "b", "c", "`c`.`b`.`a`"), - pytest.param("a", None, "c", None, marks=pytest.mark.xfail(raises=ValueError)), - ], -) -def test_table_expression(table_id, dataset_id, project_id, expected): - expr = sql.TableExpression( - table_id=table_id, dataset_id=dataset_id, project_id=project_id - ) - assert expr.sql() == expected - - -def test_escape_chars(): - assert sql._escape_chars("\a\b\f\n\r\t\v\\?'\"`") == r"\a\b\f\n\r\t\v\\\?\'\"\`" diff --git a/tests/unit/core/compile/googlesql/test_function.py b/tests/unit/core/compile/googlesql/test_function.py deleted file mode 100644 index 4edfda6f345..00000000000 --- a/tests/unit/core/compile/googlesql/test_function.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import bigframes.core.compile.googlesql as sql - - -def test_cast(): - col = sql.ColumnExpression("col") - assert sql.Cast(col, sql.DataType.STRING).sql() == "CAST (`col` AS STRING)" - assert sql.Cast(col, sql.DataType.FLOAT64).sql() == "CAST (`col` AS FLOAT64)" diff --git a/tests/unit/core/compile/googlesql/test_query.py b/tests/unit/core/compile/googlesql/test_query.py deleted file mode 100644 index b8d1d024e2b..00000000000 --- a/tests/unit/core/compile/googlesql/test_query.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from unittest.mock import MagicMock - -import google.cloud.bigquery as bigquery -import pytest - -import bigframes.core.compile.googlesql as sql - - -@pytest.mark.parametrize( - ("table_id", "dataset_id", "project_id", "expected"), - [ - pytest.param("a", None, None, "`a`"), - pytest.param("a", "b", None, "`b`.`a`"), - pytest.param("a", "b", "c", "`c`.`b`.`a`"), - pytest.param("a", None, "c", None, marks=pytest.mark.xfail(raises=ValueError)), - ], -) -def test_table_expression(table_id, dataset_id, project_id, expected): - expr = sql.TableExpression( - table_id=table_id, dataset_id=dataset_id, project_id=project_id - ) - assert expr.sql() == expected - - -@pytest.mark.parametrize( - ("table_name", "alias", "expected"), - [ - pytest.param("a", None, "`a`"), - pytest.param("a", "aa", "`a` AS `aa`"), - ], -) -def test_from_item_w_table_name(table_name, alias, expected): - expr = sql.FromItem( - sql.TableExpression(table_id=table_name), - as_alias=None - if alias is None - else sql.AsAlias(sql.AliasExpression(alias=alias)), - ) - assert expr.sql() == expected - - -def test_from_item_w_query_expr(): - from_clause = sql.FromClause( - sql.FromItem(expression=sql.TableExpression(table_id="table_a")) - ) - select = sql.Select( - select_list=[sql.SelectAll(sql.StarExpression())], - from_clause_list=[from_clause], - ) - query_expr = sql.QueryExpr(select=select) - expected = "SELECT\n*\nFROM\n`table_a`" - - # A QueryExpr object - expr = sql.FromItem(expression=query_expr) - assert expr.sql() == f"({expected})" - - # A str object - expr = sql.FromItem(expression=expected) - assert expr.sql() == f"({expected})" - - -def test_from_item_w_cte(): - expr = sql.FromItem(expression=sql.CTEExpression("test")) - assert expr.sql() == "`test`" - - -def test_from_item_w_table_ref(): - mock_table_ref = MagicMock(spec=bigquery.TableReference) - mock_table_ref.table_id = "mock_table" - mock_table_ref.dataset_id = "mock_dataset" - mock_table_ref.project = "mock_project" - - from_item = sql.FromItem.from_source(mock_table_ref) - - assert from_item.sql() == "`mock_project`.`mock_dataset`.`mock_table`" - - -@pytest.mark.parametrize( - ("col_name", "alias", "expected"), - [ - pytest.param("a", None, "`a`"), - pytest.param("a", "aa", "`a` AS `aa`"), - ], -) -def test_select_expression(col_name, alias, expected): - expr = sql.SelectExpression( - expression=sql.ColumnExpression(col_name), - alias=None if alias is None else sql.AliasExpression(alias=alias), - ) - assert expr.sql() == expected - - -def test_select(): - select_1 = sql.SelectExpression(expression=sql.ColumnExpression("a")) - select_2 = sql.SelectExpression( - expression=sql.ColumnExpression("b"), alias=sql.AliasExpression(alias="bb") - ) - from_1 = sql.FromItem(expression=sql.TableExpression(table_id="table_a")) - from_2 = sql.FromItem( - expression="SELECT * FROM project.table_b", - as_alias=sql.AsAlias(sql.AliasExpression(alias="table_b")), - ) - expr = sql.Select( - select_list=[select_1, select_2], - from_clause_list=[sql.FromClause(from_1), sql.FromClause(from_2)], - ) - expected = "SELECT\n`a`,\n`b` AS `bb`\nFROM\n`table_a`,\n(SELECT * FROM project.table_b) AS `table_b`" - - assert expr.sql() == expected - - -@pytest.mark.parametrize( - "columns, source, expected", - [ - ( - ["a", "b", "c"], - "select * from test", - "SELECT\nDISTINCT\n`a`,\n`b`,\n`c`\nFROM\n(select * from test)", - ), - ( - "a", - "select * from test", - "SELECT\nDISTINCT\n`a`\nFROM\n(select * from test)", - ), - ], -) -def test_select_from_str(columns, source, expected): - expr = sql.Select().from_(source).select(columns, distinct=True) - assert expr.sql() == expected - - -@pytest.mark.parametrize( - ("columns", "distinct", "expected"), - [ - pytest.param( - ["a", "b", "c"], - True, - "SELECT\nDISTINCT\n`a`,\n`b`,\n`c`\nFROM\n`mock_project`.`mock_dataset`.`mock_table`", - ), - pytest.param( - None, - True, - "SELECT\nDISTINCT\n*\nFROM\n`mock_project`.`mock_dataset`.`mock_table`", - ), - pytest.param( - None, False, "SELECT\n*\nFROM\n`mock_project`.`mock_dataset`.`mock_table`" - ), - ], -) -def test_select_from_table_ref(columns, distinct, expected): - mock_table_ref = MagicMock(spec=bigquery.TableReference) - mock_table_ref.table_id = "mock_table" - mock_table_ref.dataset_id = "mock_dataset" - mock_table_ref.project = "mock_project" - - expr = sql.Select().from_(mock_table_ref).select(columns, distinct=distinct) - assert expr.sql() == expected - - -def test_query_expr_w_cte(): - # Test a simple SELECT query. - from_clause1 = sql.FromClause( - sql.FromItem(expression=sql.TableExpression(table_id="table_a")) - ) - select1 = sql.Select( - select_list=[sql.SelectAll(sql.StarExpression())], - from_clause_list=[from_clause1], - ) - query1 = sql.QueryExpr(select=select1) - query1_sql = "SELECT\n*\nFROM\n`table_a`" - assert query1.sql() == query1_sql - - # Test a query with CTE statements. - cte1 = sql.NonRecursiveCTE(cte_name=sql.CTEExpression("a"), query_expr=query1) - cte2 = sql.NonRecursiveCTE(cte_name=sql.CTEExpression("b"), query_expr=query1) - - cte1_sql = f"`a` AS (\n{query1_sql}\n)" - cte2_sql = f"`b` AS (\n{query1_sql}\n)" - assert cte1.sql() == cte1_sql - assert cte2.sql() == cte2_sql - - with_cte_list = [cte1, cte2] - select2 = sql.Select( - select_list=[ - sql.SelectExpression( - sql.ColumnExpression(parent=cte1.cte_name, name="column_x") - ), - sql.SelectAll(sql.StarExpression(parent=cte2.cte_name)), - ], - from_clause_list=[ - sql.FromClause(sql.FromItem(expression=cte1.cte_name)), - sql.FromClause(sql.FromItem(expression=cte2.cte_name)), - ], - distinct=True, - ) - select2_sql = "SELECT\nDISTINCT\n`a`.`column_x`,\n`b`.*\nFROM\n`a`,\n`b`" - assert select2.sql() == select2_sql - - query2 = sql.QueryExpr(select=select2, with_cte_list=with_cte_list) - query2_sql = f"WITH {cte1_sql},\n{cte2_sql}\n{select2_sql}" - assert query2.sql() == query2_sql - - -def test_identifier(): - assert sql.identifier("\aa") == r"`\aa`" - - -def test_escape_chars(): - assert sql._escape_chars("\a\b\f\n\r\t\v\\?'\"`") == r"\a\b\f\n\r\t\v\\\?\'\"\`"