From cbebeaf400ad2700e68ae5bab8a24475e52a7b45 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 May 2024 23:20:26 +0000 Subject: [PATCH 01/10] fix: Improve escaping of literals and identifiers --- bigframes/core/blocks.py | 8 +- bigframes/core/compile/compiled.py | 28 +--- bigframes/core/sql.py | 150 +++++++++++++++--- bigframes/session/_io/bigquery/__init__.py | 39 +++-- .../session/_io/bigquery/read_gbq_table.py | 28 ++-- 5 files changed, 172 insertions(+), 81 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 58b8515418..6546068a07 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2114,16 +2114,14 @@ def _get_rows_as_json_values(self) -> Block: pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types) if pandas_row_dtype is None: pandas_row_dtype = "object" - pandas_row_dtype = sql.quote(str(pandas_row_dtype)) + pandas_row_dtype = sql.simple_literal(str(pandas_row_dtype)) # create a json column representing row through SQL manipulation row_json_column_name = guid.generate_guid() select_columns = ( [ordering_column_name] + list(self.index_columns) + [row_json_column_name] ) - select_columns_csv = sql.csv( - [sql.column_reference(col) for col in select_columns] - ) + select_columns_csv = sql.csv([sql.identifier(col) for col in select_columns]) json_sql = f"""\ With T0 AS ( {textwrap.indent(expr_sql, " ")} @@ -2136,7 +2134,7 @@ def _get_rows_as_json_values(self) -> Block: "values", [{column_references_csv}], "indexlength", {index_columns_count}, "dtype", {pandas_row_dtype} - ) AS {row_json_column_name} FROM T0 + ) AS {sql.identifier(row_json_column_name)} FROM T0 ) SELECT {select_columns_csv} FROM T1 """ diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index cc1d6baaa1..b57e0c4d35 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -16,9 +16,8 @@ import abc import functools import itertools -import textwrap import typing -from typing import Collection, Iterable, Literal, Optional, Sequence +from typing import Collection, Literal, Optional, Sequence import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis @@ -40,6 +39,7 @@ OrderingExpression, ) import bigframes.core.schema as schemata +import bigframes.core.sql from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec import bigframes.dtypes import bigframes.operations.aggregations as agg_ops @@ -821,15 +821,13 @@ def to_sql( ) ) output_columns = [ - col_id_overrides.get(col) if (col in col_id_overrides) else col - for col in baked_ir.column_ids + col_id_overrides.get(col, col) for col in baked_ir.column_ids ] - selection = ", ".join(map(lambda col_id: f"`{col_id}`", output_columns)) + sql = bigframes.core.sql.select_from(output_columns, sql) - sql = textwrap.dedent(f"SELECT {selection}\n" "FROM (\n" f"{sql}\n" ")\n") # Single row frames may not have any ordering columns if len(baked_ir._ordering.all_ordering_columns) > 0: - order_by_clause = baked_ir._ordering_clause( + order_by_clause = bigframes.core.sql.ordering_clause( baked_ir._ordering.all_ordering_columns ) sql += f"{order_by_clause}\n" @@ -843,22 +841,6 @@ def to_sql( ) return typing.cast(str, sql) - def _ordering_clause(self, ordering: Iterable[OrderingExpression]) -> str: - parts = [] - for col_ref in ordering: - asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC" - null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST" - ordering_expr = col_ref.scalar_expression - # We don't know how to compile scalar expressions in isolation - if ordering_expr.is_const: - # Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here. - continue - if not isinstance(ordering_expr, ex.UnboundVariableExpression): - raise ValueError("Expected direct column reference.") - part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" - parts.append(part) - return f"ORDER BY {' ,'.join(parts)}" - def _to_ibis_expr( self, *, diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 31ee5f9064..d4166eab9e 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -16,44 +16,148 @@ Utility functions for SQL construction. """ -from typing import Iterable +import datetime +import math +import textwrap +from typing import Iterable, TYPE_CHECKING +# Literals and identifiers matching this pattern can be unquoted +unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$" -def quote(value: str): - """Return quoted input string.""" - # Let's use repr which also escapes any special characters - # - # >>> for val in [ - # ... "123", - # ... "str with no special chars", - # ... "str with special chars.,'\"/\\" - # ... ]: - # ... print(f"{val} -> {repr(val)}") - # ... - # 123 -> '123' - # str with no special chars -> 'str with no special chars' - # str with special chars.,'"/\ -> 'str with special chars.,\'"/\\' +if TYPE_CHECKING: + import google.cloud.bigquery as bigquery - return repr(value) + import bigframes.core.ordering -def column_reference(column_name: str): - """Return a string representing column reference in a SQL.""" +### Writing SELECT expressions +def select_from(columns: Iterable[str], from_sql: str, distinct: bool = False): + selection = ", ".join(map(identifier, columns)) + distinct_clause = "DISTINCT " if distinct else "" + + return textwrap.dedent( + f"SELECT {distinct_clause}{selection}\nFROM (\n" f"{from_sql}\n" ")\n" + ) + + +def select_star(from_sql): + return textwrap.dedent(f"SELECT * FROM (\n" f"{from_sql}\n" ")\n") + + +def is_distinct_sql(columns: Iterable[str], table_sql: str) -> str: + is_unique_sql = f"""WITH full_table AS ( + {select_from(columns, table_sql)} + ), + distinct_table AS ( + {select_from(columns, table_sql, distinct=True)} + ) + + SELECT (SELECT COUNT(*) FROM full_table) == (SELECT COUNT(*) FROM distinct_table) AS `is_unique` + """ + return is_unique_sql + - return f"`{column_name}`" +def ordering_clause( + ordering: Iterable[bigframes.core.ordering.OrderingExpression], +) -> str: + import bigframes.core.expression + parts = [] + for col_ref in ordering: + asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC" + null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST" + ordering_expr = col_ref.scalar_expression + # We don't know how to compile scalar expressions in isolation + if ordering_expr.is_const: + # Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here. + continue + assert isinstance( + ordering_expr, bigframes.core.expression.UnboundVariableExpression + ) + part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" + parts.append(part) + return f"ORDER BY {' ,'.join(parts)}" -def cast_as_string(column_name: str): + +def snapshot_clause(time_travel_timestamp: datetime.datetime): + return f"FOR SYSTEM_TIME AS OF TIMESTAMP({repr(time_travel_timestamp.isoformat())})" + + +### Writing SQL Values (literals, column references, table references, etc.) +def simple_literal(value: str | int | bool | float): + """Return quoted input string.""" + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals + if isinstance(value, str): + return f'"{escape_special_characters(value)}"' + elif isinstance(value, (bool, int)): + return str(value) + elif isinstance(value, float): + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#floating_point_literals + if math.isnan(value): + return 'CAST("nan" as FLOAT)' + if value == math.inf: + return 'CAST("+inf" as FLOAT)' + if value == -math.inf: + return 'CAST("-inf" as FLOAT)' + return str(value) + else: + raise ValueError(f"Cannot produce literal for {value}") + + +def multi_literal(*values: str): + literal_strings = [simple_literal(i) for i in values] + return "(" + ", ".join(literal_strings) + ")" + + +def identifier(id: str) -> str: + """Return a string representing column reference in a SQL.""" + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers + # Just always escape, otherwise need to check against every reserved sql keyword + return f"`{escape_special_characters(id)}`" + + +def escape_special_characters(value: str): + """Escapes all special charactesrs""" + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#string_and_bytes_literals + trans_table = str.maketrans( + { + "\a": r"\a", + "\b": r"\b", + "\f": r"\f", + "\n": r"\n", + "\r": r"\r", + "\t": r"\t", + "\v": r"\v", + "\\": r"\\", + "?": r"\?", + '"': r"\"", + "'": r"\'", + "`": r"\`", + } + ) + return value.translate(trans_table) + + +def cast_as_string(column_name: str) -> str: """Return a string representing string casting of a column.""" - return f"CAST({column_reference(column_name)} AS STRING)" + return f"CAST({identifier(column_name)} AS STRING)" -def csv(values: Iterable[str], quoted=False): +def csv(values: Iterable[str], quoted=False) -> str: """Return a string of comma separated values.""" if quoted: - values = [quote(val) for val in values] + values = [simple_literal(val) for val in values] return ", ".join(values) + + +def table_reference(table_ref: bigquery.TableReference) -> str: + return f"`{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}`" + + +def infix_op(opname: str, left_arg: str, right_arg: str): + # Maybe should add parentheses?? + return f"{left_arg} {opname} {right_arg}" diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 98e0dac1e8..550057819e 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -31,6 +31,7 @@ import bigframes from bigframes.core import log_adapter +import bigframes.core.sql import bigframes.formatting_helpers as formatting_helpers IO_ORDERING_ID = "bqdf_row_nums" @@ -348,7 +349,6 @@ def to_query( else: select_clause = "SELECT *" - where_clause = "" if filters: valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = { "in": "IN", @@ -368,12 +368,12 @@ def to_query( ): filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters]) - or_expressions = [] + or_expression = "" for group in filters: if not isinstance(group, Iterable): group = [group] - and_expressions = [] + and_expression = "" for filter_item in group: if not isinstance(filter_item, tuple) or (len(filter_item) != 3): raise ValueError( @@ -392,17 +392,32 @@ def to_query( operator_str = valid_operators[operator] + column_ref = bigframes.core.sql.identifier(column) if operator_str in ["IN", "NOT IN"]: - value_list = ", ".join([repr(v) for v in value]) - expression = f"`{column}` {operator_str} ({value_list})" + value_literal = bigframes.core.sql.multi_literal(value) else: - expression = f"`{column}` {operator_str} {repr(value)}" - and_expressions.append(expression) - - or_expressions.append(" AND ".join(and_expressions)) - - if or_expressions: - where_clause = " WHERE " + " OR ".join(or_expressions) + value_literal = bigframes.core.sql.simple_literal(value) + expression = bigframes.core.sql.infix_op( + operator_str, column_ref, value_literal + ) + if and_expression: + and_expression = bigframes.core.sql.infix_op( + "AND", and_expression, expression + ) + else: + and_expression = expression + + if or_expression: + or_expression = bigframes.core.sql.infix_op( + "OR", or_expression, and_expression + ) + else: + or_expression = and_expression + + if or_expression: + where_clause = f" WHERE {or_expression}" + else: + where_clause = "" full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" return full_query diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index f6c1463e6c..655398b4a8 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -40,6 +40,7 @@ import bigframes.core.compile import bigframes.core.guid as guid import bigframes.core.ordering as order +import bigframes.core.sql import bigframes.dtypes import bigframes.session._io.bigquery.read_gbq_table import bigframes.session.clients @@ -133,14 +134,16 @@ def _create_time_travel_sql( """Query a table via 'time travel' for consistent reads.""" # If we have an anonymous query results table, it can't be modified and # there isn't any BigQuery time travel. + selection = bigframes.core.sql.select_star( + bigframes.core.sql.table_reference(table_ref) + ) if table_ref.dataset_id.startswith("_"): - return f"SELECT * FROM `{table_ref.project}`.`{table_ref.dataset_id}`.`{table_ref.table_id}`" + return selection return textwrap.dedent( f""" - SELECT * - FROM `{table_ref.project}`.`{table_ref.dataset_id}`.`{table_ref.table_id}` - FOR SYSTEM_TIME AS OF TIMESTAMP({repr(time_travel_timestamp.isoformat())}) + {selection} + {bigframes.core.sql.snapshot_clause(time_travel_timestamp)} """ ) @@ -168,25 +171,14 @@ def _check_index_uniqueness( index_cols: List[str], api_name: str, ) -> bool: - distinct_table = table.select(*index_cols).distinct() - is_unique_sql = f"""WITH full_table AS ( - {ibis_client.compile(table)} - ), - distinct_table AS ( - {ibis_client.compile(distinct_table)} - ) - - SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, - (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` - """ + table_sql = ibis_client.compile(table) + is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table_sql) job_config = bigquery.QueryJobConfig() job_config.labels["bigframes-api"] = api_name results = bqclient.query_and_wait(is_unique_sql, job_config=job_config) row = next(iter(results)) - total_count = row["total_count"] - distinct_count = row["distinct_count"] - return total_count == distinct_count + return row["is_unique"] def _get_primary_keys( From c16294c3c846b017717d7333345c8c99364f7800 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 May 2024 23:33:29 +0000 Subject: [PATCH 02/10] fix type annotations with future --- bigframes/core/sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index d4166eab9e..e3fac3b14b 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations """ Utility functions for SQL construction. From 79bf3624774055a4d5b0cf553d53ef3e1530ef75 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 11 May 2024 00:07:01 +0000 Subject: [PATCH 03/10] fix select generation --- bigframes/core/sql.py | 107 +++++++++--------- bigframes/session/__init__.py | 1 + .../session/_io/bigquery/read_gbq_table.py | 12 +- 3 files changed, 60 insertions(+), 60 deletions(-) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index e3fac3b14b..8d063171d0 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -32,59 +32,6 @@ import bigframes.core.ordering -### Writing SELECT expressions -def select_from(columns: Iterable[str], from_sql: str, distinct: bool = False): - selection = ", ".join(map(identifier, columns)) - distinct_clause = "DISTINCT " if distinct else "" - - return textwrap.dedent( - f"SELECT {distinct_clause}{selection}\nFROM (\n" f"{from_sql}\n" ")\n" - ) - - -def select_star(from_sql): - return textwrap.dedent(f"SELECT * FROM (\n" f"{from_sql}\n" ")\n") - - -def is_distinct_sql(columns: Iterable[str], table_sql: str) -> str: - is_unique_sql = f"""WITH full_table AS ( - {select_from(columns, table_sql)} - ), - distinct_table AS ( - {select_from(columns, table_sql, distinct=True)} - ) - - SELECT (SELECT COUNT(*) FROM full_table) == (SELECT COUNT(*) FROM distinct_table) AS `is_unique` - """ - return is_unique_sql - - -def ordering_clause( - ordering: Iterable[bigframes.core.ordering.OrderingExpression], -) -> str: - import bigframes.core.expression - - parts = [] - for col_ref in ordering: - asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC" - null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST" - ordering_expr = col_ref.scalar_expression - # We don't know how to compile scalar expressions in isolation - if ordering_expr.is_const: - # Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here. - continue - assert isinstance( - ordering_expr, bigframes.core.expression.UnboundVariableExpression - ) - part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" - parts.append(part) - return f"ORDER BY {' ,'.join(parts)}" - - -def snapshot_clause(time_travel_timestamp: datetime.datetime): - return f"FOR SYSTEM_TIME AS OF TIMESTAMP({repr(time_travel_timestamp.isoformat())})" - - ### Writing SQL Values (literals, column references, table references, etc.) def simple_literal(value: str | int | bool | float): """Return quoted input string.""" @@ -162,3 +109,57 @@ def table_reference(table_ref: bigquery.TableReference) -> str: def infix_op(opname: str, left_arg: str, right_arg: str): # Maybe should add parentheses?? return f"{left_arg} {opname} {right_arg}" + + +### Writing SELECT expressions +def select_from(columns: Iterable[str], subquery: str, distinct: bool = False): + selection = ", ".join(map(identifier, columns)) + distinct_clause = "DISTINCT " if distinct else "" + + return textwrap.dedent( + f"SELECT {distinct_clause}{selection}\nFROM (\n" f"{subquery}\n" ")\n" + ) + + +def select_table(table_ref: bigquery.TableReference): + return textwrap.dedent(f"SELECT * FROM {table_reference(table_ref)}") + + +def is_distinct_sql(columns: Iterable[str], table_sql: str) -> str: + is_unique_sql = f"""WITH full_table AS ( + {select_from(columns, table_sql)} + ), + distinct_table AS ( + {select_from(columns, table_sql, distinct=True)} + ) + + SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, + (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` + """ + return is_unique_sql + + +def ordering_clause( + ordering: Iterable[bigframes.core.ordering.OrderingExpression], +) -> str: + import bigframes.core.expression + + parts = [] + for col_ref in ordering: + asc_desc = "ASC" if col_ref.direction.is_ascending else "DESC" + null_clause = "NULLS LAST" if col_ref.na_last else "NULLS FIRST" + ordering_expr = col_ref.scalar_expression + # We don't know how to compile scalar expressions in isolation + if ordering_expr.is_const: + # Probably shouldn't have constants in ordering definition, but best to ignore if somehow they end up here. + continue + assert isinstance( + ordering_expr, bigframes.core.expression.UnboundVariableExpression + ) + part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" + parts.append(part) + return f"ORDER BY {' ,'.join(parts)}" + + +def snapshot_clause(time_travel_timestamp: datetime.datetime): + return f"FOR SYSTEM_TIME AS OF TIMESTAMP({repr(time_travel_timestamp.isoformat())})" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 473fc4f098..9f11ecd175 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1694,6 +1694,7 @@ def _start_query( Starts BigQuery query job and waits for results. """ job_config = self._prepare_query_job_config(job_config) + print(sql) return bigframes.session._io.bigquery.start_query_with_client( self.bqclient, sql, job_config, max_results, timeout ) diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 655398b4a8..9f6247de36 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -134,9 +134,7 @@ def _create_time_travel_sql( """Query a table via 'time travel' for consistent reads.""" # If we have an anonymous query results table, it can't be modified and # there isn't any BigQuery time travel. - selection = bigframes.core.sql.select_star( - bigframes.core.sql.table_reference(table_ref) - ) + selection = bigframes.core.sql.select_table(table_ref) if table_ref.dataset_id.startswith("_"): return selection @@ -154,9 +152,9 @@ def get_ibis_time_travel_table( time_travel_timestamp: datetime.datetime, ) -> ibis_types.Table: try: - return ibis_client.sql( - _create_time_travel_sql(table_ref, time_travel_timestamp) - ) + sql = _create_time_travel_sql(table_ref, time_travel_timestamp) + print(sql) + return ibis_client.sql(sql) except google.api_core.exceptions.Forbidden as ex: # Ibis does a dry run to get the types of the columns from the SQL. if "Drive credentials" in ex.message: @@ -178,7 +176,7 @@ def _check_index_uniqueness( results = bqclient.query_and_wait(is_unique_sql, job_config=job_config) row = next(iter(results)) - return row["is_unique"] + return row["total_count"] == row["distinct_count"] def _get_primary_keys( From 3bfed54523203236af1f3362820683c1b4c53e4f Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 11 May 2024 00:35:46 +0000 Subject: [PATCH 04/10] fix to_query --- bigframes/session/_io/bigquery/__init__.py | 23 ++++++++++------------ tests/system/small/test_series.py | 6 ++++++ tests/unit/session/test_io_bigquery.py | 10 +++++----- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 550057819e..0c43a4596e 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -349,6 +349,7 @@ def to_query( else: select_clause = "SELECT *" + filter_string = "" if filters: valid_operators: Mapping[third_party_pandas_gbq.FilterOps, str] = { "in": "IN", @@ -368,7 +369,6 @@ def to_query( ): filters = typing.cast(third_party_pandas_gbq.FiltersType, [filters]) - or_expression = "" for group in filters: if not isinstance(group, Iterable): group = [group] @@ -394,7 +394,7 @@ def to_query( column_ref = bigframes.core.sql.identifier(column) if operator_str in ["IN", "NOT IN"]: - value_literal = bigframes.core.sql.multi_literal(value) + value_literal = bigframes.core.sql.multi_literal(*value) else: value_literal = bigframes.core.sql.simple_literal(value) expression = bigframes.core.sql.infix_op( @@ -407,17 +407,14 @@ def to_query( else: and_expression = expression - if or_expression: - or_expression = bigframes.core.sql.infix_op( - "OR", or_expression, and_expression + if filter_string: + filter_string = bigframes.core.sql.infix_op( + "OR", filter_string, and_expression ) else: - or_expression = and_expression + filter_string = and_expression - if or_expression: - where_clause = f" WHERE {or_expression}" - else: - where_clause = "" - - full_query = f"{select_clause} FROM {sub_query} AS sub{where_clause}" - return full_query + if filter_string: + return f"{select_clause} FROM {sub_query} AS sub WHERE {filter_string}" + else: + return f"{select_clause} FROM {sub_query} AS sub" diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index beb99b1ada..3025cb40c4 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -33,6 +33,12 @@ ) +def test_series_unicode_name(): + pd_series = pd.Series([1, 1, 2, 3, 5, 8, 13], name="fibonacci\"`'") + bf_result = series.Series(pd_series).to_pandas() + pd.testing.assert_series_equal(bf_result, pd_series) + + def test_series_construct_copy(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = series.Series( diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 9da085e824..031431a086 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -236,8 +236,8 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) ], ( "SELECT `row_index`, `string_col` FROM `test_table` AS sub WHERE " - "`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', " - "'こんにちは')" + '`rowindex` NOT IN (0, 6) OR `string_col` IN ("Hello, World!", ' + '"こんにちは")' ), id="table-all_params-filter_or_operation", ), @@ -257,7 +257,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) rowindex, string_col, FROM `test_table` AS t - ) AS sub WHERE `rowindex` < 4 AND `string_col` = 'Hello, World!'""", + ) AS sub WHERE `rowindex` < 4 AND `string_col` = \"Hello, World!\"""", id="subquery-all_params-filter_and_operation", ), pytest.param( @@ -273,7 +273,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], [], [("date_col", ">", "2022-10-20")], - "SELECT * FROM `test_table` AS sub WHERE `date_col` > '2022-10-20'", + 'SELECT * FROM `test_table` AS sub WHERE `date_col` > "2022-10-20"', id="table-filter", ), pytest.param( @@ -289,7 +289,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], [], [("_TABLE_SUFFIX", ">", "2022-10-20")], - "SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > '2022-10-20'", + 'SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > "2022-10-20"', id="wildcard-filter", ), ], From 9057d91cd98d1ba4ba3eb0d731eee7eae70a2c27 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 13 May 2024 18:06:55 +0000 Subject: [PATCH 05/10] restore separate quoting of each table ref component --- bigframes/core/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 8d063171d0..05538c88f2 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -103,7 +103,7 @@ def csv(values: Iterable[str], quoted=False) -> str: def table_reference(table_ref: bigquery.TableReference) -> str: - return f"`{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}`" + return f"`{escape_special_characters(table_ref.project)}`.`{escape_special_characters(table_ref.dataset_id)}`.`{escape_special_characters(table_ref.table_id)}`" def infix_op(opname: str, left_arg: str, right_arg: str): From 55b72d03b617e143c59103f68a9cc8e0c064ac6a Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 13 May 2024 20:57:33 +0000 Subject: [PATCH 06/10] fix apply axis 1 value lists --- bigframes/core/blocks.py | 6 ++++-- bigframes/core/sql.py | 6 +----- tests/system/small/test_remote_function.py | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 6546068a07..830fa74f0b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2096,7 +2096,7 @@ def _get_rows_as_json_values(self) -> Block: ) column_names.append(serialized_column_name) - column_names_csv = sql.csv(column_names, quoted=True) + column_names_csv = sql.csv(map(sql.simple_literal, column_names)) # index columns count index_columns_count = len(self.index_columns) @@ -2108,7 +2108,9 @@ def _get_rows_as_json_values(self) -> Block: # types of the columns to serialize for the row column_types = list(self.index.dtypes) + list(self.dtypes) - column_types_csv = sql.csv([str(typ) for typ in column_types], quoted=True) + column_types_csv = sql.csv( + [sql.simple_literal(str(typ)) for typ in column_types] + ) # row dtype to use for deserializing the row as pandas series pandas_row_dtype = bigframes.dtypes.lcd_type(*column_types) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 05538c88f2..0db1db3dd0 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -93,12 +93,8 @@ def cast_as_string(column_name: str) -> str: return f"CAST({identifier(column_name)} AS STRING)" -def csv(values: Iterable[str], quoted=False) -> str: +def csv(values: Iterable[str]) -> str: """Return a string of comma separated values.""" - - if quoted: - values = [simple_literal(val) for val in values] - return ", ".join(values) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 9c60c821a7..979fade100 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -724,7 +724,7 @@ def add_ints(row): ) -@pytest.mark.flaky(retries=2, delay=120) +# @pytest.mark.flaky(retries=2, delay=120) def test_df_apply_axis_1_ordering(session, scalars_dfs): columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] ordering_columns = ["bool_col", "int64_col"] From 1ec2710367e17e250ec276044b173d47137ef7bc Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 13 May 2024 21:23:52 +0000 Subject: [PATCH 07/10] switch to single quoting, remove prints --- bigframes/core/sql.py | 3 ++- bigframes/session/_io/bigquery/read_gbq_table.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 0db1db3dd0..3ad06610b6 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -37,7 +37,8 @@ def simple_literal(value: str | int | bool | float): """Return quoted input string.""" # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals if isinstance(value, str): - return f'"{escape_special_characters(value)}"' + # Single quoting seems to work nicer with ibis than double quoting + return f"'{escape_special_characters(value)}'" elif isinstance(value, (bool, int)): return str(value) elif isinstance(value, float): diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 9f6247de36..2879f6a31f 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -153,7 +153,6 @@ def get_ibis_time_travel_table( ) -> ibis_types.Table: try: sql = _create_time_travel_sql(table_ref, time_travel_timestamp) - print(sql) return ibis_client.sql(sql) except google.api_core.exceptions.Forbidden as ex: # Ibis does a dry run to get the types of the columns from the SQL. From f06d3ffee90007c0d1efff0a52d9a521cca4cd85 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 13 May 2024 21:45:27 +0000 Subject: [PATCH 08/10] fix unit test quoting --- tests/unit/session/test_io_bigquery.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 5d6bb75dd1..57f9e00363 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -228,8 +228,8 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) ], ( "SELECT `row_index`, `string_col` FROM `test_table` AS sub WHERE " - '`rowindex` NOT IN (0, 6) OR `string_col` IN ("Hello, World!", ' - '"こんにちは")' + "`rowindex` NOT IN (0, 6) OR `string_col` IN ('Hello, World!', " + "'こんにちは')" ), id="table-all_params-filter_or_operation", ), @@ -249,7 +249,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) rowindex, string_col, FROM `test_table` AS t - ) AS sub WHERE `rowindex` < 4 AND `string_col` = \"Hello, World!\"""", + ) AS sub WHERE `rowindex` < 4 AND `string_col` = \'Hello, World!\'""", id="subquery-all_params-filter_and_operation", ), pytest.param( @@ -265,7 +265,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], [], [("date_col", ">", "2022-10-20")], - 'SELECT * FROM `test_table` AS sub WHERE `date_col` > "2022-10-20"', + "SELECT * FROM `test_table` AS sub WHERE `date_col` > '2022-10-20'", id="table-filter", ), pytest.param( @@ -281,7 +281,7 @@ def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str) [], [], [("_TABLE_SUFFIX", ">", "2022-10-20")], - 'SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > "2022-10-20"', + "SELECT * FROM `test_table*` AS sub WHERE `_TABLE_SUFFIX` > '2022-10-20'", id="wildcard-filter", ), ], From 50da1d32ebf5169409105bce98908992bb4f3ab8 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 15 May 2024 19:33:00 +0000 Subject: [PATCH 09/10] remove unnecessary test --- tests/system/small/test_series.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 29ad17ca62..fa514784c0 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -33,12 +33,6 @@ ) -def test_series_unicode_name(): - pd_series = pd.Series([1, 1, 2, 3, 5, 8, 13], name="fibonacci\"`'") - bf_result = series.Series(pd_series).to_pandas() - pd.testing.assert_series_equal(bf_result, pd_series) - - def test_series_construct_copy(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = series.Series( From 9dc937eba89f7a600abf36fceb0163a148aa643f Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 15 May 2024 20:59:48 +0000 Subject: [PATCH 10/10] revert unintended changes --- bigframes/session/__init__.py | 1 - tests/system/large/test_remote_function.py | 2 +- tests/system/small/test_remote_function.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5895bd994d..727269e7ee 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1701,7 +1701,6 @@ def _start_query( Starts BigQuery query job and waits for results. """ job_config = self._prepare_query_job_config(job_config) - print(sql) return bigframes.session._io.bigquery.start_query_with_client( self.bqclient, sql, job_config, max_results, timeout ) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 0fa1d90e8b..4aa62c0f6d 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -221,7 +221,7 @@ def stringify(x): ) -# @pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2, delay=120) def test_remote_function_binop(session, scalars_dfs, dataset_id, bq_cf_connection): try: diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 979fade100..9c60c821a7 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -724,7 +724,7 @@ def add_ints(row): ) -# @pytest.mark.flaky(retries=2, delay=120) +@pytest.mark.flaky(retries=2, delay=120) def test_df_apply_axis_1_ordering(session, scalars_dfs): columns = ["bool_col", "int64_col", "int64_too", "float64_col", "string_col"] ordering_columns = ["bool_col", "int64_col"]