From aadbfb16bf14b8808ab20a3884b51a438abfea5d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 16 Nov 2023 01:38:19 +0000 Subject: [PATCH 1/3] fix: correctly handle null values when initializing fingerprint ordering --- bigframes/session/__init__.py | 10 +++++++--- tests/system/small/test_dataframe.py | 8 ++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 069bd5d260..72af1090b8 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1120,8 +1120,9 @@ def _create_total_ordering( ordering_hash_part = guid.generate_guid("bigframes_ordering_") ordering_rand_part = guid.generate_guid("bigframes_ordering_") + # All inputs into hash must be non-null or resulting hash will be null str_values = list( - map(lambda col: _convert_to_string(table[col]), table.columns) + map(lambda col: _convert_to_nonnull_string(table[col]), table.columns) ) full_row_str = ( str_values[0].concat(*str_values[1:]) @@ -1419,7 +1420,7 @@ def _can_cluster_bq(field: bigquery.SchemaField): ) -def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn: +def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringColumn: col_type = column.type() if ( col_type.is_numeric() @@ -1436,4 +1437,7 @@ def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn: # TO_JSON_STRING works with all data types, but isn't the most efficient # Needed for JSON, STRUCT and ARRAY datatypes result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore - return typing.cast(ibis_types.StringColumn, result) + # Escape backslashes and use backslash as delineator + return ibis.literal("\\") + typing.cast(ibis_types.StringColumn, result).replace( + "\\", "\\\\" + ).fillna("") diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e522878229..a0cf25807c 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2703,6 +2703,14 @@ def test_sample(scalars_dfs, frac, n, random_state): assert bf_result.shape[1] == scalars_df.shape[1] +def test_sample_determinism(penguins_df_default_index): + df = penguins_df_default_index.sample(n=100, random_state=12345).head(15) + bf_result = df.to_pandas() + bf_result2 = df.to_pandas() + + pandas.testing.assert_frame_equal(bf_result, bf_result2) + + def test_sample_raises_value_error(scalars_dfs): scalars_df, _ = scalars_dfs with pytest.raises( From 7744e2b7d398822d0a93596d217c27b0601a39fd Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 16 Nov 2023 01:59:41 +0000 Subject: [PATCH 2/3] fix mypy --- bigframes/session/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 72af1090b8..344500230e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1438,6 +1438,7 @@ def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringCo # Needed for JSON, STRUCT and ARRAY datatypes result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore # Escape backslashes and use backslash as delineator - return ibis.literal("\\") + typing.cast(ibis_types.StringColumn, result).replace( + escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace( "\\", "\\\\" - ).fillna("") + ) + return typing.cast(ibis_types.StringColumn, ibis.literal("\\") + escaped) From 2764510a7a7feb4cb97cbd670794a66555119062 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 16 Nov 2023 02:07:05 +0000 Subject: [PATCH 3/3] fix mypy 2 --- bigframes/session/__init__.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 344500230e..928123ce74 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1420,7 +1420,7 @@ def _can_cluster_bq(field: bigquery.SchemaField): ) -def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringColumn: +def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringValue: col_type = column.type() if ( col_type.is_numeric() @@ -1438,7 +1438,5 @@ def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringCo # Needed for JSON, STRUCT and ARRAY datatypes result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore # Escape backslashes and use backslash as delineator - escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace( - "\\", "\\\\" - ) - return typing.cast(ibis_types.StringColumn, ibis.literal("\\") + escaped) + escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace("\\", "\\\\") # type: ignore + return typing.cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped)