diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 069bd5d260..928123ce74 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1120,8 +1120,9 @@ def _create_total_ordering( ordering_hash_part = guid.generate_guid("bigframes_ordering_") ordering_rand_part = guid.generate_guid("bigframes_ordering_") + # All inputs into hash must be non-null or resulting hash will be null str_values = list( - map(lambda col: _convert_to_string(table[col]), table.columns) + map(lambda col: _convert_to_nonnull_string(table[col]), table.columns) ) full_row_str = ( str_values[0].concat(*str_values[1:]) @@ -1419,7 +1420,7 @@ def _can_cluster_bq(field: bigquery.SchemaField): ) -def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn: +def _convert_to_nonnull_string(column: ibis_types.Column) -> ibis_types.StringValue: col_type = column.type() if ( col_type.is_numeric() @@ -1436,4 +1437,6 @@ def _convert_to_string(column: ibis_types.Column) -> ibis_types.StringColumn: # TO_JSON_STRING works with all data types, but isn't the most efficient # Needed for JSON, STRUCT and ARRAY datatypes result = vendored_ibis_ops.ToJsonString(column).to_expr() # type: ignore - return typing.cast(ibis_types.StringColumn, result) + # Escape backslashes and use backslash as delineator + escaped = typing.cast(ibis_types.StringColumn, result.fillna("")).replace("\\", "\\\\") # type: ignore + return typing.cast(ibis_types.StringColumn, ibis.literal("\\")).concat(escaped) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e522878229..a0cf25807c 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2703,6 +2703,14 @@ def test_sample(scalars_dfs, frac, n, random_state): assert bf_result.shape[1] == scalars_df.shape[1] +def test_sample_determinism(penguins_df_default_index): + df = penguins_df_default_index.sample(n=100, random_state=12345).head(15) + bf_result = df.to_pandas() + bf_result2 = df.to_pandas() + + pandas.testing.assert_frame_equal(bf_result, bf_result2) + + def test_sample_raises_value_error(scalars_dfs): scalars_df, _ = scalars_dfs with pytest.raises(