googleapis · sycai · Sep 17, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024
@@ -0,0 +1,3 @@
+{"rowindex": 0, "int_list_col": [1],        "bool_list_col": [true],        "float_list_col": [1.2, 2.3],        "date_list_col": ["2021-07-21"],                "date_time_list_col": ["2021-07-21 11:39:45"], "numeric_list_col": [1.2, 2.3, 3.4], "string_list_col": ["abc", "de", "f"]}
+{"rowindex": 1, "int_list_col": [1,2],      "bool_list_col": [true, false], "float_list_col": [1.1],             "date_list_col": ["2021-07-21", "1987-03-28"],  "date_time_list_col": ["1999-03-14 17:22:00"], "numeric_list_col": [5.5, 2.3],      "string_list_col": ["a", "bc", "de"]}
+{"rowindex": 2, "int_list_col": [1,2,3],    "bool_list_col": [true],        "float_list_col": [0.5, -1.9, 2.3],  "date_list_col": ["2017-08-01", "2004-11-22"],  "date_time_list_col": ["1979-06-03 03:20:45"], "numeric_list_col": [1.7],           "string_list_col": ["", "a"]}
@@ -0,0 +1,42 @@
+[
+    {
+        "name": "rowindex",
+        "type": "INTEGER",
+        "mode": "REQUIRED"
+    },
+    {
+        "name": "int_list_col",
+        "type": "INTEGER",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "bool_list_col",
+        "type": "BOOLEAN",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "float_list_col",
+        "type": "FLOAT",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "date_list_col",
+        "type": "DATE",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "date_time_list_col",
+        "type": "DATETIME",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "numeric_list_col",
+        "type": "NUMERIC",
+        "mode": "REPEATED"
+    },
+    {
+        "name": "string_list_col",
+        "type": "STRING",
+        "mode": "REPEATED"
+    }
+]
@@ -39,6 +39,7 @@
 import bigframes
 import bigframes.dataframe
 import bigframes.pandas as bpd
+import bigframes.series
 import tests.system.utils

 # Use this to control the number of cloud functions being deleted in a single
@@ -294,6 +295,7 @@ def load_test_data_tables(
        ("scalars", "scalars_schema.json", "scalars.jsonl"),
        ("scalars_too", "scalars_schema.json", "scalars.jsonl"),
        ("nested", "nested_schema.json", "nested.jsonl"),
+        ("repeated", "repeated_schema.json", "repeated.jsonl"),
        ("penguins", "penguins_schema.json", "penguins.jsonl"),
        ("time_series", "time_series_schema.json", "time_series.jsonl"),
        ("hockey_players", "hockey_players.json", "hockey_players.jsonl"),
@@ -370,6 +372,11 @@ def nested_table_id(test_data_tables) -> str:
    return test_data_tables["nested"]


+@pytest.fixture(scope="session")
+def repeated_table_id(test_data_tables) -> str:
+    return test_data_tables["repeated"]
+
+
 @pytest.fixture(scope="session")
 def penguins_table_id(test_data_tables) -> str:
    return test_data_tables["penguins"]
@@ -410,6 +417,26 @@ def nested_pandas_df() -> pd.DataFrame:
    return df


+@pytest.fixture(scope="session")
+def repeated_df(
+    repeated_table_id: str, session: bigframes.Session
+) -> bigframes.dataframe.DataFrame:
+    """Returns a DataFrame containing columns of list type."""
+    return session.read_gbq(repeated_table_id, index_col="rowindex")
+
+
+@pytest.fixture(scope="session")
+def repeated_pandas_df() -> pd.DataFrame:
+    """Returns a DataFrame containing columns of list type."""
+
+    df = pd.read_json(
+        DATA_DIR / "repeated.jsonl",
+        lines=True,
+    )
+    df = df.set_index("rowindex")
+    return df
+
+
 @pytest.fixture(scope="session")
 def scalars_df_default_index(
    scalars_df_index: bigframes.dataframe.DataFrame,

@@ -18,8 +18,6 @@
 import pyarrow as pa
 import pytest

-import bigframes.pandas as bpd
-
 from ...utils import assert_series_equal


@@ -32,19 +30,34 @@
        pytest.param(slice(0, 2, None), id="default_step_slice"),
    ],
 )
-def test_getitem(key):
+@pytest.mark.parametrize(
+    ("column_name", "dtype"),
+    [
+        pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))),
+        pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))),
+        pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))),
+        pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))),
+        pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))),
+        pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))),
+        pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))),
+    ],
+)
+def test_getitem(key, column_name, dtype, repeated_df, repeated_pandas_df):
    if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"):
        pytest.skip(
            "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data"
        )
-    data = [[1], [2, 3], [4, 5, 6]]
-    s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
-    pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))

-    bf_result = s.list[key].to_pandas()
-    pd_result = pd_s.list[key]
+    bf_result = repeated_df[column_name].list[key].to_pandas()
+    pd_result = repeated_pandas_df[column_name].astype(dtype).list[key]

-    assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
+    assert_series_equal(
+        pd_result,
+        bf_result,
+        check_dtype=False,
+        check_index_type=False,
+        check_names=False,
+    )


 @pytest.mark.parametrize(
@@ -60,24 +73,36 @@ def test_getitem(key):
        (slice(0, 2, 2), pytest.raises(NotImplementedError)),
    ],
 )
-def test_getitem_notsupported(key, expectation):
-    data = [[1], [2, 3], [4, 5, 6]]
-    s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
-
+def test_getitem_notsupported(key, expectation, repeated_df):
    with expectation as e:
-        assert s.list[key] == e
+        assert repeated_df["int_list_col"].list[key] == e


-def test_len():
+@pytest.mark.parametrize(
+    ("column_name", "dtype"),
+    [
+        pytest.param("int_list_col", pd.ArrowDtype(pa.list_(pa.int64()))),
+        pytest.param("bool_list_col", pd.ArrowDtype(pa.list_(pa.bool_()))),
+        pytest.param("float_list_col", pd.ArrowDtype(pa.list_(pa.float64()))),
+        pytest.param("date_list_col", pd.ArrowDtype(pa.list_(pa.date32()))),
+        pytest.param("date_time_list_col", pd.ArrowDtype(pa.list_(pa.timestamp("us")))),
+        pytest.param("numeric_list_col", pd.ArrowDtype(pa.list_(pa.decimal128(38, 9)))),
+        pytest.param("string_list_col", pd.ArrowDtype(pa.list_(pa.string()))),
+    ],
+)
+def test_len(column_name, dtype, repeated_df, repeated_pandas_df):
    if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"):
        pytest.skip(
            "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data"
        )
-    data = [[], [1], [1, 2], [1, 2, 3]]
-    s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
-    pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))

-    bf_result = s.list.len().to_pandas()
-    pd_result = pd_s.list.len()
+    bf_result = repeated_df[column_name].list.len().to_pandas()
+    pd_result = repeated_pandas_df[column_name].astype(dtype).list.len()

-    assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)
+    assert_series_equal(
+        pd_result,
+        bf_result,
+        check_dtype=False,
+        check_index_type=False,
+        check_names=False,
+    )
@@ -615,21 +615,28 @@ def test_getitem_w_string(scalars_dfs, index):
 @pytest.mark.parametrize(
    ("index"),
    [
-        pytest.param(2, id="int"),
+        pytest.param(0, id="int"),
        pytest.param(slice(None, None, None), id="default_start_slice"),
        pytest.param(slice(0, None, 1), id="default_stop_slice"),
        pytest.param(slice(0, 2, None), id="default_step_slice"),
        pytest.param(slice(0, 0, None), id="single_one_slice"),
    ],
 )
-def test_getitem_w_array(index):
-    data = [[1], [2, 3], [], [4, 5, 6]]
-    s = bpd.Series(data)
-    pd_s = pd.Series(data)
-
-    bf_result = s.str[index].to_pandas()
-    pd_result = pd_s.str[index]
-    # Skip dtype checks here because pandas returns `int64` while BF returns `Int64`.
+@pytest.mark.parametrize(
+    "column_name",
+    [
+        pytest.param("int_list_col"),
+        pytest.param("bool_list_col"),
+        pytest.param("float_list_col"),
+        pytest.param("string_list_col"),
+        # date, date_time and numeric are excluded because their default types are different
+        # in Pandas and BigFrames
+    ],
+)
+def test_getitem_w_array(index, column_name, repeated_df, repeated_pandas_df):
+    bf_result = repeated_df[column_name].str[index].to_pandas()
+    pd_result = repeated_pandas_df[column_name].str[index]
+
    assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False)