Description
Describe the bug
Hello,
I'm currently working with scikit-learn version 1.6, and I encountered a regression that wasn't present in version 1.4.
The following minimal code computes two features — the cumulative mean of age and weight grouped by id. Each transformation function returns a pandas.Series:
When I run this code with scikit-learn 1.6, I get the following error:
After investigation, I found that the issue occurs because each transformer returns a Series, not a DataFrame. If I update the functions to return DataFrame objects instead, the error disappears.
Interestingly, in scikit-learn 1.4, the same code works correctly even when the functions return Series.
Do you have any explanation for why this changed between version 1.4 and 1.6 ?
Thanks in advance for your help!
Steps/Code to Reproduce
import pandas as pd
from sklearn.pipeline import FunctionTransformer, FeatureUnion
import numpy as np
def compute_cumulative_mean_age(df: pd.DataFrame) -> pd.Series:
return (
df["age"]
.astype(float)
.groupby(df["id"])
.expanding()
.mean()
.droplevel(level="id")
.reindex(df.index)
.rename("cumulative_mean_age")
)
def compute_cumulative_mean_weight(df: pd.DataFrame) -> pd.Series:
return (
df["poids"]
.astype(float)
.groupby(df["id"])
.expanding()
.mean()
.droplevel(level="id")
.reindex(df.index)
.rename("cumulative_mean_weight")
)
def compute_features(df: pd.DataFrame) -> pd.DataFrame:
feature_union = FeatureUnion(
[
("cumulative_mean_age", FunctionTransformer(compute_cumulative_mean_age)),
("cumulative_mean_weight", FunctionTransformer(compute_cumulative_mean_weight))
]
).set_output(transform="pandas")
return feature_union.fit_transform(X=df).astype(float)
def transform(df: pd.DataFrame) -> pd.DataFrame:
return compute_features(df)
if __name__ == "__main__":
np.random.seed(42)
df = pd.DataFrame({
'id': [1, 2, 3, 1, 4, 5, 6, 6, 7, 8],
'age': np.random.randint(18, 70, size=10),
'poids': np.random.randint(50, 100, size=10)
})
print(transform(df))
Expected Results
Traceback (most recent call last):
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\tmp.py", line 73, in <module>
print(transform(df=df))
^^^^^^^^^^^^^^^^
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\tmp.py", line 55, in transform
return compute_features(
^^^^^^^^^^^^^^^^^
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\tmp.py", line 45, in compute_features
.fit_transform(
^^^^^^^^^^^^^^
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\.venv\Lib\site-packages\sklearn\utils\_set_output.py", line 332, in wrapped
return _wrap_data_with_container(method, data_to_wrap, X, self)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\.venv\Lib\site-packages\sklearn\utils\_set_output.py", line 307, in _wrap_data_with_container
return adapter.create_container(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\.venv\Lib\site-packages\sklearn\utils\_set_output.py", line 135, in create_container
X_output = _create_pandas_dataframe_from_non_pandas_container(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\.venv\Lib\site-packages\sklearn\utils\fixes.py", line 428, in _create_pandas_dataframe_from_non_pandas_container
return pd.DataFrame(X, index=index, copy=copy)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\.venv\Lib\site-packages\pandas\core\frame.py", line 722, in __init__
mgr = ndarray_to_mgr(
^^^^^^^^^^^^^^^
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\.venv\Lib\site-packages\pandas\core\internals\construction.py", line 349, in ndarray_to_mgr
_check_values_indices_shape_match(values, index, columns)
File "C:\Users\XXX\PycharmProjects\fraude_detection_pec_audio\.venv\Lib\site-packages\pandas\core\internals\construction.py", line 420, in _check_values_indices_shape_match
raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
ValueError: Shape of passed values is (20, 1), indices imply (10, 1)
Actual Results
raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
ValueError: Shape of passed values is (20, 1), indices imply (10, 1)
Versions
sklearn: 1.6.0
numpy: 1.26.4
pandas: 1.5.3