Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

MNT Use copy=False when creating DataFrames #26272

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions 6 sklearn/datasets/_arff_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def _io_to_generator(gzip_file):

# calculate chunksize
first_row = next(arff_container["data"])
first_df = pd.DataFrame([first_row], columns=columns_names)
first_df = pd.DataFrame([first_row], columns=columns_names, copy=False)

row_bytes = first_df.memory_usage(deep=True).sum()
chunksize = get_chunk_n_rows(row_bytes)
Expand All @@ -196,7 +196,9 @@ def _io_to_generator(gzip_file):
columns_to_keep = [col for col in columns_names if col in columns_to_select]
dfs = [first_df[columns_to_keep]]
for data in _chunk_generator(arff_container["data"], chunksize):
dfs.append(pd.DataFrame(data, columns=columns_names)[columns_to_keep])
dfs.append(
pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep]
)
frame = pd.concat(dfs, ignore_index=True)
del dfs, first_df

Expand Down
2 changes: 1 addition & 1 deletion 2 sklearn/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _convert_data_dataframe(
):
pd = check_pandas_support("{} with as_frame=True".format(caller_name))
if not sparse_data:
data_df = pd.DataFrame(data, columns=feature_names)
data_df = pd.DataFrame(data, columns=feature_names, copy=False)
else:
data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names)

Expand Down
2 changes: 1 addition & 1 deletion 2 sklearn/utils/_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _wrap_in_pandas_container(
data_to_wrap.columns = columns
return data_to_wrap

return pd.DataFrame(data_to_wrap, index=index, columns=columns)
return pd.DataFrame(data_to_wrap, index=index, columns=columns, copy=False)


def _get_output_config(method, estimator=None):
Expand Down
2 changes: 1 addition & 1 deletion 2 sklearn/utils/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,7 @@ def _convert_container(container, constructor_name, columns_name=None, dtype=Non
return sp.sparse.csr_matrix(container, dtype=dtype)
elif constructor_name == "dataframe":
pd = pytest.importorskip("pandas")
return pd.DataFrame(container, columns=columns_name, dtype=dtype)
return pd.DataFrame(container, columns=columns_name, dtype=dtype, copy=False)
elif constructor_name == "series":
pd = pytest.importorskip("pandas")
return pd.Series(container, dtype=dtype)
Expand Down
22 changes: 11 additions & 11 deletions 22 sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -925,11 +925,11 @@ def check_sample_weights_pandas_series(name, estimator_orig):
[3, 4],
]
)
X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X))
X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X), copy=False)
y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
weights = pd.Series([1] * 12)
if _safe_tags(estimator, key="multioutput_only"):
y = pd.DataFrame(y)
y = pd.DataFrame(y, copy=False)
try:
estimator.fit(X, y, sample_weight=weights)
except ValueError:
Expand Down Expand Up @@ -3218,10 +3218,10 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type):

y_ = np.asarray(y)
if y_.ndim == 1:
y_ = pd.Series(y_)
y_ = pd.Series(y_, copy=False)
else:
y_ = pd.DataFrame(y_)
X_ = pd.DataFrame(np.asarray(X))
y_ = pd.DataFrame(y_, copy=False)
X_ = pd.DataFrame(np.asarray(X), copy=False)

except ImportError:
raise SkipTest(
Expand Down Expand Up @@ -3897,7 +3897,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
n_samples, n_features = X_orig.shape

names = np.array([f"col_{i}" for i in range(n_features)])
X = pd.DataFrame(X_orig, columns=names)
X = pd.DataFrame(X_orig, columns=names, copy=False)

if is_regressor(estimator):
y = rng.normal(size=n_samples)
Expand Down Expand Up @@ -3985,7 +3985,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
early_stopping_enabled = any(value is True for value in params.values())

for invalid_name, additional_message in invalid_names:
X_bad = pd.DataFrame(X, columns=invalid_name)
X_bad = pd.DataFrame(X, columns=invalid_name, copy=False)

expected_msg = re.escape(
"The feature names should match those that were passed during fit.\n"
Expand Down Expand Up @@ -4094,7 +4094,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig):
y_[::2, 1] *= 2

feature_names_in = [f"col{i}" for i in range(n_features)]
df = pd.DataFrame(X, columns=feature_names_in)
df = pd.DataFrame(X, columns=feature_names_in, copy=False)
X_transform = transformer.fit_transform(df, y=y_)

# error is raised when `input_features` do not match feature_names_in
Expand Down Expand Up @@ -4324,7 +4324,7 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas):
# We always rely on the output of `get_feature_names_out` of the
# transformer used to generate the dataframe as a ground-truth of the
# columns.
expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas)
expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False)

try:
pd.testing.assert_frame_equal(df_trans, expected_dataframe)
Expand Down Expand Up @@ -4359,7 +4359,7 @@ def check_set_output_transform_pandas(name, transformer_orig):
set_random_state(transformer)

feature_names_in = [f"col{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names_in)
df = pd.DataFrame(X, columns=feature_names_in, copy=False)

transformer_default = clone(transformer).set_output(transform="default")
outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
Expand Down Expand Up @@ -4401,7 +4401,7 @@ def check_global_ouptut_transform_pandas(name, transformer_orig):
set_random_state(transformer)

feature_names_in = [f"col{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_names_in)
df = pd.DataFrame(X, columns=feature_names_in, copy=False)

transformer_default = clone(transformer).set_output(transform="default")
outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.