From 930afc34f5f0ef059fc5dbc20c9aec60bb1fe5bd Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 31 Oct 2020 11:25:25 -0700 Subject: [PATCH 01/29] Backport PR #37520: DOC: Start v1.1.5 release notes (#37537) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.1.4.rst | 2 +- doc/source/whatsnew/v1.1.5.rst | 36 ++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v1.1.5.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index b8abc71ca64a2..ae9228b04f44b 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.5 v1.1.4 v1.1.3 v1.1.2 diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index fb8687b8ba42c..6353dbfafc9f1 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -52,4 +52,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.3..v1.1.4|HEAD +.. contributors:: v1.1.3..v1.1.4 diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst new file mode 100644 index 0000000000000..cf728d94b2a55 --- /dev/null +++ b/doc/source/whatsnew/v1.1.5.rst @@ -0,0 +1,36 @@ +.. _whatsnew_115: + +What's new in 1.1.5 (??) +------------------------ + +These are the changes in pandas 1.1.5. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_115.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_115.bug_fixes: + +Bug fixes +~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_115.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.4..v1.1.5|HEAD From d6014169138e5b3447c6b1c2b58c4e815e77bb91 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 4 Nov 2020 15:54:21 +0000 Subject: [PATCH 02/29] Backport PR #37461 on branch 1.1.x: BUG: Metadata propagation for groupby iterator (#37628) Co-authored-by: Janus --- doc/source/whatsnew/v1.1.5.rst | 2 +- pandas/core/groupby/ops.py | 15 ++++++++++++--- pandas/tests/groupby/test_groupby_subclass.py | 9 +++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index cf728d94b2a55..a122154904996 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -23,7 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3aaeef3b63760..5ea4f0cbb6a0c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -131,9 +131,16 @@ def get_iterator(self, data: FrameOrSeries, axis: int = 0): splitter = self._get_splitter(data, axis=axis) keys = self._get_group_keys() for key, (i, group) in zip(keys, splitter): - yield key, group + yield key, group.__finalize__(data, method="groupby") def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": + """ + Returns + ------- + Generator yielding subsetted objects + + __finalize__ has not been called for the the subsetted objects returned. + """ comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -955,7 +962,8 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - return type(sdata)(mgr, name=sdata.name, fastpath=True) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr, name=sdata.name, fastpath=True) class FrameSplitter(DataSplitter): @@ -971,7 +979,8 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - return type(sdata)(mgr) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr) def get_splitter( diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 7271911c5f80f..a97f4d95a677d 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -51,6 +51,15 @@ def test_groupby_preserves_subclass(obj, groupby_func): tm.assert_series_equal(result1, result2) +def test_groupby_preserves_metadata(): + # GH-37343 + custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]}) + assert "testattr" in custom_df._metadata + custom_df.testattr = "hello" + for _, group_df in custom_df.groupby("c"): + assert group_df.testattr == "hello" + + @pytest.mark.parametrize( "obj", [DataFrame, tm.SubclassedDataFrame], ) From 88945abb5e8bfe092f3fa140b18e7deabc3f679b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 9 Nov 2020 12:49:09 +0000 Subject: [PATCH 03/29] Backport PR #37657 on branch 1.1.x: BUG: unpickling modifies Block.ndim (#37713) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/internals/managers.py | 9 ++++++--- pandas/tests/io/test_pickle.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index a122154904996..e0fa68e3b9f80 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -24,6 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) +- Bug in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c9ac9cb0f140a..4c52343d08513 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -284,14 +284,17 @@ def __getstate__(self): return axes_array, block_values, block_items, extra_state def __setstate__(self, state): - def unpickle_block(values, mgr_locs): - return make_block(values, placement=mgr_locs) + def unpickle_block(values, mgr_locs, ndim: int): + # TODO(EA2D): ndim would be unnecessary with 2D EAs + return make_block(values, placement=mgr_locs, ndim=ndim) if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: state = state[3]["0.14.1"] self.axes = [ensure_index(ax) for ax in state["axes"]] + ndim = len(self.axes) self.blocks = tuple( - unpickle_block(b["values"], b["mgr_locs"]) for b in state["blocks"] + unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) + for b in state["blocks"] ) else: raise NotImplementedError("pre-0.14.1 pickles are no longer supported") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index e4d43db7834e3..376091c62619b 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -477,3 +477,15 @@ def test_read_pickle_with_subclass(): tm.assert_series_equal(result[0], expected[0]) assert isinstance(result[1], MyTz) + + +def test_pickle_preserves_block_ndim(): + # GH#37631 + ser = pd.Series(list("abc")).astype("category").iloc[[0]] + res = tm.round_trip_pickle(ser) + + assert res._mgr.blocks[0].ndim == 1 + assert res._mgr.blocks[0].shape == (1,) + + # GH#37631 OP issue was about indexing, underlying problem was pickle + tm.assert_series_equal(res[[True]], ser) From 3b86ff4fc39af22fe211ff0fefb6d90ad41b35b1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 10 Nov 2020 14:52:28 +0000 Subject: [PATCH 04/29] Backport PR #37661 on branch 1.1.x: BUG: RollingGroupby when groupby key is in the index (#37741) Co-authored-by: Matthew Roeschke --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/window/rolling.py | 35 ++++++++++++++++++++++------- pandas/tests/window/test_grouper.py | 32 +++++++++++++++++++++++++- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index e0fa68e3b9f80..a29ae1912e338 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -25,6 +25,7 @@ Bug fixes ~~~~~~~~~ - Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) - Bug in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) +- Bug in :class:`RollingGroupby` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 617c43e0a59ed..ce7281988e105 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2193,20 +2193,39 @@ def _apply( use_numba_cache, **kwargs, ) - # Cannot use _wrap_outputs because we calculate the result all at once - # Compose MultiIndex result from grouping levels then rolling level - # Aggregate the MultiIndex data as tuples then the level names - grouped_object_index = self.obj.index - grouped_index_name = [*grouped_object_index.names] - groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] - result_index_names = groupby_keys + grouped_index_name + # Reconstruct the resulting MultiIndex from tuples + # 1st set of levels = group by labels + # 2nd set of levels = original index + # Ignore 2nd set of levels if a group by label include an index level + result_index_names = [ + grouping.name for grouping in self._groupby.grouper._groupings + ] + grouped_object_index = None + + column_keys = [ + key + for key in result_index_names + if key not in self.obj.index.names or key is None + ] + + if len(column_keys) == len(result_index_names): + grouped_object_index = self.obj.index + grouped_index_name = [*grouped_object_index.names] + result_index_names += grouped_index_name + else: + # Our result will have still kept the column in the result + result = result.drop(columns=column_keys, errors="ignore") result_index_data = [] for key, values in self._groupby.grouper.indices.items(): for value in values: data = [ *com.maybe_make_list(key), - *com.maybe_make_list(grouped_object_index[value]), + *com.maybe_make_list( + grouped_object_index[value] + if grouped_object_index is not None + else [] + ), ] result_index_data.append(tuple(data)) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 493a844ca7a44..da31fbaddc6e4 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Series +from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm from pandas.core.groupby.groupby import get_groupby @@ -449,3 +449,33 @@ def test_groupby_rolling_no_sort(self): index=pd.MultiIndex.from_tuples([(2, 0), (1, 1)], names=["foo", None]), ) tm.assert_frame_equal(result, expected) + + def test_groupby_rolling_group_keys(self): + # GH 37641 + arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] + index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) + + s = Series([1, 2, 3], index=index) + result = s.groupby(["idx1", "idx2"], group_keys=False).rolling(1).mean() + expected = Series( + [1.0, 2.0, 3.0], + index=MultiIndex.from_tuples( + [("val1", "val1"), ("val1", "val1"), ("val2", "val2")], + names=["idx1", "idx2"], + ), + ) + tm.assert_series_equal(result, expected) + + def test_groupby_rolling_index_level_and_column_label(self): + arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]] + index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2")) + + df = DataFrame({"A": [1, 1, 2], "B": range(3)}, index=index) + result = df.groupby(["idx1", "A"]).rolling(1).mean() + expected = DataFrame( + {"B": [0.0, 1.0, 2.0]}, + index=MultiIndex.from_tuples( + [("val1", 1), ("val1", 1), ("val2", 2)], names=["idx1", "A"] + ), + ) + tm.assert_frame_equal(result, expected) From 18e7b411a0f0af56110c1e865a4dc25c132aeaad Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 13 Nov 2020 01:04:32 -0800 Subject: [PATCH 05/29] Backport PR #37780: BUG: adding Timedelta to DatetimeIndex raising incorrectly (#37806) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.5.rst | 2 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/tests/indexes/datetimes/test_misc.py | 14 +++++++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index a29ae1912e338..3b1f64e730830 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c6945e2f78b5a..a10912aa45baa 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1253,7 +1253,7 @@ def _add_timedeltalike_scalar(self, other): # adding a scalar preserves freq new_freq = self.freq - return type(self)(new_values, dtype=self.dtype, freq=new_freq) + return type(self)._simple_new(new_values, dtype=self.dtype, freq=new_freq) def _add_timedelta_arraylike(self, other): """ diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 51841727d510b..7bf9455252d49 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -7,7 +7,7 @@ import pytest import pandas as pd -from pandas import DatetimeIndex, Index, Timestamp, date_range, offsets +from pandas import DatetimeIndex, Index, Timedelta, Timestamp, date_range, offsets import pandas._testing as tm @@ -408,3 +408,15 @@ def test_isocalendar_returns_correct_values_close_to_new_year_with_tz(): dtype="UInt32", ) tm.assert_frame_equal(result, expected_data_frame) + + +def test_add_timedelta_preserves_freq(): + # GH#37295 should hold for any DTI with freq=None or Tick freq + tz = "Canada/Eastern" + dti = date_range( + start=Timestamp("2019-03-26 00:00:00-0400", tz=tz), + end=Timestamp("2020-10-17 00:00:00-0400", tz=tz), + freq="D", + ) + result = dti + Timedelta(days=1) + assert result.freq == dti.freq From 8f7d1d503baa787fee4aa981257fbe7db930ba7b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 13 Nov 2020 10:43:39 -0800 Subject: [PATCH 06/29] Backport PR #37812: CI: The `set-env` and `add-path` commands are deprecated and will be disabled on November 16th. (#37814) Co-authored-by: Simon Hawkins --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 149acef72db26..af9f41062d096 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,7 @@ jobs: steps: - name: Setting conda path - run: echo "::add-path::${HOME}/miniconda3/bin" + run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH - name: Checkout uses: actions/checkout@v1 @@ -104,7 +104,7 @@ jobs: steps: - name: Setting conda path - run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}" + run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH - name: Checkout uses: actions/checkout@v1 From 74688649de86d09b0547d6ebb0db14acd1a6f969 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 14 Nov 2020 10:30:19 +0000 Subject: [PATCH 07/29] Backport PR #37801 on branch 1.1.x: REGR: SeriesGroupBy where index has a tuple name fails (#37829) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/series.py | 6 +++--- pandas/tests/groupby/test_groupby.py | 10 ++++++++++ pandas/tests/series/indexing/test_indexing.py | 4 ++-- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 3b1f64e730830..2a598c489e809 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) +- Fixed regression in :meth:`Series.groupby` raising when the :class:`Index` of the :class:`Series` had a tuple as its name (:issue:`37755`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 18a201674db65..277aa1d095350 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -889,7 +889,7 @@ def __getitem__(self, key): return result - except KeyError: + except (KeyError, TypeError): if isinstance(key, tuple) and isinstance(self.index, MultiIndex): # We still have the corner case where a tuple is a key # in the first level of our MultiIndex @@ -953,7 +953,7 @@ def _get_values_tuple(self, key): return result if not isinstance(self.index, MultiIndex): - raise ValueError("key of type tuple not found and not a MultiIndex") + raise KeyError("key of type tuple not found and not a MultiIndex") # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) @@ -1009,7 +1009,7 @@ def __setitem__(self, key, value): except TypeError as err: if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): - raise ValueError( + raise KeyError( "key of type tuple not found and not a MultiIndex" ) from err diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bdb283ae445b1..35eab708412b8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2055,3 +2055,13 @@ def test_groups_repr_truncates(max_seq_items, expected): result = df.groupby(np.array(df.a)).groups.__repr__() assert result == expected + + +def test_groupby_series_with_tuple_name(): + # GH 37755 + ser = Series([1, 2, 3, 4], index=[1, 1, 2, 2], name=("a", "a")) + ser.index.name = ("b", "b") + result = ser.groupby(level=0).last() + expected = Series([2, 4], index=[1, 2], name=("a", "a")) + expected.index.name = ("b", "b") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index fbdac2bb2d8e8..3b66939d9ddd2 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -387,9 +387,9 @@ def test_2d_to_1d_assignment_raises(): def test_basic_getitem_setitem_corner(datetime_series): # invalid tuples, e.g. td.ts[:, None] vs. td.ts[:, 2] msg = "key of type tuple not found and not a MultiIndex" - with pytest.raises(ValueError, match=msg): + with pytest.raises(KeyError, match=msg): datetime_series[:, 2] - with pytest.raises(ValueError, match=msg): + with pytest.raises(KeyError, match=msg): datetime_series[:, 2] = 2 # weird lists. [slice(0, 5)] will work but not two slices From 77681d6bea186b99351b80dc409a8765bd34fb17 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sun, 15 Nov 2020 11:31:48 +0100 Subject: [PATCH 08/29] Backport PR #37787 on branch 1.1.x (Fix regression for loc and __setitem__ when one-dimensional tuple was given for MultiIndex) (#37849) --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/indexing.py | 4 ++-- pandas/tests/indexing/multiindex/test_loc.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 2a598c489e809..323342cb43950 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) - Fixed regression in :meth:`Series.groupby` raising when the :class:`Index` of the :class:`Series` had a tuple as its name (:issue:`37755`) +- Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d21ff6ee17537..c33cb396e576b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -641,9 +641,9 @@ def _ensure_listlike_indexer(self, key, axis=None): if self.ndim != 2: return - if isinstance(key, tuple): + if isinstance(key, tuple) and not isinstance(self.obj.index, ABCMultiIndex): # key may be a tuple if we are .loc - # in that case, set key to the column part of key + # if index is not a MultiIndex, set key to column part key = key[column_axis] axis = column_axis diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 95a23a9bcf63b..4ee7e34bc4a0c 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -288,6 +288,24 @@ def convert_nested_indexer(indexer_type, keys): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("klass", [Series, DataFrame]) + def test_multiindex_loc_one_dimensional_tuple(self, klass): + # GH#37711 + mi = MultiIndex.from_tuples([("a", "A"), ("b", "A")]) + obj = klass([1, 2], index=mi) + obj.loc[("a",)] = 0 + expected = klass([0, 2], index=mi) + tm.assert_equal(obj, expected) + + @pytest.mark.parametrize("indexer", [("a",), ("a")]) + def test_multiindex_one_dimensional_tuple_columns(self, indexer): + # GH#37711 + mi = MultiIndex.from_tuples([("a", "A"), ("b", "A")]) + obj = DataFrame([1, 2], index=mi) + obj.loc[indexer, :] = 0 + expected = DataFrame([0, 2], index=mi) + tm.assert_frame_equal(obj, expected) + @pytest.mark.parametrize( "indexer, pos", From 288e319c1f39079adc8ea5c9a652b7015d1130a4 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 25 Nov 2020 10:52:56 +0000 Subject: [PATCH 09/29] Backport PR #37986 on branch 1.1.x: REGR: fix inplace operations for EAs with non-EA arg (#38035) Co-authored-by: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> --- doc/source/whatsnew/v1.1.5.rst | 2 +- pandas/core/ops/methods.py | 3 ++- .../tests/scalar/timedelta/test_arithmetic.py | 1 + pandas/tests/series/test_arithmetic.py | 27 +++++++++++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 323342cb43950..609c3650c8cc2 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -17,7 +17,7 @@ Fixed regressions - Regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) - Fixed regression in :meth:`Series.groupby` raising when the :class:`Index` of the :class:`Series` had a tuple as its name (:issue:`37755`) - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) -- +- Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index 6a44178e3c704..17223d6a54d4a 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -3,6 +3,7 @@ """ import operator +from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.ops.roperator import ( @@ -97,7 +98,7 @@ def f(self, other): if ( self.ndim == 1 and result._indexed_same(self) - and result.dtype == self.dtype + and is_dtype_equal(result.dtype, self.dtype) ): # GH#36498 this inplace op can _actually_ be inplace. self._values[:] = result._values diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index cb33f99d9bd91..2757766724156 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -429,6 +429,7 @@ def test_td_div_numeric_scalar(self): _is_numpy_dev and not compat.PY39, raises=RuntimeWarning, reason="https://github.com/pandas-dev/pandas/issues/31992", + strict=False, ), ), float("nan"), diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index ef2bafd4ea2ad..7394f15555f7b 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -687,3 +687,30 @@ def test_datetime_understood(self): result = series - offset expected = pd.Series(pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"])) tm.assert_series_equal(result, expected) + + +class TestInplaceOperations: + @pytest.mark.parametrize( + "dtype1, dtype2, dtype_expected, dtype_mul", + ( + ("Int64", "Int64", "Int64", "Int64"), + ("float", "float", "float", "float"), + ("Int64", "float", "float", "float"), + ), + ) + def test_series_inplace_ops(self, dtype1, dtype2, dtype_expected, dtype_mul): + # GH 37910 + + ser1 = Series([1], dtype=dtype1) + ser2 = Series([2], dtype=dtype2) + ser1 += ser2 + expected = Series([3], dtype=dtype_expected) + tm.assert_series_equal(ser1, expected) + + ser1 -= ser2 + expected = Series([1], dtype=dtype_expected) + tm.assert_series_equal(ser1, expected) + + ser1 *= ser2 + expected = Series([2], dtype=dtype_mul) + tm.assert_series_equal(ser1, expected) From 345e9fc76d39403088cc7c71a8a03ff834aa2e94 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 25 Nov 2020 11:51:02 +0000 Subject: [PATCH 10/29] Backport PR #37039: CI: move py39 build to conda #33948 (#38059) Co-authored-by: Fangchen Li --- .travis.yml | 10 +--------- ci/azure/posix.yml | 5 +++++ ci/build39.sh | 13 ------------- ci/deps/azure-39.yaml | 17 +++++++++++++++++ ci/setup_env.sh | 5 ----- 5 files changed, 23 insertions(+), 27 deletions(-) delete mode 100755 ci/build39.sh create mode 100644 ci/deps/azure-39.yaml diff --git a/.travis.yml b/.travis.yml index 1e5ea21b0f2d9..f43f4a1d16ff8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,14 +29,6 @@ matrix: fast_finish: true include: - # In allowed failures - - dist: bionic - python: 3.9-dev - env: - - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" - - env: - - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)" - - env: - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network and not clipboard)" @@ -92,7 +84,7 @@ install: script: - echo "script start" - echo "$JOB" - - if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi + - source activate pandas-dev - ci/run_tests.sh after_script: diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index f716974f6add1..457a1f3f507d4 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -65,6 +65,11 @@ jobs: PANDAS_TESTING_MODE: "deprecate" EXTRA_APT: "xsel" + py39: + ENV_FILE: ci/deps/azure-39.yaml + CONDA_PY: "39" + PATTERN: "not slow and not network and not clipboard" + steps: - script: | if [ "$(uname)" == "Linux" ]; then diff --git a/ci/build39.sh b/ci/build39.sh deleted file mode 100755 index f2ef11d5a71f4..0000000000000 --- a/ci/build39.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -e -# Special build for python3.9 until numpy puts its own wheels up - -sudo apt-get install build-essential gcc xvfb -pip install --no-deps -U pip wheel setuptools -pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis - -python setup.py build_ext -inplace -python -m pip install --no-build-isolation -e . - -python -c "import sys; print(sys.version_info)" -python -c "import pandas as pd" -python -c "import hypothesis" diff --git a/ci/deps/azure-39.yaml b/ci/deps/azure-39.yaml new file mode 100644 index 0000000000000..67edc83a9d738 --- /dev/null +++ b/ci/deps/azure-39.yaml @@ -0,0 +1,17 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.9.* + + # tools + - cython>=0.29.21 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - numpy + - python-dateutil + - pytz diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 065f9e56ea171..9adb6fe674099 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,10 +1,5 @@ #!/bin/bash -e -if [ "$JOB" == "3.9-dev" ]; then - /bin/bash ci/build39.sh - exit 0 -fi - # edit the locale file if needed if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo "Adding locale to the first line of pandas/__init__.py" From 6d5d7dff4b1ef958a38860a07726e373562a55d4 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 25 Nov 2020 12:48:13 +0000 Subject: [PATCH 11/29] BUG: pytables in py39 (#38041) (#38061) Co-authored-by: jbrockmendel --- ci/deps/azure-39.yaml | 5 +++++ doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/computation/pytables.py | 4 ++++ pandas/tests/io/pytables/test_store.py | 2 +- 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/deps/azure-39.yaml b/ci/deps/azure-39.yaml index 67edc83a9d738..c4c84e73fa684 100644 --- a/ci/deps/azure-39.yaml +++ b/ci/deps/azure-39.yaml @@ -15,3 +15,8 @@ dependencies: - numpy - python-dateutil - pytz + + # optional dependencies + - pytables + - scipy + - pyarrow=1.0 diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 609c3650c8cc2..dd88f79371d65 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -28,6 +28,7 @@ Bug fixes - Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) - Bug in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) - Bug in :class:`RollingGroupby` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) +- Bug in pytables methods in python 3.9 (:issue:`38041`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 001eb1789007f..a3389a80b017a 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -426,6 +426,10 @@ def visit_Subscript(self, node, **kwargs): except AttributeError: pass + if isinstance(slobj, Term): + # In py39 np.ndarray lookups with Term containing int raise + slobj = slobj.value + try: return self.const_type(value[slobj], self.env) except TypeError as err: diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index df014171be817..290828daacd9c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -4500,7 +4500,7 @@ def test_categorical(self, setup_path): # Appending must have the same categories df3 = df.copy() - df3["s"].cat.remove_unused_categories(inplace=True) + df3["s"] = df3["s"].cat.remove_unused_categories() with pytest.raises(ValueError): store.append("df3", df3) From 6201a09fa1029cd0c856064b5d310a109dbb1111 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 26 Nov 2020 05:56:28 -0800 Subject: [PATCH 12/29] Backport PR #38064: DOC: tidy 1.1.5 release notes (#38081) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.1.5.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index dd88f79371d65..a8bbf692a72e5 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -14,10 +14,13 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) +- Fixed regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) - Fixed regression in :meth:`Series.groupby` raising when the :class:`Index` of the :class:`Series` had a tuple as its name (:issue:`37755`) - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) - Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) +- Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) +- Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) +- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) .. --------------------------------------------------------------------------- @@ -25,11 +28,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in metadata propagation for ``groupby`` iterator (:issue:`37343`) -- Bug in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) -- Bug in :class:`RollingGroupby` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - Bug in pytables methods in python 3.9 (:issue:`38041`) -- .. --------------------------------------------------------------------------- From 6845248cee8e57629ea4061b2c132cd79127ec1d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 26 Nov 2020 13:59:14 +0000 Subject: [PATCH 13/29] CI: troubleshoot travis-36-cov on 1.1.x (#37311) --- .travis.yml | 2 +- .../{travis-36-cov.yaml => travis-37-cov.yaml} | 15 ++++++++------- pandas/tests/groupby/test_categorical.py | 9 --------- 3 files changed, 9 insertions(+), 17 deletions(-) rename ci/deps/{travis-36-cov.yaml => travis-37-cov.yaml} (79%) diff --git a/.travis.yml b/.travis.yml index f43f4a1d16ff8..f16b19814874e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -46,7 +46,7 @@ matrix: # Enabling Deprecations when running tests # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs # See pandas/_testing.py for more details. - - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" + - JOB="3.7, coverage" ENV_FILE="ci/deps/travis-37-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" services: - mysql - postgresql diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-37-cov.yaml similarity index 79% rename from ci/deps/travis-36-cov.yaml rename to ci/deps/travis-37-cov.yaml index 8c8db106af05c..c89b42ef06a2e 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -1,9 +1,8 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - cython>=0.29.21 @@ -22,18 +21,21 @@ dependencies: - geopandas - html5lib - matplotlib - - moto + - moto>=1.3.14 + - flask - nomkl - numexpr - - numpy=1.15.* + - numpy=1.16.* - odfpy - openpyxl - pandas-gbq + - google-cloud-bigquery>=1.27.2 # GH 36436 - psycopg2 - - pyarrow>=0.13.0 - - pymysql + - pyarrow>=0.15.0 + - pymysql<0.10.0 # temporary pin, GH 36465 - pytables - python-snappy + - python-dateutil - pytz - s3fs>=0.4.0 - scikit-learn @@ -49,5 +51,4 @@ dependencies: - brotlipy - coverage - pandas-datareader - - python-dateutil - pyxlsb diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0d447a70b540d..eb3847e9ac19f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import PY37, is_platform_windows - import pandas as pd from pandas import ( Categorical, @@ -13,7 +11,6 @@ Index, MultiIndex, Series, - _np_version_under1p17, qcut, ) import pandas._testing as tm @@ -244,12 +241,6 @@ def test_level_get_group(observed): tm.assert_frame_equal(result, expected) -# GH#21636 flaky on py37; may be related to older numpy, see discussion -# https://github.com/MacPython/pandas-wheels/pull/64 -@pytest.mark.xfail( - PY37 and _np_version_under1p17 and not is_platform_windows(), - reason="Flaky, GH-27902", -) @pytest.mark.parametrize("ordered", [True, False]) def test_apply(ordered): # GH 10138 From c53df5f0ea396c132ceacbe9d1efe5ab8de482e4 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 27 Nov 2020 19:47:54 +0000 Subject: [PATCH 14/29] Backport PR #34407 on branch 1.1.x: REGR: revert "CLN: _consolidate_inplace less" / fix regression in fillna() (#38115) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/generic.py | 6 ++++++ pandas/core/internals/managers.py | 6 ++++++ pandas/tests/frame/test_missing.py | 11 +++++++++++ 4 files changed, 24 insertions(+) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index a8bbf692a72e5..29b0e99a3a356 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) - Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) +- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index be85ab251c0c3..1c6248ad71b62 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3484,6 +3484,8 @@ class animal locomotion if axis == 1: return self[key] + self._consolidate_inplace() + index = self.index if isinstance(index, MultiIndex): loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) @@ -6011,6 +6013,8 @@ def fillna( inplace = validate_bool_kwarg(inplace, "inplace") value, method = validate_fillna_kwargs(value, method) + self._consolidate_inplace() + # set the default here, so functions examining the signaure # can detect if something was set (e.g. in groupby) (GH9221) if axis is None: @@ -6449,6 +6453,8 @@ def replace( if not is_bool(regex) and to_replace is not None: raise AssertionError("'to_replace' must be 'None' if 'regex' is not a bool") + self._consolidate_inplace() + if value is None: # passing a single value that is scalar like # when value is None (GH5319), for compat diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4c52343d08513..67bf2584bb84e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -417,6 +417,7 @@ def apply(self: T, f, align_keys=None, **kwargs) -> T: def quantile( self, axis: int = 0, + consolidate: bool = True, transposed: bool = False, interpolation="linear", qs=None, @@ -430,6 +431,8 @@ def quantile( Parameters ---------- axis: reduction axis, default 0 + consolidate: bool, default True. Join together blocks having same + dtype transposed: bool, default False we are holding transposed data interpolation : type of interpolation, default 'linear' @@ -444,6 +447,9 @@ def quantile( # simplify some of the code here and in the blocks assert self.ndim >= 2 + if consolidate: + self._consolidate_inplace() + def get_axe(block, qs, axes): # Because Series dispatches to DataFrame, we will always have # block.ndim == 2 diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index b4f91590e09d1..0f5048dde3250 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -717,3 +717,14 @@ def test_fill_corner(self, float_frame, float_string_frame): # TODO(wesm): unused? result = empty_float.fillna(value=0) # noqa + + +def test_fillna_nonconsolidated_frame(): + # https://github.com/pandas-dev/pandas/issues/36495 + df = DataFrame( + [[1, 1, 1, 1.0], [2, 2, 2, 2.0], [3, 3, 3, 3.0]], + columns=["i1", "i2", "i3", "f1"], + ) + df_nonconsol = df.pivot("i1", "i2") + result = df_nonconsol.fillna(0) + assert result.isna().sum().sum() == 0 From a627c707616bf5f36d8a50bbec580932ae652a31 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 27 Nov 2020 14:31:02 -0800 Subject: [PATCH 15/29] Backport PR #38094: REGR: fix regression in groupby aggregation with out-of-bounds datetimes (#38123) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/_libs/reduction.pyx | 4 +++- .../tests/groupby/aggregate/test_aggregate.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 29b0e99a3a356..9be1ff7e836ea 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) - Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) - Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) +- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 7b36bc8baf891..0d6f7f955b217 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -46,7 +46,9 @@ cdef class _BaseGrouper: Slider islider, Slider vslider): if cached_typ is None: cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, name=self.name) + cached_typ = self.typ( + vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name + ) else: # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 127d3fadee555..9d67397d2ccb9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1,6 +1,7 @@ """ test .agg behavior / note that .apply is tested generally in test_groupby.py """ +import datetime import functools from functools import partial @@ -1089,3 +1090,21 @@ def test_agg_no_suffix_index(): result = df["A"].agg(["sum", lambda x: x.sum(), lambda x: x.sum()]) expected = pd.Series([12, 12, 12], index=["sum", "", ""], name="A") tm.assert_series_equal(result, expected) + + +def test_aggregate_datetime_objects(): + # https://github.com/pandas-dev/pandas/issues/36003 + # ensure we don't raise an error but keep object dtype for out-of-bounds + # datetimes + df = DataFrame( + { + "A": ["X", "Y"], + "B": [ + datetime.datetime(2005, 1, 1, 10, 30, 23, 540000), + datetime.datetime(3005, 1, 1, 10, 30, 23, 540000), + ], + } + ) + result = df.groupby("A").B.max() + expected = df.set_index("A")["B"] + tm.assert_series_equal(result, expected) From 45c1016a09cf5849b39f9c6aed01325ad26eba4e Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 27 Nov 2020 15:35:31 -0800 Subject: [PATCH 16/29] Backport PR #38087: BLD: Only enable -Werror in the CI jobs (#38124) Co-authored-by: Uwe L. Korn --- ci/setup_env.sh | 6 ++++++ doc/source/whatsnew/v1.1.5.rst | 8 ++++++++ setup.py | 9 +++++---- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 9adb6fe674099..d725075b85fc6 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -111,6 +111,12 @@ fi echo "activate pandas-dev" source activate pandas-dev +# Explicitly set an environment variable indicating that this is pandas' CI environment. +# +# This allows us to enable things like -Werror that shouldn't be activated in +# downstream CI jobs that may also build pandas from source. +export PANDAS_CI=1 + echo echo "remove any installed pandas package" echo "w/o removing anything else" diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 9be1ff7e836ea..46c4ad4f35fe4 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -34,6 +34,14 @@ Bug fixes .. --------------------------------------------------------------------------- +.. _whatsnew_115.other: + +Other +~~~~~ +- Only set ``-Werror`` as a compiler flag in the CI jobs (:issue:`33315`, :issue:`33314`) + +.. --------------------------------------------------------------------------- + .. _whatsnew_115.contributors: Contributors diff --git a/setup.py b/setup.py index 5555592de45e0..915847c2936bb 100755 --- a/setup.py +++ b/setup.py @@ -427,15 +427,16 @@ def run(self): endian_macro = [("__LITTLE_ENDIAN__", "1")] +extra_compile_args = [] +extra_link_args = [] if is_platform_windows(): - extra_compile_args = [] - extra_link_args = [] if debugging_symbols_requested: extra_compile_args.append("/Z7") extra_link_args.append("/DEBUG") else: - extra_compile_args = ["-Werror"] - extra_link_args = [] + # PANDAS_CI=1 is set by ci/setup_env.sh + if os.environ.get("PANDAS_CI", "0") == "1": + extra_compile_args.append("-Werror") if debugging_symbols_requested: extra_compile_args.append("-g") From 8a2b8e264f4025a2666a7efd4ec5689776e42473 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 30 Nov 2020 12:13:24 +0000 Subject: [PATCH 17/29] Backport PR #36927: BUG: Fix duplicates in intersection of multiindexes (#38155) Co-authored-by: patrick <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/indexes/base.py | 9 ++++++--- pandas/core/indexes/multi.py | 8 ++++++-- pandas/core/ops/__init__.py | 6 +++++- pandas/core/reshape/merge.py | 9 +++++++-- pandas/tests/indexes/multi/test_setops.py | 23 +++++++++++++++++++++++ pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexes/test_setops.py | 10 ++++++++++ pandas/tests/reshape/merge/test_merge.py | 2 +- 9 files changed, 60 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 46c4ad4f35fe4..edc2f7327abfc 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -23,6 +23,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). +- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e4dee2b0a08ce..b0f64bd76a174 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2654,7 +2654,7 @@ def intersection(self, other, sort=False): self._assert_can_do_setop(other) other = ensure_index(other) - if self.equals(other): + if self.equals(other) and not self.has_duplicates: return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): @@ -2672,7 +2672,7 @@ def intersection(self, other, sort=False): except TypeError: pass else: - return self._wrap_setop_result(other, result) + return self._wrap_setop_result(other, algos.unique1d(result)) try: indexer = Index(rvals).get_indexer(lvals) @@ -2683,13 +2683,16 @@ def intersection(self, other, sort=False): indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] - taken = other.take(indexer) + taken = other.take(indexer).unique() res_name = get_op_result_name(self, other) if sort is None: taken = algos.safe_sort(taken.values) return self._shallow_copy(taken, name=res_name) + # Intersection has to be unique + assert algos.unique(taken._values).shape == taken._values.shape + taken.name = res_name return taken diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b9ba823ca1b0b..6ad82e81e7c30 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3398,6 +3398,8 @@ def intersection(self, other, sort=False): other, result_names = self._convert_can_do_setop(other) if self.equals(other): + if self.has_duplicates: + return self.unique() return self if not is_object_dtype(other.dtype): @@ -3416,10 +3418,12 @@ def intersection(self, other, sort=False): uniq_tuples = None # flag whether _inner_indexer was successful if self.is_monotonic and other.is_monotonic: try: - uniq_tuples = self._inner_indexer(lvals, rvals)[0] - sort = False # uniq_tuples is already sorted + inner_tuples = self._inner_indexer(lvals, rvals)[0] + sort = False # inner_tuples is already sorted except TypeError: pass + else: + uniq_tuples = algos.unique(inner_tuples) if uniq_tuples is None: other_uniq = set(rvals) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 60f3d23aaed13..5e1b8cd8dc8ca 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -539,7 +539,11 @@ def _should_reindex_frame_op( if fill_value is None and level is None and axis is default_axis: # TODO: any other cases we should handle here? cols = left.columns.intersection(right.columns) - if not (cols.equals(left.columns) and cols.equals(right.columns)): + + # Intersection is always unique so we have to check the unique columns + left_uniques = left.columns.unique() + right_uniques = right.columns.unique() + if not (cols.equals(left_uniques) and cols.equals(right_uniques)): return True return False diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2349cb1dcc0c7..f43a33d088df0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1209,7 +1209,9 @@ def _validate_specification(self): raise MergeError("Must pass left_on or left_index=True") else: # use the common columns - common_cols = self.left.columns.intersection(self.right.columns) + left_cols = self.left.columns + right_cols = self.right.columns + common_cols = left_cols.intersection(right_cols) if len(common_cols) == 0: raise MergeError( "No common columns to perform merge on. " @@ -1218,7 +1220,10 @@ def _validate_specification(self): f"left_index={self.left_index}, " f"right_index={self.right_index}" ) - if not common_cols.is_unique: + if ( + not left_cols.join(common_cols, how="inner").is_unique + or not right_cols.join(common_cols, how="inner").is_unique + ): raise MergeError(f"Data columns not unique: {repr(common_cols)}") self.left_on = self.right_on = common_cols elif self.on is not None: diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index d7427ee622977..8637c4cb8bffb 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -375,3 +375,26 @@ def test_setops_disallow_true(method): with pytest.raises(ValueError, match="The 'sort' keyword only takes"): getattr(idx1, method)(idx2, sort=True) + + +@pytest.mark.parametrize( + ("tuples", "exp_tuples"), + [ + ([("val1", "test1")], [("val1", "test1")]), + ([("val1", "test1"), ("val1", "test1")], [("val1", "test1")]), + ( + [("val2", "test2"), ("val1", "test1")], + [("val2", "test2"), ("val1", "test1")], + ), + ], +) +def test_intersect_with_duplicates(tuples, exp_tuples): + # GH#36915 + left = MultiIndex.from_tuples(tuples, names=["first", "second"]) + right = MultiIndex.from_tuples( + [("val1", "test1"), ("val1", "test1"), ("val2", "test2")], + names=["first", "second"], + ) + result = left.intersection(right) + expected = MultiIndex.from_tuples(exp_tuples, names=["first", "second"]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 59ee88117a984..a8dedffae0e65 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -688,7 +688,7 @@ def test_intersection_monotonic(self, index2, keeps_name, sort): @pytest.mark.parametrize( "index2,expected_arr", - [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B", "A"])], + [(Index(["B", "D"]), ["B"]), (Index(["B", "D", "A"]), ["A", "B"])], ) def test_intersection_non_monotonic_non_unique(self, index2, expected_arr, sort): # non-monotonic non-unique diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 1a40fe550be61..26d7c14b46e08 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -95,3 +95,13 @@ def test_union_dtypes(left, right, expected): b = pd.Index([], dtype=right) result = (a | b).dtype assert result == expected + + +@pytest.mark.parametrize("values", [[1, 2, 2, 3], [3, 3]]) +def test_intersection_duplicates(values): + # GH#31326 + a = pd.Index(values) + b = pd.Index([3, 3]) + result = a.intersection(b) + expected = pd.Index([3]) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4fd3c688b8771..491ec97e5dee9 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -742,7 +742,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) From 2daf4cd8fe97067e0470e1ec8c0ef6d11a895b61 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 30 Nov 2020 18:27:05 +0000 Subject: [PATCH 18/29] Backport PR #38120: API: preserve freq in DTI/TDI.factorize (#38185) Co-authored-by: jbrockmendel --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/algorithms.py | 28 +++++++++- pandas/core/arrays/datetimelike.py | 14 +++++ pandas/core/arrays/period.py | 4 ++ .../tests/indexes/datetimes/test_datetime.py | 51 +++++++++++++------ .../indexes/timedeltas/test_timedelta.py | 11 +++- .../indexing/multiindex/test_multiindex.py | 10 ++++ pandas/tests/window/common.py | 1 - 8 files changed, 100 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index edc2f7327abfc..4770ab37e08d2 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) - Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) - Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) +- Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`) - Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 48d4fe65942fe..32b5eae25ff5d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -46,11 +46,13 @@ pandas_dtype, ) from pandas.core.dtypes.generic import ( + ABCDatetimeArray, ABCExtensionArray, ABCIndex, ABCIndexClass, ABCMultiIndex, ABCSeries, + ABCTimedeltaArray, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -191,8 +193,16 @@ def _reconstruct_data( ------- ExtensionArray or np.ndarray """ + if isinstance(values, ABCExtensionArray) and values.dtype == dtype: + # Catch DatetimeArray/TimedeltaArray + return values + if is_extension_array_dtype(dtype): - values = dtype.construct_array_type()._from_sequence(values) + cls = dtype.construct_array_type() + if isinstance(values, cls) and values.dtype == dtype: + return values + + values = cls._from_sequence(values) elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) @@ -654,6 +664,8 @@ def factorize( values = _ensure_arraylike(values) original = values + if not isinstance(values, ABCMultiIndex): + values = extract_array(values, extract_numpy=True) # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques # of values, assign na_sentinel=-1 to replace code value for NaN. @@ -662,8 +674,20 @@ def factorize( na_sentinel = -1 dropna = False + if ( + isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) + and values.freq is not None + ): + codes, uniques = values.factorize(sort=sort) + if isinstance(original, ABCIndexClass): + uniques = original._shallow_copy(uniques, name=None) + elif isinstance(original, ABCSeries): + from pandas import Index + + uniques = Index(uniques) + return codes, uniques + if is_extension_array_dtype(values.dtype): - values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a10912aa45baa..a9fe95c0892e6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1660,6 +1660,20 @@ def mean(self, skipna=True): # Don't have to worry about NA `result`, since no NA went in. return self._box_func(result) + # -------------------------------------------------------------- + + def factorize(self, na_sentinel=-1, sort: bool = False): + if self.freq is not None: + # We must be unique, so can short-circuit (and retain freq) + codes = np.arange(len(self), dtype=np.intp) + uniques = self.copy() # TODO: copy or view? + if sort and self.freq.n < 0: + codes = codes[::-1] + uniques = uniques[::-1] + return codes, uniques + # FIXME: shouldn't get here; we are ignoring sort + return super().factorize(na_sentinel=na_sentinel) + DatetimeLikeArrayMixin._add_comparison_ops() diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index fe78481d99d30..4d117a31255da 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -48,6 +48,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays.base import ExtensionArray import pandas.core.common as com @@ -766,6 +767,9 @@ def _check_timedeltalike_freq_compat(self, other): raise raise_on_incompatible(self, other) + def factorize(self, na_sentinel=-1): + return ExtensionArray.factorize(self, na_sentinel=na_sentinel) + def raise_on_incompatible(left, right): """ diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 7bb1d98086a91..e6758df2d3d93 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -271,10 +271,12 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq # tz must be preserved idx1 = idx1.tz_localize("Asia/Tokyo") @@ -283,6 +285,7 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq idx2 = pd.DatetimeIndex( ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"] @@ -293,21 +296,31 @@ def test_factorize(self): arr, idx = idx2.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"]) arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#38120 freq should be preserved idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) + arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq - def test_factorize_tz(self, tz_naive_fixture): + def test_factorize_tz(self, tz_naive_fixture, index_or_series): tz = tz_naive_fixture # GH#13750 base = pd.date_range("2016-11-05", freq="H", periods=100, tz=tz) @@ -315,27 +328,33 @@ def test_factorize_tz(self, tz_naive_fixture): exp_arr = np.arange(100, dtype=np.intp).repeat(5) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, exp_arr) - expected = base._with_freq(None) - tm.assert_index_equal(res, expected) + obj = index_or_series(idx) + + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, exp_arr) + expected = base._with_freq(None) + tm.assert_index_equal(res, expected) + assert res.freq == expected.freq - def test_factorize_dst(self): + def test_factorize_dst(self, index_or_series): # GH 13750 idx = pd.date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq idx = pd.date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern") + obj = index_or_series(idx) - for obj in [idx, pd.Series(idx)]: - arr, res = obj.factorize() - tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) - tm.assert_index_equal(res, idx) + arr, res = obj.factorize() + tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + if index_or_series is Index: + assert res.freq == idx.freq @pytest.mark.parametrize( "arr, expected", diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 4a1749ff734c1..ef1e599d13221 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -75,17 +75,26 @@ def test_factorize(self): arr, idx = idx1.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq arr, idx = idx1.factorize(sort=True) tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + assert idx.freq == exp_idx.freq - # freq must be preserved + def test_factorize_preserves_freq(self): + # GH#38120 freq should be preserved idx3 = timedelta_range("1 day", periods=4, freq="s") exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq + + arr, idx = pd.factorize(idx3) + tm.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(idx, idx3) + assert idx.freq == idx3.freq def test_sort_values(self): diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 4565d79c632de..162be4e0740d6 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -91,3 +91,13 @@ def test_multiindex_get_loc_list_raises(self): msg = "unhashable type" with pytest.raises(TypeError, match=msg): idx.get_loc([]) + + def test_multiindex_with_datatime_level_preserves_freq(self): + # https://github.com/pandas-dev/pandas/issues/35563 + idx = Index(range(2), name="A") + dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B") + mi = MultiIndex.from_product([idx, dti]) + df = DataFrame(np.random.randn(14, 2), index=mi) + result = df.loc[0].index + tm.assert_index_equal(result, dti) + assert result.freq == dti.freq diff --git a/pandas/tests/window/common.py b/pandas/tests/window/common.py index 7e0be331ec8d5..d6b80a803a88d 100644 --- a/pandas/tests/window/common.py +++ b/pandas/tests/window/common.py @@ -12,7 +12,6 @@ def get_result(obj, obj2=None): result = result.loc[(slice(None), 1), 5] result.index = result.index.droplevel(1) expected = get_result(frame[1], frame[5]) - expected.index = expected.index._with_freq(None) tm.assert_series_equal(result, expected, check_names=False) From ec30ff713c7177c10be5fba0bae9cff645b33859 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 30 Nov 2020 13:00:00 -0800 Subject: [PATCH 19/29] Backport PR #38172: CI: update tests for numpy 1.20 change to floordiv (#38191) Co-authored-by: jbrockmendel --- .../tests/arrays/integer/test_arithmetic.py | 6 ++- .../tests/arrays/sparse/test_arithmetics.py | 45 +++++++++++++++---- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index f549a7caeab1d..d8052cd8134c8 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p20 + import pandas as pd import pandas._testing as tm from pandas.core.arrays import integer_array @@ -197,7 +199,9 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators): result = op(s, other) expected = op(s.astype(float), other) # rfloordiv results in nan instead of inf - if all_arithmetic_operators == "__rfloordiv__": + if all_arithmetic_operators == "__rfloordiv__" and _np_version_under1p20: + # for numpy 1.20 https://github.com/numpy/numpy/pull/16161 + # updated floordiv, now matches our behavior defined in core.ops expected[(expected == np.inf) | (expected == -np.inf)] = np.nan tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index c9f1dd7f589fc..61f4e3e50d09d 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p20 + import pandas as pd import pandas._testing as tm from pandas.core import ops @@ -116,9 +118,15 @@ def _check_logical_ops(self, a, b, a_dense, b_dense): @pytest.mark.parametrize("scalar", [0, 1, 3]) @pytest.mark.parametrize("fill_value", [None, 0, 2]) def test_float_scalar( - self, kind, mix, all_arithmetic_functions, fill_value, scalar + self, kind, mix, all_arithmetic_functions, fill_value, scalar, request ): op = all_arithmetic_functions + + if not _np_version_under1p20: + if op in [operator.floordiv, ops.rfloordiv]: + mark = pytest.mark.xfail(strict=False, reason="GH#38172") + request.node.add_marker(mark) + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) a = self._klass(values, kind=kind, fill_value=fill_value) @@ -142,15 +150,11 @@ def test_float_scalar_comparison(self, kind): self._check_comparison_ops(a, 0, values, 0) self._check_comparison_ops(a, 3, values, 3) - def test_float_same_index(self, kind, mix, all_arithmetic_functions): + def test_float_same_index_without_nans( + self, kind, mix, all_arithmetic_functions, request + ): # when sp_index are the same op = all_arithmetic_functions - values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - - a = self._klass(values, kind=kind) - b = self._klass(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues, mix, op) values = self._base([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0]) rvalues = self._base([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0]) @@ -159,6 +163,24 @@ def test_float_same_index(self, kind, mix, all_arithmetic_functions): b = self._klass(rvalues, kind=kind, fill_value=0) self._check_numeric_ops(a, b, values, rvalues, mix, op) + def test_float_same_index_with_nans( + self, kind, mix, all_arithmetic_functions, request + ): + # when sp_index are the same + op = all_arithmetic_functions + + if not _np_version_under1p20: + if op in [operator.floordiv, ops.rfloordiv]: + mark = pytest.mark.xfail(strict=False, reason="GH#38172") + request.node.add_marker(mark) + + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues, mix, op) + def test_float_same_index_comparison(self, kind): # when sp_index are the same values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) @@ -324,9 +346,14 @@ def test_bool_array_logical(self, kind, fill_value): b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) self._check_logical_ops(a, b, values, rvalues) - def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions): + def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request): op = all_arithmetic_functions + if not _np_version_under1p20: + if op in [operator.floordiv, ops.rfloordiv] and mix: + mark = pytest.mark.xfail(strict=True, reason="GH#38172") + request.node.add_marker(mark) + rdtype = "int64" values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) From b376fb94c5cb160375475bed667ee540735e3df1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Dec 2020 13:53:00 +0100 Subject: [PATCH 20/29] Backport PR #38099: BUG: fix wrong error message in deprecated 2D indexing of Series with datetime values (#38210) --- pandas/core/internals/blocks.py | 22 ++++++++++++++++++++ pandas/core/series.py | 3 ++- pandas/tests/series/indexing/test_getitem.py | 16 +++++++++++--- 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 12806170a9f6d..709924d800ebc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2295,6 +2295,28 @@ def quantile(self, qs, interpolation="linear", axis=0): aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) return self.make_block_same_class(aware, ndim=res_blk.ndim) + def _check_ndim(self, values, ndim): + """ + ndim inference and validation. + + This is overriden by the DatetimeTZBlock to check the case of 2D + data (values.ndim == 2), which should only be allowed if ndim is + also 2. + The case of 1D array is still allowed with both ndim of 1 or 2, as + if the case for other EAs. Therefore, we are only checking + `values.ndim > ndim` instead of `values.ndim != ndim` as for + consolidated blocks. + """ + if ndim is None: + ndim = values.ndim + + if values.ndim > ndim: + raise ValueError( + "Wrong number of dimensions. " + f"values.ndim != ndim [{values.ndim} != {ndim}]" + ) + return ndim + class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): __slots__ = () diff --git a/pandas/core/series.py b/pandas/core/series.py index 277aa1d095350..00fcd44d11cb4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -967,7 +967,8 @@ def _get_values(self, indexer): except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack - return self._values[indexer] + # the asarray is needed to avoid returning a 2D DatetimeArray + return np.asarray(self._values)[indexer] def _get_value(self, label, takeable: bool = False): """ diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index cf03dfb8ca9b7..246f27b566c43 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -133,10 +133,20 @@ def test_getitem_generator(string_series): tm.assert_series_equal(result2, expected) -def test_getitem_ndim_deprecated(): - s = pd.Series([0, 1]) +@pytest.mark.parametrize( + "series", + [ + Series([0, 1]), + Series(date_range("2012-01-01", periods=2)), + Series(date_range("2012-01-01", periods=2, tz="CET")), + ], +) +def test_getitem_ndim_deprecated(series): with tm.assert_produces_warning(FutureWarning): - s[:, None] + result = series[:, None] + + expected = np.asarray(series)[:, None] + tm.assert_numpy_array_equal(result, expected) def test_getitem_assignment_series_aligment(): From 993557b42a5b5ffb0bb7d47462fe534b257b8a60 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 1 Dec 2020 12:57:48 +0000 Subject: [PATCH 21/29] Backport PR #38057: PERF: fix regression in creation of resulting index in RollingGroupby (#38211) Co-authored-by: Joris Van den Bossche --- asv_bench/benchmarks/rolling.py | 14 ++++++ doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/window/rolling.py | 37 ++++++++------ pandas/tests/window/test_grouper.py | 75 +++++++++++++++++++++++++++-- 4 files changed, 107 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index f0dd908f81043..0075b7095aa93 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -216,4 +216,18 @@ def time_rolling_offset(self, method): getattr(self.groupby_roll_offset, method)() +class GroupbyLargeGroups: + # https://github.com/pandas-dev/pandas/issues/38038 + # specific example where the rolling operation on a larger dataframe + # is relatively cheap (few but large groups), but creation of + # MultiIndex of result can be expensive + + def setup(self): + N = 100000 + self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)}) + + def time_rolling_multiindex_creation(self): + self.df.groupby("A").rolling(3).mean() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 4770ab37e08d2..0e2e510147603 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). +- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`) - Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ce7281988e105..237c29a9b0c26 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2216,22 +2216,29 @@ def _apply( # Our result will have still kept the column in the result result = result.drop(columns=column_keys, errors="ignore") - result_index_data = [] - for key, values in self._groupby.grouper.indices.items(): - for value in values: - data = [ - *com.maybe_make_list(key), - *com.maybe_make_list( - grouped_object_index[value] - if grouped_object_index is not None - else [] - ), - ] - result_index_data.append(tuple(data)) - - result_index = MultiIndex.from_tuples( - result_index_data, names=result_index_names + codes = self._groupby.grouper.codes + levels = self._groupby.grouper.levels + + group_indices = self._groupby.grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) + codes = [c.take(indexer) for c in codes] + + # if the index of the original dataframe needs to be preserved, append + # this index (but reordered) to the codes/levels from the groupby + if grouped_object_index is not None: + idx = grouped_object_index.take(indexer) + if not isinstance(idx, MultiIndex): + idx = MultiIndex.from_arrays([idx]) + codes.extend(list(idx.codes)) + levels.extend(list(idx.levels)) + + result_index = MultiIndex( + levels, codes, names=result_index_names, verify_integrity=False ) + result.index = result_index return result diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index da31fbaddc6e4..a5f759fb90dab 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm from pandas.core.groupby.groupby import get_groupby @@ -396,14 +396,25 @@ def test_groupby_rolling_index_changed(self, func): def test_groupby_rolling_empty_frame(self): # GH 36197 - expected = pd.DataFrame({"s1": []}) + expected = DataFrame({"s1": []}) result = expected.groupby("s1").rolling(window=1).sum() - expected.index = pd.MultiIndex.from_tuples([], names=["s1", None]) + # GH-38057 from_tuples gives empty object dtype, we now get float/int levels + # expected.index = MultiIndex.from_tuples([], names=["s1", None]) + expected.index = MultiIndex.from_product( + [Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None] + ) tm.assert_frame_equal(result, expected) - expected = pd.DataFrame({"s1": [], "s2": []}) + expected = DataFrame({"s1": [], "s2": []}) result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() - expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None]) + expected.index = MultiIndex.from_product( + [ + Index([], dtype="float64"), + Index([], dtype="float64"), + Index([], dtype="int64"), + ], + names=["s1", "s2", None], + ) tm.assert_frame_equal(result, expected) def test_groupby_rolling_string_index(self): @@ -479,3 +490,57 @@ def test_groupby_rolling_index_level_and_column_label(self): ), ) tm.assert_frame_equal(result, expected) + + def test_groupby_rolling_resulting_multiindex(self): + # a few different cases checking the created MultiIndex of the result + # https://github.com/pandas-dev/pandas/pull/38057 + + # grouping by 1 columns -> 2-level MI as result + df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4}) + result = df.groupby("b").rolling(3).mean() + expected_index = MultiIndex.from_tuples( + [(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)], + names=["b", None], + ) + tm.assert_index_equal(result.index, expected_index) + + # grouping by 2 columns -> 3-level MI as result + df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3}) + result = df.groupby(["b", "c"]).rolling(2).sum() + expected_index = MultiIndex.from_tuples( + [ + (1, 1, 0), + (1, 1, 4), + (1, 1, 8), + (1, 3, 2), + (1, 3, 6), + (1, 3, 10), + (2, 2, 1), + (2, 2, 5), + (2, 2, 9), + (2, 4, 3), + (2, 4, 7), + (2, 4, 11), + ], + names=["b", "c", None], + ) + tm.assert_index_equal(result.index, expected_index) + + # grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result + df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2}) + df = df.set_index("c", append=True) + result = df.groupby("b").rolling(3).mean() + expected_index = MultiIndex.from_tuples( + [ + (1, 0, 1), + (1, 2, 3), + (1, 4, 1), + (1, 6, 3), + (2, 1, 2), + (2, 3, 4), + (2, 5, 2), + (2, 7, 4), + ], + names=["b", None, "c"], + ) + tm.assert_index_equal(result.index, expected_index) From 884691347a97849c77209f6e2c298c5d91591fc0 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 2 Dec 2020 03:16:47 -0800 Subject: [PATCH 22/29] Backport PR #38228: CI: pin pip to 20.2 on numpy-dev build (#38230) Co-authored-by: Joris Van den Bossche --- ci/deps/azure-37-numpydev.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 7248b1740058f..b2009e84df12e 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -12,7 +12,7 @@ dependencies: # pandas dependencies - pytz - - pip + - pip=20.2 - pip: - cython==0.29.21 # GH#34014 - "git+git://github.com/dateutil/dateutil.git" From ee1828847b7edfc1236c696424aba56934dc9f7d Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 2 Dec 2020 03:38:40 -0800 Subject: [PATCH 23/29] Backport PR #38209: CI/TST: fix CI with numpy dev for changed error message / dev version (#38231) Co-authored-by: Joris Van den Bossche --- pandas/tests/series/indexing/test_indexing.py | 3 +-- pandas/tests/util/test_show_versions.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 3b66939d9ddd2..9d5491fe2f5fc 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -369,8 +369,7 @@ def test_2d_to_1d_assignment_raises(): msg = "|".join( [ - r"shape mismatch: value array of shape \(2,2\) could not be " - r"broadcast to indexing result of shape \(2,\)", + r"shape mismatch: value array of shape \(2,2\)", r"cannot reshape array of size 4 into shape \(2,\)", ] ) diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index fe5fc3e21d960..4ea3ebe5000ad 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -39,7 +39,7 @@ def test_show_versions(capsys): assert re.search(r"commit\s*:\s[0-9a-f]{40}\n", result) # check required dependency - assert re.search(r"numpy\s*:\s([0-9\.\+a-f]|dev)+\n", result) + assert re.search(r"numpy\s*:\s([0-9\.\+a-f\_]|dev)+\n", result) # check optional dependency assert re.search(r"pyarrow\s*:\s([0-9\.]+|None)\n", result) From 3e926800cfd5fbcbaee32062784c6eead2686168 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 3 Dec 2020 12:02:27 +0000 Subject: [PATCH 24/29] Backport PR #38244: REGR: unstack on 'int' dtype prevent fillna to work (#38259) --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/internals/blocks.py | 2 +- pandas/tests/frame/test_reshape.py | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 0e2e510147603..fbb12cb38448a 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) - Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) - Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`) +- Fixed regression in :meth:`DataFrame.unstack` with columns with integer dtype (:issue:`37115`) - Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) - Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 709924d800ebc..944652924611e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1410,7 +1410,7 @@ def _unstack(self, unstacker, fill_value, new_placement): new_values = new_values.T[mask] new_placement = new_placement[mask] - blocks = [self.make_block_same_class(new_values, placement=new_placement)] + blocks = [make_block(new_values, placement=new_placement)] return blocks, mask def quantile(self, qs, interpolation="linear", axis: int = 0): diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 1b452658cc219..506a5a5706673 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -1315,3 +1315,26 @@ def test_stack_positional_level_duplicate_column_names(): expected = pd.DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns) tm.assert_frame_equal(result, expected) + + +def test_unstack_with_missing_int_cast_to_float(): + # https://github.com/pandas-dev/pandas/issues/37115 + df = DataFrame( + {"a": ["A", "A", "B"], "b": ["ca", "cb", "cb"], "v": [10] * 3} + ).set_index(["a", "b"]) + + # add another int column to get 2 blocks + df["is_"] = 1 + assert len(df._mgr.blocks) == 2 + + result = df.unstack("b") + result[("is_", "ca")] = result[("is_", "ca")].fillna(0) + + expected = DataFrame( + [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]], + index=Index(["A", "B"], dtype="object", name="a"), + columns=MultiIndex.from_tuples( + [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")], names=[None, "b"], + ), + ) + tm.assert_frame_equal(result, expected) From 039aabad00e9c5996a7b8f59720deb2ed67c83dc Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 4 Dec 2020 12:03:19 -0800 Subject: [PATCH 25/29] Backport PR #38272: BUG: DataFrame.apply with axis=1 and EA dtype (#38295) Co-authored-by: jbrockmendel --- pandas/core/apply.py | 27 +++++++++++++++----- pandas/tests/frame/apply/test_frame_apply.py | 6 +++++ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index fd7ffd1b54a70..af4717498aa19 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -9,7 +9,12 @@ from pandas._typing import Axis from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence +from pandas.core.dtypes.common import ( + is_dict_like, + is_extension_array_dtype, + is_list_like, + is_sequence, +) from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype @@ -407,12 +412,20 @@ def series_generator(self): mgr = ser._mgr blk = mgr.blocks[0] - for (arr, name) in zip(values, self.index): - # GH#35462 re-pin mgr in case setitem changed it - ser._mgr = mgr - blk.values = arr - ser.name = name - yield ser + if is_extension_array_dtype(blk.dtype): + # values will be incorrect for this block + # TODO(EA2D): special case would be unnecessary with 2D EAs + obj = self.obj + for i in range(len(obj)): + yield obj._ixs(i, axis=0) + + else: + for (arr, name) in zip(values, self.index): + # GH#35462 re-pin mgr in case setitem changed it + ser._mgr = mgr + blk.values = arr + ser.name = name + yield ser @property def result_index(self) -> "Index": diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index a89a20fc69ef8..73a80d048ba00 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -60,6 +60,12 @@ def test_apply(self, float_frame): assert isinstance(df["c0"].dtype, CategoricalDtype) assert isinstance(df["c1"].dtype, CategoricalDtype) + def test_apply_axis1_with_ea(self): + # GH#36785 + df = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]}) + result = df.apply(lambda x: x, axis=1) + tm.assert_frame_equal(result, df) + def test_apply_mixed_datetimelike(self): # mixed datetimelike # GH 7778 From 247ecefd639d047053fd838cc639f775bc9bdbdd Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 6 Dec 2020 20:32:57 +0000 Subject: [PATCH 26/29] Backport PR #38330: REGR: Groupby first/last/nth treats None as an observation (#38333) Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/_libs/groupby.pyx | 12 ++++-------- pandas/tests/groupby/test_nth.py | 20 ++++++++++++++++++++ 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index fbb12cb38448a..7164830392f35 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -27,6 +27,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). - Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`) - Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) +- Fixed regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` where ``None`` was considered a non-NA value (:issue:`38286`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index a83634aad3ce2..5215bde281652 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -926,9 +926,7 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - # None should not be treated like other NA-like - # so that it won't be converted to nan - if not checknull(val) or val is None: + if not checknull(val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -937,7 +935,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = NAN + out[i, j] = None else: out[i, j] = resx[i, j] else: @@ -1021,9 +1019,7 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - # None should not be treated like other NA-like - # so that it won't be converted to nan - if not checknull(val) or val is None: + if not checknull(val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -1033,7 +1029,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): if nobs[i, j] == 0: - out[i, j] = NAN + out[i, j] = None else: out[i, j] = resx[i, j] diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 0cbfbad85a8b6..559b5116e4240 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -105,6 +105,26 @@ def test_first_last_with_None(method): tm.assert_frame_equal(result, df) +@pytest.mark.parametrize("method", ["first", "last"]) +@pytest.mark.parametrize( + "df, expected", + [ + ( + DataFrame({"id": "a", "value": [None, "foo", np.nan]}), + DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")), + ), + ( + DataFrame({"id": "a", "value": [np.nan]}, dtype=object), + DataFrame({"value": [None]}, index=Index(["a"], name="id")), + ), + ], +) +def test_first_last_with_None_expanded(method, df, expected): + # GH 32800, 38286 + result = getattr(df.groupby("id"), method)() + tm.assert_frame_equal(result, expected) + + def test_first_last_nth_dtypes(df_mixed_floats): df = df_mixed_floats.copy() From 66923e159dee6506b210a37b7f6ba184877a2269 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 7 Dec 2020 02:39:12 -0800 Subject: [PATCH 27/29] Backport PR #38332: REGR: Fix Index construction from Sparse["datetime64[ns]"] (#38341) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.1.5.rst | 1 + pandas/core/arrays/datetimes.py | 7 ++++++- pandas/tests/indexes/datetimes/test_constructors.py | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 7164830392f35..cfdeeb61d8f61 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) - Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) - Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`) +- Fixed regression in :class:`Index` constructor raising a ``AttributeError`` when passed a :class:`SparseArray` with datetime64 values (:issue:`35843`) - Fixed regression in :meth:`DataFrame.unstack` with columns with integer dtype (:issue:`37115`) - Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) - Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d674b1c476d2c..8d6016ff41c7f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -36,6 +36,7 @@ is_float_dtype, is_object_dtype, is_period_dtype, + is_sparse, is_string_dtype, is_timedelta64_dtype, pandas_dtype, @@ -1937,7 +1938,11 @@ def sequence_to_dt64ns( data, copy = maybe_convert_dtype(data, copy) data_dtype = getattr(data, "dtype", None) - if is_object_dtype(data_dtype) or is_string_dtype(data_dtype): + if ( + is_object_dtype(data_dtype) + or is_string_dtype(data_dtype) + or is_sparse(data_dtype) + ): # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension copy = False diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index c150e7901c86a..fe5692224d8b9 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -97,6 +97,17 @@ def test_dti_with_timedelta64_data_raises(self): with pytest.raises(TypeError, match=msg): to_datetime(pd.TimedeltaIndex(data)) + def test_constructor_from_sparse_array(self): + # https://github.com/pandas-dev/pandas/issues/35843 + values = [ + Timestamp("2012-05-01T01:00:00.000000"), + Timestamp("2016-05-01T01:00:00.000000"), + ] + arr = pd.arrays.SparseArray(values) + result = Index(arr) + expected = DatetimeIndex(values) + tm.assert_index_equal(result, expected) + def test_construction_caching(self): df = pd.DataFrame( From 69b65716541213dabcca665345d3d112cb70e635 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 7 Dec 2020 02:46:08 -0800 Subject: [PATCH 28/29] Backport PR #38318: DOC: 1.1.5 release date (#38342) Co-authored-by: Simon Hawkins --- doc/source/whatsnew/v1.1.5.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index cfdeeb61d8f61..002e1f85f4127 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -1,7 +1,7 @@ .. _whatsnew_115: -What's new in 1.1.5 (??) ------------------------- +What's new in 1.1.5 (December 07, 2020) +--------------------------------------- These are the changes in pandas 1.1.5. See :ref:`release` for a full changelog including other versions of pandas. From b5958ee1999e9aead1938c0bba2b674378807b3d Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Mon, 7 Dec 2020 11:42:10 +0000 Subject: [PATCH 29/29] RLS: 1.1.5