diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml new file mode 100644 index 0000000000000..c2f332cc5454a --- /dev/null +++ b/.github/workflows/database.yml @@ -0,0 +1,186 @@ +name: Database + +on: + push: + branches: [master] + pull_request: + branches: + - master + - 1.2.x + +env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: ((not slow and not network and not clipboard) or (single and db)) + +jobs: + Linux_py37_locale: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + + env: + ENV_FILE: ci/deps/actions-37-locale.yaml + LOCALE_OVERRIDE: zh_CN.UTF-8 + + services: + mysql: + image: mysql + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Cache conda + uses: actions/cache@v1 + env: + CACHE_NUMBER: 0 + with: + path: ~/conda_pkgs_dir + key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ + hashFiles('${{ env.ENV_FILE }}') }} + + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: pandas-dev + channel-priority: strict + environment-file: ${{ env.ENV_FILE }} + use-only-tar-bz2: true + + - name: Environment Detail + run: | + conda info + conda list + + - name: Build Pandas + run: | + python setup.py build_ext -j 2 + python -m pip install -e . --no-build-isolation --no-use-pep517 + + - name: Test + run: ci/run_tests.sh + if: always() + + - name: Build Version + run: pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + + - name: Publish test results + uses: actions/upload-artifact@master + with: + name: Test results + path: test-data.xml + if: failure() + + - name: Print skipped tests + run: python ci/print_skipped.py + + Linux_py37_cov: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + + env: + ENV_FILE: ci/deps/actions-37-cov.yaml + PANDAS_TESTING_MODE: deprecate + COVERAGE: true + + services: + mysql: + image: mysql + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + steps: + - name: Checkout + uses: actions/checkout@v1 + + - name: Cache conda + uses: actions/cache@v1 + env: + CACHE_NUMBER: 0 + with: + path: ~/conda_pkgs_dir + key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ + hashFiles('${{ env.ENV_FILE }}') }} + + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: pandas-dev + channel-priority: strict + environment-file: ${{ env.ENV_FILE }} + use-only-tar-bz2: true + + - name: Environment Detail + run: | + conda info + conda list + + - name: Build Pandas + run: | + python setup.py build_ext -j 2 + python -m pip install -e . --no-build-isolation --no-use-pep517 + + - name: Test + run: ci/run_tests.sh + if: always() + + - name: Build Version + run: pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + + - name: Publish test results + uses: actions/upload-artifact@master + with: + name: Test results + path: test-data.xml + if: failure() + + - name: Print skipped tests + run: python ci/print_skipped.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 717334bfe1299..90d65327ea980 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -144,6 +144,11 @@ repos: \#\ type:\s?ignore(?!\[) language: pygrep types: [python] + - id: np-bool + name: Check for use of np.bool instead of np.bool_ + entry: np\.bool[^_8] + language: pygrep + types_or: [python, cython, rst] - id: no-os-remove name: Check code for instances of os.remove entry: os\.remove diff --git a/.travis.yml b/.travis.yml index 31edc4872e907..8ede978074a9c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,13 +16,13 @@ services: # travis cache --delete inside the project directory from the travis command line client # The cache directories will be deleted if anything in ci/ changes in a commit cache: + apt: true ccache: true directories: - $HOME/.cache # cython cache env: global: - - PYTEST_WORKERS="auto" # create a github personal access token # cd pandas-dev/pandas # travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas @@ -35,25 +35,10 @@ matrix: fast_finish: true include: - - env: - - JOB="3.7, locale" ENV_FILE="ci/deps/travis-37-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" - services: - - mysql - - postgresql - - arch: arm64 env: - JOB="3.7, arm64" PYTEST_WORKERS=1 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" - - env: - # Enabling Deprecations when running tests - # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs - # See pandas/_testing.py for more details. - - JOB="3.7, coverage" ENV_FILE="ci/deps/travis-37-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" - services: - - mysql - - postgresql - allow_failures: # Moved to allowed_failures 2020-09-29 due to timeouts https://github.com/pandas-dev/pandas/issues/36719 - arch: arm64 diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6ce63ff8badca..6cc8e15786795 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -625,7 +625,7 @@ class TransformBools: def setup(self): N = 120000 transition_points = np.sort(np.random.choice(np.arange(N), 1400)) - transitions = np.zeros(N, dtype=np.bool) + transitions = np.zeros(N, dtype=np.bool_) transitions[transition_points] = True self.g = transitions.cumsum() self.df = DataFrame({"signal": np.random.rand(N)}) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..96f02d37db1e1 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -40,7 +40,7 @@ def time_write_excel(self, engine): class ReadExcel: - params = ["xlrd", "openpyxl", "odf"] + params = ["openpyxl", "odf"] param_names = ["engine"] fname_excel = "spreadsheet.xlsx" fname_odf = "spreadsheet.ods" diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/actions-37-cov.yaml similarity index 97% rename from ci/deps/travis-37-cov.yaml rename to ci/deps/actions-37-cov.yaml index b68ff0672888a..5381caaa242cf 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/actions-37-cov.yaml @@ -15,7 +15,7 @@ dependencies: - beautifulsoup4 - botocore>=1.11 - dask - - fastparquet>=0.3.2 + - fastparquet>=0.4.0 - fsspec>=0.7.4 - gcsfs>=0.6.0 - geopandas diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/actions-37-locale.yaml similarity index 86% rename from ci/deps/travis-37-locale.yaml rename to ci/deps/actions-37-locale.yaml index 60a92c4dfd3c6..b18ce37d05ca0 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/actions-37-locale.yaml @@ -1,6 +1,5 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - python=3.7.* @@ -18,9 +17,9 @@ dependencies: # optional - beautifulsoup4 - - blosc=1.15.0 + - blosc=1.17.0 - python-blosc - - fastparquet=0.3.2 + - fastparquet=0.4.0 - html5lib - ipython - jinja2 @@ -31,7 +30,7 @@ dependencies: - openpyxl - pandas-gbq - google-cloud-bigquery>=1.27.2 # GH 36436 - - pyarrow>=0.17 + - pyarrow=0.17 # GH 38803 - pytables>=3.5.1 - scipy - xarray=0.12.3 @@ -43,5 +42,5 @@ dependencies: # sql - psycopg2=2.7 - - pymysql=0.7.11 + - pymysql=0.8.1 - sqlalchemy=1.3.0 diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index 90cd11037e472..15d503e8fd0a5 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -18,6 +18,7 @@ dependencies: - html5lib - ipython - jinja2 + - jedi<0.18.0 - lxml - matplotlib <3.3.0 - moto diff --git a/ci/deps/azure-38-slow.yaml b/ci/deps/azure-38-slow.yaml index 9651837f26114..fd40f40294b7f 100644 --- a/ci/deps/azure-38-slow.yaml +++ b/ci/deps/azure-38-slow.yaml @@ -1,6 +1,5 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - python=3.8.* diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 08693e02aa8d3..661d8813d32d2 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - blosc - bottleneck - - fastparquet>=0.3.2 + - fastparquet>=0.4.0 - flask - fsspec>=0.8.0 - matplotlib=3.1.3 diff --git a/doc/source/_static/css/pandas.css b/doc/source/_static/css/pandas.css index 43cd631890330..403d182e3d3e5 100644 --- a/doc/source/_static/css/pandas.css +++ b/doc/source/_static/css/pandas.css @@ -1,3 +1,10 @@ +/* Override some aspects of the pydata-sphinx-theme */ + +:root { + /* Use softer blue from bootstrap's default info color */ + --color-info: 23, 162, 184; +} + /* Getting started index page */ .intro-card { diff --git a/doc/source/conf.py b/doc/source/conf.py index 951a6d4043786..8ab1c8c2f3428 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -427,7 +427,7 @@ ipython_warning_is_error = False -ipython_exec_lines = [ +ipython_execlines = [ "import numpy as np", "import pandas as pd", # This ensures correct rendering on system with console encoding != utf8 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index c823ad01f10bf..9c070efa694d4 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -263,12 +263,12 @@ Jinja2 2.10 Conditional formatting with DataFra PyQt4 Clipboard I/O PyQt5 Clipboard I/O PyTables 3.5.1 HDF5-based reading / writing -SQLAlchemy 1.2.8 SQL support for databases other than sqlite +SQLAlchemy 1.3.0 SQL support for databases other than sqlite SciPy 1.12.0 Miscellaneous statistical functions xlsxwriter 1.0.2 Excel writing -blosc 1.15.0 Compression for HDF5 +blosc 1.17.0 Compression for HDF5 fsspec 0.7.4 Handling files aside from local and HTTP -fastparquet 0.3.2 Parquet reading / writing +fastparquet 0.4.0 Parquet reading / writing gcsfs 0.6.0 Google Cloud Storage access html5lib 1.0.1 HTML parser for read_html (see :ref:`note `) lxml 4.3.0 HTML parser for read_html (see :ref:`note `) @@ -278,7 +278,7 @@ openpyxl 2.6.0 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access psycopg2 2.7 PostgreSQL engine for sqlalchemy pyarrow 0.15.0 Parquet, ORC, and feather reading / writing -pymysql 0.7.11 MySQL engine for sqlalchemy +pymysql 0.8.1 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index ffecaa222e1f9..8d38c12252df4 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2229,7 +2229,7 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python dft1 = pd.DataFrame({"a": [1, 0, 1], "b": [4, 5, 6], "c": [7, 8, 9]}) - dft1 = dft1.astype({"a": np.bool, "c": np.float64}) + dft1 = dft1.astype({"a": np.bool_, "c": np.float64}) dft1 dft1.dtypes diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 5a6f56388dee5..77791b4b7e491 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -1406,7 +1406,7 @@ Often it's useful to obtain the lower (or upper) triangular form of a correlatio df = pd.DataFrame(np.random.random(size=(100, 5))) corr_mat = df.corr() - mask = np.tril(np.ones_like(corr_mat, dtype=np.bool), k=-1) + mask = np.tril(np.ones_like(corr_mat, dtype=np.bool_), k=-1) corr_mat.where(mask) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index eba097cd8c345..a78af82ba4db8 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2430,16 +2430,14 @@ Read a URL with no options: .. ipython:: python - url = "https://www.fdic.gov/bank/individual/failed/banklist.html" + url = ( + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + "pandas/tests/io/data/html/spam.html" + ) dfs = pd.read_html(url) dfs -.. note:: - - The data from the above URL changes every Monday so the resulting data above - and the data below may be slightly different. - -Read in the content of the file from the above URL and pass it to ``read_html`` +Read in the content of the "banklist.html" file and pass it to ``read_html`` as a string: .. ipython:: python diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 310857faec436..55e3971502c0a 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 1.2 .. toctree:: :maxdepth: 2 + v1.2.1 v1.2.0 Version 1.1 diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e054ac830ce41..64552b104c053 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -716,6 +716,19 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df.apply(func, axis=1) +.. _whatsnew_110.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_110.api_breaking.testing.check_freq: + +Added ``check_freq`` argument to ``testing.assert_frame_equal`` and ``testing.assert_series_equal`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``check_freq`` argument was added to :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal` in pandas 1.1.0 and defaults to ``True``. :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal` now raise ``AssertionError`` if the indexes do not have the same frequency. Before pandas 1.1.0, the index frequency was not checked. + + Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 93cb1939a4e84..dfd23309faaef 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -286,6 +286,8 @@ Other enhancements - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) - Calling a NumPy ufunc on a ``DataFrame`` with extension types now preserves the extension types when possible (:issue:`23743`) - Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). + This change has been reverted in pandas 1.2.1, and the behaviour to not align DataFrames + is deprecated instead, see the :ref:`the 1.2.1 release notes `. - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) - :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) - :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use nullable dtypes that use ``pd.NA`` as missing value indicator where possible for the resulting DataFrame (default is ``False``, and only applicable for ``engine="pyarrow"``) (:issue:`31242`) @@ -536,6 +538,14 @@ Deprecations - The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) - The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) +**Calling NumPy ufuncs on non-aligned DataFrames** + +Calling NumPy ufuncs on non-aligned DataFrames changed behaviour in pandas +1.2.0 (to align the inputs before calling the ufunc), but this change is +reverted in pandas 1.2.1. The behaviour to not align is now deprecated instead, +see the :ref:`the 1.2.1 release notes ` for +more details. + .. --------------------------------------------------------------------------- @@ -751,7 +761,7 @@ Plotting - Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`) - Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes caused a ``ValueError`` (:issue:`21003`) -- Bug in :meth:`DataFrame.plot.bar` and :meth:`Series.plot.bar` where ticks positions were assigned by value order instead of using the actual value for numeric or a smart ordering for string (:issue:`26186`, :issue:`11465`) +- Bug in :meth:`DataFrame.plot.bar` and :meth:`Series.plot.bar` where ticks positions were assigned by value order instead of using the actual value for numeric or a smart ordering for string (:issue:`26186`, :issue:`11465`). This fix has been reverted in pandas 1.2.1, see :doc:`v1.2.1` - Twinned axes were losing their tick labels which should only happen to all but the last row or column of 'externally' shared axes (:issue:`33819`) - Bug in :meth:`Series.plot` and :meth:`DataFrame.plot` was throwing a :exc:`ValueError` when the Series or DataFrame was indexed by a :class:`.TimedeltaIndex` with a fixed frequency and the x-axis lower limit was greater than the upper limit (:issue:`37454`) @@ -859,4 +869,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.5..v1.2.0|HEAD +.. contributors:: v1.1.5..v1.2.0 diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst new file mode 100644 index 0000000000000..474970601022c --- /dev/null +++ b/doc/source/whatsnew/v1.2.1.rst @@ -0,0 +1,147 @@ +.. _whatsnew_121: + +What's new in 1.2.1 (January 20, 2021) +-------------------------------------- + +These are the changes in pandas 1.2.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`~DataFrame.to_csv` that created corrupted zip files when there were more rows than ``chunksize`` (:issue:`38714`) +- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamReaderWriter`` in binary mode instead of in text mode (:issue:`39247`) +- Fixed regression in :meth:`read_csv` and other read functions were the encoding error policy (``errors``) did not default to ``"replace"`` when no encoding was specified (:issue:`38989`) +- Fixed regression in :func:`read_excel` with non-rawbyte file handles (:issue:`38788`) +- Fixed regression in :meth:`DataFrame.to_stata` not removing the created file when an error occured (:issue:`39202`) +- Fixed regression in ``DataFrame.__setitem__`` raising ``ValueError`` when expanding :class:`DataFrame` and new column is from type ``"0 - name"`` (:issue:`39010`) +- Fixed regression in setting with :meth:`DataFrame.loc` raising ``ValueError`` when :class:`DataFrame` has unsorted :class:`MultiIndex` columns and indexer is a scalar (:issue:`38601`) +- Fixed regression in setting with :meth:`DataFrame.loc` raising ``KeyError`` with :class:`MultiIndex` and list-like columns indexer enlarging :class:`DataFrame` (:issue:`39147`) +- Fixed regression in :meth:`~DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) +- Fixed regression in :meth:`.GroupBy.sem` where the presence of non-numeric columns would cause an error instead of being dropped (:issue:`38774`) +- Fixed regression in :meth:`.DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`) +- Fixed regression in :meth:`DataFrame.groupby` when aggregating an ``ExtensionDType`` that could fail for non-numeric values (:issue:`38980`) +- Fixed regression in :meth:`.Rolling.skew` and :meth:`.Rolling.kurt` modifying the object inplace (:issue:`38908`) +- Fixed regression in :meth:`DataFrame.any` and :meth:`DataFrame.all` not returning a result for tz-aware ``datetime64`` columns (:issue:`38723`) +- Fixed regression in :meth:`DataFrame.apply` with ``axis=1`` using str accessor in apply function (:issue:`38979`) +- Fixed regression in :meth:`DataFrame.replace` raising ``ValueError`` when :class:`DataFrame` has dtype ``bytes`` (:issue:`38900`) +- Fixed regression in :meth:`Series.fillna` that raised ``RecursionError`` with ``datetime64[ns, UTC]`` dtype (:issue:`38851`) +- Fixed regression in comparisons between ``NaT`` and ``datetime.date`` objects incorrectly returning ``True`` (:issue:`39151`) +- Fixed regression in calling NumPy :func:`~numpy.ufunc.accumulate` ufuncs on DataFrames, e.g. ``np.maximum.accumulate(df)`` (:issue:`39259`) +- Fixed regression in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) +- Fixed regression that raised ``AttributeError`` with PyArrow versions [0.16.0, 1.0.0) (:issue:`38801`) +- Fixed regression in :func:`pandas.testing.assert_frame_equal` raising ``TypeError`` with ``check_like=True`` when :class:`Index` or columns have mixed dtype (:issue:`39168`) + +We have reverted a commit that resulted in several plotting related regressions in pandas 1.2.0 (:issue:`38969`, :issue:`38736`, :issue:`38865`, :issue:`38947` and :issue:`39126`). +As a result, bugs reported as fixed in pandas 1.2.0 related to inconsistent tick labeling in bar plots are again present (:issue:`26186` and :issue:`11465`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.ufunc_deprecation: + +Calling NumPy ufuncs on non-aligned DataFrames +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Before pandas 1.2.0, calling a NumPy ufunc on non-aligned DataFrames (or +DataFrame / Series combination) would ignore the indices, only match +the inputs by shape, and use the index/columns of the first DataFrame for +the result: + +.. code-block:: python + + >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) + ... df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) + >>> df1 + a b + 0 1 3 + 1 2 4 + >>> df2 + a b + 1 1 3 + 2 2 4 + + >>> np.add(df1, df2) + a b + 0 2 6 + 1 4 8 + +This contrasts with how other pandas operations work, which first align +the inputs: + +.. code-block:: python + + >>> df1 + df2 + a b + 0 NaN NaN + 1 3.0 7.0 + 2 NaN NaN + +In pandas 1.2.0, we refactored how NumPy ufuncs are called on DataFrames, and +this started to align the inputs first (:issue:`39184`), as happens in other +pandas operations and as it happens for ufuncs called on Series objects. + +For pandas 1.2.1, we restored the previous behaviour to avoid a breaking +change, but the above example of ``np.add(df1, df2)`` with non-aligned inputs +will now to raise a warning, and a future pandas 2.0 release will start +aligning the inputs first (:issue:`39184`). Calling a NumPy ufunc on Series +objects (eg ``np.add(s1, s2)``) already aligns and continues to do so. + +To avoid the warning and keep the current behaviour of ignoring the indices, +convert one of the arguments to a NumPy array: + +.. code-block:: python + + >>> np.add(df1, np.asarray(df2)) + a b + 0 2 6 + 1 4 8 + +To obtain the future behaviour and silence the warning, you can align manually +before passing the arguments to the ufunc: + +.. code-block:: python + + >>> df1, df2 = df1.align(df2) + >>> np.add(df1, df2) + a b + 0 NaN NaN + 1 3.0 7.0 + 2 NaN NaN + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- Bug in :meth:`read_csv` with ``float_precision="high"`` caused segfault or wrong parsing of long exponent strings. This resulted in a regression in some cases as the default for ``float_precision`` was changed in pandas 1.2.0 (:issue:`38753`) +- Bug in :func:`read_csv` not closing an opened file handle when a ``csv.Error`` or ``UnicodeDecodeError`` occurred while initializing (:issue:`39024`) +- Bug in :func:`pandas.testing.assert_index_equal` raising ``TypeError`` with ``check_order=False`` when :class:`Index` has mixed dtype (:issue:`39168`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.other: + +Other +~~~~~ + +- The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`) +- Bumped minimum fastparquet version to 0.4.0 to avoid ``AttributeError`` from numba (:issue:`38344`) +- Bumped minimum pymysql version to 0.8.1 to avoid test failures (:issue:`38344`) +- Fixed build failure on MacOS 11 in Python 3.9.1 (:issue:`38766`) +- Added reference to backwards incompatible ``check_freq`` arg of :func:`testing.assert_frame_equal` and :func:`testing.assert_series_equal` in :ref:`pandas 1.1.0 whats new ` (:issue:`34050`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_121.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.2.0..v1.2.1|HEAD diff --git a/environment.yml b/environment.yml index b99b856187fb6..6f3f81d8a4d77 100644 --- a/environment.yml +++ b/environment.yml @@ -68,7 +68,7 @@ dependencies: # unused (required indirectly may be?) - ipywidgets - - nbformat + - nbformat=5.0.8 - notebook>=5.7.5 - pip diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 88144330c1fe9..4ddbd6cf3ae60 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1733,7 +1733,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, // Process string of digits. num_digits = 0; n = 0; - while (isdigit_ascii(*p)) { + while (num_digits < max_digits && isdigit_ascii(*p)) { n = n * 10 + (*p - '0'); num_digits++; p++; @@ -1754,10 +1754,13 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, } else if (exponent > 0) { number *= e[exponent]; } else if (exponent < -308) { // Subnormal - if (exponent < -616) // Prevent invalid array access. + if (exponent < -616) { // Prevent invalid array access. number = 0.; - number /= e[-308 - exponent]; - number /= e[308]; + } else { + number /= e[-308 - exponent]; + number /= e[308]; + } + } else { number /= e[-exponent]; } diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 561143f48e0ec..3a61de62daf39 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,4 +1,7 @@ +import warnings + from cpython.datetime cimport ( + PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, PyDelta_Check, @@ -125,6 +128,21 @@ cdef class _NaT(datetime): return NotImplemented return result + elif PyDate_Check(other): + # GH#39151 don't defer to datetime.date object + if op == Py_EQ: + return False + if op == Py_NE: + return True + warnings.warn( + "Comparison of NaT with datetime.date is deprecated in " + "order to match the standard library behavior. " + "In a future version these will be considered non-comparable.", + FutureWarning, + stacklevel=1, + ) + return False + return NotImplemented def __add__(self, other): diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 54a09a6d2ede7..882674a5c5c92 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -523,7 +523,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, float64_t x = 0, xx = 0, xxx = 0 int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 int64_t s, e - ndarray[float64_t] output, mean_array + ndarray[float64_t] output, mean_array, values_copy bint is_monotonic_increasing_bounds minp = max(minp, 3) @@ -532,10 +532,11 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ) output = np.empty(N, dtype=float) min_val = np.nanmin(values) + values_copy = np.copy(values) with nogil: for i in range(0, N): - val = values[i] + val = values_copy[i] if notnan(val): nobs_mean += 1 sum_val += val @@ -544,7 +545,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, if min_val - mean_val > -1e5: mean_val = round(mean_val) for i in range(0, N): - values[i] = values[i] - mean_val + values_copy[i] = values_copy[i] - mean_val for i in range(0, N): @@ -556,7 +557,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): - val = values[j] + val = values_copy[j] add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, &compensation_xx_add, &compensation_xxx_add) @@ -566,13 +567,13 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, # and removed # calculate deletes for j in range(start[i - 1], s): - val = values[j] + val = values_copy[j] remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove, &compensation_xx_remove, &compensation_xxx_remove) # calculate adds for j in range(end[i - 1], e): - val = values[j] + val = values_copy[j] add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, &compensation_xx_add, &compensation_xxx_add) @@ -703,7 +704,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, float64_t compensation_x_remove = 0, compensation_x_add = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 - ndarray[float64_t] output + ndarray[float64_t] output, values_copy bint is_monotonic_increasing_bounds minp = max(minp, 4) @@ -711,11 +712,12 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, start, end ) output = np.empty(N, dtype=float) + values_copy = np.copy(values) min_val = np.nanmin(values) with nogil: for i in range(0, N): - val = values[i] + val = values_copy[i] if notnan(val): nobs_mean += 1 sum_val += val @@ -724,7 +726,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, if min_val - mean_val > -1e4: mean_val = round(mean_val) for i in range(0, N): - values[i] = values[i] - mean_val + values_copy[i] = values_copy[i] - mean_val for i in range(0, N): @@ -736,7 +738,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, &compensation_x_add, &compensation_xx_add, &compensation_xxx_add, &compensation_xxxx_add) @@ -746,13 +748,13 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, # and removed # calculate deletes for j in range(start[i - 1], s): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + remove_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, &compensation_x_remove, &compensation_xx_remove, &compensation_xxx_remove, &compensation_xxxx_remove) # calculate adds for j in range(end[i - 1], e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx, &compensation_x_add, &compensation_xx_add, &compensation_xxx_add, &compensation_xxxx_add) diff --git a/pandas/_testing.py b/pandas/_testing.py index 73b1dcf31979f..224c8d540c6bb 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -6,11 +6,13 @@ import gzip import operator import os +from pathlib import Path +import random import re from shutil import rmtree import string import tempfile -from typing import Any, Callable, ContextManager, List, Optional, Type, Union, cast +from typing import IO, Any, Callable, ContextManager, List, Optional, Type, Union, cast import warnings import zipfile @@ -57,7 +59,7 @@ Series, bdate_range, ) -from pandas.core.algorithms import take_1d +from pandas.core.algorithms import safe_sort, take_1d from pandas.core.arrays import ( DatetimeArray, ExtensionArray, @@ -578,66 +580,48 @@ def close(fignum=None): @contextmanager -def ensure_clean(filename=None, return_filelike=False, **kwargs): +def ensure_clean(filename=None, return_filelike: bool = False, **kwargs: Any): """ Gets a temporary path and agrees to remove on close. + This implementation does not use tempfile.mkstemp to avoid having a file handle. + If the code using the returned path wants to delete the file itself, windows + requires that no program has a file handle to it. + Parameters ---------- filename : str (optional) - if None, creates a temporary file which is then removed when out of - scope. if passed, creates temporary file with filename as ending. + suffix of the created file. return_filelike : bool (default False) if True, returns a file-like which is *always* cleaned. Necessary for savefig and other functions which want to append extensions. **kwargs - Additional keywords passed in for creating a temporary file. - :meth:`tempFile.TemporaryFile` is used when `return_filelike` is ``True``. - :meth:`tempfile.mkstemp` is used when `return_filelike` is ``False``. - Note that the `filename` parameter will be passed in as the `suffix` - argument to either function. + Additional keywords are passed to open(). - See Also - -------- - tempfile.TemporaryFile - tempfile.mkstemp """ - filename = filename or "" - fd = None - - kwargs["suffix"] = filename - - if return_filelike: - f = tempfile.TemporaryFile(**kwargs) + folder = Path(tempfile.gettempdir()) - try: - yield f - finally: - f.close() - else: - # Don't generate tempfile if using a path with directory specified. - if len(os.path.dirname(filename)): - raise ValueError("Can't pass a qualified name to ensure_clean()") + if filename is None: + filename = "" + filename = ( + "".join(random.choices(string.ascii_letters + string.digits, k=30)) + filename + ) + path = folder / filename - try: - fd, filename = tempfile.mkstemp(**kwargs) - except UnicodeEncodeError: - import pytest + path.touch() - pytest.skip("no unicode file names on this system") + handle_or_str: Union[str, IO] = str(path) + if return_filelike: + kwargs.setdefault("mode", "w+b") + handle_or_str = open(path, **kwargs) - try: - yield filename - finally: - try: - os.close(fd) - except OSError: - print(f"Couldn't close file descriptor: {fd} (file: {filename})") - try: - if os.path.exists(filename): - os.remove(filename) - except OSError as e: - print(f"Exception on removing file: {e}") + try: + yield handle_or_str + finally: + if not isinstance(handle_or_str, str): + handle_or_str.close() + if path.is_file(): + path.unlink() @contextmanager @@ -804,8 +788,8 @@ def _get_ilevel_values(index, level): # If order doesn't matter then sort the index entries if not check_order: - left = left.sort_values() - right = right.sort_values() + left = Index(safe_sort(left)) + right = Index(safe_sort(right)) # MultiIndex special comparison for little-friendly error messages if left.nlevels > 1: @@ -1334,6 +1318,8 @@ def assert_series_equal( .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + + .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. @@ -1578,6 +1564,8 @@ def assert_frame_equal( (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + + .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. rtol : float, default 1e-5 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1d411f3b1b287..58384405a5cab 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1981,7 +1981,13 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): elif is_integer_dtype(dtype): # We have to cast in order to be able to hold np.nan - dtype = np.float64 + + # int8, int16 are incompatible with float64, + # see https://github.com/cython/cython/issues/2646 + if arr.dtype.name in ["int8", "int16"]: + dtype = np.float32 + else: + dtype = np.float64 orig_ndim = arr.ndim if orig_ndim == 1: diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 6b28f8f135769..cb185dcf78f63 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -149,6 +149,85 @@ def __rpow__(self, other): return self._arith_method(other, roperator.rpow) +# ----------------------------------------------------------------------------- +# Helpers to implement __array_ufunc__ + + +def _is_aligned(frame, other): + """ + Helper to check if a DataFrame is aligned with another DataFrame or Series. + """ + from pandas import DataFrame + + if isinstance(other, DataFrame): + return frame._indexed_same(other) + else: + # Series -> match index + return frame.columns.equals(other.index) + + +def _maybe_fallback(ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): + """ + In the future DataFrame, inputs to ufuncs will be aligned before applying + the ufunc, but for now we ignore the index but raise a warning if behaviour + would change in the future. + This helper detects the case where a warning is needed and then fallbacks + to applying the ufunc on arrays to avoid alignment. + + See https://github.com/pandas-dev/pandas/pull/39239 + """ + from pandas import DataFrame + from pandas.core.generic import NDFrame + + n_alignable = sum(isinstance(x, NDFrame) for x in inputs) + n_frames = sum(isinstance(x, DataFrame) for x in inputs) + + if n_alignable >= 2 and n_frames >= 1: + # if there are 2 alignable inputs (Series or DataFrame), of which at least 1 + # is a DataFrame -> we would have had no alignment before -> warn that this + # will align in the future + + # the first frame is what determines the output index/columns in pandas < 1.2 + first_frame = next(x for x in inputs if isinstance(x, DataFrame)) + + # check if the objects are aligned or not + non_aligned = sum( + not _is_aligned(first_frame, x) for x in inputs if isinstance(x, NDFrame) + ) + + # if at least one is not aligned -> warn and fallback to array behaviour + if non_aligned: + warnings.warn( + "Calling a ufunc on non-aligned DataFrames (or DataFrame/Series " + "combination). Currently, the indices are ignored and the result " + "takes the index/columns of the first DataFrame. In the future , " + "the DataFrames/Series will be aligned before applying the ufunc.\n" + "Convert one of the arguments to a NumPy array " + "(eg 'ufunc(df1, np.asarray(df2)') to keep the current behaviour, " + "or align manually (eg 'df1, df2 = df1.align(df2)') before passing to " + "the ufunc to obtain the future behaviour and silence this warning.", + FutureWarning, + stacklevel=4, + ) + + # keep the first dataframe of the inputs, other DataFrame/Series is + # converted to array for fallback behaviour + new_inputs = [] + for x in inputs: + if x is first_frame: + new_inputs.append(x) + elif isinstance(x, NDFrame): + new_inputs.append(np.asarray(x)) + else: + new_inputs.append(x) + + # call the ufunc on those transformed inputs + return getattr(ufunc, method)(*new_inputs, **kwargs) + + # signal that we didn't fallback / execute the ufunc yet + return NotImplemented + + def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): """ Compatibility with numpy ufuncs. @@ -162,6 +241,11 @@ def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any) cls = type(self) + # for backwards compatibility check and potentially fallback for non-aligned frames + result = _maybe_fallback(ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + # for binary ops, use our custom dunder methods result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs) if result is not NotImplemented: @@ -274,8 +358,14 @@ def reconstruct(result): result = getattr(ufunc, method)(*inputs, **kwargs) else: # ufunc(dataframe) - mgr = inputs[0]._mgr - result = mgr.apply(getattr(ufunc, method)) + if method == "__call__": + # for np.(..) calls + mgr = inputs[0]._mgr + result = mgr.apply(getattr(ufunc, method)) + else: + # otherwise specific ufunc methods (eg np..accumulate(..)) + # Those can have an axis keyword and thus can't be called block-by-block + result = getattr(ufunc, method)(np.asarray(inputs[0]), **kwargs) if ufunc.nout > 1: # type: ignore[attr-defined] result = tuple(reconstruct(x) for x in result) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index be9864731842d..2f2f8efc0c360 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1621,6 +1621,17 @@ def floor(self, freq, ambiguous="raise", nonexistent="raise"): def ceil(self, freq, ambiguous="raise", nonexistent="raise"): return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + # -------------------------------------------------------------- + # Reductions + + def any(self, *, axis: Optional[int] = None, skipna: bool = True): + # GH#34479 discussion of desired behavior long-term + return nanops.nanany(self._ndarray, axis=axis, skipna=skipna, mask=self.isna()) + + def all(self, *, axis: Optional[int] = None, skipna: bool = True): + # GH#34479 discussion of desired behavior long-term + return nanops.nanall(self._ndarray, axis=axis, skipna=skipna, mask=self.isna()) + # -------------------------------------------------------------- # Frequency Methods diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 184fbc050036b..7d3806fe11bd2 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -29,13 +29,12 @@ except ImportError: pa = None else: - # our min supported version of pyarrow, 0.15.1, does not have a compute - # module - try: + # PyArrow backed StringArrays are available starting at 1.0.0, but this + # file is imported from even if pyarrow is < 1.0.0, before pyarrow.compute + # and its compute functions existed. GH38801 + if LooseVersion(pa.__version__) >= "1.0.0": import pyarrow.compute as pc - except ImportError: - pass - else: + ARROW_CMP_FUNCS = { "eq": pc.equal, "ne": pc.not_equal, diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index d8b0ad739b056..73cf20979a8ad 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1382,7 +1382,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: return False try: dtype = get_dtype(arr_or_dtype) - except TypeError: + except (TypeError, ValueError): return False if isinstance(arr_or_dtype, CategoricalDtype): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e43edf1e6577e..8da3bae190f82 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -172,7 +172,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() - _hidden_attrs: FrozenSet[str] = frozenset(["get_values", "tshift"]) + _hidden_attrs: FrozenSet[str] = frozenset( + ["_AXIS_NAMES", "_AXIS_NUMBERS", "get_values", "tshift"] + ) _metadata: List[str] = [] _is_copy = None _mgr: BlockManager @@ -10887,8 +10889,10 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): # [assignment] cls.all = all # type: ignore[assignment] + # error: Argument 1 to "doc" has incompatible type "Optional[str]"; expected + # "Union[str, Callable[..., Any]]" @doc( - NDFrame.mad, + NDFrame.mad.__doc__, # type: ignore[arg-type] desc="Return the mean absolute deviation of the values " "over the requested axis.", name1=name1, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 23f0e178130be..1272ea7547209 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1606,12 +1606,11 @@ def sem(self, ddof: int = 1): if result.ndim == 1: result /= np.sqrt(self.count()) else: - cols = result.columns.get_indexer_for( - result.columns.difference(self.exclusions).unique() - ) - result.iloc[:, cols] = result.iloc[:, cols] / np.sqrt( - self.count().iloc[:, cols] - ) + cols = result.columns.difference(self.exclusions).unique() + counts = self.count() + result_ilocs = result.columns.get_indexer_for(cols) + count_ilocs = counts.columns.get_indexer_for(cols) + result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs]) return result @final diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 40ef7199406fe..17584ffc5b1bf 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -582,13 +582,8 @@ def indices(self): if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices - # Return a dictionary of {group label: [indices belonging to the group label]} - # respecting whether sort was specified - codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) - return { - category: np.flatnonzero(codes == i) - for i, category in enumerate(Index(uniques)) - } + values = Categorical(self.grouper) + return values._reverse_indexer() @property def codes(self) -> np.ndarray: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7724e3930f7df..b86d54024c62d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -53,6 +53,7 @@ is_timedelta64_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.generic import ABCCategoricalIndex from pandas.core.dtypes.missing import isna, maybe_fill import pandas.core.algorithms as algorithms @@ -244,6 +245,11 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): """ dict {group name -> group indices} """ + if len(self.groupings) == 1 and isinstance( + self.result_index, ABCCategoricalIndex + ): + # This shows unused categories in indices GH#38642 + return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) @@ -537,7 +543,9 @@ def _ea_wrap_cython_operation( result = type(orig_values)._from_sequence(res_values) return result - raise NotImplementedError(values.dtype) + raise NotImplementedError( + f"function is not implemented for this dtype: {values.dtype}" + ) @final def _cython_operation( diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e7cf8cae28b88..94ddbbdf589d4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.common import ( is_array_like, + is_bool_dtype, is_hashable, is_integer, is_iterator, @@ -659,9 +660,9 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None): if self.ndim != 2: return - if isinstance(key, tuple) and not isinstance(self.obj.index, ABCMultiIndex): + if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc - # if index is not a MultiIndex, set key to column part + # if length of key is > 1 set key to column part key = key[column_axis] axis = column_axis @@ -1925,12 +1926,14 @@ def _ensure_iterable_column_indexer(self, column_indexer): """ Ensure that our column indexer is something that can be iterated over. """ - # Ensure we have something we can iterate over if is_integer(column_indexer): ilocs = [column_indexer] elif isinstance(column_indexer, slice): - ri = Index(range(len(self.obj.columns))) - ilocs = ri[column_indexer] + ilocs = np.arange(len(self.obj.columns))[column_indexer] + elif isinstance(column_indexer, np.ndarray) and is_bool_dtype( + column_indexer.dtype + ): + ilocs = np.arange(len(column_indexer))[column_indexer] else: ilocs = column_indexer return ilocs diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fe07823a80783..32aade97c8736 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2148,7 +2148,13 @@ def _can_hold_element(self, element: Any) -> bool: class DatetimeLikeBlockMixin(Block): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" - _can_hold_na = True + @property + def _holder(self): + return DatetimeArray + + @property + def fill_value(self): + return np.datetime64("NaT", "ns") def get_values(self, dtype=None): """ @@ -2216,8 +2222,10 @@ def to_native_types(self, na_rep="NaT", **kwargs): class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () is_datetime = True - _holder = DatetimeArray - fill_value = np.datetime64("NaT", "ns") + + @property + def _can_hold_na(self): + return True def _maybe_coerce_values(self, values): """ @@ -2308,17 +2316,17 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): is_extension = True internal_values = Block.internal_values - - _holder = DatetimeBlock._holder _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types diff = DatetimeBlock.diff - fillna = DatetimeBlock.fillna # i.e. Block.fillna - fill_value = DatetimeBlock.fill_value - _can_hold_na = DatetimeBlock._can_hold_na + fill_value = np.datetime64("NaT", "ns") array_values = ExtensionBlock.array_values + @property + def _holder(self): + return DatetimeArray + def _maybe_coerce_values(self, values): """ Input validation for values passed to __init__. Ensure that @@ -2383,6 +2391,17 @@ def external_values(self): # return an object-dtype ndarray of Timestamps. return np.asarray(self.values.astype("datetime64[ns]", copy=False)) + def fillna(self, value, limit=None, inplace=False, downcast=None): + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. + if self._can_hold_element(value): + return super().fillna(value, limit, inplace, downcast) + + # different timezones, or a non-tz + return self.astype(object).fillna( + value, limit=limit, inplace=inplace, downcast=downcast + ) + def quantile(self, qs, interpolation="linear", axis=0): naive = self.values.view("M8[ns]") @@ -2419,9 +2438,11 @@ def _check_ndim(self, values, ndim): return ndim -class TimeDeltaBlock(DatetimeLikeBlockMixin): +class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): __slots__ = () is_timedelta = True + _can_hold_na = True + is_numeric = False fill_value = np.timedelta64("NaT", "ns") def _maybe_coerce_values(self, values): @@ -2482,7 +2503,7 @@ class ObjectBlock(Block): _can_hold_na = True def _maybe_coerce_values(self, values): - if issubclass(values.dtype.type, str): + if issubclass(values.dtype.type, (str, bytes)): values = np.array(values, dtype=object) return values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 93ab207d8ce12..9aebacd740526 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1916,7 +1916,7 @@ def _consolidate(blocks): merged_blocks = _merge_blocks( list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate ) - new_blocks.extend(merged_blocks) + new_blocks = extend_blocks(merged_blocks, new_blocks) return new_blocks diff --git a/pandas/core/series.py b/pandas/core/series.py index 1449b78ee91d8..b4e8696ad9e13 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4622,6 +4622,15 @@ def isin(self, values) -> "Series": 4 True 5 False Name: animal, dtype: bool + + Strings and integers are distinct and are therefore not comparable: + + >>> pd.Series([1]).isin(['1']) + 0 False + dtype: bool + >>> pd.Series([1.1]).isin(['1.1']) + 0 False + dtype: bool """ result = algorithms.isin(self._values, values) return self._constructor(result, index=self.index).__finalize__( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 2713b76189157..ca12012ec135f 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -109,7 +109,7 @@ def wrapper(self, *args, **kwargs): def _map_and_wrap(name, docstring): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): - result = getattr(self._array, f"_str_{name}")() + result = getattr(self._data.array, f"_str_{name}")() return self._wrap_result(result) wrapper.__doc__ = docstring @@ -154,8 +154,7 @@ def __init__(self, data): self._inferred_dtype = self._validate(data) self._is_categorical = is_categorical_dtype(data.dtype) self._is_string = isinstance(data.dtype, StringDtype) - array = data.array - self._array = array + self._data = data self._index = self._name = None if isinstance(data, ABCSeries): @@ -219,7 +218,7 @@ def _validate(data): return inferred_dtype def __getitem__(self, key): - result = self._array._str_getitem(key) + result = self._data.array._str_getitem(key) return self._wrap_result(result) def __iter__(self): @@ -744,13 +743,13 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) @forbid_nonstring_types(["bytes"]) def split(self, pat=None, n=-1, expand=False): - result = self._array._str_split(pat, n, expand) + result = self._data.array._str_split(pat, n, expand) return self._wrap_result(result, returns_string=expand, expand=expand) @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) @forbid_nonstring_types(["bytes"]) def rsplit(self, pat=None, n=-1, expand=False): - result = self._array._str_rsplit(pat, n=n) + result = self._data.array._str_rsplit(pat, n=n) return self._wrap_result(result, expand=expand, returns_string=expand) _shared_docs[ @@ -846,7 +845,7 @@ def rsplit(self, pat=None, n=-1, expand=False): ) @forbid_nonstring_types(["bytes"]) def partition(self, sep=" ", expand=True): - result = self._array._str_partition(sep, expand) + result = self._data.array._str_partition(sep, expand) return self._wrap_result(result, expand=expand, returns_string=expand) @Appender( @@ -860,7 +859,7 @@ def partition(self, sep=" ", expand=True): ) @forbid_nonstring_types(["bytes"]) def rpartition(self, sep=" ", expand=True): - result = self._array._str_rpartition(sep, expand) + result = self._data.array._str_rpartition(sep, expand) return self._wrap_result(result, expand=expand, returns_string=expand) def get(self, i): @@ -914,7 +913,7 @@ def get(self, i): 5 None dtype: object """ - result = self._array._str_get(i) + result = self._data.array._str_get(i) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -980,7 +979,7 @@ def join(self, sep): 4 NaN dtype: object """ - result = self._array._str_join(sep) + result = self._data.array._str_join(sep) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1108,7 +1107,7 @@ def contains(self, pat, case=True, flags=0, na=None, regex=True): 4 False dtype: bool """ - result = self._array._str_contains(pat, case, flags, na, regex) + result = self._data.array._str_contains(pat, case, flags, na, regex) return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -1140,7 +1139,7 @@ def match(self, pat, case=True, flags=0, na=None): re.match. extract : Extract matched groups. """ - result = self._array._str_match(pat, case=case, flags=flags, na=na) + result = self._data.array._str_match(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -1173,7 +1172,7 @@ def fullmatch(self, pat, case=True, flags=0, na=None): matches the regular expression. extract : Extract matched groups. """ - result = self._array._str_fullmatch(pat, case=case, flags=flags, na=na) + result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -1309,7 +1308,7 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): ) warnings.warn(msg, FutureWarning, stacklevel=3) regex = True - result = self._array._str_replace( + result = self._data.array._str_replace( pat, repl, n=n, case=case, flags=flags, regex=regex ) return self._wrap_result(result) @@ -1355,7 +1354,7 @@ def repeat(self, repeats): 2 ccc dtype: object """ - result = self._array._str_repeat(repeats) + result = self._data.array._str_repeat(repeats) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1423,7 +1422,7 @@ def pad(self, width, side="left", fillchar=" "): msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - result = self._array._str_pad(width, side=side, fillchar=fillchar) + result = self._data.array._str_pad(width, side=side, fillchar=fillchar) return self._wrap_result(result) _shared_docs[ @@ -1597,7 +1596,7 @@ def slice(self, start=None, stop=None, step=None): 2 cm dtype: object """ - result = self._array._str_slice(start, stop, step) + result = self._data.array._str_slice(start, stop, step) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1673,7 +1672,7 @@ def slice_replace(self, start=None, stop=None, repl=None): 4 aXde dtype: object """ - result = self._array._str_slice_replace(start, stop, repl) + result = self._data.array._str_slice_replace(start, stop, repl) return self._wrap_result(result) def decode(self, encoding, errors="strict"): @@ -1699,7 +1698,7 @@ def decode(self, encoding, errors="strict"): else: decoder = codecs.getdecoder(encoding) f = lambda x: decoder(x, errors)[0] - arr = self._array + arr = self._data.array # assert isinstance(arr, (StringArray,)) result = arr._str_map(f) return self._wrap_result(result) @@ -1720,7 +1719,7 @@ def encode(self, encoding, errors="strict"): ------- encoded : Series/Index of objects """ - result = self._array._str_encode(encoding, errors) + result = self._data.array._str_encode(encoding, errors) return self._wrap_result(result, returns_string=False) _shared_docs[ @@ -1798,7 +1797,7 @@ def encode(self, encoding, errors="strict"): ) @forbid_nonstring_types(["bytes"]) def strip(self, to_strip=None): - result = self._array._str_strip(to_strip) + result = self._data.array._str_strip(to_strip) return self._wrap_result(result) @Appender( @@ -1807,7 +1806,7 @@ def strip(self, to_strip=None): ) @forbid_nonstring_types(["bytes"]) def lstrip(self, to_strip=None): - result = self._array._str_lstrip(to_strip) + result = self._data.array._str_lstrip(to_strip) return self._wrap_result(result) @Appender( @@ -1816,7 +1815,7 @@ def lstrip(self, to_strip=None): ) @forbid_nonstring_types(["bytes"]) def rstrip(self, to_strip=None): - result = self._array._str_rstrip(to_strip) + result = self._data.array._str_rstrip(to_strip) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1875,7 +1874,7 @@ def wrap(self, width, **kwargs): 1 another line\nto be\nwrapped dtype: object """ - result = self._array._str_wrap(width, **kwargs) + result = self._data.array._str_wrap(width, **kwargs) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -1917,7 +1916,7 @@ def get_dummies(self, sep="|"): """ # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._array._str_get_dummies(sep) + result, name = self._data.array._str_get_dummies(sep) return self._wrap_result( result, name=name, @@ -1944,7 +1943,7 @@ def translate(self, table): ------- Series or Index """ - result = self._array._str_translate(table) + result = self._data.array._str_translate(table) return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) @@ -2012,7 +2011,7 @@ def count(self, pat, flags=0): >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') Int64Index([0, 0, 2, 1], dtype='int64') """ - result = self._array._str_count(pat, flags) + result = self._data.array._str_count(pat, flags) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2069,7 +2068,7 @@ def startswith(self, pat, na=None): 3 False dtype: bool """ - result = self._array._str_startswith(pat, na=na) + result = self._data.array._str_startswith(pat, na=na) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2126,7 +2125,7 @@ def endswith(self, pat, na=None): 3 False dtype: bool """ - result = self._array._str_endswith(pat, na=na) + result = self._data.array._str_endswith(pat, na=na) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2219,7 +2218,7 @@ def findall(self, pat, flags=0): 2 [b, b] dtype: object """ - result = self._array._str_findall(pat, flags) + result = self._data.array._str_findall(pat, flags) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2426,7 +2425,7 @@ def find(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str_find(sub, start, end) + result = self._data.array._str_find(sub, start, end) return self._wrap_result(result, returns_string=False) @Appender( @@ -2443,7 +2442,7 @@ def rfind(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str_rfind(sub, start=start, end=end) + result = self._data.array._str_rfind(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) @@ -2463,7 +2462,7 @@ def normalize(self, form): ------- normalized : Series/Index of objects """ - result = self._array._str_normalize(form) + result = self._data.array._str_normalize(form) return self._wrap_result(result) _shared_docs[ @@ -2510,7 +2509,7 @@ def index(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str_index(sub, start=start, end=end) + result = self._data.array._str_index(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) @Appender( @@ -2528,7 +2527,7 @@ def rindex(self, sub, start=0, end=None): msg = f"expected a string object, not {type(sub).__name__}" raise TypeError(msg) - result = self._array._str_rindex(sub, start=start, end=end) + result = self._data.array._str_rindex(sub, start=start, end=end) return self._wrap_result(result, returns_string=False) def len(self): @@ -2577,7 +2576,7 @@ def len(self): 5 3.0 dtype: float64 """ - result = self._array._str_len() + result = self._data.array._str_len() return self._wrap_result(result, returns_string=False) _shared_docs[ @@ -2677,37 +2676,37 @@ def len(self): @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) @forbid_nonstring_types(["bytes"]) def lower(self): - result = self._array._str_lower() + result = self._data.array._str_lower() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) @forbid_nonstring_types(["bytes"]) def upper(self): - result = self._array._str_upper() + result = self._data.array._str_upper() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["title"]) @forbid_nonstring_types(["bytes"]) def title(self): - result = self._array._str_title() + result = self._data.array._str_title() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) @forbid_nonstring_types(["bytes"]) def capitalize(self): - result = self._array._str_capitalize() + result = self._data.array._str_capitalize() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) @forbid_nonstring_types(["bytes"]) def swapcase(self): - result = self._array._str_swapcase() + result = self._data.array._str_swapcase() return self._wrap_result(result) @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) @forbid_nonstring_types(["bytes"]) def casefold(self): - result = self._array._str_casefold() + result = self._data.array._str_casefold() return self._wrap_result(result) _shared_docs[ diff --git a/pandas/io/common.py b/pandas/io/common.py index 64c5d3173fe0a..90622ef0c0f2c 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,13 +1,14 @@ """Common IO api utilities""" import bz2 +import codecs from collections import abc import dataclasses import gzip -from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper +from io import BufferedIOBase, BytesIO, RawIOBase, StringIO, TextIOWrapper import mmap import os -from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast +from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union, cast from urllib.parse import ( urljoin, urlparse as parse_url, @@ -547,8 +548,7 @@ def get_handle( Returns the dataclass IOHandles """ # Windows does not default to utf-8. Set to utf-8 for a consistent behavior - if encoding is None: - encoding = "utf-8" + encoding_passed, encoding = encoding, encoding or "utf-8" # read_csv does not know whether the buffer is opened in binary/text mode if _is_binary_mode(path_or_buf, mode) and "b" not in mode: @@ -635,6 +635,9 @@ def get_handle( # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. if ioargs.encoding and "b" not in ioargs.mode: + if errors is None and encoding_passed is None: + # ignore errors when no encoding is specified + errors = "replace" # Encoding handle = open( handle, @@ -707,17 +710,36 @@ def __init__( archive_name: Optional[str] = None, **kwargs, ): - if mode in ["wb", "rb"]: - mode = mode.replace("b", "") + mode = mode.replace("b", "") self.archive_name = archive_name + self.multiple_write_buffer: Optional[Union[StringIO, BytesIO]] = None + kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} kwargs_zip.update(kwargs) + super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] def write(self, data): + # buffer multiple write calls, write on flush + if self.multiple_write_buffer is None: + self.multiple_write_buffer = ( + BytesIO() if isinstance(data, bytes) else StringIO() + ) + self.multiple_write_buffer.write(data) + + def flush(self) -> None: + # write to actual handle and close write buffer + if self.multiple_write_buffer is None or self.multiple_write_buffer.closed: + return + # ZipFile needs a non-empty string archive_name = self.archive_name or self.filename or "zip" - super().writestr(archive_name, data) + with self.multiple_write_buffer: + super().writestr(archive_name, self.multiple_write_buffer.getvalue()) + + def close(self): + self.flush() + super().close() @property def closed(self): @@ -823,9 +845,12 @@ def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool: def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool: """Whether the handle is opened in binary mode""" + # classes that expect string but have 'b' in mode + text_classes = (codecs.StreamReaderWriter,) + if isinstance(handle, text_classes): + return False + # classes that expect bytes - binary_classes = [BufferedIOBase, RawIOBase] + binary_classes = (BufferedIOBase, RawIOBase) - return isinstance(handle, tuple(binary_classes)) or "b" in getattr( - handle, "mode", mode - ) + return isinstance(handle, binary_classes) or "b" in getattr(handle, "mode", mode) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 221e8b9ccfb14..5be8dbf152309 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1051,16 +1051,11 @@ def __init__( xlrd_version = LooseVersion(xlrd.__version__) - if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)): - ext = inspect_excel_format( - content=path_or_buffer, storage_options=storage_options - ) - elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: - # path_or_buffer is path-like, use stringified path ext = inspect_excel_format( - path=str(self._io), storage_options=storage_options + content=path_or_buffer, storage_options=storage_options ) if engine is None: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index db34b882a3c35..d0b821a3679bb 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1305,7 +1305,7 @@ def _format(x): if not is_float_type[i] and leading_space: fmt_values.append(f" {_format(v)}") elif is_float_type[i]: - fmt_values.append(float_format(v)) + fmt_values.append(_trim_zeros_single_float(float_format(v))) else: if leading_space is False: # False specifically, so that the default is @@ -1315,8 +1315,6 @@ def _format(x): tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) - fmt_values = _trim_zeros_float(str_floats=fmt_values, decimal=".") - return fmt_values @@ -1832,11 +1830,25 @@ def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> List[s return padded +def _trim_zeros_single_float(str_float: str) -> str: + """ + Trims trailing zeros after a decimal point, + leaving just one if necessary. + """ + str_float = str_float.rstrip("0") + if str_float.endswith("."): + str_float += "0" + + return str_float + + def _trim_zeros_float( str_floats: Union[np.ndarray, List[str]], decimal: str = "." ) -> List[str]: """ - Trims zeros, leaving just one before the decimal points if need be. + Trims the maximum number of trailing zeros equally from + all numbers containing decimals, leaving just one if + necessary. """ trimmed = str_floats number_regex = re.compile(fr"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$") diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index fcbf7ec3897fc..d99abbea90a51 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2288,7 +2288,11 @@ def __init__(self, f: Union[FilePathOrBuffer, List], **kwds): self._open_handles(f, kwds) assert self.handles is not None assert hasattr(self.handles.handle, "readline") - self._make_reader(self.handles.handle) + try: + self._make_reader(self.handles.handle) + except (csv.Error, UnicodeDecodeError): + self.close() + raise # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6f296d3c8d92f..b7fe630af90ef 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -13,7 +13,6 @@ import datetime from io import BytesIO import os -from pathlib import Path import struct import sys from typing import Any, AnyStr, Dict, List, Optional, Sequence, Tuple, Union, cast @@ -2462,8 +2461,8 @@ def write_file(self) -> None: if self.handles.compression["method"] is not None: # ZipFile creates a file (with the same name) for each write call. # Write it first into a buffer and then write the buffer to the ZipFile. - self._output_file = self.handles.handle - self.handles.handle = BytesIO() + self._output_file, self.handles.handle = self.handles.handle, BytesIO() + self.handles.created_handles.append(self.handles.handle) try: self._write_header( @@ -2484,20 +2483,21 @@ def write_file(self) -> None: self._write_value_labels() self._write_file_close_tag() self._write_map() - except Exception as exc: self._close() - if isinstance(self._fname, (str, Path)): + except Exception as exc: + self.handles.close() + if isinstance(self._fname, (str, os.PathLike)) and os.path.isfile( + self._fname + ): try: os.unlink(self._fname) except OSError: warnings.warn( f"This save was not successful but {self._fname} could not " - "be deleted. This file is not valid.", + "be deleted. This file is not valid.", ResourceWarning, ) raise exc - else: - self._close() def _close(self) -> None: """ @@ -2509,11 +2509,8 @@ def _close(self) -> None: # write compression if self._output_file is not None: assert isinstance(self.handles.handle, BytesIO) - bio = self.handles.handle - bio.seek(0) - self.handles.handle = self._output_file - self.handles.handle.write(bio.read()) # type: ignore[arg-type] - bio.close() + bio, self.handles.handle = self.handles.handle, self._output_file + self.handles.handle.write(bio.getvalue()) # type: ignore[arg-type] def _write_map(self) -> None: """No-op, future compatibility""" diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 1a22e5629ebe8..00fd0efb48530 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1370,6 +1370,7 @@ def __init__(self, data, **kwargs): self.bar_width = kwargs.pop("width", 0.5) pos = kwargs.pop("position", 0.5) kwargs.setdefault("align", "center") + self.tick_pos = np.arange(len(data)) self.bottom = kwargs.pop("bottom", 0) self.left = kwargs.pop("left", 0) @@ -1392,16 +1393,7 @@ def __init__(self, data, **kwargs): self.tickoffset = self.bar_width * pos self.lim_offset = 0 - if isinstance(self.data.index, ABCMultiIndex): - if kwargs["ax"] is not None and kwargs["ax"].has_data(): - warnings.warn( - "Redrawing a bar plot with a MultiIndex is not supported " - + "and may lead to inconsistent label positions.", - UserWarning, - ) - self.ax_index = np.arange(len(data)) - else: - self.ax_index = self.data.index + self.ax_pos = self.tick_pos - self.tickoffset def _args_adjust(self): if is_list_like(self.bottom): @@ -1428,15 +1420,6 @@ def _make_plot(self): for i, (label, y) in enumerate(self._iter_data(fillna=0)): ax = self._get_ax(i) - - if self.orientation == "vertical": - ax.xaxis.update_units(self.ax_index) - self.tick_pos = ax.convert_xunits(self.ax_index).astype(np.int) - elif self.orientation == "horizontal": - ax.yaxis.update_units(self.ax_index) - self.tick_pos = ax.convert_yunits(self.ax_index).astype(np.int) - self.ax_pos = self.tick_pos - self.tickoffset - kwds = self.kwds.copy() if self._is_series: kwds["color"] = colors @@ -1508,8 +1491,8 @@ def _post_plot_logic(self, ax: "Axes", data): str_index = [pprint_thing(key) for key in range(data.shape[0])] name = self._get_index_name() - s_edge = self.ax_pos.min() - 0.25 + self.lim_offset - e_edge = self.ax_pos.max() + 0.25 + self.bar_width + self.lim_offset + s_edge = self.ax_pos[0] - 0.25 + self.lim_offset + e_edge = self.ax_pos[-1] + 0.25 + self.bar_width + self.lim_offset self._decorate_ticks(ax, name, str_index, s_edge, e_edge) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 19d80b714a674..128f505402eff 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -557,6 +557,11 @@ def test_is_bool_dtype(): assert com.is_bool_dtype("boolean") +def test_is_bool_dtype_numpy_error(): + # GH39010 + assert not com.is_bool_dtype("0 - Name") + + @pytest.mark.filterwarnings("ignore:'is_extension_type' is deprecated:FutureWarning") @pytest.mark.parametrize( "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 94d0ef7bbea84..c81304695f353 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -33,6 +33,22 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): expected = expected.reset_index() self.assert_frame_equal(result, expected) + def test_groupby_agg_extension(self, data_for_grouping): + # GH#38980 groupby agg on extension type fails for non-numeric types + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + + expected = df.iloc[[0, 2, 4, 7]] + expected = expected.set_index("A") + + result = df.groupby("A").agg({"B": "first"}) + self.assert_frame_equal(result, expected) + + result = df.groupby("A").agg("first") + self.assert_frame_equal(result, expected) + + result = df.groupby("A").first() + self.assert_frame_equal(result, expected) + def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 233b658d29782..08768bda312ba 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -197,6 +197,10 @@ class TestGroupby(BaseDecimal, base.BaseGroupbyTests): def test_groupby_apply_identity(self, data_for_grouping): super().test_groupby_apply_identity(data_for_grouping) + @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") + def test_groupby_agg_extension(self, data_for_grouping): + super().test_groupby_agg_extension(data_for_grouping) + class TestSetitem(BaseDecimal, base.BaseSetitemTests): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 3a5e49796c53b..164a39498ec73 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -313,6 +313,10 @@ def test_groupby_extension_apply(self): def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) + @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") + def test_groupby_agg_extension(self, data_for_grouping): + super().test_groupby_agg_extension(data_for_grouping) + class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests): def test_error(self, data, all_arithmetic_operators): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index ced7ea9261310..86a0bc9213256 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -291,6 +291,22 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): expected = expected.reset_index() self.assert_frame_equal(result, expected) + def test_groupby_agg_extension(self, data_for_grouping): + # GH#38980 groupby agg on extension type fails for non-numeric types + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + + expected = df.iloc[[0, 2, 4]] + expected = expected.set_index("A") + + result = df.groupby("A").agg({"B": "first"}) + self.assert_frame_equal(result, expected) + + result = df.groupby("A").agg("first") + self.assert_frame_equal(result, expected) + + result = df.groupby("A").first() + self.assert_frame_equal(result, expected) + def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 49eb570c4ffe0..32dfe5858d1d0 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1682,6 +1682,21 @@ def test_getitem_interval_index_partial_indexing(self): res = df.loc[:, 0.5] tm.assert_series_equal(res, expected) + @pytest.mark.parametrize("indexer", ["A", ["A"], ("A", slice(None))]) + def test_setitem_unsorted_multiindex_columns(self, indexer): + # GH#38601 + mi = MultiIndex.from_tuples([("A", 4), ("B", "3"), ("A", "2")]) + df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=mi) + obj = df.copy() + obj.loc[:, indexer] = np.zeros((2, 2), dtype=int) + expected = DataFrame([[0, 2, 0], [0, 5, 0]], columns=mi) + tm.assert_frame_equal(obj, expected) + + df = df.sort_index(1) + df.loc[:, indexer] = np.zeros((2, 2), dtype=int) + expected = expected.sort_index(1) + tm.assert_frame_equal(df, expected) + class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 19d2f8301037a..cedef4784e4a1 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -356,6 +356,13 @@ def test_setitem_listlike_views(self): expected = Series([100, 2, 3], name="a") tm.assert_series_equal(ser, expected) + def test_setitem_string_column_numpy_dtype_raising(self): + # GH#39010 + df = DataFrame([[1, 2], [3, 4]]) + df["0 - Name"] = [5, 6] + expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"]) + tm.assert_frame_equal(df, expected) + class TestDataFrameSetItemSlicing: def test_setitem_slice_position(self): diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ab750bca7e069..1b570028964df 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1636,3 +1636,10 @@ def test_replace_unicode(self): result = df1.replace(columns_values_map) expected = DataFrame({"positive": np.ones(3)}) tm.assert_frame_equal(result, expected) + + def test_replace_bytes(self, frame_or_series): + # GH#38900 + obj = frame_or_series(["o"]).astype("|S") + expected = obj.copy() + obj = obj.replace({None: np.nan}) + tm.assert_equal(obj, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 157c8687808b3..a7e2fa760b7e4 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -1,7 +1,6 @@ from copy import deepcopy import inspect import pydoc -import warnings import numpy as np import pytest @@ -330,19 +329,17 @@ def test_set_flags(self, allows_duplicate_labels, frame_or_series): result.iloc[key] = 10 assert obj.iloc[key] == 0 - @skip_if_no("jinja2") def test_constructor_expanddim_lookup(self): # GH#33628 accessing _constructor_expanddim should not # raise NotImplementedError df = DataFrame() - with warnings.catch_warnings(record=True) as wrn: - # _AXIS_NUMBERS, _AXIS_NAMES lookups - inspect.getmembers(df) - - # some versions give FutureWarning, others DeprecationWarning - assert len(wrn) - assert any(x.category in [FutureWarning, DeprecationWarning] for x in wrn) - with pytest.raises(NotImplementedError, match="Not supported for DataFrames!"): df._constructor_expanddim(np.arange(27).reshape(3, 3, 3)) + + @skip_if_no("jinja2") + def test_inspect_getmembers(self): + # GH38740 + df = DataFrame() + with tm.assert_produces_warning(None): + inspect.getmembers(df) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index d33d91f2cefca..d843d4b0e9504 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1091,9 +1091,13 @@ def test_any_all_bool_only(self): (np.all, {"A": Series([0, 1], dtype=int)}, False), (np.any, {"A": Series([0, 1], dtype=int)}, True), pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False), + pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, False), pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True), + pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, True), pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True), + pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True), pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True), + pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True), pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False), pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True), pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True), diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py index 81c0dc65b4e97..83fd3db72a90c 100644 --- a/pandas/tests/frame/test_ufunc.py +++ b/pandas/tests/frame/test_ufunc.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -70,12 +72,19 @@ def test_binary_input_aligns_columns(dtype_a, dtype_b): dtype_b["C"] = dtype_b.pop("B") df2 = pd.DataFrame({"A": [1, 2], "C": [3, 4]}).astype(dtype_b) - result = np.heaviside(df1, df2) - expected = np.heaviside( - np.array([[1, 3, np.nan], [2, 4, np.nan]]), - np.array([[1, np.nan, 3], [2, np.nan, 4]]), - ) - expected = pd.DataFrame(expected, index=[0, 1], columns=["A", "B", "C"]) + with tm.assert_produces_warning(FutureWarning): + result = np.heaviside(df1, df2) + # Expected future behaviour: + # expected = np.heaviside( + # np.array([[1, 3, np.nan], [2, 4, np.nan]]), + # np.array([[1, np.nan, 3], [2, np.nan, 4]]), + # ) + # expected = pd.DataFrame(expected, index=[0, 1], columns=["A", "B", "C"]) + expected = pd.DataFrame([[1.0, 1.0], [1.0, 1.0]], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + # ensure the expected is the same when applying with numpy array + result = np.heaviside(df1, df2.values) tm.assert_frame_equal(result, expected) @@ -85,27 +94,149 @@ def test_binary_input_aligns_index(dtype): pytest.xfail(reason="Extension / mixed with multiple inputs not implemented.") df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).astype(dtype) df2 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "c"]).astype(dtype) - result = np.heaviside(df1, df2) - expected = np.heaviside( - np.array([[1, 3], [3, 4], [np.nan, np.nan]]), - np.array([[1, 3], [np.nan, np.nan], [3, 4]]), + with tm.assert_produces_warning(FutureWarning): + result = np.heaviside(df1, df2) + # Expected future behaviour: + # expected = np.heaviside( + # np.array([[1, 3], [3, 4], [np.nan, np.nan]]), + # np.array([[1, 3], [np.nan, np.nan], [3, 4]]), + # ) + # # TODO(FloatArray): this will be Float64Dtype. + # expected = pd.DataFrame(expected, index=["a", "b", "c"], columns=["A", "B"]) + expected = pd.DataFrame( + [[1.0, 1.0], [1.0, 1.0]], columns=["A", "B"], index=["a", "b"] ) - # TODO(FloatArray): this will be Float64Dtype. - expected = pd.DataFrame(expected, index=["a", "b", "c"], columns=["A", "B"]) tm.assert_frame_equal(result, expected) + # ensure the expected is the same when applying with numpy array + result = np.heaviside(df1, df2.values) + tm.assert_frame_equal(result, expected) + +@pytest.mark.filterwarnings("ignore:Calling a ufunc on non-aligned:FutureWarning") def test_binary_frame_series_raises(): # We don't currently implement df = pd.DataFrame({"A": [1, 2]}) - with pytest.raises(NotImplementedError, match="logaddexp"): + # with pytest.raises(NotImplementedError, match="logaddexp"): + with pytest.raises(ValueError, match=""): np.logaddexp(df, df["A"]) - with pytest.raises(NotImplementedError, match="logaddexp"): + # with pytest.raises(NotImplementedError, match="logaddexp"): + with pytest.raises(ValueError, match=""): np.logaddexp(df["A"], df) +def test_unary_accumulate_axis(): + # https://github.com/pandas-dev/pandas/issues/39259 + df = pd.DataFrame({"a": [1, 3, 2, 4]}) + result = np.maximum.accumulate(df) + expected = pd.DataFrame({"a": [1, 3, 3, 4]}) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"a": [1, 3, 2, 4], "b": [0.1, 4.0, 3.0, 2.0]}) + result = np.maximum.accumulate(df) + # in theory could preserve int dtype for default axis=0 + expected = pd.DataFrame({"a": [1.0, 3.0, 3.0, 4.0], "b": [0.1, 4.0, 4.0, 4.0]}) + tm.assert_frame_equal(result, expected) + + result = np.maximum.accumulate(df, axis=0) + tm.assert_frame_equal(result, expected) + + result = np.maximum.accumulate(df, axis=1) + expected = pd.DataFrame({"a": [1.0, 3.0, 2.0, 4.0], "b": [1.0, 4.0, 3.0, 4.0]}) + tm.assert_frame_equal(result, expected) + + def test_frame_outer_deprecated(): df = pd.DataFrame({"A": [1, 2]}) with tm.assert_produces_warning(FutureWarning): np.subtract.outer(df, df) + + +def test_alignment_deprecation(): + # https://github.com/pandas-dev/pandas/issues/39184 + df1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = pd.DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) + s1 = pd.Series([1, 2], index=["a", "b"]) + s2 = pd.Series([1, 2], index=["b", "c"]) + + # binary dataframe / dataframe + expected = pd.DataFrame({"a": [2, 4, 6], "b": [8, 10, 12]}) + + with tm.assert_produces_warning(None): + # aligned -> no warning! + result = np.add(df1, df1) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + # non-aligned -> warns + result = np.add(df1, df2) + tm.assert_frame_equal(result, expected) + + result = np.add(df1, df2.values) + tm.assert_frame_equal(result, expected) + + result = np.add(df1.values, df2) + expected = pd.DataFrame({"b": [2, 4, 6], "c": [8, 10, 12]}) + tm.assert_frame_equal(result, expected) + + # binary dataframe / series + expected = pd.DataFrame({"a": [2, 3, 4], "b": [6, 7, 8]}) + + with tm.assert_produces_warning(None): + # aligned -> no warning! + result = np.add(df1, s1) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + result = np.add(df1, s2) + tm.assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + result = np.add(s2, df1) + tm.assert_frame_equal(result, expected) + + result = np.add(df1, s2.values) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("numba", "0.46.0") +def test_alignment_deprecation_many_inputs(): + # https://github.com/pandas-dev/pandas/issues/39184 + # test that the deprecation also works with > 2 inputs -> using a numba + # written ufunc for this because numpy itself doesn't have such ufuncs + from numba import float64, vectorize + + @vectorize([float64(float64, float64, float64)]) + def my_ufunc(x, y, z): + return x + y + z + + df1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df2 = pd.DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) + df3 = pd.DataFrame({"a": [1, 2, 3], "c": [4, 5, 6]}) + + with tm.assert_produces_warning(FutureWarning): + result = my_ufunc(df1, df2, df3) + expected = pd.DataFrame([[3.0, 12.0], [6.0, 15.0], [9.0, 18.0]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + # all aligned -> no warning + with tm.assert_produces_warning(None): + result = my_ufunc(df1, df1, df1) + tm.assert_frame_equal(result, expected) + + # mixed frame / arrays + with tm.assert_produces_warning(FutureWarning): + result = my_ufunc(df1, df2, df3.values) + tm.assert_frame_equal(result, expected) + + # single frame -> no warning + with tm.assert_produces_warning(None): + result = my_ufunc(df1, df2.values, df3.values) + tm.assert_frame_equal(result, expected) + + # takes indices of first frame + with tm.assert_produces_warning(FutureWarning): + result = my_ufunc(df1.values, df2, df3) + expected = expected.set_axis(["b", "c"], axis=1) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8cf77ca6335f4..f0bc58cbf07bf 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1678,3 +1678,23 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( df_grp = df.groupby(["a", "b"], observed=observed) result = getattr(df_grp, func)() tm.assert_frame_equal(result, expected) + + +def test_groupby_categorical_indices_unused_categories(): + # GH#38642 + df = DataFrame( + { + "key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]), + "col": range(3), + } + ) + grouped = df.groupby("key", sort=False) + result = grouped.indices + expected = { + "b": np.array([0, 1], dtype="int64"), + "a": np.array([2], dtype="int64"), + "c": np.array([], dtype="int64"), + } + assert result.keys() == expected.keys() + for key in result.keys(): + tm.assert_numpy_array_equal(result[key], expected[key]) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7c179a79513fa..a260aaf6e057d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -842,6 +842,14 @@ def test_omit_nuisance(df): grouped.agg(lambda x: x.sum(0, numeric_only=False)) +def test_omit_nuisance_sem(df): + # GH 38774 - sem should work with nuisance columns + grouped = df.groupby("A") + result = grouped.sem() + expected = df.loc[:, ["A", "C", "D"]].groupby("A").sem() + tm.assert_frame_equal(result, expected) + + def test_omit_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) @@ -1689,64 +1697,6 @@ def test_sort(x): g.apply(test_sort) -def test_group_shift_with_null_key(): - # This test is designed to replicate the segfault in issue #13813. - n_rows = 1200 - - # Generate a moderately large dataframe with occasional missing - # values in column `B`, and then group by [`A`, `B`]. This should - # force `-1` in `labels` array of `g.grouper.group_info` exactly - # at those places, where the group-by key is partially missing. - df = DataFrame( - [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], - dtype=float, - columns=["A", "B", "Z"], - index=None, - ) - g = df.groupby(["A", "B"]) - - expected = DataFrame( - [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], - dtype=float, - columns=["Z"], - index=None, - ) - result = g.shift(-1) - - tm.assert_frame_equal(result, expected) - - -def test_group_shift_with_fill_value(): - # GH #24128 - n_rows = 24 - df = DataFrame( - [(i % 12, i % 3, i) for i in range(n_rows)], - dtype=float, - columns=["A", "B", "Z"], - index=None, - ) - g = df.groupby(["A", "B"]) - - expected = DataFrame( - [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], - dtype=float, - columns=["Z"], - index=None, - ) - result = g.shift(-1, fill_value=0)[["Z"]] - - tm.assert_frame_equal(result, expected) - - -def test_group_shift_lose_timezone(): - # GH 30134 - now_dt = Timestamp.utcnow() - df = DataFrame({"a": [1, 1], "date": now_dt}) - result = df.groupby("a").shift(0).iloc[0] - expected = Series({"date": now_dt}, name=result.name) - tm.assert_series_equal(result, expected) - - def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = DataFrame( diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py new file mode 100644 index 0000000000000..1410038274152 --- /dev/null +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -0,0 +1,106 @@ +import numpy as np +import pytest + +from pandas import DataFrame, NaT, Series, Timedelta, Timestamp +import pandas._testing as tm + + +def test_group_shift_with_null_key(): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g.grouper.group_info` exactly + # at those places, where the group-by key is partially missing. + df = DataFrame( + [(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1) + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_with_fill_value(): + # GH #24128 + n_rows = 24 + df = DataFrame( + [(i % 12, i % 3, i) for i in range(n_rows)], + dtype=float, + columns=["A", "B", "Z"], + index=None, + ) + g = df.groupby(["A", "B"]) + + expected = DataFrame( + [(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)], + dtype=float, + columns=["Z"], + index=None, + ) + result = g.shift(-1, fill_value=0)[["Z"]] + + tm.assert_frame_equal(result, expected) + + +def test_group_shift_lose_timezone(): + # GH 30134 + now_dt = Timestamp.utcnow() + df = DataFrame({"a": [1, 1], "date": now_dt}) + result = df.groupby("a").shift(0).iloc[0] + expected = Series({"date": now_dt}, name=result.name) + tm.assert_series_equal(result, expected) + + +def test_group_diff_real(any_real_dtype): + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, dtype=any_real_dtype) + result = df.groupby("a")["b"].diff() + exp_dtype = "float" + if any_real_dtype in ["int8", "int16", "float32"]: + exp_dtype = "float32" + expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [ + Timestamp("2013-01-01"), + Timestamp("2013-01-02"), + Timestamp("2013-01-03"), + ], + [Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")], + ], +) +def test_group_diff_datetimelike(data): + df = DataFrame({"a": [1, 2, 2], "b": data}) + result = df.groupby("a")["b"].diff() + expected = Series([NaT, NaT, Timedelta("1 days")], name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_bool(): + df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]}) + result = df.groupby("a")["b"].diff() + expected = Series([np.nan, np.nan, np.nan, False, False], name="b") + tm.assert_series_equal(result, expected) + + +def test_group_diff_object_raises(object_dtype): + df = DataFrame( + {"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype + ) + with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"): + df.groupby("a")["b"].diff() diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 42525fc575397..f381a3b205e8c 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -305,6 +305,21 @@ def test_multiindex_one_dimensional_tuple_columns(self, indexer): expected = DataFrame([0, 2], index=mi) tm.assert_frame_equal(obj, expected) + @pytest.mark.parametrize( + "indexer, exp_value", [(slice(None), 1.0), ((1, 2), np.nan)] + ) + def test_multiindex_setitem_columns_enlarging(self, indexer, exp_value): + # GH#39147 + mi = MultiIndex.from_tuples([(1, 2), (3, 4)]) + df = DataFrame([[1, 2], [3, 4]], index=mi, columns=["a", "b"]) + df.loc[indexer, ["c", "d"]] = 1.0 + expected = DataFrame( + [[1, 2, 1.0, 1.0], [3, 4, exp_value, exp_value]], + index=mi, + columns=["a", "b", "c", "d"], + ) + tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize( "indexer, pos", diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index df1250cee8b00..8b1a96f694e71 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -657,6 +657,22 @@ def test_read_from_s3_url(self, read_ext, s3_resource, s3so): local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) + def test_read_from_s3_object(self, read_ext, s3_resource, s3so): + # GH 38788 + # Bucket "pandas-test" created in tests/io/conftest.py + with open("test1" + read_ext, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) + + import s3fs + + s3 = s3fs.S3FileSystem(**s3so) + + with s3.open("s3://pandas-test/test1" + read_ext) as f: + url_table = pd.read_excel(f) + + local_table = pd.read_excel("test1" + read_ext) + tm.assert_frame_equal(url_table, local_table) + @pytest.mark.slow def test_read_from_file_url(self, read_ext, datapath): diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 197738330efe1..6a2ac2f6003d7 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -492,7 +492,7 @@ def test_float_types(self, np_type, path): @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, np_type, path): - # Test np.bool values read come back as float. + # Test np.bool8 and np.bool_ values read come back as float. df = DataFrame([1, 0, True, False], dtype=np_type) df.to_excel(path, "test1") @@ -657,30 +657,27 @@ def test_excel_date_datetime_format(self, engine, ext, path): ) with tm.ensure_clean(ext) as filename2: - writer1 = ExcelWriter(path) - writer2 = ExcelWriter( + with ExcelWriter(path) as writer1: + df.to_excel(writer1, "test1") + + with ExcelWriter( filename2, date_format="DD.MM.YYYY", datetime_format="DD.MM.YYYY HH-MM-SS", - ) - - df.to_excel(writer1, "test1") - df.to_excel(writer2, "test1") - - writer1.close() - writer2.close() + ) as writer2: + df.to_excel(writer2, "test1") - reader1 = ExcelFile(path) - reader2 = ExcelFile(filename2) + with ExcelFile(path) as reader1: + rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) - rs1 = pd.read_excel(reader1, sheet_name="test1", index_col=0) - rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) + with ExcelFile(filename2) as reader2: + rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) - tm.assert_frame_equal(rs1, rs2) + tm.assert_frame_equal(rs1, rs2) - # Since the reader returns a datetime object for dates, - # we need to use df_expected to check the result. - tm.assert_frame_equal(rs2, df_expected) + # Since the reader returns a datetime object for dates, + # we need to use df_expected to check the result. + tm.assert_frame_equal(rs2, df_expected) def test_to_excel_interval_no_labels(self, path): # see gh-19242 @@ -862,7 +859,7 @@ def test_to_excel_unicode_filename(self, ext, path): f = open(filename, "wb") except UnicodeEncodeError: pytest.skip("No unicode file names on this system") - else: + finally: f.close() df = DataFrame( @@ -872,15 +869,15 @@ def test_to_excel_unicode_filename(self, ext, path): ) df.to_excel(filename, "test1", float_format="%.2f") - reader = ExcelFile(filename) - result = pd.read_excel(reader, sheet_name="test1", index_col=0) + with ExcelFile(filename) as reader: + result = pd.read_excel(reader, sheet_name="test1", index_col=0) - expected = DataFrame( - [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], - index=["A", "B"], - columns=["X", "Y", "Z"], - ) - tm.assert_frame_equal(result, expected) + expected = DataFrame( + [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], + index=["A", "B"], + columns=["X", "Y", "Z"], + ) + tm.assert_frame_equal(result, expected) # FIXME: dont leave commented-out # def test_to_excel_header_styling_xls(self, engine, ext): @@ -1374,8 +1371,8 @@ def test_excelfile_fspath(self): with tm.ensure_clean("foo.xlsx") as path: df = DataFrame({"A": [1, 2]}) df.to_excel(path) - xl = ExcelFile(path) - result = os.fspath(xl) + with ExcelFile(path) as xl: + result = os.fspath(xl) assert result == path def test_excelwriter_fspath(self): diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index fe85849c6dcca..b0b07045a9156 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2002,6 +2002,25 @@ def test_float_trim_zeros(self): assert ("+10" in line) or skip skip = False + @pytest.mark.parametrize( + "data, expected", + [ + (["3.50"], "0 3.50\ndtype: object"), + ([1.20, "1.00"], "0 1.2\n1 1.00\ndtype: object"), + ([np.nan], "0 NaN\ndtype: float64"), + ([None], "0 None\ndtype: object"), + (["3.50", np.nan], "0 3.50\n1 NaN\ndtype: object"), + ([3.50, np.nan], "0 3.5\n1 NaN\ndtype: float64"), + ([3.50, np.nan, "3.50"], "0 3.5\n1 NaN\n2 3.50\ndtype: object"), + ([3.50, None, "3.50"], "0 3.5\n1 None\n2 3.50\ndtype: object"), + ], + ) + def test_repr_str_float_truncation(self, data, expected): + # GH#38708 + series = Series(data) + result = repr(series) + assert result == expected + def test_dict_entries(self): df = DataFrame({"A": [{"a": 1, "b": 2}]}) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index a9673ded7c377..ef4de5961a696 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -545,12 +545,12 @@ def test_to_csv_zip_arguments(self, compression, archive_name): df.to_csv( path, compression={"method": compression, "archive_name": archive_name} ) - zp = ZipFile(path) - expected_arcname = path if archive_name is None else archive_name - expected_arcname = os.path.basename(expected_arcname) - assert len(zp.filelist) == 1 - archived_file = os.path.basename(zp.filelist[0].filename) - assert archived_file == expected_arcname + with ZipFile(path) as zp: + expected_arcname = path if archive_name is None else archive_name + expected_arcname = os.path.basename(expected_arcname) + assert len(zp.filelist) == 1 + archived_file = os.path.basename(zp.filelist[0].filename) + assert archived_file == expected_arcname @pytest.mark.parametrize("df_new_type", ["Int64"]) def test_to_csv_na_rep_long_string(self, df_new_type): @@ -640,3 +640,25 @@ def test_to_csv_encoding_binary_handle(self, mode): handle.seek(0) assert handle.read().startswith(b'\xef\xbb\xbf""') + + +def test_to_csv_iterative_compression_name(compression): + # GH 38714 + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + df.to_csv(path, compression=compression, chunksize=1) + tm.assert_frame_equal( + pd.read_csv(path, compression=compression, index_col=0), df + ) + + +def test_to_csv_iterative_compression_buffer(compression): + # GH 38714 + df = tm.makeDataFrame() + with io.BytesIO() as buffer: + df.to_csv(buffer, compression=compression, chunksize=1) + buffer.seek(0) + tm.assert_frame_equal( + pd.read_csv(buffer, compression=compression, index_col=0), df + ) + assert not buffer.closed diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index e8893b4c02238..ec098353960d7 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -97,6 +97,33 @@ def python_parser_only(request): return request.param +def _get_all_parser_float_precision_combinations(): + """ + Return all allowable parser and float precision + combinations and corresponding ids. + """ + params = [] + ids = [] + for parser, parser_id in zip(_all_parsers, _all_parser_ids): + for precision in parser.float_precision_choices: + params.append((parser, precision)) + ids.append(f"{parser_id}-{precision}") + + return {"params": params, "ids": ids} + + +@pytest.fixture( + params=_get_all_parser_float_precision_combinations()["params"], + ids=_get_all_parser_float_precision_combinations()["ids"], +) +def all_parsers_all_precisions(request): + """ + Fixture for all allowable combinations of parser + and float precision + """ + return request.param + + _utf_values = [8, 16, 32] _encoding_seps = ["", "-", "_"] diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c8ed0d75b13a2..8871ea7205a46 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -8,13 +8,16 @@ from inspect import signature from io import BytesIO, StringIO import os +from pathlib import Path import platform from urllib.error import URLError +import warnings import numpy as np import pytest from pandas._libs.tslib import Timestamp +from pandas.compat import is_platform_linux from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td @@ -1258,15 +1261,14 @@ def test_float_parser(all_parsers): tm.assert_frame_equal(result, expected) -def test_scientific_no_exponent(all_parsers): +def test_scientific_no_exponent(all_parsers_all_precisions): # see gh-12215 df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) data = df.to_csv(index=False) - parser = all_parsers + parser, precision = all_parsers_all_precisions - for precision in parser.float_precision_choices: - df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) - tm.assert_frame_equal(df_roundtrip, df) + df_roundtrip = parser.read_csv(StringIO(data), float_precision=precision) + tm.assert_frame_equal(df_roundtrip, df) @pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) @@ -1350,6 +1352,35 @@ def test_numeric_range_too_wide(all_parsers, exp_data): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("neg_exp", [-617, -100000, -99999999999999999]) +def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): + # GH#38753 + parser, precision = all_parsers_all_precisions + data = f"data\n10E{neg_exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"data": [0.0]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) +def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): + # GH#38753 + parser, precision = all_parsers_all_precisions + data = f"data\n10E{exp}" + result = parser.read_csv(StringIO(data), float_precision=precision) + if precision == "round_trip": + if exp == 999999999999999999 and is_platform_linux(): + mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") + request.node.add_marker(mark) + + value = np.inf if exp > 0 else 0.0 + expected = DataFrame({"data": [value]}) + else: + expected = DataFrame({"data": [f"10E{exp}"]}) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("iterator", [True, False]) def test_empty_with_nrows_chunksize(all_parsers, iterator): # see gh-9535 @@ -2340,3 +2371,22 @@ def test_context_manageri_user_provided(all_parsers, datapath): assert False except AssertionError: assert not reader._engine.handles.handle.closed + + +@td.check_file_leaks +def test_open_file(all_parsers): + # GH 39024 + parser = all_parsers + if parser.engine == "c": + pytest.skip() + + with tm.ensure_clean() as path: + file = Path(path) + file.write_bytes(b"\xe4\na\n1") + + # should not trigger a ResourceWarning + warnings.simplefilter("always", category=ResourceWarning) + with warnings.catch_warnings(record=True) as record: + with pytest.raises(csv.Error, match="Could not determine delimiter"): + parser.read_csv(file, sep=None) + assert len(record) == 0, record[0].message diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 97f82b9a01a9a..11e14ac61a831 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -208,6 +208,7 @@ def test_read_s3_fails(self, s3so): with pytest.raises(IOError): read_csv("s3://cant_get_it/file.csv") + @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) def test_write_s3_csv_fails(self, tips_df, s3so): # GH 32486 # Attempting to write to an invalid S3 path should raise @@ -223,6 +224,7 @@ def test_write_s3_csv_fails(self, tips_df, s3so): "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so ) + @pytest.mark.xfail(reason="GH#39155 s3fs upgrade", strict=False) @td.skip_if_no("pyarrow") def test_write_s3_parquet_fails(self, tips_df, s3so): # GH 27679 diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 34cb00e89ea0c..80e2b36764ba0 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -1,6 +1,7 @@ """ Tests for the pandas.io.common functionalities """ +import codecs from io import StringIO import mmap import os @@ -418,3 +419,27 @@ def test_is_fsspec_url(): assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") + + +def test_default_errors(): + # GH 38989 + with tm.ensure_clean() as path: + file = Path(path) + file.write_bytes(b"\xe4\na\n1") + tm.assert_frame_equal(pd.read_csv(file, skiprows=[0]), pd.DataFrame({"a": [1]})) + + +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +@pytest.mark.parametrize("format", ["csv", "json"]) +def test_codecs_encoding(encoding, format): + # GH39247 + expected = tm.makeDataFrame() + with tm.ensure_clean() as path: + with codecs.open(path, mode="w", encoding=encoding) as handle: + getattr(expected, f"to_{format}")(handle) + with codecs.open(path, mode="r", encoding=encoding) as handle: + if format == "csv": + df = pd.read_csv(handle, index_col=0) + else: + df = pd.read_json(handle) + tm.assert_frame_equal(expected, df) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index ba8b1a8a0679d..aed1aaedf2fa3 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -129,6 +129,7 @@ def test_to_html_compat(self): res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0] tm.assert_frame_equal(res, df) + @pytest.mark.xfail(reason="Html file was removed") @tm.network def test_banklist_url_positional_match(self): url = "https://www.fdic.gov/bank/individual/failed/banklist.html" @@ -142,6 +143,7 @@ def test_banklist_url_positional_match(self): assert_framelist_equal(df1, df2) + @pytest.mark.xfail(reason="Html file was removed") @tm.network def test_banklist_url(self): url = "https://www.fdic.gov/bank/individual/failed/banklist.html" diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 99e7c3061d670..a9357ef89de92 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -880,7 +880,7 @@ def test_timezone_aware_index(self, pa, timezone_aware_date_list): # this use-case sets the resolution to 1 minute check_round_trip(df, pa, check_dtype=False) - @td.skip_if_no("pyarrow", min_version="0.17") + @td.skip_if_no("pyarrow", min_version="1.0.0") def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 df = pd.DataFrame({"a": list(range(0, 3))}) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 15f65a9ce46c6..16d4bc65094f8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1194,7 +1194,7 @@ def test_sql_open_close(self): @pytest.mark.skipif(SQLALCHEMY_INSTALLED, reason="SQLAlchemy is installed") def test_con_string_import_error(self): - conn = "mysql://root@localhost/pandas_nosetest" + conn = "mysql://root@localhost/pandas" msg = "Using URI string without sqlalchemy installed" with pytest.raises(ImportError, match=msg): sql.read_sql("SELECT * FROM iris", conn) @@ -1931,11 +1931,12 @@ class _TestMySQLAlchemy: """ flavor = "mysql" + port = 3306 @classmethod def connect(cls): return sqlalchemy.create_engine( - f"mysql+{cls.driver}://root@localhost/pandas_nosetest", + f"mysql+{cls.driver}://root@localhost:{cls.port}/pandas", connect_args=cls.connect_args, ) @@ -2000,11 +2001,12 @@ class _TestPostgreSQLAlchemy: """ flavor = "postgresql" + port = 5432 @classmethod def connect(cls): return sqlalchemy.create_engine( - f"postgresql+{cls.driver}://postgres@localhost/pandas_nosetest" + f"postgresql+{cls.driver}://postgres:postgres@localhost:{cls.port}/pandas" ) @classmethod @@ -2620,7 +2622,7 @@ class TestXMySQL(MySQLMixIn): @pytest.fixture(autouse=True, scope="class") def setup_class(cls): pymysql = pytest.importorskip("pymysql") - pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas") try: pymysql.connect(read_default_group="pandas") except pymysql.ProgrammingError as err: @@ -2640,7 +2642,7 @@ def setup_class(cls): @pytest.fixture(autouse=True) def setup_method(self, request, datapath): pymysql = pytest.importorskip("pymysql") - pymysql.connect(host="localhost", user="root", passwd="", db="pandas_nosetest") + pymysql.connect(host="localhost", user="root", passwd="", db="pandas") try: pymysql.connect(read_default_group="pandas") except pymysql.ProgrammingError as err: diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 24944281419c3..0f9321fd4f96a 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -550,6 +550,7 @@ def test_invalid_timestamp(self, version): msg = "time_stamp should be datetime type" with pytest.raises(ValueError, match=msg): original.to_stata(path, time_stamp=time_stamp, version=version) + assert not os.path.isfile(path) def test_numeric_column_names(self): original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) @@ -1916,10 +1917,10 @@ def test_compression_dict(method, file_ext): compression = {"method": method, "archive_name": archive_name} df.to_stata(path, compression=compression) if method == "zip" or file_ext == "zip": - zp = zipfile.ZipFile(path, "r") - assert len(zp.filelist) == 1 - assert zp.filelist[0].filename == archive_name - fp = io.BytesIO(zp.read(zp.filelist[0])) + with zipfile.ZipFile(path, "r") as zp: + assert len(zp.filelist) == 1 + assert zp.filelist[0].filename == archive_name + fp = io.BytesIO(zp.read(zp.filelist[0])) else: fp = path reread = read_stata(fp, index_col="index") diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index c66334065ea63..68e693cdb85e2 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -2176,80 +2176,6 @@ def test_xlabel_ylabel_dataframe_plane_plot(self, kind, xlabel, ylabel): assert ax.get_xlabel() == (xcol if xlabel is None else xlabel) assert ax.get_ylabel() == (ycol if ylabel is None else ylabel) - @pytest.mark.parametrize("method", ["bar", "barh"]) - def test_bar_ticklabel_consistence(self, method): - # Draw two consecutiv bar plot with consistent ticklabels - # The labels positions should not move between two drawing on the same axis - # GH: 26186 - def get_main_axis(ax): - if method == "barh": - return ax.yaxis - elif method == "bar": - return ax.xaxis - - # Plot the first bar plot - data = {"A": 0, "B": 3, "C": -4} - df = DataFrame.from_dict(data, orient="index", columns=["Value"]) - ax = getattr(df.plot, method)() - ax.get_figure().canvas.draw() - - # Retrieve the label positions for the first drawing - xticklabels = [t.get_text() for t in get_main_axis(ax).get_ticklabels()] - label_positions_1 = dict(zip(xticklabels, get_main_axis(ax).get_ticklocs())) - - # Modify the dataframe order and values and plot on same axis - df = df.sort_values("Value") * -2 - ax = getattr(df.plot, method)(ax=ax, color="red") - ax.get_figure().canvas.draw() - - # Retrieve the label positions for the second drawing - xticklabels = [t.get_text() for t in get_main_axis(ax).get_ticklabels()] - label_positions_2 = dict(zip(xticklabels, get_main_axis(ax).get_ticklocs())) - - # Assert that the label positions did not change between the plotting - assert label_positions_1 == label_positions_2 - - def test_bar_numeric(self): - # Bar plot with numeric index have tick location values equal to index - # values - # GH: 11465 - df = DataFrame(np.random.rand(10), index=np.arange(10, 20)) - ax = df.plot.bar() - ticklocs = ax.xaxis.get_ticklocs() - expected = np.arange(10, 20, dtype=np.int64) - tm.assert_numpy_array_equal(ticklocs, expected) - - def test_bar_multiindex(self): - # Test from pandas/doc/source/user_guide/visualization.rst - # at section Plotting With Error Bars - # Related to issue GH: 26186 - - ix3 = pd.MultiIndex.from_arrays( - [ - ["a", "a", "a", "a", "b", "b", "b", "b"], - ["foo", "foo", "bar", "bar", "foo", "foo", "bar", "bar"], - ], - names=["letter", "word"], - ) - - df3 = DataFrame( - {"data1": [3, 2, 4, 3, 2, 4, 3, 2], "data2": [6, 5, 7, 5, 4, 5, 6, 5]}, - index=ix3, - ) - - # Group by index labels and take the means and standard deviations - # for each group - gp3 = df3.groupby(level=("letter", "word")) - means = gp3.mean() - errors = gp3.std() - - # No assertion we just ensure that we can plot a MultiIndex bar plot - # and are getting a UserWarning if redrawing - with tm.assert_produces_warning(None): - ax = means.plot.bar(yerr=errors, capsize=4) - with tm.assert_produces_warning(UserWarning): - means.plot.bar(yerr=errors, capsize=4, ax=ax) - def _generate_4_axes_via_gridspec(): import matplotlib as mpl diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 8c2297699807d..94afa204db891 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -17,6 +17,7 @@ Timedelta, TimedeltaIndex, Timestamp, + date_range, isna, timedelta_range, to_timedelta, @@ -923,6 +924,48 @@ def test_any_axis1_bool_only(self): expected = Series([True, False]) tm.assert_series_equal(result, expected) + def test_any_all_datetimelike(self): + # GH#38723 these may not be the desired long-term behavior (GH#34479) + # but in the interim should be internally consistent + dta = date_range("1995-01-02", periods=3)._data + ser = Series(dta) + df = DataFrame(ser) + + assert dta.all() + assert dta.any() + + assert ser.all() + assert ser.any() + + assert df.any().all() + assert df.all().all() + + dta = dta.tz_localize("UTC") + ser = Series(dta) + df = DataFrame(ser) + + assert dta.all() + assert dta.any() + + assert ser.all() + assert ser.any() + + assert df.any().all() + assert df.all().all() + + tda = dta - dta[0] + ser = Series(tda) + df = DataFrame(ser) + + assert tda.any() + assert not tda.all() + + assert ser.any() + assert not ser.all() + + assert df.any().all() + assert not df.all().any() + def test_timedelta64_analytics(self): # index min/max diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 2ea7602b00206..20de0effc30e1 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -575,6 +575,40 @@ def test_nat_comparisons_invalid(other, op): op(other, NaT) +def test_compare_date(): + # GH#39151 comparing NaT with date object is deprecated + # See also: tests.scalar.timestamps.test_comparisons::test_compare_date + + dt = Timestamp.now().to_pydatetime().date() + + for left, right in [(NaT, dt), (dt, NaT)]: + assert not left == right + assert left != right + + with tm.assert_produces_warning(FutureWarning): + assert not left < right + with tm.assert_produces_warning(FutureWarning): + assert not left <= right + with tm.assert_produces_warning(FutureWarning): + assert not left > right + with tm.assert_produces_warning(FutureWarning): + assert not left >= right + + # Once the deprecation is enforced, the following assertions + # can be enabled: + # assert not left == right + # assert left != right + # + # with pytest.raises(TypeError): + # left < right + # with pytest.raises(TypeError): + # left <= right + # with pytest.raises(TypeError): + # left > right + # with pytest.raises(TypeError): + # left >= right + + @pytest.mark.parametrize( "obj", [ diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index aaa58cdb390f7..c2219e9fd45a6 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone import numpy as np import pytest @@ -13,6 +13,7 @@ Series, Timedelta, Timestamp, + date_range, isna, ) import pandas._testing as tm @@ -711,6 +712,14 @@ def test_fillna_method_and_limit_invalid(self): with pytest.raises(ValueError, match=msg): ser.fillna(1, limit=limit, method=method) + def test_fillna_datetime64_with_timezone_tzinfo(self): + # https://github.com/pandas-dev/pandas/issues/38851 + s = Series(date_range("2020", periods=3, tz="UTC")) + expected = s.astype(object) + s[1] = NaT + result = s.fillna(datetime(2020, 1, 2, tzinfo=timezone.utc)) + tm.assert_series_equal(result, expected) + class TestFillnaPad: def test_fillna_bug(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ae01093fbadbf..ac97ff7af262d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2409,3 +2409,10 @@ def test_diff_ea_axis(self): msg = "cannot diff DatetimeArray on axis=1" with pytest.raises(ValueError, match=msg): algos.diff(dta, 1, axis=1) + + @pytest.mark.parametrize("dtype", ["int8", "int16"]) + def test_diff_low_precision_int(self, dtype): + arr = np.array([0, 1, 1, 0, 0], dtype=dtype) + result = algos.diff(arr, 1) + expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32") + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 538a52d84b73a..a15b2d03079d4 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -3670,3 +3670,11 @@ def test_str_get_stringarray_multiple_nans(): result = s.str.get(2) expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) tm.assert_series_equal(result, expected) + + +def test_str_accessor_in_apply_func(): + # https://github.com/pandas-dev/pandas/issues/38979 + df = DataFrame(zip("abc", "def")) + expected = Series(["A/D", "B/E", "C/F"]) + result = df.apply(lambda f: "/".join(f.str.upper()), axis=1) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 8034ace479a62..bf80a1410e7d9 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -299,3 +299,9 @@ def test_allows_duplicate_labels(): with pytest.raises(AssertionError, match="=5.7.5 pip blosc diff --git a/setup.py b/setup.py index a25fe95e025b3..f9c4a1158fee0 100755 --- a/setup.py +++ b/setup.py @@ -435,7 +435,7 @@ def run(self): "MACOSX_DEPLOYMENT_TARGET", current_system ) if ( - LooseVersion(python_target) < "10.9" + LooseVersion(str(python_target)) < "10.9" and LooseVersion(current_system) >= "10.9" ): os.environ["MACOSX_DEPLOYMENT_TARGET"] = "10.9"